diff --git a/examples/pom.xml b/examples/pom.xml
index f5ab2a7fdc09..6852ba93cc1d 100644
--- a/examples/pom.xml
+++ b/examples/pom.xml
@@ -102,6 +102,11 @@
       <artifactId>spark-streaming-kafka_${scala.binary.version}</artifactId>
       <version>${project.version}</version>
     </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-streaming-kafka-v09_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+    </dependency>
     <dependency>
       <groupId>org.apache.hbase</groupId>
       <artifactId>hbase-testing-util</artifactId>
diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/v09DirectKafkaWordCount.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/v09DirectKafkaWordCount.scala
new file mode 100644
index 000000000000..7dca188dd7a1
--- /dev/null
+++ b/examples/src/main/scala/org/apache/spark/examples/streaming/v09DirectKafkaWordCount.scala
@@ -0,0 +1,103 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.streaming
+
+import kafka.serializer.StringDecoder
+import org.apache.kafka.clients.consumer.ConsumerConfig
+import org.apache.kafka.common.serialization.StringDeserializer
+
+import org.apache.spark.streaming._
+import org.apache.spark.streaming.kafka.v09._
+import org.apache.spark.SparkConf
+
+/**
+ * Consumes messages from one or more topics in Kafka and does wordcount.
+ * Usage: v09DirectKafkaWordCount <brokers> <topics>
+ *   <brokers> is a list of one or more Kafka brokers
+ *   <topics> is a list of one or more kafka topics to consume from
+ *   <groupId> is the name of kafka consumer group
+ *   <auto.offset.reset> What to do when there is no initial offset in Kafka or
+ *                       if the current offset does not exist any more on the server
+ *                       earliest: automatically reset the offset to the earliest offset
+ *                       latest: automatically reset the offset to the latest offset
+ *   <batch interval> is the time interval at which streaming data will be divided into batches
+ *   <pollTimeout> is time, in milliseconds, spent waiting in Kafka consumer poll
+ *                 if data is not available
+ * Example:
+ *    $ bin/run-example streaming.v09DirectKafkaWordCount broker1-host:port,broker2-host:port \
+ *    topic1,topic2 my-consumer-group latest batch-interval pollTimeout
+ */
+object v09DirectKafkaWordCount {
+  def main(args: Array[String]) {
+    if (args.length < 2) {
+      System.err.println(s"""
+                            |Usage: v09DirectKafkaWordCount <brokers> <topics>
+                            |  <brokers> is a list of one or more Kafka brokers
+                            |  <topics> is a list of one or more kafka topics to consume from
+                            |  <groupId> is the name of kafka consumer group
+                            |  <auto.offset.reset> What to do when there is no initial offset
+                            |                      in Kafka or if the current offset does not exist
+                            |                      any more on the server
+                            |                      earliest: automatically reset the offset
+                            |                                to the earliest offset
+                            |                      latest: automatically reset the offset
+                            |                              to the latest offset
+                            |  <batch interval> is the time interval at which
+                            |                   streaming data will be divided into batches
+                            |  <pollTimeout> is time, in milliseconds, spent waiting in
+                            |                Kafka consumer poll if data is not available
+                            |
+        """.stripMargin)
+      System.exit(1)
+    }
+
+    StreamingExamples.setStreamingLogLevels()
+
+    val Array(brokers, topics, groupId, offsetReset, batchInterval, pollTimeout) = args
+
+    // Create context with 2 second batch interval
+    val sparkConf = new SparkConf().setAppName("v09DirectKafkaWordCount")
+    val ssc = new StreamingContext(sparkConf, Seconds(batchInterval.toInt))
+
+    // Create direct kafka stream with brokers and topics
+    val topicsSet = topics.split(",").toSet
+    val kafkaParams = Map[String, String](
+      ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> brokers,
+      ConsumerConfig.GROUP_ID_CONFIG -> groupId,
+      ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG ->
+        "org.apache.kafka.common.serialization.StringDeserializer",
+      ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG ->
+        "org.apache.kafka.common.serialization.StringDeserializer",
+      ConsumerConfig.AUTO_OFFSET_RESET_CONFIG -> offsetReset,
+      ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG -> "false",
+      "spark.kafka.poll.time" -> pollTimeout)
+    val messages = KafkaUtils.createDirectStream[String, String](ssc, kafkaParams, topicsSet)
+
+    // Get the lines, split them into words, count the words and print
+    val lines = messages.map(_._2)
+    val words = lines.flatMap(_.split(" "))
+    val wordCounts = words.map(x => (x, 1L)).reduceByKey(_ + _)
+    wordCounts.print()
+
+    // Start the computation
+    ssc.start()
+    ssc.awaitTermination()
+  }
+}
+// scalastyle:on println
diff --git a/external/kafka-assembly/pom.xml b/external/kafka-assembly/pom.xml
index a9ed39ef8c9a..65f65b91182b 100644
--- a/external/kafka-assembly/pom.xml
+++ b/external/kafka-assembly/pom.xml
@@ -41,6 +41,11 @@
       <artifactId>spark-streaming-kafka_${scala.binary.version}</artifactId>
       <version>${project.version}</version>
     </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-streaming-kafka-v09_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+    </dependency>
     <dependency>
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-streaming_${scala.binary.version}</artifactId>
diff --git a/external/kafka-v09/pom.xml b/external/kafka-v09/pom.xml
new file mode 100644
index 000000000000..8392b12459af
--- /dev/null
+++ b/external/kafka-v09/pom.xml
@@ -0,0 +1,144 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~    http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <parent>
+    <groupId>org.apache.spark</groupId>
+    <artifactId>spark-parent_2.10</artifactId>
+    <version>1.6.0-SNAPSHOT</version>
+    <relativePath>../../pom.xml</relativePath>
+  </parent>
+
+  <groupId>org.apache.spark</groupId>
+  <artifactId>spark-streaming-kafka-v09_2.10</artifactId>
+  <properties>
+    <sbt.project.name>streaming-kafka-v09</sbt.project.name>
+  </properties>
+  <packaging>jar</packaging>
+  <name>Spark Project External Kafka v09</name>
+  <url>http://spark.apache.org/</url>
+
+  <dependencies>
+    <dependency>
+      <groupId>com.101tec</groupId>
+      <artifactId>zkclient</artifactId>
+      <version>0.6</version>
+    </dependency>
+    <dependency>
+      <groupId>com.yammer.metrics</groupId>
+      <artifactId>metrics-core</artifactId>
+      <version>2.2.0</version>
+    </dependency>
+
+    <dependency>
+      <groupId>org.eclipse.jetty</groupId>
+      <artifactId>jetty-util</artifactId>
+    </dependency>
+
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-streaming_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <scope>provided</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-core_${scala.binary.version}</artifactId>
+      <version>${project.version}</version>
+      <type>test-jar</type>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.kafka</groupId>
+      <artifactId>kafka_${scala.binary.version}</artifactId>
+      <version>0.9.0.0</version>
+      <exclusions>
+        <exclusion>
+          <groupId>com.sun.jmx</groupId>
+          <artifactId>jmxri</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>com.sun.jdmk</groupId>
+          <artifactId>jmxtools</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>net.sf.jopt-simple</groupId>
+          <artifactId>jopt-simple</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.slf4j</groupId>
+          <artifactId>slf4j-simple</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.apache.zookeeper</groupId>
+          <artifactId>zookeeper</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.kafka</groupId>
+      <artifactId>kafka-clients</artifactId>
+      <version>0.9.0.0</version>
+      <exclusions>
+        <exclusion>
+          <groupId>com.sun.jmx</groupId>
+          <artifactId>jmxri</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>com.sun.jdmk</groupId>
+          <artifactId>jmxtools</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>net.sf.jopt-simple</groupId>
+          <artifactId>jopt-simple</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.slf4j</groupId>
+          <artifactId>slf4j-simple</artifactId>
+        </exclusion>
+        <exclusion>
+          <groupId>org.apache.zookeeper</groupId>
+          <artifactId>zookeeper</artifactId>
+        </exclusion>
+      </exclusions>
+    </dependency>
+    <dependency>
+      <groupId>net.sf.jopt-simple</groupId>
+      <artifactId>jopt-simple</artifactId>
+      <version>3.2</version>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.scalacheck</groupId>
+      <artifactId>scalacheck_${scala.binary.version}</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-test-tags_${scala.binary.version}</artifactId>
+    </dependency>
+  </dependencies>
+
+  <build>
+    <outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
+    <testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
+  </build>
+</project>
diff --git a/external/kafka-v09/src/main/scala/org/apache/spark/streaming/kafka/v09/Broker.scala b/external/kafka-v09/src/main/scala/org/apache/spark/streaming/kafka/v09/Broker.scala
new file mode 100644
index 000000000000..0826cd027e50
--- /dev/null
+++ b/external/kafka-v09/src/main/scala/org/apache/spark/streaming/kafka/v09/Broker.scala
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.streaming.kafka.v09
+
+import org.apache.spark.annotation.Experimental
+
+/**
+ * Represents the host and port info for a Kafka broker.
+ * Differs from the Kafka project's internal kafka.cluster.Broker, which contains a server ID.
+ */
+final class Broker private(
+    /** Broker's hostname */
+    val host: String,
+    /** Broker's port */
+    val port: Int) extends Serializable {
+  override def equals(obj: Any): Boolean = obj match {
+    case that: Broker =>
+      this.host == that.host &&
+      this.port == that.port
+    case _ => false
+  }
+
+  override def hashCode: Int = {
+    41 * (41 + host.hashCode) + port
+  }
+
+  override def toString(): String = {
+    s"Broker($host, $port)"
+  }
+}
+
+/**
+ * :: Experimental ::
+ * Companion object that provides methods to create instances of [[Broker]].
+ */
+@Experimental
+object Broker {
+  def create(host: String, port: Int): Broker =
+    new Broker(host, port)
+
+  def apply(host: String, port: Int): Broker =
+    new Broker(host, port)
+
+  def unapply(broker: Broker): Option[(String, Int)] = {
+    if (broker == null) {
+      None
+    } else {
+      Some((broker.host, broker.port))
+    }
+  }
+}
diff --git a/external/kafka-v09/src/main/scala/org/apache/spark/streaming/kafka/v09/DirectKafkaInputDStream.scala b/external/kafka-v09/src/main/scala/org/apache/spark/streaming/kafka/v09/DirectKafkaInputDStream.scala
new file mode 100644
index 000000000000..448baf867795
--- /dev/null
+++ b/external/kafka-v09/src/main/scala/org/apache/spark/streaming/kafka/v09/DirectKafkaInputDStream.scala
@@ -0,0 +1,207 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.streaming.kafka.v09
+
+import kafka.common.TopicAndPartition
+import org.apache.kafka.clients.consumer.ConsumerRecord
+import org.apache.kafka.common.TopicPartition
+import org.apache.spark.Logging
+import org.apache.spark.streaming.dstream._
+import org.apache.spark.streaming.kafka.v09.KafkaCluster.LeaderOffset
+import org.apache.spark.streaming.scheduler.rate.RateEstimator
+import org.apache.spark.streaming.scheduler.{RateController, StreamInputInfo}
+import org.apache.spark.streaming.{StreamingContext, Time}
+
+import scala.collection.mutable
+import scala.reflect.ClassTag
+
+/**
+ * A stream of {@link org.apache.spark.streaming.kafka.KafkaRDD} where
+ * each given Kafka topic/partition corresponds to an RDD partition.
+ * The spark configuration spark.streaming.kafka.maxRatePerPartition gives the maximum number
+ * of messages
+ * per second that each '''partition''' will accept.
+ * Starting offsets are specified in advance,
+ * and this DStream is not responsible for committing offsets,
+ * so that you can control exactly-once semantics.
+ * @param kafkaParams Kafka <a href="http://kafka.apache.org/documentation.html#configuration">
+ *                    configuration parameters</a>.
+ *                    Requires "metadata.broker.list" or "bootstrap.servers" to be set
+ *                    with Kafka broker(s),
+ *                    NOT zookeeper servers, specified in host1:port1,host2:port2 form.
+ * @param fromOffsets per-topic/partition Kafka offsets defining the (inclusive)
+ *                    starting point of the stream
+ */
+private[streaming]
+class DirectKafkaInputDStream[
+  K: ClassTag,
+  V: ClassTag,
+  R: ClassTag](
+    @transient ssc_ : StreamingContext,
+    val kafkaParams: Map[String, String],
+    @transient val fromOffsets: Map[TopicPartition, Long],
+    messageHandler: ConsumerRecord[K, V] => R
+  ) extends InputDStream[R](ssc_) with Logging {
+
+  val maxRetries = context.sparkContext.getConf.getInt(
+    "spark.streaming.kafka.maxRetries", 1)
+
+  // Keep this consistent with how other streams are named (e.g. "Flume polling stream [2]")
+  private[streaming] override def name: String = s"Kafka 0.9 direct stream [$id]"
+
+  protected[streaming] override val checkpointData =
+    new DirectKafkaInputDStreamCheckpointData
+
+
+  /**
+   * Asynchronously maintains & sends new rate limits to the receiver through the receiver tracker.
+   */
+  override protected[streaming] val rateController: Option[RateController] = {
+    if (RateController.isBackPressureEnabled(ssc.conf)) {
+      Some(new DirectKafkaRateController(id,
+        RateEstimator.create(ssc.conf, ssc_.graph.batchDuration)))
+    } else {
+      None
+    }
+  }
+
+  protected var kafkaCluster = new KafkaCluster[K, V](kafkaParams)
+
+  private val maxRateLimitPerPartition: Int = context.sparkContext.getConf.getInt(
+    "spark.streaming.kafka.maxRatePerPartition", 0)
+
+  protected def maxMessagesPerPartition: Option[Long] = {
+    val estimatedRateLimit = rateController.map(_.getLatestRate().toInt)
+    val numPartitions = currentOffsets.keys.size
+
+    val effectiveRateLimitPerPartition = estimatedRateLimit
+      .filter(_ > 0)
+      .map { limit =>
+        if (maxRateLimitPerPartition > 0) {
+          Math.min(maxRateLimitPerPartition, (limit / numPartitions))
+        } else {
+          limit / numPartitions
+        }
+      }.getOrElse(maxRateLimitPerPartition)
+
+    if (effectiveRateLimitPerPartition > 0) {
+      val secsPerBatch = context.graph.batchDuration.milliseconds.toDouble / 1000
+      Some((secsPerBatch * effectiveRateLimitPerPartition).toLong)
+    } else {
+      None
+    }
+  }
+
+  // temp fix for serialization issue of TopicPartition
+  protected var serCurrentOffsets = fromOffsets.map { case(tp, l) =>
+    (tp.topic, tp.partition, l);
+  }
+
+  @transient
+  protected var currentOffsets: Map[TopicPartition, Long] = null
+
+  protected final def latestLeaderOffsets(): Map[TopicPartition, LeaderOffset] = {
+    kafkaCluster.getLatestOffsetsWithLeaders(currentOffsets.keySet)
+  }
+
+  // limits the maximum number of messages per partition
+  protected def clamp(
+      leaderOffsets: Map[TopicPartition, LeaderOffset]
+    ): Map[TopicPartition, LeaderOffset] = {
+    maxMessagesPerPartition.map { mmp =>
+      leaderOffsets.map { case (tp, lo) =>
+        tp -> lo.copy(offset = Math.min(currentOffsets(tp) + mmp, lo.offset))
+      }
+    }.getOrElse(leaderOffsets)
+  }
+
+  override def compute(validTime: Time): Option[KafkaRDD[K, V, R]] = {
+    currentOffsets = serCurrentOffsets.map { i => new TopicPartition(i._1, i._2) -> i._3 }.toMap
+    val untilOffsets = clamp(latestLeaderOffsets())
+    val rdd = KafkaRDD[K, V, R](
+      context.sparkContext, kafkaParams, currentOffsets, untilOffsets, messageHandler)
+
+    // Report the record number and metadata of this batch interval to InputInfoTracker.
+    val offsetRanges = currentOffsets.map { case (tp, fo) =>
+      val uo = untilOffsets(tp)
+      OffsetRange(tp.topic, tp.partition, fo, uo.offset)
+    }
+    val description = offsetRanges.filter { offsetRange =>
+      // Don't display empty ranges.
+      offsetRange.fromOffset != offsetRange.untilOffset
+    }.map { offsetRange =>
+      s"topic: ${offsetRange.topic}\tpartition: ${offsetRange.partition}\t" +
+        s"offsets: ${offsetRange.fromOffset} to ${offsetRange.untilOffset}"
+    }.mkString("\n")
+    // Copy offsetRanges to immutable.List to prevent from being modified by the user
+    val metadata = Map(
+      "offsets" -> offsetRanges.toList,
+      StreamInputInfo.METADATA_KEY_DESCRIPTION -> description)
+    val inputInfo = StreamInputInfo(id, rdd.count, metadata)
+    ssc.scheduler.inputInfoTracker.reportInfo(validTime, inputInfo)
+
+    serCurrentOffsets = untilOffsets.map { kv => (kv._1.topic, kv._1.partition, kv._2.offset) }
+    Some(rdd)
+  }
+
+  override def start(): Unit = {
+  }
+
+  def stop(): Unit = {
+    if (kafkaCluster != null) {
+      kafkaCluster.close()
+      kafkaCluster = null
+    }
+  }
+
+  private[streaming]
+  class DirectKafkaInputDStreamCheckpointData extends DStreamCheckpointData(this) {
+    def batchForTime: mutable.HashMap[Time, Array[(String, Int, Long, Long)]] = {
+      data.asInstanceOf[mutable.HashMap[Time, Array[OffsetRange.OffsetRangeTuple]]]
+    }
+
+    override def update(time: Time) {
+      batchForTime.clear()
+      generatedRDDs.foreach { kv =>
+        val a = kv._2.asInstanceOf[KafkaRDD[K, V, R]].offsetRanges.map(_.toTuple).toArray
+        batchForTime += kv._1 -> a
+      }
+    }
+
+    override def cleanup(time: Time) {}
+
+    override def restore() {
+      // this is assuming that the topics don't change during execution, which is true currently
+
+      batchForTime.toSeq.sortBy(_._1)(Time.ordering).foreach { case (t, b) =>
+        logInfo(s"Restoring KafkaRDD for time $t ${b.mkString("[", ", ", "]")}")
+        generatedRDDs += t -> new KafkaRDD[K, V, R](
+          context.sparkContext, kafkaParams, b.map(OffsetRange(_)), messageHandler)
+      }
+    }
+  }
+
+  /**
+   * A RateController to retrieve the rate from RateEstimator.
+   */
+  private[streaming] class DirectKafkaRateController(id: Int, estimator: RateEstimator)
+    extends RateController(id, estimator) {
+    override def publish(rate: Long): Unit = ()
+  }
+
+}
diff --git a/external/kafka-v09/src/main/scala/org/apache/spark/streaming/kafka/v09/KafkaCluster.scala b/external/kafka-v09/src/main/scala/org/apache/spark/streaming/kafka/v09/KafkaCluster.scala
new file mode 100644
index 000000000000..c4ff717329d6
--- /dev/null
+++ b/external/kafka-v09/src/main/scala/org/apache/spark/streaming/kafka/v09/KafkaCluster.scala
@@ -0,0 +1,183 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.streaming.kafka.v09
+
+import java.util
+import java.util.{Collections}
+
+import org.apache.kafka.clients.consumer.{OffsetResetStrategy, KafkaConsumer, OffsetAndMetadata}
+import org.apache.kafka.common.{PartitionInfo, TopicPartition}
+import org.apache.spark.SparkException
+
+import scala.collection.JavaConverters._
+import scala.reflect._
+
+/**
+ * @param kafkaParams Kafka <a href="http://kafka.apache.org/documentation.html#configuration">
+ *                    configuration parameters</a>.
+ *                    Requires "bootstrap.servers" to be set with Kafka broker(s),
+ *                    NOT zookeeper servers, specified in host1:port1,host2:port2 form
+ */
+private[spark]
+class KafkaCluster[K: ClassTag, V: ClassTag](val kafkaParams: Map[String, String])
+  extends Serializable {
+
+  import KafkaCluster.LeaderOffset
+
+  @transient
+  protected var consumer: KafkaConsumer[K, V] = null
+
+  def getLatestOffsets(topicPartitions: Set[TopicPartition]): Map[TopicPartition, Long] = {
+    getOffsetsWithoutLeaders(topicPartitions, OffsetResetStrategy.LATEST)
+  }
+
+  def getEarliestOffsets(topicPartitions: Set[TopicPartition]): Map[TopicPartition, Long] = {
+    getOffsetsWithoutLeaders(topicPartitions, OffsetResetStrategy.EARLIEST)
+  }
+
+  def getPartitions(topics: Set[String]): Set[TopicPartition] = {
+    withConsumer { consumer => {
+        val partInfo = topics.flatMap {
+          topic => Option(consumer.partitionsFor(topic)) match {
+            case None => throw new SparkException("Topic doesn't exist " + topic)
+            case Some(partInfoList) => partInfoList.asScala.toList
+          }
+        }
+        val topicPartitions: Set[TopicPartition] = partInfo.map { partition =>
+          new TopicPartition(partition.topic(), partition.partition())
+        }
+        topicPartitions
+      }
+    }.asInstanceOf[Set[TopicPartition]]
+  }
+
+  def getPartitionsLeader(topics: Set[String]): Map[TopicPartition, String] = {
+    getPartitionInfo(topics).map { pi =>
+      new TopicPartition(pi.topic, pi.partition) -> pi.leader.host
+    }.toMap
+  }
+
+  def getPartitionInfo(topics: Set[String]): Set[PartitionInfo] = {
+    withConsumer { consumer =>
+      topics.flatMap { topic =>
+        Option(consumer.partitionsFor(topic)) match {
+          case None =>
+            throw new SparkException("Topic doesn't exist " + topic)
+          case Some(piList) => piList.asScala.toList
+        }
+      }
+    }.asInstanceOf[Set[PartitionInfo]]
+  }
+
+  def setConsumerOffsets(offsets: Map[TopicPartition, Long]): Unit = {
+    val topicPartOffsets = new util.HashMap[TopicPartition, OffsetAndMetadata]()
+    val topicPartition = offsets.map(tpl => tpl._1).toSeq
+
+    withConsumer(consumer => {
+      consumer.assign(Collections.emptyList[TopicPartition])
+      consumer.assign(topicPartition.asJava)
+
+      for ((topicAndPart, offset) <- offsets) {
+        val topicPartition = topicAndPart
+        val offsetAndMetadata = new OffsetAndMetadata(offset)
+        topicPartOffsets.put(topicPartition, offsetAndMetadata)
+      }
+
+      consumer.commitSync(topicPartOffsets)
+    })
+  }
+
+  def getCommittedOffsets(topicPartitions: Set[TopicPartition]):
+    Map[TopicPartition, Long] = {
+    withConsumer(consumer => {
+      consumer.assign(topicPartitions.toList.asJava)
+      topicPartitions.map( tp => {
+        val offsetAndMetadata = consumer.committed(tp)
+        Option(offsetAndMetadata) match {
+          case None => throw new SparkException(s"Topic $tp hasn't committed offsets")
+          case Some(om) => tp -> om.offset()
+        }
+      }
+      ).toMap
+    }).asInstanceOf[Map[TopicPartition, Long]]
+  }
+
+  def getLatestOffsetsWithLeaders(
+      topicPartitions: Set[TopicPartition]
+    ): Map[TopicPartition, LeaderOffset] = {
+    getOffsets(topicPartitions, OffsetResetStrategy.LATEST)
+  }
+
+  private def getOffsetsWithoutLeaders(
+      topicPartitions: Set[TopicPartition],
+      offsetResetType: OffsetResetStrategy
+    ): Map[TopicPartition, Long] = {
+    getOffsets(topicPartitions, offsetResetType)
+      .map { t => (t._1, t._2.offset) }
+  }
+
+  def getOffsets(topicPartitions: Set[TopicPartition], resetStrategy: OffsetResetStrategy):
+    Map[TopicPartition, LeaderOffset] = {
+    val topics = topicPartitions.map { _.topic }
+    withConsumer{ consumer =>
+      val tplMap = topics.flatMap { topic =>
+        Option(consumer.partitionsFor(topic)) match {
+          case None =>
+            throw new SparkException("Topic doesnt exist " + topic)
+          case Some(piList) => piList.asScala.toList
+        }
+      }.map { pi =>
+        new TopicPartition(pi.topic, pi.partition) -> pi.leader.host
+      }.toMap
+
+      consumer.assign(topicPartitions.toList.asJava)
+      resetStrategy match {
+        case OffsetResetStrategy.EARLIEST => consumer.seekToBeginning(topicPartitions.toList: _*)
+        case OffsetResetStrategy.LATEST => consumer.seekToEnd(topicPartitions.toList: _*)
+        case _ => throw new SparkException("Unknown OffsetResetStrategy " + resetStrategy)
+      }
+      topicPartitions.map { tp =>
+        val pos = consumer.position(tp)
+        tp -> new LeaderOffset(tplMap(tp), pos)
+      }.toMap
+
+    }.asInstanceOf[Map[TopicPartition, LeaderOffset]]
+  }
+
+  private def withConsumer(fn: KafkaConsumer[K, V] => Any): Any = {
+    if (consumer == null) {
+      consumer = new KafkaConsumer[K, V](kafkaParams.asInstanceOf[Map[String, Object]].asJava)
+    }
+    fn(consumer)
+  }
+
+  def close(): Unit = {
+    if (consumer != null) {
+      consumer.close()
+      consumer = null
+    }
+  }
+
+}
+
+private[spark]
+object KafkaCluster {
+
+  private[spark]
+  case class LeaderOffset(host: String, offset: Long)
+}
diff --git a/external/kafka-v09/src/main/scala/org/apache/spark/streaming/kafka/v09/KafkaRDD.scala b/external/kafka-v09/src/main/scala/org/apache/spark/streaming/kafka/v09/KafkaRDD.scala
new file mode 100644
index 000000000000..debbc9c3352e
--- /dev/null
+++ b/external/kafka-v09/src/main/scala/org/apache/spark/streaming/kafka/v09/KafkaRDD.scala
@@ -0,0 +1,233 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.streaming.kafka.v09
+
+import java.util.{ Collections, Properties }
+
+import org.apache.kafka.clients.consumer.{ ConsumerRecord, KafkaConsumer }
+import org.apache.kafka.common.TopicPartition
+import org.apache.spark.partial.{ BoundedDouble, PartialResult }
+import org.apache.spark.rdd.RDD
+import org.apache.spark.streaming.kafka.v09.KafkaCluster.{LeaderOffset}
+import org.apache.spark.util.NextIterator
+import org.apache.spark.{ Logging, Partition, SparkContext, TaskContext }
+
+import scala.collection.mutable.ArrayBuffer
+import scala.reflect.ClassTag
+import scala.collection.JavaConverters._
+
+/**
+ * A batch-oriented interface for consuming from Kafka.
+ * Starting and ending offsets are specified in advance,
+ * so that you can control exactly-once semantics.
+ * @param kafkaParams Kafka <a href="http://kafka.apache.org/documentation.html#configuration">
+ *                    configuration parameters</a>. Requires "bootstrap.servers" to be set
+ *                    with Kafka broker(s) specified in host1:port1,host2:port2 form.
+ * @param offsetRanges offset ranges that define the Kafka data belonging to this RDD
+ */
+private[kafka]
+class KafkaRDD[K: ClassTag, V: ClassTag, R: ClassTag] private[spark] (
+    sc: SparkContext,
+    kafkaParams: Map[String, String],
+    val offsetRanges: Array[OffsetRange],
+    messageHandler: ConsumerRecord[K, V] => R
+  ) extends RDD[R](sc, Nil) with Logging with HasOffsetRanges {
+
+  private val KAFKA_DEFAULT_POLL_TIME: String = "0"
+  private val pollTime = kafkaParams.get("spark.kafka.poll.time")
+    .getOrElse(KAFKA_DEFAULT_POLL_TIME).toInt
+  private val cluster = new KafkaCluster[K, V](kafkaParams)
+
+  override def getPartitions: Array[Partition] = {
+    offsetRanges.zipWithIndex.map {
+      case (o, i) =>
+        new KafkaRDDPartition(i, o.topic, o.partition, o.fromOffset, o.untilOffset, o.leaderHost)
+    }.toArray
+  }
+
+  override def count(): Long = offsetRanges.map(_.count).sum
+
+  override def countApprox(
+      timeout: Long,
+      confidence: Double = 0.95): PartialResult[BoundedDouble] = {
+    val c = count
+    new PartialResult(new BoundedDouble(c, 1.0, c, c), true)
+  }
+
+  override def isEmpty(): Boolean = count == 0L
+
+  override def take(num: Int): Array[R] = {
+    val nonEmptyPartitions = this.partitions
+      .map(_.asInstanceOf[KafkaRDDPartition])
+      .filter(_.count > 0)
+
+    if (num < 1 || nonEmptyPartitions.size < 1) {
+      return new Array[R](0)
+    }
+
+    // Determine in advance how many messages need to be taken from each partition
+    val parts = nonEmptyPartitions.foldLeft(Map[Int, Int]()) { (result, part) =>
+      val remain = num - result.values.sum
+      if (remain > 0) {
+        val taken = Math.min(remain, part.count)
+        result + (part.index -> taken.toInt)
+      } else {
+        result
+      }
+    }
+
+    val buf = new ArrayBuffer[R]
+    val res = context.runJob(
+      this,
+      (tc: TaskContext, it: Iterator[R]) => it.take(parts(tc.partitionId)).toArray,
+      parts.keys.toArray)
+    res.foreach(buf ++= _)
+    buf.toArray
+  }
+
+  override def getPreferredLocations(thePart: Partition): Seq[String] = {
+    val part = thePart.asInstanceOf[KafkaRDDPartition]
+    // TODO is additional hostname resolution necessary here
+    if (part.host != null ) {
+      Seq(part.host)
+    }
+    else {
+      Seq()
+    }
+  }
+
+  private def errBeginAfterEnd(part: KafkaRDDPartition): String =
+    s"Beginning offset ${part.fromOffset} is after the ending offset ${part.untilOffset} " +
+      s"for topic ${part.topic} partition ${part.partition}. " +
+      "You either provided an invalid fromOffset, or the Kafka topic has been damaged"
+
+  private def errRanOutBeforeEnd(part: KafkaRDDPartition): String =
+    s"Ran out of messages before reaching ending offset ${part.untilOffset} " +
+      s"for topic ${part.topic} partition ${part.partition} start ${part.fromOffset}." +
+      " This should not happen, and indicates that messages may have been lost"
+
+  private def errOvershotEnd(itemOffset: Long, part: KafkaRDDPartition): String =
+    s"Got ${itemOffset} > ending offset ${part.untilOffset} " +
+      s"for topic ${part.topic} partition ${part.partition} start ${part.fromOffset}." +
+      " This should not happen, and indicates a message may have been skipped"
+
+  override def compute(thePart: Partition, context: TaskContext): Iterator[R] = {
+    val part = thePart.asInstanceOf[KafkaRDDPartition]
+    assert(part.fromOffset <= part.untilOffset, errBeginAfterEnd(part))
+    if (part.fromOffset == part.untilOffset) {
+      log.info(s"Beginning offset ${part.fromOffset} is the same as ending offset " +
+        s"skipping ${part.topic} ${part.partition}")
+      Iterator.empty
+    } else {
+      new KafkaRDDIterator(part, context)
+    }
+  }
+
+  private class KafkaRDDIterator(
+      part: KafkaRDDPartition,
+      context: TaskContext) extends NextIterator[R] {
+
+    context.addTaskCompletionListener { context => closeIfNeeded() }
+
+    log.info(s"Computing topic ${part.topic}, partition ${part.partition} " +
+      s"offsets ${part.fromOffset} -> ${part.untilOffset}")
+
+    val props = new Properties()
+    kafkaParams.foreach(param => props.put(param._1, param._2))
+
+    val consumer = new KafkaConsumer[K, V](props)
+    val tp = new TopicPartition(part.topic, part.partition)
+    consumer.assign(Collections.singletonList[TopicPartition](tp))
+
+    var requestOffset = part.fromOffset
+    var iter: Iterator[ConsumerRecord[K, V]] = null
+    consumer.seek(tp, requestOffset)
+
+    override def close(): Unit = {
+      if (consumer != null) {
+        consumer.close()
+      }
+    }
+
+    private def fetchBatch: Iterator[ConsumerRecord[K, V]] = {
+      consumer.seek(new TopicPartition(part.topic, part.partition), requestOffset)
+      val recs = consumer.poll(pollTime)
+      recs.records(new TopicPartition(part.topic, part.partition)).iterator().asScala
+    }
+
+    override def getNext(): R = {
+      if ( requestOffset == part.untilOffset ) {
+        finished = true
+        null.asInstanceOf[R]
+      }
+
+      if (iter == null || !iter.hasNext) {
+        iter = fetchBatch
+      }
+
+      if (!iter.hasNext) {
+        if ( requestOffset < part.untilOffset ) {
+          return getNext()
+        }
+        assert(requestOffset == part.untilOffset, errRanOutBeforeEnd(part))
+        finished = true
+        null.asInstanceOf[R]
+      } else {
+        val item: ConsumerRecord[K, V] = iter.next()
+        if (item.offset >= part.untilOffset) {
+          assert(item.offset == part.untilOffset, errOvershotEnd(item.offset, part))
+          finished = true
+          null.asInstanceOf[R]
+        } else {
+          requestOffset = item.offset() + 1
+          messageHandler(item)
+        }
+      }
+    }
+  }
+
+}
+
+private[kafka]
+object KafkaRDD {
+
+  /**
+   * @param kafkaParams Kafka <a href="http://kafka.apache.org/documentation.html#configuration">
+   *                    configuration parameters</a>.
+   *                    Requires "bootstrap.servers" to be set with Kafka broker(s),
+   *                    NOT zookeeper servers, specified in host1:port1,host2:port2 form.
+   * @param fromOffsets per-topic/partition Kafka offsets defining the (inclusive)
+   *                    starting point of the batch
+   * @param untilOffsets per-topic/partition Kafka offsets defining the (exclusive)
+   *                     ending point of the batch
+   */
+  def apply[K: ClassTag, V: ClassTag, R: ClassTag](
+      sc: SparkContext,
+      kafkaParams: Map[String, String],
+      fromOffsets: Map[TopicPartition, Long],
+      untilOffsets: Map[TopicPartition, LeaderOffset],
+      messageHandler: ConsumerRecord[K, V] => R): KafkaRDD[K, V, R] = {
+    val offsetRanges = fromOffsets.map {
+      case (tp, fo) =>
+        val uo = untilOffsets(tp)
+        OffsetRange(tp.topic, tp.partition, fo, uo.offset, uo.host)
+    }.toArray
+
+    new KafkaRDD[K, V, R](sc, kafkaParams, offsetRanges, messageHandler)
+  }
+}
diff --git a/external/kafka-v09/src/main/scala/org/apache/spark/streaming/kafka/v09/KafkaRDDPartition.scala b/external/kafka-v09/src/main/scala/org/apache/spark/streaming/kafka/v09/KafkaRDDPartition.scala
new file mode 100644
index 000000000000..b3889aae9063
--- /dev/null
+++ b/external/kafka-v09/src/main/scala/org/apache/spark/streaming/kafka/v09/KafkaRDDPartition.scala
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.streaming.kafka.v09
+
+import org.apache.spark.Partition
+
+/** @param topic kafka topic name
+ *  @param partition kafka partition id
+ *  @param fromOffset inclusive starting offset
+ *  @param untilOffset exclusive ending offset
+ */
+private[kafka]
+class KafkaRDDPartition(
+    val index: Int,
+    val topic: String,
+    val partition: Int,
+    val fromOffset: Long,
+    val untilOffset: Long,
+    val host: String
+  ) extends Partition {
+  /** Number of messages this partition refers to */
+  def count(): Long = untilOffset - fromOffset
+}
diff --git a/external/kafka-v09/src/main/scala/org/apache/spark/streaming/kafka/v09/KafkaTestUtils.scala b/external/kafka-v09/src/main/scala/org/apache/spark/streaming/kafka/v09/KafkaTestUtils.scala
new file mode 100644
index 000000000000..c5baaeca9e18
--- /dev/null
+++ b/external/kafka-v09/src/main/scala/org/apache/spark/streaming/kafka/v09/KafkaTestUtils.scala
@@ -0,0 +1,280 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.streaming.kafka.v09
+
+import java.io.File
+import java.lang.{Integer => JInt}
+import java.net.InetSocketAddress
+import java.util.concurrent.TimeoutException
+import java.util.{Map => JMap, Properties}
+
+import kafka.admin.AdminUtils
+import kafka.api.Request
+import kafka.producer.{KeyedMessage, Producer, ProducerConfig}
+import kafka.serializer.StringEncoder
+import kafka.server.{KafkaConfig, KafkaServer}
+import kafka.utils.ZkUtils
+import org.I0Itec.zkclient.ZkClient
+import org.apache.kafka.common.security.JaasUtils
+import org.apache.spark.streaming.Time
+import org.apache.spark.util.Utils
+import org.apache.spark.{Logging, SparkConf}
+import org.apache.zookeeper.server.{NIOServerCnxnFactory, ZooKeeperServer}
+
+import scala.annotation.tailrec
+import scala.collection.JavaConverters._
+import scala.language.postfixOps
+import scala.util.control.NonFatal
+
+/**
+ * This is a helper class for Kafka test suites. This has the functionality to set up
+ * and tear down local Kafka servers, and to push data using Kafka producers.
+ *
+ * The reason to put Kafka test utility class in src is to test Python related Kafka APIs.
+ */
+private[kafka] class KafkaTestUtils extends Logging {
+
+  // Zookeeper related configurations
+  private val zkHost = "localhost"
+  private var zkPort: Int = 0
+  private val zkConnectionTimeout = 60000
+  private val zkSessionTimeout = 6000
+
+  private var zookeeper: EmbeddedZookeeper = _
+
+  private var zkClient: ZkClient = _
+  private var zkUtils: ZkUtils = _
+
+  // Kafka broker related configurations
+  private val brokerHost = "localhost"
+  private var brokerPort = 9092
+  private var brokerConf: KafkaConfig = _
+
+  // Kafka broker server
+  private var server: KafkaServer = _
+
+  // Kafka producer
+  private var producer: Producer[String, String] = _
+
+  // Flag to test whether the system is correctly started
+  private var zkReady = false
+  private var brokerReady = false
+
+  def zkAddress: String = {
+    assert(zkReady, "Zookeeper not setup yet or already torn down, cannot get zookeeper address")
+    s"$zkHost:$zkPort"
+  }
+
+  def brokerAddress: String = {
+    assert(brokerReady, "Kafka not setup yet or already torn down, cannot get broker address")
+    s"$brokerHost:$brokerPort"
+  }
+
+  def zookeeperClient: ZkClient = {
+    assert(zkReady, "Zookeeper not setup yet or already torn down, cannot get zookeeper client")
+    Option(zkClient).getOrElse(
+      throw new IllegalStateException("Zookeeper client is not yet initialized"))
+  }
+
+  // Set up the Embedded Zookeeper server and get the proper Zookeeper port
+  private def setupEmbeddedZookeeper(): Unit = {
+    // Zookeeper server startup
+    zookeeper = new EmbeddedZookeeper(s"$zkHost:$zkPort")
+    // Get the actual zookeeper binding port
+    zkPort = zookeeper.actualPort
+    zkClient = ZkUtils.createZkClient(s"$zkHost:$zkPort", zkSessionTimeout, zkConnectionTimeout)
+    zkUtils = ZkUtils(zkClient, JaasUtils.isZkSecurityEnabled())
+    zkReady = true
+  }
+
+  // Set up the Embedded Kafka server
+  private def setupEmbeddedKafkaServer(): Unit = {
+    assert(zkReady, "Zookeeper should be set up beforehand")
+
+    // Kafka broker startup
+    Utils.startServiceOnPort(brokerPort, port => {
+      brokerPort = port
+      brokerConf = new KafkaConfig(brokerConfiguration)
+      server = new KafkaServer(brokerConf)
+      server.startup()
+      (server, port)
+    }, new SparkConf(), "KafkaBroker")
+
+    brokerReady = true
+  }
+
+  /** setup the whole embedded servers, including Zookeeper and Kafka brokers */
+  def setup(): Unit = {
+    setupEmbeddedZookeeper()
+    setupEmbeddedKafkaServer()
+  }
+
+  /** Teardown the whole servers, including Kafka broker and Zookeeper */
+  def teardown(): Unit = {
+    brokerReady = false
+    zkReady = false
+
+    if (producer != null) {
+      producer.close()
+      producer = null
+    }
+
+    if (server != null) {
+      server.shutdown()
+      server = null
+    }
+
+    brokerConf.logDirs.foreach { f => Utils.deleteRecursively(new File(f)) }
+
+    if (zkClient != null) {
+      zkClient.close()
+      zkClient = null
+    }
+
+    if (zkUtils != null) {
+      zkUtils.close()
+      zkUtils = null
+    }
+
+    if (zookeeper != null) {
+      zookeeper.shutdown()
+      zookeeper = null
+    }
+  }
+
+  /** Create a Kafka topic and wait until it is propagated to the whole cluster */
+  def createTopic(topic: String): Unit = {
+    AdminUtils.createTopic(zkUtils, topic, 1, 1)
+    // wait until metadata is propagated
+    waitUntilMetadataIsPropagated(topic, 0)
+  }
+
+  /** Java-friendly function for sending messages to the Kafka broker */
+  def sendMessages(topic: String, messageToFreq: JMap[String, JInt]): Unit = {
+    sendMessages(topic, Map(messageToFreq.asScala.mapValues(_.intValue()).toSeq: _*))
+  }
+
+  /** Send the messages to the Kafka broker */
+  def sendMessages(topic: String, messageToFreq: Map[String, Int]): Unit = {
+    val messages = messageToFreq.flatMap { case (s, freq) => Seq.fill(freq)(s) }.toArray
+    sendMessages(topic, messages)
+  }
+
+  /** Send the array of messages to the Kafka broker */
+  def sendMessages(topic: String, messages: Array[String]): Unit = {
+    producer = new Producer[String, String](new ProducerConfig(producerConfiguration))
+    producer.send(messages.map {
+      new KeyedMessage[String, String](topic, _)
+    }: _*)
+    producer.close()
+    producer = null
+  }
+
+  private def brokerConfiguration: Properties = {
+    val props = new Properties()
+    props.put("broker.id", "0")
+    props.put("host.name", "localhost")
+    props.put("port", brokerPort.toString)
+    props.put("log.dir", Utils.createTempDir().getAbsolutePath)
+    props.put("zookeeper.connect", zkAddress)
+    props.put("log.flush.interval.messages", "1")
+    props.put("replica.socket.timeout.ms", "1500")
+    props
+  }
+
+  private def producerConfiguration: Properties = {
+    val props = new Properties()
+    props.put("metadata.broker.list", brokerAddress)
+    props.put("serializer.class", classOf[StringEncoder].getName)
+    // wait for all in-sync replicas to ack sends
+    props.put("request.required.acks", "-1")
+    props
+  }
+
+  // A simplified version of scalatest eventually, rewritten here to avoid adding extra test
+  // dependency
+  def eventually[T](timeout: Time, interval: Time)(func: => T): T = {
+    def makeAttempt(): Either[Throwable, T] = {
+      try {
+        Right(func)
+      } catch {
+        case e if NonFatal(e) => Left(e)
+      }
+    }
+
+    val startTime = System.currentTimeMillis()
+    @tailrec
+    def tryAgain(attempt: Int): T = {
+      makeAttempt() match {
+        case Right(result) => result
+        case Left(e) =>
+          val duration = System.currentTimeMillis() - startTime
+          if (duration < timeout.milliseconds) {
+            Thread.sleep(interval.milliseconds)
+          } else {
+            throw new TimeoutException(e.getMessage)
+          }
+
+          tryAgain(attempt + 1)
+      }
+    }
+
+    tryAgain(1)
+  }
+
+  private def waitUntilMetadataIsPropagated(topic: String, partition: Int): Unit = {
+    def isPropagated = server.apis.metadataCache.getPartitionInfo(topic, partition) match {
+      case Some(partitionState) =>
+        val leaderAndInSyncReplicas = partitionState.leaderIsrAndControllerEpoch.leaderAndIsr
+
+        zkUtils.getLeaderForPartition(topic, partition).isDefined &&
+          Request.isValidBrokerId(leaderAndInSyncReplicas.leader) &&
+          leaderAndInSyncReplicas.isr.size >= 1
+
+      case _ =>
+        false
+    }
+    eventually(Time(10000), Time(100)) {
+      assert(isPropagated, s"Partition [$topic, $partition] metadata not propagated after timeout")
+    }
+  }
+
+  private class EmbeddedZookeeper(val zkConnect: String) {
+    val snapshotDir = Utils.createTempDir()
+    val logDir = Utils.createTempDir()
+
+    val zookeeper = new ZooKeeperServer(snapshotDir, logDir, 500)
+    val (ip, port) = {
+      val splits = zkConnect.split(":")
+      (splits(0), splits(1).toInt)
+    }
+    val factory = new NIOServerCnxnFactory()
+    factory.configure(new InetSocketAddress(ip, port), 16)
+    factory.startup(zookeeper)
+
+    val actualPort = factory.getLocalPort
+
+    def shutdown() {
+      factory.shutdown()
+      Utils.deleteRecursively(snapshotDir)
+      Utils.deleteRecursively(logDir)
+    }
+  }
+
+}
+
diff --git a/external/kafka-v09/src/main/scala/org/apache/spark/streaming/kafka/v09/KafkaUtils.scala b/external/kafka-v09/src/main/scala/org/apache/spark/streaming/kafka/v09/KafkaUtils.scala
new file mode 100644
index 000000000000..00bafa2495b8
--- /dev/null
+++ b/external/kafka-v09/src/main/scala/org/apache/spark/streaming/kafka/v09/KafkaUtils.scala
@@ -0,0 +1,590 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.streaming.kafka.v09
+
+import java.io.OutputStream
+import java.lang.{Integer => JInt, Long => JLong}
+
+import java.util.{List => JList}
+import java.util.{Map => JMap}
+import java.util.{Set => JSet}
+
+import org.apache.kafka.clients.CommonClientConfigs
+import org.apache.kafka.common.TopicPartition
+import org.apache.kafka.common.config.SslConfigs
+
+import scala.reflect.ClassTag
+
+import com.google.common.base.Charsets.UTF_8
+import kafka.common.TopicAndPartition
+import kafka.serializer.Decoder
+import net.razorvine.pickle.{Opcodes, Pickler, IObjectPickler}
+import org.apache.kafka.clients.consumer.{ConsumerConfig, ConsumerRecord}
+import org.apache.spark.api.java.function.{Function => JFunction}
+import org.apache.spark.{SSLOptions, SparkContext, SparkException}
+
+import scala.collection.JavaConverters._
+import scala.reflect._
+import org.apache.spark.api.java.{JavaSparkContext, JavaPairRDD, JavaRDD}
+import org.apache.spark.api.python.SerDeUtil
+import org.apache.spark.rdd.RDD
+import org.apache.spark.streaming.StreamingContext
+import org.apache.spark.streaming.api.java._
+import org.apache.spark.streaming.dstream.{DStream, InputDStream}
+
+object KafkaUtils {
+
+  def addSSLOptions(
+      kafkaParams: Map[String, String],
+      sc: SparkContext
+     ): Map[String, String] = {
+
+    val sparkConf = sc.getConf
+    val defaultSSLOptions = SSLOptions.parse(sparkConf, "spark.ssl", None)
+    val kafkaSSLOptions = SSLOptions.parse(sparkConf, "spark.ssl.kafka", Some(defaultSSLOptions))
+
+    if (kafkaSSLOptions.enabled) {
+      val sslParams = Map[String, Option[_]](
+        CommonClientConfigs.SECURITY_PROTOCOL_CONFIG -> Some("SSL"),
+        SslConfigs.SSL_TRUSTSTORE_LOCATION_CONFIG -> kafkaSSLOptions.trustStore,
+        SslConfigs.SSL_TRUSTSTORE_PASSWORD_CONFIG -> kafkaSSLOptions.trustStorePassword,
+        SslConfigs.SSL_KEYSTORE_LOCATION_CONFIG -> kafkaSSLOptions.keyStore,
+        SslConfigs.SSL_KEYSTORE_PASSWORD_CONFIG -> kafkaSSLOptions.keyStorePassword,
+        SslConfigs.SSL_KEY_PASSWORD_CONFIG -> kafkaSSLOptions.keyPassword
+      )
+      kafkaParams ++ sslParams.filter(_._2.isDefined).mapValues(_.get.toString)
+    } else {
+      kafkaParams
+    }
+
+  }
+
+  /** Make sure offsets are available in kafka, or throw an exception */
+  private def checkOffsets(
+    kafkaParams: Map[String, String],
+    offsetRanges: Array[OffsetRange]): Array[OffsetRange] = {
+    val kc = new KafkaCluster(kafkaParams)
+    try {
+      val topics = offsetRanges.map(_.topicPartition).toSet
+      val low = kc.getEarliestOffsets(topics)
+      val high = kc.getLatestOffsetsWithLeaders(topics)
+
+      val result = offsetRanges.filterNot { o =>
+        low(o.topicPartition()) <= o.fromOffset &&
+          o.untilOffset <= high(o.topicPartition()).offset
+      }
+
+      if (!result.isEmpty) {
+        throw new SparkException("Offsets not available in Kafka: " + result.mkString(","))
+      }
+
+      offsetRanges.map { o =>
+        OffsetRange(o.topic, o.partition, o.fromOffset, o.untilOffset,
+          high(o.topicPartition()).host)
+      }
+    }
+    finally {
+      kc.close()
+    }
+  }
+
+
+  def createRDD[K: ClassTag, V: ClassTag](
+      sc: SparkContext,
+      kafkaParams: Map[String, String],
+      offsetRanges: Array[OffsetRange]
+     ): RDD[(K, V)] = sc.withScope {
+    val messageHandler = (cr: ConsumerRecord[K, V]) => (cr.key, cr.value)
+    new KafkaRDD[K, V, (K, V)](
+      sc,
+      addSSLOptions(kafkaParams, sc),
+      checkOffsets(kafkaParams, offsetRanges),
+      messageHandler
+    )
+  }
+
+  /**
+   * Create a RDD from Kafka using offset ranges for each topic and partition. This allows you
+   * specify the Kafka leader to connect to (to optimize fetching) and access the message as well
+   * as the metadata.
+   *
+   * @param sc SparkContext object
+   * @param kafkaParams Kafka <a href="http://kafka.apache.org/documentation.html#configuration">
+   *                    configuration parameters</a>. Requires "bootstrap.servers"
+   *                    to be set with Kafka broker(s) (NOT zookeeper servers) specified in
+   *                    host1:port1,host2:port2 form.
+   * @param offsetRanges Each OffsetRange in the batch corresponds to a
+   *                     range of offsets for a given Kafka topic/partition
+   * @param messageHandler Function for translating each message and metadata into the desired type
+   */
+  def createRDD[K: ClassTag, V: ClassTag, R: ClassTag](
+      sc: SparkContext,
+      kafkaParams: Map[String, String],
+      offsetRanges: Array[OffsetRange],
+      messageHandler: ConsumerRecord[K, V] => R
+     ): RDD[R] = sc.withScope {
+    val kc = new KafkaCluster[K, V](addSSLOptions(kafkaParams, sc))
+    val cleanedHandler = sc.clean(messageHandler)
+    new KafkaRDD[K, V, R](sc,
+      addSSLOptions(kafkaParams, sc),
+      checkOffsets(kafkaParams, offsetRanges),
+      cleanedHandler
+    )
+  }
+
+  /**
+   * Create a RDD from Kafka using offset ranges for each topic and partition.
+   *
+   * @param jsc JavaSparkContext object
+   * @param kafkaParams Kafka <a href="http://kafka.apache.org/documentation.html#configuration">
+   *    configuration parameters</a>. Requires "bootstrap.servers"
+   *    specified in host1:port1,host2:port2 form.
+   * @param offsetRanges Each OffsetRange in the batch corresponds to a
+   *   range of offsets for a given Kafka topic/partition
+   */
+  def createRDD[K, V, KD <: Decoder[K], VD <: Decoder[V]](
+      jsc: JavaSparkContext,
+      keyClass: Class[K],
+      valueClass: Class[V],
+      kafkaParams: JMap[String, String],
+      offsetRanges: Array[OffsetRange]
+    ): JavaPairRDD[K, V] = jsc.sc.withScope {
+    implicit val keyCmt: ClassTag[K] = ClassTag(keyClass)
+    implicit val valueCmt: ClassTag[V] = ClassTag(valueClass)
+    new JavaPairRDD(createRDD[K, V](
+      jsc.sc, Map(kafkaParams.asScala.toSeq: _*), offsetRanges))
+  }
+
+  /**
+   * Create a RDD from Kafka using offset ranges for each topic and partition. This allows you
+   * specify the Kafka leader to connect to (to optimize fetching) and access the message as well
+   * as the metadata.
+   *
+   * @param jsc JavaSparkContext object
+   * @param kafkaParams Kafka <a href="http://kafka.apache.org/documentation.html#configuration">
+   *    configuration parameters</a>. Requires "bootstrap.servers"
+   *    specified in host1:port1,host2:port2 form.
+   * @param offsetRanges Each OffsetRange in the batch corresponds to a
+   *   range of offsets for a given Kafka topic/partition
+   * @param messageHandler Function for translating each message and metadata into the desired type
+   */
+  def createRDD[K, V, KD <: Decoder[K], VD <: Decoder[V], R](
+      jsc: JavaSparkContext,
+      keyClass: Class[K],
+      valueClass: Class[V],
+      recordClass: Class[R],
+      kafkaParams: JMap[String, String],
+      offsetRanges: Array[OffsetRange],
+      messageHandler: JFunction[ConsumerRecord[K, V], R]
+    ): JavaRDD[R] = jsc.sc.withScope {
+    implicit val keyCmt: ClassTag[K] = ClassTag(keyClass)
+    implicit val valueCmt: ClassTag[V] = ClassTag(valueClass)
+    implicit val recordCmt: ClassTag[R] = ClassTag(recordClass)
+    createRDD[K, V, R](
+      jsc.sc, Map(kafkaParams.asScala.toSeq: _*), offsetRanges, messageHandler.call _)
+  }
+
+  /**
+   * Create an input stream that directly pulls messages from Kafka Brokers
+   * without using any receiver. This stream can guarantee that each message
+   * from Kafka is included in transformations exactly once (see points below).
+   *
+   * Points to note:
+   * - No receivers: This stream does not use any receiver. It directly queries Kafka
+   * - Offsets: This does not use Zookeeper to store offsets. The consumed offsets are tracked
+   * by the stream itself.
+   * You can access the offsets used in each batch from the generated RDDs (see
+   * [[org.apache.spark.streaming.kafka.v09.HasOffsetRanges]]).
+   * - Failure Recovery: To recover from driver failures, you have to enable checkpointing
+   * in the [[StreamingContext]]. The information on consumed offset can be
+   * recovered from the checkpoint. See the programming guide for details (constraints, etc.).
+   * - End-to-end semantics: This stream ensures that every records is effectively received and
+   * transformed exactly once, but gives no guarantees on whether the transformed data are
+   * outputted exactly once. For end-to-end exactly-once semantics, you have to either ensure
+   * that the output operation is idempotent, or use transactions to output records atomically.
+   * See the programming guide for more details.
+   *
+   * @param ssc StreamingContext object
+   * @param kafkaParams Kafka <a href="http://kafka.apache.org/documentation.html#configuration">
+   *                    configuration parameters</a>. Requires "bootstrap.servers"
+   *                    to be set with Kafka broker(s) (NOT zookeeper servers) specified in
+   *                    host1:port1,host2:port2 form.
+   * @param fromOffsets Per-topic/partition Kafka offsets defining the (inclusive)
+   *                    starting point of the stream
+   * @param messageHandler Function for translating each message and metadata into the desired type
+   */
+  def createDirectStream[K: ClassTag, V: ClassTag, R: ClassTag](
+      ssc: StreamingContext,
+      kafkaParams: Map[String, String],
+      fromOffsets: Map[TopicPartition, Long],
+      messageHandler: ConsumerRecord[K, V] => R
+     ): InputDStream[R] = {
+    val cleanedHandler = ssc.sc.clean(messageHandler)
+    new DirectKafkaInputDStream[K, V, R](
+      ssc, addSSLOptions(kafkaParams, ssc.sparkContext), fromOffsets, messageHandler)
+  }
+
+  /**
+   * Create an input stream that directly pulls messages from Kafka Brokers
+   * without using any receiver. This stream can guarantee that each message
+   * from Kafka is included in transformations exactly once (see points below).
+   *
+   * Points to note:
+   * - No receivers: This stream does not use any receiver. It directly queries Kafka
+   * - Offsets: This does not use Zookeeper to store offsets. The consumed offsets are tracked
+   * by the stream itself.
+   * You can access the offsets used in each batch from the generated RDDs (see
+   * [[org.apache.spark.streaming.kafka.v09.HasOffsetRanges]]).
+   * - Failure Recovery: To recover from driver failures, you have to enable checkpointing
+   * in the [[StreamingContext]]. The information on consumed offset can be
+   * recovered from the checkpoint. See the programming guide for details (constraints, etc.).
+   * - End-to-end semantics: This stream ensures that every records is effectively received and
+   * transformed exactly once, but gives no guarantees on whether the transformed data are
+   * outputted exactly once. For end-to-end exactly-once semantics, you have to either ensure
+   * that the output operation is idempotent, or use transactions to output records atomically.
+   * See the programming guide for more details.
+   *
+   * @param ssc StreamingContext object
+   * @param kafkaParams Kafka <a href="http://kafka.apache.org/documentation.html#configuration">
+   *                    configuration parameters</a>. Requires "bootstrap.servers"
+   *                    to be set with Kafka broker(s) (NOT zookeeper servers), specified in
+   *                    host1:port1,host2:port2 form.
+   *                    If not starting from a checkpoint, "auto.offset.reset" may be set to
+   *                    "earliest" or "latest" to determine where the stream starts
+   *                    (defaults to "latest")
+   * @param topics Names of the topics to consume
+   */
+  def createDirectStream[K: ClassTag, V: ClassTag](
+      ssc: StreamingContext,
+      kafkaParams: Map[String, String],
+      topics: Set[String]
+     ): InputDStream[(K, V)] = {
+    val messageHandler = (cr: ConsumerRecord[K, V]) => (cr.key, cr.value)
+    val fromOffsets = getFromOffsets(kafkaParams, topics)
+
+    new DirectKafkaInputDStream[K, V, (K, V)](
+      ssc, addSSLOptions(kafkaParams, ssc.sparkContext), fromOffsets, messageHandler)
+  }
+
+  /**
+   * Create an input stream that directly pulls messages from Kafka Brokers
+   * without using any receiver. This stream can guarantee that each message
+   * from Kafka is included in transformations exactly once (see points below).
+   *
+   * Points to note:
+   * - No receivers: This stream does not use any receiver. It directly queries Kafka
+   * - Offsets: This does not use Zookeeper to store offsets. The consumed offsets are tracked
+   * by the stream itself.
+   * You can access the offsets used in each batch from the generated RDDs (see
+   * [[org.apache.spark.streaming.kafka.v09.HasOffsetRanges]]).
+   * - Failure Recovery: To recover from driver failures, you have to enable checkpointing
+   * in the [[StreamingContext]]. The information on consumed offset can be
+   * recovered from the checkpoint. See the programming guide for details (constraints, etc.).
+   * - End-to-end semantics: This stream ensures that every records is effectively received and
+   * transformed exactly once, but gives no guarantees on whether the transformed data are
+   * outputted exactly once. For end-to-end exactly-once semantics, you have to either ensure
+   * that the output operation is idempotent, or use transactions to output records atomically.
+   * See the programming guide for more details.
+   *
+   * @param jssc JavaStreamingContext object
+   * @param keyClass Class of the keys in the Kafka records
+   * @param valueClass Class of the values in the Kafka records
+   * @param recordClass Class of the records in DStream
+   * @param kafkaParams Kafka <a href="http://kafka.apache.org/documentation.html#configuration">
+   *                    configuration parameters</a>. Requires "bootstrap.servers"
+   *                    specified in host1:port1,host2:port2 form.
+   * @param fromOffsets Per-topic/partition Kafka offsets defining the (inclusive)
+   *                    starting point of the stream
+   * @param messageHandler Function for translating each message and metadata into the desired type
+   */
+  def createDirectStream[K, V, R](
+      jssc: JavaStreamingContext,
+      keyClass: Class[K],
+      valueClass: Class[V],
+      recordClass: Class[R],
+      kafkaParams: JMap[String, String],
+      fromOffsets: JMap[TopicPartition, JLong],
+      messageHandler: JFunction[ConsumerRecord[K, V], R]
+     ): JavaInputDStream[R] = {
+    implicit val keyCmt: ClassTag[K] = ClassTag(keyClass)
+    implicit val valueCmt: ClassTag[V] = ClassTag(valueClass)
+    implicit val recordCmt: ClassTag[R] = ClassTag(recordClass)
+    val cleanedHandler = jssc.sparkContext.clean(messageHandler.call _)
+    createDirectStream[K, V, R](
+      jssc.ssc,
+      Map(kafkaParams.asScala.toSeq: _*),
+      Map(fromOffsets.asScala.mapValues {
+        _.longValue()
+      }.toSeq: _*),
+      cleanedHandler
+    )
+  }
+
+  /**
+   * Create an input stream that directly pulls messages from Kafka Brokers
+   * without using any receiver. This stream can guarantee that each message
+   * from Kafka is included in transformations exactly once (see points below).
+   *
+   * Points to note:
+   * - No receivers: This stream does not use any receiver. It directly queries Kafka
+   * - Offsets: This does not use Zookeeper to store offsets. The consumed offsets are tracked
+   * by the stream itself.
+   * You can access the offsets used in each batch from the generated RDDs (see
+   * [[org.apache.spark.streaming.kafka.v09.HasOffsetRanges]]).
+   * - Failure Recovery: To recover from driver failures, you have to enable checkpointing
+   * in the [[StreamingContext]]. The information on consumed offset can be
+   * recovered from the checkpoint. See the programming guide for details (constraints, etc.).
+   * - End-to-end semantics: This stream ensures that every records is effectively received and
+   * transformed exactly once, but gives no guarantees on whether the transformed data are
+   * outputted exactly once. For end-to-end exactly-once semantics, you have to either ensure
+   * that the output operation is idempotent, or use transactions to output records atomically.
+   * See the programming guide for more details.
+   *
+   * @param jssc JavaStreamingContext object
+   * @param keyClass Class of the keys in the Kafka records
+   * @param valueClass Class of the values in the Kafka records
+   * @param kafkaParams Kafka <a href="http://kafka.apache.org/documentation.html#configuration">
+   *                    configuration parameters</a>. Requires "bootstrap.servers"
+   *                    to be set with Kafka broker(s) (NOT zookeeper servers), specified in
+   *                    host1:port1,host2:port2 form.
+   *                    If not starting from a checkpoint, "auto.offset.reset" may be set
+   *                    to "latest" or "earliest" to determine where the stream starts
+   *                    (defaults to "latest")
+   * @param topics Names of the topics to consume
+   */
+  def createDirectStream[K, V](
+      jssc: JavaStreamingContext,
+      keyClass: Class[K],
+      valueClass: Class[V],
+      kafkaParams: JMap[String, String],
+      topics: JSet[String]
+     ): JavaPairInputDStream[K, V] = {
+    implicit val keyCmt: ClassTag[K] = ClassTag(keyClass)
+    implicit val valueCmt: ClassTag[V] = ClassTag(valueClass)
+    createDirectStream[K, V](
+      jssc.ssc,
+      Map(kafkaParams.asScala.toSeq: _*),
+      Set(topics.asScala.toSeq: _*)
+    )
+  }
+
+  def createOffsetRange(
+      topic: String,
+      partition: JInt,
+      fromOffset: JLong,
+      untilOffset: JLong
+     ): OffsetRange = OffsetRange.create(topic, partition, fromOffset, untilOffset)
+
+  def createTopicAndPartition(topic: String, partition: JInt): TopicAndPartition =
+    TopicAndPartition(topic, partition)
+
+  private[kafka] def getFromOffsets(
+      kafkaParams: Map[String, String],
+      topics: Set[String]
+    ): Map[TopicPartition, Long] = {
+    val kc = new KafkaCluster(kafkaParams)
+    try {
+      val reset = kafkaParams.get("auto.offset.reset").map(_.toLowerCase)
+      if (reset == Some("earliest")) {
+        kc.getEarliestOffsets(kc.getPartitions(topics))
+      } else {
+        kc.getLatestOffsets(kc.getPartitions(topics))
+      }
+    }
+    finally {
+      kc.close()
+    }
+  }
+}
+
+/**
+  * This is a helper class that wraps the KafkaUtils.createStream() into more
+  * Python-friendly class and function so that it can be easily
+  * instantiated and called from Python's KafkaUtils (see SPARK-6027).
+  *
+  * The zero-arg constructor helps instantiate this class from the Class object
+  * classOf[KafkaUtilsPythonHelper].newInstance(), and the createStream()
+  * takes care of known parameters instead of passing them from Python
+  */
+private[kafka] class KafkaUtilsPythonHelper {
+  import KafkaUtilsPythonHelper._
+
+  def createRDDWithoutMessageHandler(
+      jsc: JavaSparkContext,
+      kafkaParams: JMap[String, String],
+      offsetRanges: JList[OffsetRange]): JavaRDD[(Array[Byte], Array[Byte])] = {
+    val messageHandler =
+      (cr: ConsumerRecord[Array[Byte], Array[Byte]]) => (cr.key, cr.value)
+    new JavaRDD(createRDD(jsc, kafkaParams, offsetRanges, messageHandler))
+  }
+
+  def createRDDWithMessageHandler(
+      jsc: JavaSparkContext,
+      kafkaParams: JMap[String, String],
+      offsetRanges: JList[OffsetRange]): JavaRDD[Array[Byte]] = {
+    val messageHandler = (cr: ConsumerRecord[Array[Byte], Array[Byte]]) =>
+      new PythonConsumerRecord(
+        cr.topic, cr.partition, cr.offset, cr.key(), cr.value())
+    val rdd = createRDD(jsc, kafkaParams, offsetRanges, messageHandler).
+      mapPartitions(picklerIterator)
+    new JavaRDD(rdd)
+  }
+
+  private def createRDD[V: ClassTag](
+      jsc: JavaSparkContext,
+      kafkaParams: JMap[String, String],
+      offsetRanges: JList[OffsetRange],
+      messageHandler: ConsumerRecord[Array[Byte], Array[Byte]] => V): RDD[V] = {
+    kafkaParams.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, "ByteArrayDeserializer" )
+    kafkaParams.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, "ByteArrayDeserializer")
+    KafkaUtils.createRDD[Array[Byte], Array[Byte], V](
+      jsc.sc,
+      kafkaParams.asScala.toMap,
+      offsetRanges.toArray(new Array[OffsetRange](offsetRanges.size())),
+      messageHandler
+    )
+  }
+
+  def createDirectStreamWithoutMessageHandler(
+      jssc: JavaStreamingContext,
+      kafkaParams: JMap[String, String],
+      topics: JSet[String],
+      fromOffsets: JMap[TopicPartition, JLong]): JavaDStream[(Array[Byte], Array[Byte])] = {
+    val messageHandler =
+      (cr: ConsumerRecord[Array[Byte], Array[Byte]]) => (cr.key, cr.value)
+    new JavaDStream(createDirectStream(jssc, kafkaParams, topics, fromOffsets, messageHandler))
+  }
+
+  def createDirectStreamWithMessageHandler(
+      jssc: JavaStreamingContext,
+      kafkaParams: JMap[String, String],
+      topics: JSet[String],
+      fromOffsets: JMap[TopicPartition, JLong]): JavaDStream[Array[Byte]] = {
+    val messageHandler = (cr: ConsumerRecord[Array[Byte], Array[Byte]]) =>
+      new PythonConsumerRecord(cr.topic, cr.partition, cr.offset, cr.key(), cr.value())
+    val stream = createDirectStream(jssc, kafkaParams, topics, fromOffsets, messageHandler).
+      mapPartitions(picklerIterator)
+    new JavaDStream(stream)
+  }
+
+  private def createDirectStream[V: ClassTag](
+      jssc: JavaStreamingContext,
+      kafkaParams: JMap[String, String],
+      topics: JSet[String],
+      fromOffsets: JMap[TopicPartition, JLong],
+      messageHandler: ConsumerRecord[Array[Byte], Array[Byte]] => V): DStream[V] = {
+
+    val currentFromOffsets = if (!fromOffsets.isEmpty) {
+      val topicsFromOffsets = fromOffsets.keySet().asScala.map(_.topic)
+      if (topicsFromOffsets != topics.asScala.toSet) {
+        throw new IllegalStateException(
+          s"The specified topics: ${topics.asScala.toSet.mkString(" ")} " +
+            s"do not equal to the topic from offsets: ${topicsFromOffsets.mkString(" ")}")
+      }
+      Map(fromOffsets.asScala.mapValues { _.longValue() }.toSeq: _*)
+    } else {
+      KafkaUtils.getFromOffsets(
+        Map(kafkaParams.asScala.toSeq: _*), Set(topics.asScala.toSeq: _*))
+    }
+
+    kafkaParams.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, "ByteArrayDeserializer" )
+    kafkaParams.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, "ByteArrayDeserializer")
+    KafkaUtils.createDirectStream[Array[Byte], Array[Byte], V](
+      jssc.ssc,
+      Map(kafkaParams.asScala.toSeq: _*),
+      Map(currentFromOffsets.toSeq: _*),
+      messageHandler)
+  }
+
+  def createOffsetRange(
+      topic: String,
+      partition: JInt,
+      fromOffset: JLong,
+      untilOffset: JLong
+    ): OffsetRange = OffsetRange.create(topic, partition, fromOffset, untilOffset)
+
+  def createTopicAndPartition(topic: String, partition: JInt): TopicPartition =
+    new TopicPartition(topic, partition)
+
+  def offsetRangesOfKafkaRDD(rdd: RDD[_]): JList[OffsetRange] = {
+    val parentRDDs = rdd.getNarrowAncestors
+    val kafkaRDDs = parentRDDs.filter(rdd => rdd.isInstanceOf[KafkaRDD[_, _, _]])
+
+    require(
+      kafkaRDDs.length == 1,
+      "Cannot get offset ranges, as there may be multiple Kafka RDDs or no Kafka RDD associated" +
+        "with this RDD, please call this method only on a Kafka RDD.")
+
+    val kafkaRDD = kafkaRDDs.head.asInstanceOf[KafkaRDD[_, _, _]]
+    kafkaRDD.offsetRanges.toSeq.asJava
+  }
+}
+
+private object KafkaUtilsPythonHelper {
+  private var initialized = false
+
+  def initialize(): Unit = {
+    SerDeUtil.initialize()
+    synchronized {
+      if (!initialized) {
+        new PythonConsumerRecordPickler().register()
+        initialized = true
+      }
+    }
+  }
+
+  initialize()
+
+  def picklerIterator(iter: Iterator[Any]): Iterator[Array[Byte]] = {
+    new SerDeUtil.AutoBatchedPickler(iter)
+  }
+
+  case class PythonConsumerRecord(
+      topic: String,
+      partition: JInt,
+      offset: JLong,
+      key: Array[Byte],
+      message: Array[Byte])
+
+  class PythonConsumerRecordPickler extends IObjectPickler {
+    private val module = "pyspark.streaming.kafka"
+
+    def register(): Unit = {
+      Pickler.registerCustomPickler(classOf[PythonConsumerRecord], this)
+      Pickler.registerCustomPickler(this.getClass, this)
+    }
+
+    def pickle(obj: Object, out: OutputStream, pickler: Pickler) {
+      if (obj == this) {
+        out.write(Opcodes.GLOBAL)
+        out.write(s"$module\nKafkaMessageAndMetadata\n".getBytes(UTF_8))
+      } else {
+        pickler.save(this)
+        val msgAndMetaData = obj.asInstanceOf[PythonConsumerRecord]
+        out.write(Opcodes.MARK)
+        pickler.save(msgAndMetaData.topic)
+        pickler.save(msgAndMetaData.partition)
+        pickler.save(msgAndMetaData.offset)
+        pickler.save(msgAndMetaData.key)
+        pickler.save(msgAndMetaData.message)
+        out.write(Opcodes.TUPLE)
+        out.write(Opcodes.REDUCE)
+      }
+    }
+  }
+}
diff --git a/external/kafka-v09/src/main/scala/org/apache/spark/streaming/kafka/v09/OffsetRange.scala b/external/kafka-v09/src/main/scala/org/apache/spark/streaming/kafka/v09/OffsetRange.scala
new file mode 100644
index 000000000000..20672ac4745c
--- /dev/null
+++ b/external/kafka-v09/src/main/scala/org/apache/spark/streaming/kafka/v09/OffsetRange.scala
@@ -0,0 +1,131 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.streaming.kafka.v09
+
+import kafka.common.TopicAndPartition
+import org.apache.kafka.common.TopicPartition
+
+/**
+ * Represents any object that has a collection of [[OffsetRange]]s. This can be used to access the
+ * offset ranges in RDDs generated by the direct Kafka DStream (see
+ * [[KafkaUtils.createDirectStream()]]).
+ * {{{
+ *   KafkaUtils.createDirectStream(...).foreachRDD { rdd =>
+ *      val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
+ *      ...
+ *   }
+ * }}}
+ */
+trait HasOffsetRanges {
+  def offsetRanges: Array[OffsetRange]
+}
+
+/**
+ * Represents a range of offsets from a single Kafka TopicAndPartition. Instances of this class
+ * can be created with `OffsetRange.create()`.
+ * @param topic Kafka topic name
+ * @param partition Kafka partition id
+ * @param fromOffset Inclusive starting offset
+ * @param untilOffset Exclusive ending offset
+ */
+final class OffsetRange private(
+    val topic: String,
+    val partition: Int,
+    val fromOffset: Long,
+    val untilOffset: Long,
+    val leaderHost: String) extends Serializable {
+  import OffsetRange.OffsetRangeTuple
+
+  def this(
+      topic: String,
+      partition: Int,
+      fromOffset: Long,
+      untilOffset: Long
+    ) = {
+    this(topic, partition, fromOffset, untilOffset, null)
+  }
+
+  /** Kafka TopicAndPartition object, for convenience */
+  def topicAndPartition(): TopicAndPartition = TopicAndPartition(topic, partition)
+
+  def topicPartition(): TopicPartition = new TopicPartition(topic, partition)
+
+  /** Number of messages this OffsetRange refers to */
+  def count(): Long = untilOffset - fromOffset
+
+  override def equals(obj: Any): Boolean = obj match {
+    case that: OffsetRange =>
+      this.topic == that.topic &&
+        this.partition == that.partition &&
+        this.fromOffset == that.fromOffset &&
+        this.untilOffset == that.untilOffset
+    case _ => false
+  }
+
+  override def hashCode(): Int = {
+    toTuple.hashCode()
+  }
+
+  override def toString(): String = {
+    s"OffsetRange(topic: '$topic', partition: $partition, range: [$fromOffset -> $untilOffset])"
+  }
+
+  /** this is to avoid ClassNotFoundException during checkpoint restore */
+  private[streaming]
+  def toTuple: OffsetRangeTuple = (topic, partition, fromOffset, untilOffset)
+}
+
+/**
+ * Companion object the provides methods to create instances of [[OffsetRange]].
+ */
+object OffsetRange {
+  def create(topic: String, partition: Int, fromOffset: Long, untilOffset: Long): OffsetRange =
+    new OffsetRange(topic, partition, fromOffset, untilOffset)
+
+  def create(
+      topicAndPartition: TopicAndPartition,
+      fromOffset: Long,
+      untilOffset: Long): OffsetRange =
+    new OffsetRange(topicAndPartition.topic, topicAndPartition.partition, fromOffset, untilOffset)
+
+  def apply(topic: String, partition: Int, fromOffset: Long, untilOffset: Long): OffsetRange =
+    new OffsetRange(topic, partition, fromOffset, untilOffset)
+
+
+  def apply(
+      topic: String,
+      partition: Int,
+      fromOffset: Long,
+      untilOffset: Long,
+      leaderHost: String): OffsetRange =
+    new OffsetRange(topic, partition, fromOffset, untilOffset, leaderHost)
+
+  def apply(
+      topicPartition: TopicPartition,
+      fromOffset: Long,
+      untilOffset: Long): OffsetRange =
+    new OffsetRange(topicPartition.topic, topicPartition.partition, fromOffset, untilOffset)
+
+  /** this is to avoid ClassNotFoundException during checkpoint restore */
+  private[kafka]
+  type OffsetRangeTuple = (String, Int, Long, Long)
+
+  private[kafka]
+  def apply(t: OffsetRangeTuple) =
+    new OffsetRange(t._1, t._2, t._3, t._4)
+}
diff --git a/external/kafka-v09/src/test/java/org/apache/spark/streaming/kafka/v09/JavaDirectKafkaStreamSuite.java b/external/kafka-v09/src/test/java/org/apache/spark/streaming/kafka/v09/JavaDirectKafkaStreamSuite.java
new file mode 100644
index 000000000000..71e0e9203d64
--- /dev/null
+++ b/external/kafka-v09/src/test/java/org/apache/spark/streaming/kafka/v09/JavaDirectKafkaStreamSuite.java
@@ -0,0 +1,174 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.streaming.kafka.v09;
+
+
+import org.apache.kafka.clients.consumer.ConsumerConfig;
+import org.apache.kafka.clients.consumer.ConsumerRecord;
+import org.apache.kafka.common.TopicPartition;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaPairRDD;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.function.Function;
+import org.apache.spark.streaming.Durations;
+import org.apache.spark.streaming.api.java.JavaDStream;
+import org.apache.spark.streaming.api.java.JavaStreamingContext;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+import scala.Tuple2;
+
+import java.io.Serializable;
+import java.util.*;
+import java.util.concurrent.atomic.AtomicReference;
+
+public class JavaDirectKafkaStreamSuite implements Serializable {
+  private transient JavaStreamingContext ssc = null;
+  private transient KafkaTestUtils kafkaTestUtils = null;
+
+  @Before
+  public void setUp() {
+    kafkaTestUtils = new KafkaTestUtils();
+    kafkaTestUtils.setup();
+    SparkConf sparkConf = new SparkConf()
+      .setMaster("local[4]").setAppName(this.getClass().getSimpleName());
+    ssc = new JavaStreamingContext(sparkConf, Durations.milliseconds(200));
+  }
+
+  @After
+  public void tearDown() {
+    if (ssc != null) {
+      ssc.stop();
+      ssc = null;
+    }
+
+    if (kafkaTestUtils != null) {
+      kafkaTestUtils.teardown();
+      kafkaTestUtils = null;
+    }
+  }
+
+  @Test
+  public void testKafkaStream() throws InterruptedException {
+    final String topic1 = "topic1_testKafkaDirectStream";
+    final String topic2 = "topic2_testKafkaDirectStream";
+    // hold a reference to the current offset ranges, so it can be used downstream
+    final AtomicReference<OffsetRange[]> offsetRanges = new AtomicReference<>();
+
+    String[] topic1data = createTopicAndSendData(topic1);
+    String[] topic2data = createTopicAndSendData(topic2);
+
+    Set<String> sent = new HashSet<>();
+    sent.addAll(Arrays.asList(topic1data));
+    sent.addAll(Arrays.asList(topic2data));
+
+    Map<String, String> kafkaParams = new HashMap<>();
+    kafkaParams.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, kafkaTestUtils.brokerAddress());
+    kafkaParams.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest");
+    kafkaParams.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer");
+    kafkaParams.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer");
+    kafkaParams.put("spark.kafka.poll.time", "1000");
+
+    JavaDStream<String> stream1 = KafkaUtils.createDirectStream(
+        ssc,
+        String.class,
+        String.class,
+        kafkaParams,
+        topicToSet(topic1)
+    ).transformToPair(
+        // Make sure you can get offset ranges from the rdd
+        new Function<JavaPairRDD<String, String>, JavaPairRDD<String, String>>() {
+          @Override
+          public JavaPairRDD<String, String> call(JavaPairRDD<String, String> rdd) {
+            OffsetRange[] offsets = ((HasOffsetRanges) rdd.rdd()).offsetRanges();
+            offsetRanges.set(offsets);
+            Assert.assertEquals(topic1, offsets[0].topic());
+            return rdd;
+          }
+        }
+    ).map(
+        new Function<Tuple2<String, String>, String>() {
+          @Override
+          public String call(Tuple2<String, String> kv) {
+            return kv._2();
+          }
+        }
+    );
+
+    JavaDStream<String> stream2 = KafkaUtils.createDirectStream(
+        ssc,
+        String.class,
+        String.class,
+        String.class,
+        kafkaParams,
+        topicOffsetToMap(topic2, 0L),
+        new Function<ConsumerRecord<String, String>, String>() {
+          @Override
+          public String call(ConsumerRecord<String, String> consumerRecord) throws Exception {
+            return consumerRecord.value();
+          }
+        }
+    );
+    JavaDStream<String> unifiedStream = stream1.union(stream2);
+
+    final Set<String> result = Collections.synchronizedSet(new HashSet<String>());
+    unifiedStream.foreachRDD(
+        new Function<JavaRDD<String>, Void>() {
+          @Override
+          public Void call(JavaRDD<String> rdd) {
+            result.addAll(rdd.collect());
+            for (OffsetRange o : offsetRanges.get()) {
+              System.out.println(
+                o.topic() + " " + o.partition() + " " + o.fromOffset() + " " + o.untilOffset()
+              );
+            }
+            return null;
+          }
+        }
+    );
+    ssc.start();
+    long startTime = System.currentTimeMillis();
+    boolean matches = false;
+    while (!matches && System.currentTimeMillis() - startTime < 20000) {
+      matches = sent.size() == result.size();
+      Thread.sleep(50);
+    }
+    Assert.assertEquals(sent, result);
+    ssc.stop();
+  }
+
+  private static Set<String> topicToSet(String topic) {
+    Set<String> topicSet = new HashSet<>();
+    topicSet.add(topic);
+    return topicSet;
+  }
+
+  private static Map<TopicPartition, Long> topicOffsetToMap(String topic, Long offsetToStart) {
+    Map<TopicPartition, Long> topicMap = new HashMap<>();
+    topicMap.put(new TopicPartition(topic, 0), offsetToStart);
+    return topicMap;
+  }
+
+  private String[] createTopicAndSendData(String topic) {
+    String[] data = {topic + "-1", topic + "-2", topic + "-3"};
+    kafkaTestUtils.createTopic(topic);
+    kafkaTestUtils.sendMessages(topic, data);
+    return data;
+  }
+}
diff --git a/external/kafka-v09/src/test/java/org/apache/spark/streaming/kafka/v09/JavaKafkaRDDSuite.java b/external/kafka-v09/src/test/java/org/apache/spark/streaming/kafka/v09/JavaKafkaRDDSuite.java
new file mode 100644
index 000000000000..e020e436518f
--- /dev/null
+++ b/external/kafka-v09/src/test/java/org/apache/spark/streaming/kafka/v09/JavaKafkaRDDSuite.java
@@ -0,0 +1,143 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.streaming.kafka.v09;
+
+import java.io.Serializable;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.kafka.clients.consumer.ConsumerConfig;
+import org.apache.kafka.clients.consumer.ConsumerRecord;
+import scala.Tuple2;
+
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.api.java.function.Function;
+
+public class JavaKafkaRDDSuite implements Serializable {
+  private transient JavaSparkContext sc = null;
+  private transient KafkaTestUtils kafkaTestUtils = null;
+
+  @Before
+  public void setUp() {
+    kafkaTestUtils = new KafkaTestUtils();
+    kafkaTestUtils.setup();
+    SparkConf sparkConf = new SparkConf()
+      .setMaster("local[4]").setAppName(this.getClass().getSimpleName());
+    sc = new JavaSparkContext(sparkConf);
+  }
+
+  @After
+  public void tearDown() {
+    if (sc != null) {
+      sc.stop();
+      sc = null;
+    }
+
+    if (kafkaTestUtils != null) {
+      kafkaTestUtils.teardown();
+      kafkaTestUtils = null;
+    }
+  }
+
+  @Test
+  public void testKafkaRDD() throws InterruptedException {
+    String topic1 = "topic1_testKafkaRDD";
+    String topic2 = "topic2_testKafkaRDD";
+
+    createTopicAndSendData(topic1);
+    createTopicAndSendData(topic2);
+
+    Map<String, String> kafkaParams = new HashMap<>();
+    kafkaParams.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, kafkaTestUtils.brokerAddress());
+    kafkaParams.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer");
+    kafkaParams.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer");
+    kafkaParams.put("spark.kafka.poll.time", "1000");
+
+    OffsetRange[] offsetRanges = {
+      OffsetRange.create(topic1, 0, 0, 1),
+      OffsetRange.create(topic2, 0, 0, 1)
+    };
+
+    JavaRDD<String> rdd1 = KafkaUtils.createRDD(
+        sc,
+        String.class,
+        String.class,
+        kafkaParams,
+        offsetRanges
+    ).map(
+        new Function<Tuple2<String, String>, String>() {
+          @Override
+          public String call(Tuple2<String, String> kv) {
+            return kv._2();
+          }
+        }
+    );
+
+    JavaRDD<String> rdd2 = KafkaUtils.createRDD(
+        sc,
+        String.class,
+        String.class,
+        String.class,
+        kafkaParams,
+        offsetRanges,
+        new Function<ConsumerRecord<String, String>, String>() {
+          @Override
+          public String call(ConsumerRecord<String, String> consumerRecord) throws Exception {
+            return consumerRecord.value();
+          }
+        }
+    );
+
+    JavaRDD<String> rdd3 = KafkaUtils.createRDD(
+        sc,
+        String.class,
+        String.class,
+        String.class,
+        kafkaParams,
+        offsetRanges,
+        new Function<ConsumerRecord<String, String>, String>() {
+          @Override
+          public String call(ConsumerRecord<String, String> consumerRecord) throws Exception {
+            return consumerRecord.value();
+          }
+        }
+    );
+
+    // just making sure the java user apis work; the scala tests handle logic corner cases
+    long count1 = rdd1.count();
+    long count2 = rdd2.count();
+    long count3 = rdd3.count();
+    Assert.assertTrue(count1 > 0);
+    Assert.assertEquals(count1, count2);
+    Assert.assertEquals(count1, count3);
+  }
+
+  private  String[] createTopicAndSendData(String topic) {
+    String[] data = { topic + "-1", topic + "-2", topic + "-3"};
+    kafkaTestUtils.createTopic(topic);
+    kafkaTestUtils.sendMessages(topic, data);
+    return data;
+  }
+}
diff --git a/external/kafka-v09/src/test/resources/log4j.properties b/external/kafka-v09/src/test/resources/log4j.properties
new file mode 100644
index 000000000000..75e3b53a093f
--- /dev/null
+++ b/external/kafka-v09/src/test/resources/log4j.properties
@@ -0,0 +1,28 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Set everything to be logged to the file target/unit-tests.log
+log4j.rootCategory=INFO, file
+log4j.appender.file=org.apache.log4j.FileAppender
+log4j.appender.file.append=true
+log4j.appender.file.file=target/unit-tests.log
+log4j.appender.file.layout=org.apache.log4j.PatternLayout
+log4j.appender.file.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n
+
+# Ignore messages below warning level from Jetty, because it's a bit verbose
+log4j.logger.org.spark-project.jetty=WARN
+
diff --git a/external/kafka-v09/src/test/scala/org/apache/spark/streaming/kafka/v09/DirectKafkaStreamSuite.scala b/external/kafka-v09/src/test/scala/org/apache/spark/streaming/kafka/v09/DirectKafkaStreamSuite.scala
new file mode 100644
index 000000000000..cb4dab6a2cf4
--- /dev/null
+++ b/external/kafka-v09/src/test/scala/org/apache/spark/streaming/kafka/v09/DirectKafkaStreamSuite.scala
@@ -0,0 +1,492 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.streaming.kafka.v09
+
+import java.io.File
+import java.util.concurrent.atomic.AtomicLong
+
+import kafka.common.TopicAndPartition
+import org.apache.kafka.clients.consumer.{ ConsumerRecord, ConsumerConfig }
+import org.apache.kafka.common.TopicPartition
+import org.apache.spark.rdd.RDD
+import org.apache.spark.streaming.dstream.DStream
+import org.apache.spark.streaming.scheduler.rate.RateEstimator
+import org.apache.spark.streaming.scheduler.{ StreamingListenerBatchCompleted, StreamingListenerBatchStarted, StreamingListenerBatchSubmitted, StreamingListener }
+import org.apache.spark.streaming.{ Time, Milliseconds, StreamingContext }
+import org.apache.spark.util.Utils
+import org.apache.spark.{ SparkContext, SparkConf, Logging, SparkFunSuite }
+import org.scalatest.concurrent.Eventually
+import org.scalatest.{ BeforeAndAfter, BeforeAndAfterAll }
+
+import scala.collection.mutable
+import scala.collection.mutable.ArrayBuffer
+import scala.concurrent.duration._
+import scala.language.postfixOps
+
+class DirectKafkaStreamSuite
+  extends SparkFunSuite
+  with BeforeAndAfter
+  with BeforeAndAfterAll
+  with Eventually
+  with Logging {
+  val sparkConf = new SparkConf()
+    .setMaster("local[4]")
+    .setAppName(this.getClass.getSimpleName)
+
+  private var sc: SparkContext = _
+  private var ssc: StreamingContext = _
+  private var testDir: File = _
+
+  private var kafkaTestUtils: KafkaTestUtils = _
+
+  override def beforeAll {
+    kafkaTestUtils = new KafkaTestUtils
+    kafkaTestUtils.setup()
+  }
+
+  override def afterAll {
+    if (kafkaTestUtils != null) {
+      kafkaTestUtils.teardown()
+      kafkaTestUtils = null
+    }
+  }
+
+  after {
+    if (ssc != null) {
+      ssc.stop()
+      sc = null
+    }
+    if (sc != null) {
+      sc.stop()
+    }
+    if (testDir != null) {
+      Utils.deleteRecursively(testDir)
+    }
+  }
+
+  test("basic stream receiving with multiple topics and earliest starting offset") {
+    val topics = Set("new_basic1", "new_basic2", "new_basic3")
+    val data = Map("a" -> 7, "b" -> 9)
+    topics.foreach { t =>
+      kafkaTestUtils.createTopic(t)
+      kafkaTestUtils.sendMessages(t, data)
+    }
+    val totalSent = data.values.sum * topics.size
+    val kafkaParams = Map(
+      ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> kafkaTestUtils.brokerAddress,
+      ConsumerConfig.AUTO_OFFSET_RESET_CONFIG -> "earliest",
+      ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG ->
+        "org.apache.kafka.common.serialization.StringDeserializer",
+      ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG ->
+        "org.apache.kafka.common.serialization.StringDeserializer",
+      "spark.kafka.poll.time" -> "1000")
+
+    ssc = new StreamingContext(sparkConf, Milliseconds(200))
+    val stream = withClue("Error creating direct stream") {
+      KafkaUtils.createDirectStream[String, String](
+        ssc, kafkaParams, topics)
+    }
+
+    val allReceived =
+      new ArrayBuffer[(String, String)] with mutable.SynchronizedBuffer[(String, String)]
+
+    // hold a reference to the current offset ranges, so it can be used downstream
+    var offsetRanges = Array[OffsetRange]()
+
+    stream.transform { rdd =>
+      // Get the offset ranges in the RDD
+      offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
+      rdd
+    }.foreachRDD { rdd =>
+      for (o <- offsetRanges) {
+        log.info(s"${rdd.id} | ${o.topic} ${o.partition} ${o.fromOffset} ${o.untilOffset}")
+      }
+      val collected = rdd.mapPartitionsWithIndex { (i, iter) =>
+        // For each partition, get size of the range in the partition,
+        // and the number of items in the partition
+        val off = offsetRanges(i)
+        val all = iter.toSeq
+        val partSize = all.size
+        val rangeSize = off.untilOffset - off.fromOffset
+        Iterator((partSize, rangeSize))
+      }.collect
+
+      // Verify whether number of elements in each partition
+      // matches with the corresponding offset range
+      collected.foreach {
+        case (partSize, rangeSize) =>
+          assert(partSize === rangeSize, "offset ranges are wrong")
+      }
+    }
+    stream.foreachRDD { rdd => allReceived ++= rdd.collect() }
+    ssc.start()
+    eventually(timeout(20000.milliseconds), interval(200.milliseconds)) {
+      assert(allReceived.size === totalSent,
+        "didn't get expected number of messages, messages:\n" + allReceived.mkString("\n"))
+    }
+    ssc.stop()
+  }
+
+  test("receiving from latest starting offset") {
+    val topic = "new_latest"
+    val topicPartition = new TopicPartition(topic, 0)
+    val data = Map("a" -> 10)
+    kafkaTestUtils.createTopic(topic)
+    val kafkaParams = Map(
+      ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> kafkaTestUtils.brokerAddress,
+      ConsumerConfig.AUTO_OFFSET_RESET_CONFIG -> "latest",
+      ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG ->
+        "org.apache.kafka.common.serialization.StringDeserializer",
+      ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG ->
+        "org.apache.kafka.common.serialization.StringDeserializer",
+      "spark.kafka.poll.time" -> "100")
+    val kc = new KafkaCluster(kafkaParams)
+    def getLatestOffset(): Long = {
+      kc.getLatestOffsets(Set(topicPartition)).get(topicPartition).getOrElse(0)
+    }
+
+    // Send some initial messages before starting context
+    kafkaTestUtils.sendMessages(topic, data)
+    eventually(timeout(10 seconds), interval(20 milliseconds)) {
+      assert(getLatestOffset() > 3)
+    }
+    val offsetBeforeStart = getLatestOffset()
+
+    // Setup context and kafka stream with largest offset
+    ssc = new StreamingContext(sparkConf, Milliseconds(200))
+    val stream = withClue("Error creating direct stream") {
+      KafkaUtils.createDirectStream[String, String](
+        ssc, kafkaParams, Set(topic))
+    }
+    assert(
+      stream.asInstanceOf[DirectKafkaInputDStream[_, _, _]]
+        .fromOffsets(topicPartition) >= offsetBeforeStart,
+      "Start offset not from latest"
+    )
+
+    val collectedData = new mutable.ArrayBuffer[String]() with mutable.SynchronizedBuffer[String]
+    stream.map { _._2 }.foreachRDD { rdd => collectedData ++= rdd.collect() }
+    ssc.start()
+    val newData = Map("b" -> 10)
+    kafkaTestUtils.sendMessages(topic, newData)
+    eventually(timeout(10 seconds), interval(50 milliseconds)) {
+      collectedData.contains("b")
+    }
+    assert(!collectedData.contains("a"))
+  }
+
+  test("creating stream by offset") {
+    val topic = "new_offset"
+    val topicPartition = new TopicPartition(topic, 0)
+    val data = Map("a" -> 10)
+    kafkaTestUtils.createTopic(topic)
+    val kafkaParams = Map(
+      ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> kafkaTestUtils.brokerAddress,
+      ConsumerConfig.AUTO_OFFSET_RESET_CONFIG -> "latest",
+      ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG ->
+        "org.apache.kafka.common.serialization.StringDeserializer",
+      ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG ->
+        "org.apache.kafka.common.serialization.StringDeserializer",
+      "spark.kafka.poll.time" -> "100")
+    val kc = new KafkaCluster(kafkaParams)
+    def getLatestOffset(): Long = {
+      kc.getLatestOffsets(Set(topicPartition)).get(topicPartition).getOrElse(0)
+    }
+
+    // Send some initial messages before starting context
+    kafkaTestUtils.sendMessages(topic, data)
+    eventually(timeout(10 seconds), interval(20 milliseconds)) {
+      assert(getLatestOffset() >= 10)
+    }
+    val offsetBeforeStart = getLatestOffset()
+
+    // Setup context and kafka stream with largest offset
+    ssc = new StreamingContext(sparkConf, Milliseconds(200))
+    val stream = withClue("Error creating direct stream") {
+      KafkaUtils.createDirectStream[String, String, String](
+        ssc, kafkaParams, Map(topicPartition -> 11L),
+        (m: ConsumerRecord[String, String]) => m.value())
+    }
+    assert(
+      stream.asInstanceOf[DirectKafkaInputDStream[_, _, _]]
+        .fromOffsets(topicPartition) >= offsetBeforeStart,
+      "Start offset not from latest")
+
+    val collectedData = new mutable.ArrayBuffer[String]() with mutable.SynchronizedBuffer[String]
+    stream.foreachRDD { rdd => collectedData ++= rdd.collect() }
+    ssc.start()
+    val newData = Map("b" -> 10)
+    kafkaTestUtils.sendMessages(topic, newData)
+    eventually(timeout(10 seconds), interval(50 milliseconds)) {
+      collectedData.contains("b")
+    }
+    assert(!collectedData.contains("a"))
+  }
+
+  // Test to verify the offset ranges can be recovered from the checkpoints
+  test("offset recovery") {
+    val topic = "new_recovery"
+    kafkaTestUtils.createTopic(topic)
+    testDir = Utils.createTempDir()
+
+    val kafkaParams = Map(
+      ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> kafkaTestUtils.brokerAddress,
+      ConsumerConfig.AUTO_OFFSET_RESET_CONFIG -> "earliest",
+      ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG ->
+        "org.apache.kafka.common.serialization.StringDeserializer",
+      ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG ->
+        "org.apache.kafka.common.serialization.StringDeserializer",
+      "spark.kafka.poll.time" -> "1000")
+
+    // Send data to Kafka and wait for it to be received
+    def sendDataAndWaitForReceive(data: Seq[Int]) {
+      val strings = data.map { _.toString }
+      kafkaTestUtils.sendMessages(topic, strings.map { _ -> 1 }.toMap)
+      eventually(timeout(60 seconds), interval(200 milliseconds)) {
+        assert(strings.forall {
+          DirectKafkaStreamSuite.collectedData.contains
+        })
+      }
+    }
+
+    // Setup the streaming context
+    ssc = new StreamingContext(sparkConf, Milliseconds(100))
+    val kafkaStream = withClue("Error creating direct stream") {
+      KafkaUtils.createDirectStream[String, String](
+        ssc, kafkaParams, Set(topic))
+    }
+    val keyedStream = kafkaStream.map { v => "key" -> v._2.toInt }
+    val stateStream = keyedStream.updateStateByKey { (values: Seq[Int], state: Option[Int]) =>
+      Some(values.sum + state.getOrElse(0))
+    }
+    ssc.checkpoint(testDir.getAbsolutePath)
+
+    // This is to collect the raw data received from Kafka
+    kafkaStream.foreachRDD { (rdd: RDD[(String, String)], time: Time) =>
+      val data = rdd.map { _._2 }.collect()
+      DirectKafkaStreamSuite.collectedData.appendAll(data)
+    }
+
+    // This is ensure all the data is eventually receiving only once
+    stateStream.foreachRDD { (rdd: RDD[(String, Int)]) =>
+      rdd.collect().headOption.foreach { x => DirectKafkaStreamSuite.total = x._2 }
+    }
+    ssc.start()
+
+    // Send some data and wait for them to be received
+    for (i <- (1 to 10).grouped(4)) {
+      sendDataAndWaitForReceive(i)
+    }
+
+    // Verify that offset ranges were generated
+    val offsetRangesBeforeStop = getOffsetRanges(kafkaStream)
+    assert(offsetRangesBeforeStop.size >= 1, "No offset ranges generated")
+    assert(
+      offsetRangesBeforeStop.head._2.forall { _.fromOffset === 0 },
+      "starting offset not zero"
+    )
+    ssc.stop()
+    logInfo("====== RESTARTING ========")
+
+    // Recover context from checkpoints
+    ssc = new StreamingContext(testDir.getAbsolutePath)
+    val recoveredStream = ssc.graph.getInputStreams().head.asInstanceOf[DStream[(String, String)]]
+
+    // Verify offset ranges have been recovered
+    val recoveredOffsetRanges = getOffsetRanges(recoveredStream)
+    assert(recoveredOffsetRanges.size > 0, "No offset ranges recovered")
+    val earlierOffsetRangesAsSets = offsetRangesBeforeStop.map { x => (x._1, x._2.toSet) }
+    assert(
+      recoveredOffsetRanges.forall { or =>
+        earlierOffsetRangesAsSets.contains((or._1, or._2.toSet))
+      },
+      "Recovered ranges are not the same as the ones generated"
+    )
+    // Restart context, give more data and verify the total at the end
+    // If the total is write that means each records has been received only once
+    ssc.start()
+    sendDataAndWaitForReceive(11 to 20)
+    eventually(timeout(10 seconds), interval(50 milliseconds)) {
+      assert(DirectKafkaStreamSuite.total === (1 to 20).sum)
+    }
+    ssc.stop()
+  }
+
+  test("Direct Kafka stream report input information") {
+    val topic = "new_report-test"
+    val data = Map("a" -> 7, "b" -> 9)
+    kafkaTestUtils.createTopic(topic)
+    kafkaTestUtils.sendMessages(topic, data)
+
+    val totalSent = data.values.sum
+    val kafkaParams = Map(
+      ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> kafkaTestUtils.brokerAddress,
+      ConsumerConfig.AUTO_OFFSET_RESET_CONFIG -> "earliest",
+      ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG ->
+        "org.apache.kafka.common.serialization.StringDeserializer",
+      ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG ->
+        "org.apache.kafka.common.serialization.StringDeserializer",
+      "spark.kafka.poll.time" -> "1000")
+
+    import DirectKafkaStreamSuite._
+    ssc = new StreamingContext(sparkConf, Milliseconds(200))
+    val collector = new InputInfoCollector
+    ssc.addStreamingListener(collector)
+
+    val stream = withClue("Error creating direct stream") {
+      KafkaUtils.createDirectStream[String, String](
+        ssc, kafkaParams, Set(topic))
+    }
+
+    val allReceived =
+      new ArrayBuffer[(String, String)] with mutable.SynchronizedBuffer[(String, String)]
+
+    stream.foreachRDD { rdd => allReceived ++= rdd.collect() }
+    ssc.start()
+    eventually(timeout(20000.milliseconds), interval(200.milliseconds)) {
+      assert(allReceived.size === totalSent,
+        "didn't get expected number of messages, messages:\n" + allReceived.mkString("\n"))
+
+      // Calculate all the record number collected in the StreamingListener.
+      assert(collector.numRecordsSubmitted.get() === totalSent)
+      assert(collector.numRecordsStarted.get() === totalSent)
+      assert(collector.numRecordsCompleted.get() === totalSent)
+    }
+    ssc.stop()
+  }
+
+  test("using rate controller") {
+    val topic = "new_backpressure"
+    val topicPartition = new TopicPartition(topic, 0)
+    kafkaTestUtils.createTopic(topic)
+    val kafkaParams = Map(
+      ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> kafkaTestUtils.brokerAddress,
+      ConsumerConfig.AUTO_OFFSET_RESET_CONFIG -> "earliest",
+      ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG ->
+        "org.apache.kafka.common.serialization.StringDeserializer",
+      ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG ->
+        "org.apache.kafka.common.serialization.StringDeserializer",
+      "spark.kafka.poll.time" -> "1000")
+
+    val batchIntervalMilliseconds = 100
+    val estimator = new ConstantEstimator(100)
+    val messageKeys = (1 to 200).map(_.toString)
+    val messages = messageKeys.map((_, 1)).toMap
+
+    val sparkConf = new SparkConf()
+      // Safe, even with streaming, because we're using the direct API.
+      // Using 1 core is useful to make the test more predictable.
+      .setMaster("local[1]")
+      .setAppName(this.getClass.getSimpleName)
+      .set("spark.streaming.kafka.maxRatePerPartition", "100")
+
+    // Setup the streaming context
+    ssc = new StreamingContext(sparkConf, Milliseconds(batchIntervalMilliseconds))
+
+    val kafkaStream = withClue("Error creating direct stream") {
+      val kc = new KafkaCluster(kafkaParams)
+      val messageHandler = (mmd: ConsumerRecord[String, String]) => (mmd.key(), mmd.value())
+      val m = kc.getEarliestOffsets(Set(topicPartition))
+
+      new DirectKafkaInputDStream[String, String, (String, String)](
+        ssc, kafkaParams, m, messageHandler) {
+        override protected[streaming] val rateController =
+          Some(new DirectKafkaRateController(id, estimator))
+      }
+    }
+
+    val collectedData =
+      new mutable.ArrayBuffer[Array[String]]() with mutable.SynchronizedBuffer[Array[String]]
+
+    // Used for assertion failure messages.
+    def dataToString: String =
+      collectedData.map(_.mkString("[", ",", "]")).mkString("{", ", ", "}")
+
+    // This is to collect the raw data received from Kafka
+    kafkaStream.foreachRDD { (rdd: RDD[(String, String)], time: Time) =>
+      val data = rdd.map { _._2 }.collect()
+      collectedData += data
+    }
+
+    ssc.start()
+
+    // Try different rate limits.
+    // Send data to Kafka and wait for arrays of data to appear matching the rate.
+    Seq(100, 50, 20).foreach { rate =>
+      collectedData.clear() // Empty this buffer on each pass.
+      estimator.updateRate(rate) // Set a new rate.
+      // Expect blocks of data equal to "rate", scaled by the interval length in secs.
+      val expectedSize = Math.round(rate * batchIntervalMilliseconds * 0.001)
+      kafkaTestUtils.sendMessages(topic, messages)
+      eventually(timeout(10.seconds), interval(batchIntervalMilliseconds.milliseconds)) {
+        // Assert that rate estimator values are used to determine maxMessagesPerPartition.
+        // Funky "-" in message makes the complete assertion message read better.
+        assert(collectedData.exists(_.size == expectedSize),
+          s" - No arrays of size $expectedSize for rate $rate found in $dataToString")
+      }
+    }
+
+    ssc.stop()
+  }
+
+  /** Get the generated offset ranges from the DirectKafkaStream */
+  private def getOffsetRanges[K, V](kafkaStream: DStream[(K, V)]):
+    Seq[(Time, Array[OffsetRange])] = {
+    kafkaStream.generatedRDDs.mapValues { rdd =>
+      rdd.asInstanceOf[KafkaRDD[K, V, (K, V)]].offsetRanges
+    }.toSeq.sortBy { _._1 }
+  }
+}
+
+object DirectKafkaStreamSuite {
+  val collectedData = new mutable.ArrayBuffer[String]() with mutable.SynchronizedBuffer[String]
+  @volatile var total = -1L
+
+  class InputInfoCollector extends StreamingListener {
+    val numRecordsSubmitted = new AtomicLong(0L)
+    val numRecordsStarted = new AtomicLong(0L)
+    val numRecordsCompleted = new AtomicLong(0L)
+
+    override def onBatchSubmitted(batchSubmitted: StreamingListenerBatchSubmitted): Unit = {
+      numRecordsSubmitted.addAndGet(batchSubmitted.batchInfo.numRecords)
+    }
+
+    override def onBatchStarted(batchStarted: StreamingListenerBatchStarted): Unit = {
+      numRecordsStarted.addAndGet(batchStarted.batchInfo.numRecords)
+    }
+
+    override def onBatchCompleted(batchCompleted: StreamingListenerBatchCompleted): Unit = {
+      numRecordsCompleted.addAndGet(batchCompleted.batchInfo.numRecords)
+    }
+  }
+}
+
+private[streaming] class ConstantEstimator(@volatile private var rate: Long)
+  extends RateEstimator {
+
+  def updateRate(newRate: Long): Unit = {
+    rate = newRate
+  }
+
+  def compute(
+      time: Long,
+      elements: Long,
+      processingDelay: Long,
+      schedulingDelay: Long): Option[Double] = Some(rate)
+}
+
diff --git a/external/kafka-v09/src/test/scala/org/apache/spark/streaming/kafka/v09/KafkaClusterSuite.scala b/external/kafka-v09/src/test/scala/org/apache/spark/streaming/kafka/v09/KafkaClusterSuite.scala
new file mode 100644
index 000000000000..7b7c16456468
--- /dev/null
+++ b/external/kafka-v09/src/test/scala/org/apache/spark/streaming/kafka/v09/KafkaClusterSuite.scala
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.streaming.kafka.v09
+
+import kafka.common.TopicAndPartition
+import org.apache.kafka.clients.consumer.ConsumerConfig
+import org.apache.kafka.common.TopicPartition
+import org.apache.spark.SparkFunSuite
+import org.scalatest.BeforeAndAfterAll
+
+import scala.util.Random
+
+class KafkaClusterSuite extends SparkFunSuite with BeforeAndAfterAll {
+  private val topic = "new_kcsuitetopic" + Random.nextInt(10000)
+  private val topicPartition = new TopicPartition(topic, 0)
+  private var newKc: KafkaCluster[_, _] = null
+
+  private var kafkaTestUtils: KafkaTestUtils = _
+
+  override def beforeAll() {
+    kafkaTestUtils = new KafkaTestUtils
+    kafkaTestUtils.setup()
+
+    kafkaTestUtils.createTopic(topic)
+    kafkaTestUtils.sendMessages(topic, Map("a" -> 1))
+    val kafkaParams = Map(
+      ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> kafkaTestUtils.brokerAddress,
+      ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG ->
+        "org.apache.kafka.common.serialization.StringDeserializer",
+      ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG ->
+        "org.apache.kafka.common.serialization.StringDeserializer")
+    newKc = new KafkaCluster(kafkaParams)
+  }
+
+  override def afterAll() {
+    if (kafkaTestUtils != null) {
+      kafkaTestUtils.teardown()
+      kafkaTestUtils = null
+    }
+  }
+
+  test("leader offset apis") {
+    val earliest = newKc.getEarliestOffsets(Set(topicPartition))
+    assert(earliest(topicPartition) === 0, "didn't get earliest")
+
+    val latest = newKc.getLatestOffsets(Set(topicPartition))
+    assert(latest(topicPartition) === 1, "didn't get latest")
+  }
+
+}
diff --git a/external/kafka-v09/src/test/scala/org/apache/spark/streaming/kafka/v09/KafkaRDDSuite.scala b/external/kafka-v09/src/test/scala/org/apache/spark/streaming/kafka/v09/KafkaRDDSuite.scala
new file mode 100644
index 000000000000..9a296f0f33ac
--- /dev/null
+++ b/external/kafka-v09/src/test/scala/org/apache/spark/streaming/kafka/v09/KafkaRDDSuite.scala
@@ -0,0 +1,177 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.streaming.kafka.v09
+
+import kafka.common.TopicAndPartition
+import org.apache.kafka.clients.consumer.{ ConsumerConfig, ConsumerRecord }
+import org.apache.kafka.common.TopicPartition
+import org.apache.spark._
+import org.scalatest.BeforeAndAfterAll
+
+import scala.util.Random
+
+class KafkaRDDSuite extends SparkFunSuite with BeforeAndAfterAll {
+
+  private var kafkaTestUtils: KafkaTestUtils = _
+
+  private val sparkConf = new SparkConf().setMaster("local[4]")
+    .setAppName(this.getClass.getSimpleName)
+  private var sc: SparkContext = _
+
+  override def beforeAll {
+    sc = new SparkContext(sparkConf)
+    kafkaTestUtils = new KafkaTestUtils
+    kafkaTestUtils.setup()
+  }
+
+  override def afterAll {
+    if (sc != null) {
+      sc.stop
+      sc = null
+    }
+
+    if (kafkaTestUtils != null) {
+      kafkaTestUtils.teardown()
+      kafkaTestUtils = null
+    }
+  }
+
+  test("basic usage") {
+    val topic = s"new_topicbasic-${Random.nextInt}"
+    kafkaTestUtils.createTopic(topic)
+    val messages = Array("the", "quick", "brown", "fox")
+    kafkaTestUtils.sendMessages(topic, messages)
+
+    val kafkaParams = Map(
+      ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> kafkaTestUtils.brokerAddress,
+      ConsumerConfig.AUTO_OFFSET_RESET_CONFIG -> "latest",
+      ConsumerConfig.GROUP_ID_CONFIG -> s"test-consumer-${Random.nextInt}",
+      ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG ->
+        "org.apache.kafka.common.serialization.StringDeserializer",
+      ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG ->
+        "org.apache.kafka.common.serialization.StringDeserializer",
+      "spark.kafka.poll.time" -> "10000")
+
+    val offsetRanges = Array(OffsetRange(topic, 0, 0, messages.size))
+
+    val rdd = KafkaUtils.createRDD[String, String](
+      sc, kafkaParams, offsetRanges)
+
+    val received = rdd.map(_._2).collect.toSet
+    assert(received === messages.toSet)
+
+    // size-related method optimizations return sane results
+    assert(rdd.count === messages.size)
+    assert(rdd.countApprox(0).getFinalValue.mean === messages.size)
+    assert(!rdd.isEmpty)
+    assert(rdd.take(1).size === 1)
+    assert(rdd.take(1).head._2 === messages.head)
+    assert(rdd.take(messages.size + 10).size === messages.size)
+
+    val emptyRdd = KafkaUtils.createRDD[String, String](
+      sc, kafkaParams, Array(OffsetRange(topic, 0, 0, 0)))
+
+    assert(emptyRdd.isEmpty)
+
+    // invalid offset ranges throw exceptions
+    val badRanges = Array(OffsetRange(topic, 0, 0, messages.size + 1))
+    intercept[SparkException] {
+      KafkaUtils.createRDD[String, String](
+        sc, kafkaParams, badRanges)
+    }
+  }
+
+  test("iterator boundary conditions") {
+    // the idea is to find e.g. off-by-one errors between what kafka has available and the rdd
+    val topic = s"new_topicboundary-${Random.nextInt}"
+    val sent = Map("a" -> 5, "b" -> 3, "c" -> 10)
+    kafkaTestUtils.createTopic(topic)
+
+    val kafkaParams = Map(
+      ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> kafkaTestUtils.brokerAddress,
+      ConsumerConfig.AUTO_OFFSET_RESET_CONFIG -> "latest",
+      ConsumerConfig.GROUP_ID_CONFIG -> s"test-consumer-${Random.nextInt}",
+      ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG ->
+        "org.apache.kafka.common.serialization.StringDeserializer",
+      ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG ->
+        "org.apache.kafka.common.serialization.StringDeserializer",
+      "spark.kafka.poll.time" -> "1000")
+
+    val kc = new KafkaCluster(kafkaParams)
+
+    // this is the "lots of messages" case
+    kafkaTestUtils.sendMessages(topic, sent)
+    val sentCount = sent.values.sum
+
+    // rdd defined from leaders after sending messages, should get the number sent
+    val rdd = getRdd(kc, Set(topic))
+
+    assert(rdd.isDefined)
+
+    val ranges = rdd.get.asInstanceOf[HasOffsetRanges].offsetRanges
+    val rangeCount = ranges.map(o => o.untilOffset - o.fromOffset).sum
+
+    assert(rangeCount === sentCount, "offset range didn't include all sent messages")
+    assert(rdd.get.count === sentCount, "didn't get all sent messages")
+
+    val rangesMap = ranges.map(o => new TopicPartition(o.topic, o.partition) -> o.untilOffset).toMap
+
+    // make sure consumer offsets are committed before the next getRdd call
+    kc.setConsumerOffsets(rangesMap)
+
+    // this is the "0 messages" case
+    val rdd2 = getRdd(kc, Set(topic))
+    // shouldn't get anything, since message is sent after rdd was defined
+    val sentOnlyOne = Map("d" -> 1)
+
+    kafkaTestUtils.sendMessages(topic, sentOnlyOne)
+
+    assert(rdd2.isDefined)
+    assert(rdd2.get.count === 0, "got messages when there shouldn't be any")
+
+    // this is the "exactly 1 message" case, namely the single message from sentOnlyOne above
+    val rdd3 = getRdd(kc, Set(topic))
+    // send lots of messages after rdd was defined, they shouldn't show up
+    kafkaTestUtils.sendMessages(topic, Map("extra" -> 22))
+
+    assert(rdd3.isDefined)
+    assert(rdd3.get.count === sentOnlyOne.values.sum, "didn't get exactly one message")
+
+  }
+
+  // get an rdd from the committed consumer offsets until the latest leader offsets,
+  private def getRdd(kc: KafkaCluster[_, _], topics: Set[String]) = {
+    val groupId = kc.kafkaParams("group.id")
+    val topicPartitions = kc.getPartitions(topics)
+    val consumerOffsets = try {
+      kc.getCommittedOffsets(topicPartitions)
+    } catch {
+      case e: SparkException => kc.getEarliestOffsets(topicPartitions)
+    }
+    val latestOffsets = kc.getLatestOffsets(topicPartitions)
+
+    val offsetRanges = consumerOffsets.map { case (tp: TopicPartition, fromOffset: Long) =>
+      OffsetRange(tp.topic, tp.partition, fromOffset, latestOffsets(tp))
+    }.toArray
+
+    Option(KafkaUtils.createRDD[String, String, String](
+      sc, kc.kafkaParams, offsetRanges,
+      (cr: ConsumerRecord[String, String]) => s"${cr.offset()} ${cr.value()}"))
+    }
+
+}
diff --git a/external/kafka-v09/src/test/scala/org/apache/spark/streaming/kafka/v09/KafkaUtilsSSLSuite.scala b/external/kafka-v09/src/test/scala/org/apache/spark/streaming/kafka/v09/KafkaUtilsSSLSuite.scala
new file mode 100644
index 000000000000..4ffb7bd53c90
--- /dev/null
+++ b/external/kafka-v09/src/test/scala/org/apache/spark/streaming/kafka/v09/KafkaUtilsSSLSuite.scala
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.streaming.kafka.v09
+
+import org.apache.kafka.clients.CommonClientConfigs
+import org.apache.kafka.clients.consumer.ConsumerConfig
+import org.apache.kafka.common.config.SslConfigs
+import org.apache.spark.{SparkContext, SparkConf, SparkFunSuite}
+import org.scalatest.BeforeAndAfterAll
+
+class KafkaUtilsSSLSuite extends SparkFunSuite with BeforeAndAfterAll {
+  private var sc: SparkContext = _
+
+  private val pathToKeyStore = "/path/to/ssl_keystore"
+  private val pathToTrustStore = "/path/to/ssl_truststore"
+  private val keystorePasswd = "keystore_secret_pass"
+  private val truststorePasswd = "truststore_secret_pass"
+  private val keyPasswd = "key_secret_pass"
+
+  private val sparkSslProperties = Map[String, String] (
+    "spark.ssl.kafka.enabled" -> "true",
+    "spark.ssl.kafka.keyStore" -> pathToKeyStore,
+    "spark.ssl.kafka.keyStorePassword" -> keystorePasswd,
+    "spark.ssl.kafka.trustStore" -> pathToTrustStore,
+    "spark.ssl.kafka.trustStorePassword" -> truststorePasswd,
+    "spark.ssl.kafka.keyPassword" -> keyPasswd
+  )
+
+  private val kafkaSslProperties = Map[String, String] (
+    CommonClientConfigs.SECURITY_PROTOCOL_CONFIG -> "SSL",
+    SslConfigs.SSL_KEYSTORE_LOCATION_CONFIG -> pathToKeyStore,
+    SslConfigs.SSL_KEYSTORE_PASSWORD_CONFIG -> keystorePasswd,
+    SslConfigs.SSL_TRUSTSTORE_LOCATION_CONFIG -> pathToTrustStore,
+    SslConfigs.SSL_TRUSTSTORE_PASSWORD_CONFIG -> truststorePasswd,
+    SslConfigs.SSL_KEY_PASSWORD_CONFIG -> keyPasswd
+  )
+
+  val sparkConf = new SparkConf()
+    .setMaster("local[4]")
+    .setAppName(this.getClass.getSimpleName)
+
+  override def beforeAll {
+    sparkConf.setAll(sparkSslProperties)
+    sc = new SparkContext(sparkConf)
+  }
+
+  override def afterAll {
+    if (sc != null) {
+      sc.stop
+      sc = null
+    }
+  }
+
+  test("Check adding SSL properties to Kafka parameters") {
+    val kafkaParams = Map(
+      ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> "localhost:9093",
+      ConsumerConfig.AUTO_OFFSET_RESET_CONFIG -> "earliest",
+      ConsumerConfig.GROUP_ID_CONFIG -> "test-consumer",
+      ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG ->
+        "org.apache.kafka.common.serialization.StringDeserializer",
+      ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG ->
+        "org.apache.kafka.common.serialization.StringDeserializer",
+      "spark.kafka.poll.time" -> "100")
+
+    val kafkaParamsWithSSL = KafkaUtils.addSSLOptions(kafkaParams, sc)
+
+    kafkaSslProperties.foreach {
+      case (k, v) => assert(kafkaParamsWithSSL.get(k).get.toString == v)
+    }
+  }
+
+}
diff --git a/pom.xml b/pom.xml
index 234fd5dea1a6..f180fead7d56 100644
--- a/pom.xml
+++ b/pom.xml
@@ -112,6 +112,7 @@
     <module>repl</module>
     <module>launcher</module>
     <module>external/kafka</module>
+    <module>external/kafka-v09</module>
     <module>external/kafka-assembly</module>
   </modules>