apache · markgrover · Jan 10, 2016 · Jan 7, 2016 · Jan 11, 2016 · Jan 11, 2016
diff --git a/examples/src/main/java/org/apache/spark/examples/streaming/JavaNewDirectKafkaWordCount.java b/examples/src/main/java/org/apache/spark/examples/streaming/JavaNewDirectKafkaWordCount.java
@@ -0,0 +1,125 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.streaming;
+
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.regex.Pattern;
+
+import org.apache.spark.streaming.kafka.KafkaUtils;
+import scala.Tuple2;
+
+import com.google.common.collect.Lists;
+import org.apache.kafka.clients.consumer.ConsumerConfig;
+
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.function.FlatMapFunction;
+import org.apache.spark.api.java.function.Function;
+import org.apache.spark.api.java.function.Function2;
+import org.apache.spark.api.java.function.PairFunction;
+import org.apache.spark.streaming.api.java.JavaDStream;
+import org.apache.spark.streaming.api.java.JavaPairDStream;
+import org.apache.spark.streaming.api.java.JavaPairInputDStream;
+import org.apache.spark.streaming.api.java.JavaStreamingContext;
+import org.apache.spark.streaming.Durations;
+
+/**
+ * Consumes messages from one or more topics in Kafka and does wordcount.
+ * Usage: JavaNewDirectKafkaWordCount <brokers> <topics>
+ *   <brokers> is a list of one or more Kafka brokers
+ *   <topics> is a list of one or more kafka topics to consume from
+ *
+ * Example:
+ *    $ bin/run-example streaming.JavaNewDirectKafkaWordCount
+ *    broker1-host:port,broker2-host:port topic1,topic2
+ */
+
+public final class JavaNewDirectKafkaWordCount {
+  private static final Pattern SPACE = Pattern.compile(" ");
+
+  public static void main(String[] args) {
+    if (args.length < 2) {
+      System.err.println("Usage: JavaNewDirectKafkaWordCount <brokers> <topics>\n" +
+          "  <brokers> is a list of one or more Kafka brokers\n" +
+          "  <topics> is a list of one or more kafka topics to consume from\n\n");
+      System.exit(1);
+    }
+
+    // StreamingExamples.setStreamingLogLevels();
+
+    String brokers = args[0];
+    String topics = args[1];
+
+    // Create context with a 2 seconds batch interval
+    SparkConf sparkConf = new SparkConf().setAppName("Direct Kafka Java Wordcount (New Consumer " +
+      "API)");
+    JavaStreamingContext jssc = new JavaStreamingContext(sparkConf, Durations.seconds(2));
+
+    HashSet<String> topicsSet = new HashSet<String>(Arrays.asList(topics.split(",")));
+    HashMap<String, String> kafkaParams = new HashMap<String, String>();
+    kafkaParams.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, brokers);
+    kafkaParams.put(ConsumerConfig.GROUP_ID_CONFIG, "testGroup");
+    kafkaParams.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG,
+      "org.apache.kafka.common.serialization.StringDeserializer");
+    kafkaParams.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG,
+      "org.apache.kafka.common.serialization.StringDeserializer");
+
+    // Create direct kafka stream with brokers and topics
+    JavaPairInputDStream<String, String> messages = KafkaUtils.createNewDirectStream(
+        jssc,
+        String.class,
+        String.class,
+        kafkaParams,
+        topicsSet
+    );
+
+    // Get the lines, split them into words, count the words and print
+    JavaDStream<String> lines = messages.map(new Function<Tuple2<String, String>, String>() {
+      @Override
+      public String call(Tuple2<String, String> tuple2) {
+        return tuple2._2();
+      }
+    });
+    JavaDStream<String> words = lines.flatMap(new FlatMapFunction<String, String>() {
+      @Override
+      public Iterator<String> call(String x) {
+        return Lists.newArrayList(SPACE.split(x)).iterator();
+      }
+    });
+    JavaPairDStream<String, Integer> wordCounts = words.mapToPair(
+      new PairFunction<String, String, Integer>() {
+        @Override
+        public Tuple2<String, Integer> call(String s) {
+          return new Tuple2<String, Integer>(s, 1);
+        }
+      }).reduceByKey(
+        new Function2<Integer, Integer, Integer>() {
+        @Override
+        public Integer call(Integer i1, Integer i2) {
+          return i1 + i2;
+        }
+      });
+    wordCounts.print();
+
+    // Start the computation
+    jssc.start();
+    jssc.awaitTermination();
+  }
+}
diff --git a/examples/src/main/scala/org/apache/spark/examples/streaming/DirectNewKafkaWordCount.scala b/examples/src/main/scala/org/apache/spark/examples/streaming/DirectNewKafkaWordCount.scala
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.streaming
+
+import org.apache.kafka.clients.consumer.ConsumerConfig
+
+import org.apache.spark.SparkConf
+import org.apache.spark.streaming._
+import org.apache.spark.streaming.kafka.KafkaUtils
+
+/**
+ * Consumes messages from one or more topics in Kafka and does wordcount.
+ * Usage: DirectNewKafkaWordCount <brokers> <topics>
+ *   <brokers> is a list of one or more Kafka brokers
+ *   <topics> is a list of one or more kafka topics to consume from
+ *   <groupId> is the name of kafka consumer group
+ *   <auto.offset.reset> What to do when there is no initial offset in Kafka or
+ *                       if the current offset does not exist any more on the server
+ *                       earliest: automatically reset the offset to the earliest offset
+ *                       latest: automatically reset the offset to the latest offset
+ * Example:
+ *    $ bin/run-example streaming.DirectNewKafkaWordCount broker1-host:port,broker2-host:port \
+ *    topic1,topic2 my-consumer-group earliest
+ *
+ */
+object DirectNewKafkaWordCount {
+  def main(args: Array[String]) {
+    if (args.length < 4) {
+      System.err.println(s"""
+                            |Usage: DirectNewKafkaWordCount <brokers> <topics> <groupId> \
+                            |<auto.offset.reset>
+                            |  <brokers> is a list of one or more Kafka brokers
+                            |  <topics> is a list of one or more kafka topics to consume from
+                            |  <groupId> is the name of kafka consumer group (can be arbitrary)
+                            |  <auto.offset.reset> What to do when there is no initial offset
+                            |                      in Kafka or if the current offset does not exist
+                            |                      any more on the server
+                            |                      earliest: automatically reset the offset
+                            |                                to the earliest offset
+                            |                      latest: automatically reset the offset
+                            |                              to the latest offset
+        """.stripMargin)
+      System.exit(1)
+    }
+
+    // StreamingExamples.setStreamingLogLevels()
+
+    val Array(brokers, topics, groupId, offsetReset) = args
+
+    // Create context with 2 second batch interval
+    val sparkConf = new SparkConf().setAppName("Direct Kafka Wordcount (New Consumer API)")
+    val ssc = new StreamingContext(sparkConf, Seconds(2))
+
+    // Create direct kafka stream with brokers and topics
+    val topicsSet = topics.split(",").toSet
+    val kafkaParams = Map[String, String](
+      ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> brokers,
+      ConsumerConfig.GROUP_ID_CONFIG -> groupId,
+      ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG ->
+        "org.apache.kafka.common.serialization.StringDeserializer",
+      ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG ->
+        "org.apache.kafka.common.serialization.StringDeserializer",
+      ConsumerConfig.AUTO_OFFSET_RESET_CONFIG -> offsetReset,
+      ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG -> "false"
+    )
+    val messages = KafkaUtils.createNewDirectStream[String, String](ssc, kafkaParams, topicsSet)
+
+    // Get the lines, split them into words, count the words and print
+    val lines = messages.map(_._2)
+    val words = lines.flatMap(_.split(" "))
+    val wordCounts = words.map(x => (x, 1L)).reduceByKey(_ + _)
+    wordCounts.print()
+
+    // Start the computation
+    ssc.start()
+    ssc.awaitTermination()
+  }
+}
+// scalastyle:on println
diff --git a/external/kafka/pom.xml b/external/kafka/pom.xml
@@ -51,7 +51,7 @@
     <dependency>
       <groupId>org.apache.kafka</groupId>
       <artifactId>kafka_${scala.binary.version}</artifactId>
-      <version>0.8.2.1</version>
+      <version>${kafka.version}</version>
       <exclusions>
         <exclusion>
           <groupId>com.sun.jmx</groupId>

diff --git a/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala b/external/kafka/src/main/scala/org/apache/spark/streaming/kafka/DirectKafkaInputDStream.scala
@@ -28,9 +28,6 @@ import kafka.serializer.Decoder
 import org.apache.spark.{Logging, SparkException}
 import org.apache.spark.streaming.{StreamingContext, Time}
 import org.apache.spark.streaming.dstream._
-import org.apache.spark.streaming.kafka.KafkaCluster.LeaderOffset
-import org.apache.spark.streaming.scheduler.{RateController, StreamInputInfo}
-import org.apache.spark.streaming.scheduler.rate.RateEstimator
 
 /**
  *  A stream of {@link org.apache.spark.streaming.kafka.KafkaRDD} where
@@ -62,59 +59,19 @@ class DirectKafkaInputDStream[
     val kafkaParams: Map[String, String],
     val fromOffsets: Map[TopicAndPartition, Long],
     messageHandler: MessageAndMetadata[K, V] => R
-  ) extends InputDStream[R](_ssc) with Logging {
-  val maxRetries = context.sparkContext.getConf.getInt(
-    "spark.streaming.kafka.maxRetries", 1)
+  ) extends DirectKafkaInputDStreamBase[K, V, U, T, R] (_ssc, kafkaParams, fromOffsets) {
 
   // Keep this consistent with how other streams are named (e.g. "Flume polling stream [2]")
   private[streaming] override def name: String = s"Kafka direct stream [$id]"
 
   protected[streaming] override val checkpointData =
     new DirectKafkaInputDStreamCheckpointData
 
-
-  /**
-   * Asynchronously maintains & sends new rate limits to the receiver through the receiver tracker.
-   */
-  override protected[streaming] val rateController: Option[RateController] = {
-    if (RateController.isBackPressureEnabled(ssc.conf)) {
-      Some(new DirectKafkaRateController(id,
-        RateEstimator.create(ssc.conf, context.graph.batchDuration)))
-    } else {
-      None
-    }
-  }
-
   protected val kc = new KafkaCluster(kafkaParams)
 
-  private val maxRateLimitPerPartition: Int = context.sparkContext.getConf.getInt(
-      "spark.streaming.kafka.maxRatePerPartition", 0)
-  protected def maxMessagesPerPartition: Option[Long] = {
-    val estimatedRateLimit = rateController.map(_.getLatestRate().toInt)
-    val numPartitions = currentOffsets.keys.size
-
-    val effectiveRateLimitPerPartition = estimatedRateLimit
-      .filter(_ > 0)
-      .map { limit =>
-        if (maxRateLimitPerPartition > 0) {
-          Math.min(maxRateLimitPerPartition, (limit / numPartitions))
-        } else {
-          limit / numPartitions
-        }
-      }.getOrElse(maxRateLimitPerPartition)
-
-    if (effectiveRateLimitPerPartition > 0) {
-      val secsPerBatch = context.graph.batchDuration.milliseconds.toDouble / 1000
-      Some((secsPerBatch * effectiveRateLimitPerPartition).toLong)
-    } else {
-      None
-    }
-  }
-
-  protected var currentOffsets = fromOffsets
-
   @tailrec
-  protected final def latestLeaderOffsets(retries: Int): Map[TopicAndPartition, LeaderOffset] = {
+  protected final override def latestLeaderOffsets(retries: Int): Map[TopicAndPartition,
+    LeaderOffset] = {
     val o = kc.getLatestLeaderOffsets(currentOffsets.keySet)
     // Either.fold would confuse @tailrec, do it manually
     if (o.isLeft) {
@@ -131,48 +88,10 @@ class DirectKafkaInputDStream[
     }
   }
 
-  // limits the maximum number of messages per partition
-  protected def clamp(
-    leaderOffsets: Map[TopicAndPartition, LeaderOffset]): Map[TopicAndPartition, LeaderOffset] = {
-    maxMessagesPerPartition.map { mmp =>
-      leaderOffsets.map { case (tp, lo) =>
-        tp -> lo.copy(offset = Math.min(currentOffsets(tp) + mmp, lo.offset))
-      }
-    }.getOrElse(leaderOffsets)
-  }
-
-  override def compute(validTime: Time): Option[KafkaRDD[K, V, U, T, R]] = {
-    val untilOffsets = clamp(latestLeaderOffsets(maxRetries))
-    val rdd = KafkaRDD[K, V, U, T, R](
-      context.sparkContext, kafkaParams, currentOffsets, untilOffsets, messageHandler)
-
-    // Report the record number and metadata of this batch interval to InputInfoTracker.
-    val offsetRanges = currentOffsets.map { case (tp, fo) =>
-      val uo = untilOffsets(tp)
-      OffsetRange(tp.topic, tp.partition, fo, uo.offset)
-    }
-    val description = offsetRanges.filter { offsetRange =>
-      // Don't display empty ranges.
-      offsetRange.fromOffset != offsetRange.untilOffset
-    }.map { offsetRange =>
-      s"topic: ${offsetRange.topic}\tpartition: ${offsetRange.partition}\t" +
-        s"offsets: ${offsetRange.fromOffset} to ${offsetRange.untilOffset}"
-    }.mkString("\n")
-    // Copy offsetRanges to immutable.List to prevent from being modified by the user
-    val metadata = Map(
-      "offsets" -> offsetRanges.toList,
-      StreamInputInfo.METADATA_KEY_DESCRIPTION -> description)
-    val inputInfo = StreamInputInfo(id, rdd.count, metadata)
-    ssc.scheduler.inputInfoTracker.reportInfo(validTime, inputInfo)
-
-    currentOffsets = untilOffsets.map(kv => kv._1 -> kv._2.offset)
-    Some(rdd)
-  }
-
-  override def start(): Unit = {
-  }
-
-  def stop(): Unit = {
+  override def getRdd(untilOffsets: Map[TopicAndPartition, LeaderOffset]): KafkaRDD[K, V, U, T,
+    R] = {
+    KafkaRDD[K, V, U, T, R](context.sparkContext, kafkaParams, currentOffsets, untilOffsets,
+      messageHandler)
   }
 
   private[streaming]
@@ -204,11 +123,4 @@ class DirectKafkaInputDStream[
     }
   }
 
-  /**
-   * A RateController to retrieve the rate from RateEstimator.
-   */
-  private[streaming] class DirectKafkaRateController(id: Int, estimator: RateEstimator)
-    extends RateController(id, estimator) {
-    override def publish(rate: Long): Unit = ()
-  }
 }