apache · asfgit · Nov 19, 2016 · Sep 11, 2016 · srdo · Sep 19, 2016
diff --git a/external/storm-kafka-client/pom.xml b/external/storm-kafka-client/pom.xml
@@ -53,6 +53,20 @@
             <groupId>org.apache.kafka</groupId>
             <artifactId>kafka-clients</artifactId>
             <version>${storm.kafka.client.version}</version>
+            <exclusions>
+                <exclusion>
+                    <groupId>org.apache.zookeeper</groupId>
+                    <artifactId>zookeeper</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>org.slf4j</groupId>
+                    <artifactId>slf4j-log4j12</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>log4j</groupId>
+                    <artifactId>log4j</artifactId>
+                </exclusion>
+            </exclusions>
         </dependency>
         <!--test dependencies -->
         <dependency>
@@ -66,6 +80,18 @@
             <version>4.11</version>
             <scope>test</scope>
         </dependency>
+        <dependency>
+            <groupId>info.batey.kafka</groupId>
+            <artifactId>kafka-unit</artifactId>
+            <version>0.6</version>
+            <scope>test</scope>
+        </dependency>
+        <dependency>
+            <groupId>org.slf4j</groupId>
+            <artifactId>log4j-over-slf4j</artifactId>
+            <version>${log4j-over-slf4j.version}</version>
+            <scope>test</scope>
+        </dependency>
     </dependencies>
 
     <build>

diff --git a/external/storm-kafka-client/src/main/java/org/apache/storm/kafka/spout/KafkaSpout.java b/external/storm-kafka-client/src/main/java/org/apache/storm/kafka/spout/KafkaSpout.java
@@ -77,7 +77,7 @@ public class KafkaSpout<K, V> extends BaseRichSpout {
     private KafkaSpoutStreams kafkaSpoutStreams;                        // Object that wraps all the logic to declare output fields and emit tuples
     private transient KafkaSpoutTuplesBuilder<K, V> tuplesBuilder;      // Object that contains the logic to build tuples for each ConsumerRecord
 
-    private transient Map<TopicPartition, OffsetEntry> acked;           // Tuples that were successfully acked. These tuples will be committed periodically when the commit timer expires, after consumer rebalance, or on close/deactivate
+    transient Map<TopicPartition, OffsetEntry> acked;           // Tuples that were successfully acked. These tuples will be committed periodically when the commit timer expires, after consumer rebalance, or on close/deactivate
     private transient Set<KafkaSpoutMessageId> emitted;                 // Tuples that have been emitted but that are "on the wire", i.e. pending being acked or failed
     private transient Iterator<ConsumerRecord<K, V>> waitingToEmit;         // Records that have been polled and are queued to be emitted in the nextTuple() call. One record is emitted per nextTuple()
     private transient long numUncommittedOffsets;                       // Number of offsets that have been polled and emitted but not yet been committed
@@ -266,19 +266,22 @@ private void doSeekRetriableTopicPartitions() {
             if (offsetAndMeta != null) {
                 kafkaConsumer.seek(rtp, offsetAndMeta.offset() + 1);  // seek to the next offset that is ready to commit in next commit cycle
             } else {
-                kafkaConsumer.seekToEnd(toArrayList(rtp));    // Seek to last committed offset
+                kafkaConsumer.seek(rtp, acked.get(rtp).committedOffset + 1);    // Seek to last committed offset
             }
         }
     }
 
     // ======== emit  =========
     private void emit() {
-        emitTupleIfNotEmitted(waitingToEmit.next());
-        waitingToEmit.remove();
+        while(!emitTupleIfNotEmitted(waitingToEmit.next()) && waitingToEmit.hasNext()) {
+            waitingToEmit.remove();
+        }
     }
 
-    // emits one tuple per record
-    private void emitTupleIfNotEmitted(ConsumerRecord<K, V> record) {
+
+    //Emits one tuple per record
+    //@return true if tuple was emitted
+    private boolean emitTupleIfNotEmitted(ConsumerRecord<K, V> record) {
         final TopicPartition tp = new TopicPartition(record.topic(), record.partition());
         final KafkaSpoutMessageId msgId = new KafkaSpoutMessageId(record);
 
@@ -295,7 +298,9 @@ private void emitTupleIfNotEmitted(ConsumerRecord<K, V> record) {
                 retryService.remove(msgId);  // re-emitted hence remove from failed
             }
             LOG.trace("Emitted tuple [{}] for record [{}]", tuple, record);
+            return true;
         }
+        return false;
     }
 
     private void commitOffsetsForAckedTuples() {
@@ -451,7 +456,7 @@ public int compare(KafkaSpoutMessageId m1, KafkaSpoutMessageId m2) {
     /**
      * This class is not thread safe
      */
-    private class OffsetEntry {
+    class OffsetEntry {
         private final TopicPartition tp;
         private final long initialFetchOffset;  /* First offset to be fetched. It is either set to the beginning, end, or to the first uncommitted offset.
                                                  * Initial value depends on offset strategy. See KafkaSpoutConsumerRebalanceListener */
@@ -479,16 +484,17 @@ public OffsetAndMetadata findNextCommitOffset() {
             KafkaSpoutMessageId nextCommitMsg = null;     // this is a convenience variable to make it faster to create OffsetAndMetadata
 
             for (KafkaSpoutMessageId currAckedMsg : ackedMsgs) {  // complexity is that of a linear scan on a TreeMap
-                if ((currOffset = currAckedMsg.offset()) == initialFetchOffset || currOffset == nextCommitOffset + 1) {            // found the next offset to commit
+                if ((currOffset = currAckedMsg.offset()) == nextCommitOffset + 1) {            // found the next offset to commit
                     found = true;
                     nextCommitMsg = currAckedMsg;
                     nextCommitOffset = currOffset;
                 } else if (currAckedMsg.offset() > nextCommitOffset + 1) {    // offset found is not continuous to the offsets listed to go in the next commit, so stop search
                     LOG.debug("topic-partition [{}] has non-continuous offset [{}]. It will be processed in a subsequent batch.", tp, currOffset);
                     break;
                 } else {
-                    LOG.debug("topic-partition [{}] has unexpected offset [{}].", tp, currOffset);
-                    break;
+                    //Received a redundant ack. Ignore and continue processing.
+                    LOG.warn("topic-partition [{}] has unexpected offset [{}]. Current committed Offset [{}]",
+                            tp, currOffset,  committedOffset);
                 }
             }
 
@@ -532,6 +538,10 @@ public void commit(OffsetAndMetadata committedOffset) {
             LOG.trace("{}", this);
         }
 
+        long getCommittedOffset() {
+            return committedOffset;
+        }
+
         public boolean isEmpty() {
             return ackedMsgs.isEmpty();
         }

diff --git a/...rm-kafka-client/src/test/java/org/apache/storm/kafka/spout/SingleTopicKafkaSpoutTest.java b/...rm-kafka-client/src/test/java/org/apache/storm/kafka/spout/SingleTopicKafkaSpoutTest.java
@@ -0,0 +1,240 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ *   or more contributor license agreements.  See the NOTICE file
+ *   distributed with this work for additional information
+ *   regarding copyright ownership.  The ASF licenses this file
+ *   to you under the Apache License, Version 2.0 (the
+ *   "License"); you may not use this file except in compliance
+ *   with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *   Unless required by applicable law or agreed to in writing, software
+ *   distributed under the License is distributed on an "AS IS" BASIS,
+ *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *   See the License for the specific language governing permissions and
+ *   limitations under the License.
+ */
+package org.apache.storm.kafka.spout;
+
+import info.batey.kafka.unit.KafkaUnitRule;
+import kafka.producer.KeyedMessage;
+import org.apache.kafka.clients.consumer.OffsetAndMetadata;
+import org.apache.storm.kafka.spout.builders.SingleTopicKafkaSpoutConfiguration;
+import org.apache.storm.spout.SpoutOutputCollector;
+import org.apache.storm.task.TopologyContext;
+import org.apache.storm.tuple.Values;
+import org.junit.Rule;
+import org.junit.Test;
+import org.mockito.ArgumentCaptor;
+
+import static org.junit.Assert.*;
+
+import java.util.Map;
+import java.util.stream.IntStream;
+import static org.mockito.Mockito.*;
+import static org.apache.storm.kafka.spout.builders.SingleTopicKafkaSpoutConfiguration.*;
+
+public class SingleTopicKafkaSpoutTest {
+
+    private class SpoutContext {
+        public KafkaSpout<String, String> spout;
+        public SpoutOutputCollector collector;
+
+        public SpoutContext(KafkaSpout<String, String> spout,
+                            SpoutOutputCollector collector) {
+            this.spout = spout;
+            this.collector = collector;
+        }
+    }
+
+    @Rule
+    public KafkaUnitRule kafkaUnitRule = new KafkaUnitRule();
+
+    void populateTopicData(String topicName, int msgCount) {
+        kafkaUnitRule.getKafkaUnit().createTopic(topicName);
+
+        IntStream.range(0, msgCount).forEach(value -> {
+            KeyedMessage<String, String> keyedMessage = new KeyedMessage<>(
+                    topicName, Integer.toString(value),
+                    Integer.toString(value));
+
+            kafkaUnitRule.getKafkaUnit().sendMessages(keyedMessage);
+        });
+    }
+
+    SpoutContext initializeSpout(int msgCount) {
+        populateTopicData(SingleTopicKafkaSpoutConfiguration.TOPIC, msgCount);
+        int kafkaPort = kafkaUnitRule.getKafkaPort();
+
+        TopologyContext topology = mock(TopologyContext.class);
+        SpoutOutputCollector collector = mock(SpoutOutputCollector.class);
+        Map conf = mock(Map.class);
+
+        KafkaSpout<String, String> spout = new KafkaSpout<>(getKafkaSpoutConfig(getKafkaSpoutStreams(), kafkaPort));
+        spout.open(conf, topology, collector);
+        spout.activate();
+        return new SpoutContext(spout, collector);
+    }
+    /*
+     * Asserts that the next possible offset to commit or the committed offset is the provided offset.
+     * An offset that is ready to be committed is not guarenteed to be already committed.
+     */
+    private void assertOffsetCommitted(int offset, KafkaSpout.OffsetEntry entry) {
+
+        boolean currentOffsetMatch = entry.getCommittedOffset() == offset;
+        OffsetAndMetadata nextOffset = entry.findNextCommitOffset();
+        boolean nextOffsetMatch =  nextOffset != null && nextOffset.offset() == offset;
+        assertTrue("Next offset: " +
+                        entry.findNextCommitOffset() +
+                        " OR current offset: " +
+                        entry.getCommittedOffset() +
+                        " must equal desired offset: " +
+                        offset,
+                currentOffsetMatch | nextOffsetMatch);
+    }
+
+    @Test
+    public void shouldContinueWithSlowDoubleAcks() throws Exception {
+        int messageCount = 20;
+        SpoutContext context = initializeSpout(messageCount);
+
+        //play 1st tuple
+        ArgumentCaptor<Object> messageIdToDoubleAck = ArgumentCaptor.forClass(Object.class);
+        context.spout.nextTuple();
+        verify(context.collector).emit(anyObject(), anyObject(), messageIdToDoubleAck.capture());
+        context.spout.ack(messageIdToDoubleAck.getValue());
+
+        IntStream.range(0, messageCount/2).forEach(value -> {
+            context.spout.nextTuple();
+        });
+
+        context.spout.ack(messageIdToDoubleAck.getValue());
+
+        IntStream.range(0, messageCount).forEach(value -> {
+            context.spout.nextTuple();
+        });
+
+        ArgumentCaptor<Object> remainingIds = ArgumentCaptor.forClass(Object.class);
+
+        verify(context.collector, times(messageCount)).emit(
+                eq(SingleTopicKafkaSpoutConfiguration.STREAM),
+                anyObject(),
+                remainingIds.capture());
+        remainingIds.getAllValues().iterator().forEachRemaining(context.spout::ack);
+
+        context.spout.acked.values().forEach(item -> {
+            assertOffsetCommitted(messageCount - 1, (KafkaSpout.OffsetEntry) item);
+        });
+    }
+
+    @Test
+    public void shouldEmitAllMessages() throws Exception {
+        int messageCount = 10;
+        SpoutContext context = initializeSpout(messageCount);
+
+
+        IntStream.range(0, messageCount).forEach(value -> {
+            context.spout.nextTuple();
+            ArgumentCaptor<Object> messageId = ArgumentCaptor.forClass(Object.class);
+            verify(context.collector).emit(
+                    eq(SingleTopicKafkaSpoutConfiguration.STREAM),
+                    eq(new Values(SingleTopicKafkaSpoutConfiguration.TOPIC,
+                            Integer.toString(value),
+                            Integer.toString(value))),
+            messageId.capture());
+            context.spout.ack(messageId.getValue());
+            reset(context.collector);
+        });
+
+        context.spout.acked.values().forEach(item -> {
+            assertOffsetCommitted(messageCount - 1, (KafkaSpout.OffsetEntry) item);
+        });
+    }
+
+    @Test
+    public void shouldReplayInOrderFailedMessages() throws Exception {
+        int messageCount = 10;
+        SpoutContext context = initializeSpout(messageCount);
+
+        //play and ack 1 tuple
+        ArgumentCaptor<Object> messageIdAcked = ArgumentCaptor.forClass(Object.class);
+        context.spout.nextTuple();
+        verify(context.collector).emit(anyObject(), anyObject(), messageIdAcked.capture());
+        context.spout.ack(messageIdAcked.getValue());
+        reset(context.collector);
+
+        //play and fail 1 tuple
+        ArgumentCaptor<Object> messageIdFailed = ArgumentCaptor.forClass(Object.class);
+        context.spout.nextTuple();
+        verify(context.collector).emit(anyObject(), anyObject(), messageIdFailed.capture());
+        context.spout.fail(messageIdFailed.getValue());
+        reset(context.collector);
+
+        //pause so that failed tuples will be retried
+        Thread.sleep(200);
+
+
+        //allow for some calls to nextTuple() to fail to emit a tuple
+        IntStream.range(0, messageCount + 5).forEach(value -> {
+            context.spout.nextTuple();
+        });
+
+        ArgumentCaptor<Object> remainingMessageIds = ArgumentCaptor.forClass(Object.class);
+
+        //1 message replayed, messageCount - 2 messages emitted for the first time
+        verify(context.collector, times(messageCount - 1)).emit(
+                eq(SingleTopicKafkaSpoutConfiguration.STREAM),
+                anyObject(),
+                remainingMessageIds.capture());
+        remainingMessageIds.getAllValues().iterator().forEachRemaining(context.spout::ack);
+
+        context.spout.acked.values().forEach(item -> {
+            assertOffsetCommitted(messageCount - 1, (KafkaSpout.OffsetEntry) item);
+        });
+    }
+
+    @Test
+    public void shouldReplayFirstTupleFailedOutOfOrder() throws Exception {
+        int messageCount = 10;
+        SpoutContext context = initializeSpout(messageCount);
+
+
+        //play 1st tuple
+        ArgumentCaptor<Object> messageIdToFail = ArgumentCaptor.forClass(Object.class);
+        context.spout.nextTuple();
+        verify(context.collector).emit(anyObject(), anyObject(), messageIdToFail.capture());
+        reset(context.collector);
+
+        //play 2nd tuple
+        ArgumentCaptor<Object> messageIdToAck = ArgumentCaptor.forClass(Object.class);
+        context.spout.nextTuple();
+        verify(context.collector).emit(anyObject(), anyObject(), messageIdToAck.capture());
+        reset(context.collector);
+
+        //ack 2nd tuple
+        context.spout.ack(messageIdToAck.getValue());
+        //fail 1st tuple
+        context.spout.fail(messageIdToFail.getValue());
+
+        //pause so that failed tuples will be retried
+        Thread.sleep(200);
+
+        //allow for some calls to nextTuple() to fail to emit a tuple
+        IntStream.range(0, messageCount + 5).forEach(value -> {
+            context.spout.nextTuple();
+        });
+
+        ArgumentCaptor<Object> remainingIds = ArgumentCaptor.forClass(Object.class);
+        //1 message replayed, messageCount - 2 messages emitted for the first time
+        verify(context.collector, times(messageCount - 1)).emit(
+                eq(SingleTopicKafkaSpoutConfiguration.STREAM),
+                anyObject(),
+                remainingIds.capture());
+        remainingIds.getAllValues().iterator().forEachRemaining(context.spout::ack);
+
+        context.spout.acked.values().forEach(item -> {
+            assertOffsetCommitted(messageCount - 1, (KafkaSpout.OffsetEntry) item);
+        });
+    }
+}