diff --git a/external/storm-kafka-client/pom.xml b/external/storm-kafka-client/pom.xml index f7a387cb6e0..2d9a8445ab9 100644 --- a/external/storm-kafka-client/pom.xml +++ b/external/storm-kafka-client/pom.xml @@ -53,6 +53,20 @@ org.apache.kafka kafka-clients ${storm.kafka.client.version} + + + org.apache.zookeeper + zookeeper + + + org.slf4j + slf4j-log4j12 + + + log4j + log4j + + @@ -66,6 +80,18 @@ 4.11 test + + info.batey.kafka + kafka-unit + 0.6 + test + + + org.slf4j + log4j-over-slf4j + ${log4j-over-slf4j.version} + test + diff --git a/external/storm-kafka-client/src/main/java/org/apache/storm/kafka/spout/KafkaSpout.java b/external/storm-kafka-client/src/main/java/org/apache/storm/kafka/spout/KafkaSpout.java index 4389acb50e5..4f0780afa01 100644 --- a/external/storm-kafka-client/src/main/java/org/apache/storm/kafka/spout/KafkaSpout.java +++ b/external/storm-kafka-client/src/main/java/org/apache/storm/kafka/spout/KafkaSpout.java @@ -77,7 +77,7 @@ public class KafkaSpout extends BaseRichSpout { private KafkaSpoutStreams kafkaSpoutStreams; // Object that wraps all the logic to declare output fields and emit tuples private transient KafkaSpoutTuplesBuilder tuplesBuilder; // Object that contains the logic to build tuples for each ConsumerRecord - private transient Map acked; // Tuples that were successfully acked. These tuples will be committed periodically when the commit timer expires, after consumer rebalance, or on close/deactivate + transient Map acked; // Tuples that were successfully acked. These tuples will be committed periodically when the commit timer expires, after consumer rebalance, or on close/deactivate private transient Set emitted; // Tuples that have been emitted but that are "on the wire", i.e. pending being acked or failed private transient Iterator> waitingToEmit; // Records that have been polled and are queued to be emitted in the nextTuple() call. One record is emitted per nextTuple() private transient long numUncommittedOffsets; // Number of offsets that have been polled and emitted but not yet been committed @@ -266,19 +266,22 @@ private void doSeekRetriableTopicPartitions() { if (offsetAndMeta != null) { kafkaConsumer.seek(rtp, offsetAndMeta.offset() + 1); // seek to the next offset that is ready to commit in next commit cycle } else { - kafkaConsumer.seekToEnd(toArrayList(rtp)); // Seek to last committed offset + kafkaConsumer.seek(rtp, acked.get(rtp).committedOffset + 1); // Seek to last committed offset } } } // ======== emit ========= private void emit() { - emitTupleIfNotEmitted(waitingToEmit.next()); - waitingToEmit.remove(); + while(!emitTupleIfNotEmitted(waitingToEmit.next()) && waitingToEmit.hasNext()) { + waitingToEmit.remove(); + } } - // emits one tuple per record - private void emitTupleIfNotEmitted(ConsumerRecord record) { + + //Emits one tuple per record + //@return true if tuple was emitted + private boolean emitTupleIfNotEmitted(ConsumerRecord record) { final TopicPartition tp = new TopicPartition(record.topic(), record.partition()); final KafkaSpoutMessageId msgId = new KafkaSpoutMessageId(record); @@ -295,7 +298,9 @@ private void emitTupleIfNotEmitted(ConsumerRecord record) { retryService.remove(msgId); // re-emitted hence remove from failed } LOG.trace("Emitted tuple [{}] for record [{}]", tuple, record); + return true; } + return false; } private void commitOffsetsForAckedTuples() { @@ -451,7 +456,7 @@ public int compare(KafkaSpoutMessageId m1, KafkaSpoutMessageId m2) { /** * This class is not thread safe */ - private class OffsetEntry { + class OffsetEntry { private final TopicPartition tp; private final long initialFetchOffset; /* First offset to be fetched. It is either set to the beginning, end, or to the first uncommitted offset. * Initial value depends on offset strategy. See KafkaSpoutConsumerRebalanceListener */ @@ -479,7 +484,7 @@ public OffsetAndMetadata findNextCommitOffset() { KafkaSpoutMessageId nextCommitMsg = null; // this is a convenience variable to make it faster to create OffsetAndMetadata for (KafkaSpoutMessageId currAckedMsg : ackedMsgs) { // complexity is that of a linear scan on a TreeMap - if ((currOffset = currAckedMsg.offset()) == initialFetchOffset || currOffset == nextCommitOffset + 1) { // found the next offset to commit + if ((currOffset = currAckedMsg.offset()) == nextCommitOffset + 1) { // found the next offset to commit found = true; nextCommitMsg = currAckedMsg; nextCommitOffset = currOffset; @@ -487,8 +492,9 @@ public OffsetAndMetadata findNextCommitOffset() { LOG.debug("topic-partition [{}] has non-continuous offset [{}]. It will be processed in a subsequent batch.", tp, currOffset); break; } else { - LOG.debug("topic-partition [{}] has unexpected offset [{}].", tp, currOffset); - break; + //Received a redundant ack. Ignore and continue processing. + LOG.warn("topic-partition [{}] has unexpected offset [{}]. Current committed Offset [{}]", + tp, currOffset, committedOffset); } } @@ -532,6 +538,10 @@ public void commit(OffsetAndMetadata committedOffset) { LOG.trace("{}", this); } + long getCommittedOffset() { + return committedOffset; + } + public boolean isEmpty() { return ackedMsgs.isEmpty(); } diff --git a/external/storm-kafka-client/src/test/java/org/apache/storm/kafka/spout/SingleTopicKafkaSpoutTest.java b/external/storm-kafka-client/src/test/java/org/apache/storm/kafka/spout/SingleTopicKafkaSpoutTest.java new file mode 100644 index 00000000000..8fa7b80c05d --- /dev/null +++ b/external/storm-kafka-client/src/test/java/org/apache/storm/kafka/spout/SingleTopicKafkaSpoutTest.java @@ -0,0 +1,240 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.storm.kafka.spout; + +import info.batey.kafka.unit.KafkaUnitRule; +import kafka.producer.KeyedMessage; +import org.apache.kafka.clients.consumer.OffsetAndMetadata; +import org.apache.storm.kafka.spout.builders.SingleTopicKafkaSpoutConfiguration; +import org.apache.storm.spout.SpoutOutputCollector; +import org.apache.storm.task.TopologyContext; +import org.apache.storm.tuple.Values; +import org.junit.Rule; +import org.junit.Test; +import org.mockito.ArgumentCaptor; + +import static org.junit.Assert.*; + +import java.util.Map; +import java.util.stream.IntStream; +import static org.mockito.Mockito.*; +import static org.apache.storm.kafka.spout.builders.SingleTopicKafkaSpoutConfiguration.*; + +public class SingleTopicKafkaSpoutTest { + + private class SpoutContext { + public KafkaSpout spout; + public SpoutOutputCollector collector; + + public SpoutContext(KafkaSpout spout, + SpoutOutputCollector collector) { + this.spout = spout; + this.collector = collector; + } + } + + @Rule + public KafkaUnitRule kafkaUnitRule = new KafkaUnitRule(); + + void populateTopicData(String topicName, int msgCount) { + kafkaUnitRule.getKafkaUnit().createTopic(topicName); + + IntStream.range(0, msgCount).forEach(value -> { + KeyedMessage keyedMessage = new KeyedMessage<>( + topicName, Integer.toString(value), + Integer.toString(value)); + + kafkaUnitRule.getKafkaUnit().sendMessages(keyedMessage); + }); + } + + SpoutContext initializeSpout(int msgCount) { + populateTopicData(SingleTopicKafkaSpoutConfiguration.TOPIC, msgCount); + int kafkaPort = kafkaUnitRule.getKafkaPort(); + + TopologyContext topology = mock(TopologyContext.class); + SpoutOutputCollector collector = mock(SpoutOutputCollector.class); + Map conf = mock(Map.class); + + KafkaSpout spout = new KafkaSpout<>(getKafkaSpoutConfig(getKafkaSpoutStreams(), kafkaPort)); + spout.open(conf, topology, collector); + spout.activate(); + return new SpoutContext(spout, collector); + } + /* + * Asserts that the next possible offset to commit or the committed offset is the provided offset. + * An offset that is ready to be committed is not guarenteed to be already committed. + */ + private void assertOffsetCommitted(int offset, KafkaSpout.OffsetEntry entry) { + + boolean currentOffsetMatch = entry.getCommittedOffset() == offset; + OffsetAndMetadata nextOffset = entry.findNextCommitOffset(); + boolean nextOffsetMatch = nextOffset != null && nextOffset.offset() == offset; + assertTrue("Next offset: " + + entry.findNextCommitOffset() + + " OR current offset: " + + entry.getCommittedOffset() + + " must equal desired offset: " + + offset, + currentOffsetMatch | nextOffsetMatch); + } + + @Test + public void shouldContinueWithSlowDoubleAcks() throws Exception { + int messageCount = 20; + SpoutContext context = initializeSpout(messageCount); + + //play 1st tuple + ArgumentCaptor messageIdToDoubleAck = ArgumentCaptor.forClass(Object.class); + context.spout.nextTuple(); + verify(context.collector).emit(anyObject(), anyObject(), messageIdToDoubleAck.capture()); + context.spout.ack(messageIdToDoubleAck.getValue()); + + IntStream.range(0, messageCount/2).forEach(value -> { + context.spout.nextTuple(); + }); + + context.spout.ack(messageIdToDoubleAck.getValue()); + + IntStream.range(0, messageCount).forEach(value -> { + context.spout.nextTuple(); + }); + + ArgumentCaptor remainingIds = ArgumentCaptor.forClass(Object.class); + + verify(context.collector, times(messageCount)).emit( + eq(SingleTopicKafkaSpoutConfiguration.STREAM), + anyObject(), + remainingIds.capture()); + remainingIds.getAllValues().iterator().forEachRemaining(context.spout::ack); + + context.spout.acked.values().forEach(item -> { + assertOffsetCommitted(messageCount - 1, (KafkaSpout.OffsetEntry) item); + }); + } + + @Test + public void shouldEmitAllMessages() throws Exception { + int messageCount = 10; + SpoutContext context = initializeSpout(messageCount); + + + IntStream.range(0, messageCount).forEach(value -> { + context.spout.nextTuple(); + ArgumentCaptor messageId = ArgumentCaptor.forClass(Object.class); + verify(context.collector).emit( + eq(SingleTopicKafkaSpoutConfiguration.STREAM), + eq(new Values(SingleTopicKafkaSpoutConfiguration.TOPIC, + Integer.toString(value), + Integer.toString(value))), + messageId.capture()); + context.spout.ack(messageId.getValue()); + reset(context.collector); + }); + + context.spout.acked.values().forEach(item -> { + assertOffsetCommitted(messageCount - 1, (KafkaSpout.OffsetEntry) item); + }); + } + + @Test + public void shouldReplayInOrderFailedMessages() throws Exception { + int messageCount = 10; + SpoutContext context = initializeSpout(messageCount); + + //play and ack 1 tuple + ArgumentCaptor messageIdAcked = ArgumentCaptor.forClass(Object.class); + context.spout.nextTuple(); + verify(context.collector).emit(anyObject(), anyObject(), messageIdAcked.capture()); + context.spout.ack(messageIdAcked.getValue()); + reset(context.collector); + + //play and fail 1 tuple + ArgumentCaptor messageIdFailed = ArgumentCaptor.forClass(Object.class); + context.spout.nextTuple(); + verify(context.collector).emit(anyObject(), anyObject(), messageIdFailed.capture()); + context.spout.fail(messageIdFailed.getValue()); + reset(context.collector); + + //pause so that failed tuples will be retried + Thread.sleep(200); + + + //allow for some calls to nextTuple() to fail to emit a tuple + IntStream.range(0, messageCount + 5).forEach(value -> { + context.spout.nextTuple(); + }); + + ArgumentCaptor remainingMessageIds = ArgumentCaptor.forClass(Object.class); + + //1 message replayed, messageCount - 2 messages emitted for the first time + verify(context.collector, times(messageCount - 1)).emit( + eq(SingleTopicKafkaSpoutConfiguration.STREAM), + anyObject(), + remainingMessageIds.capture()); + remainingMessageIds.getAllValues().iterator().forEachRemaining(context.spout::ack); + + context.spout.acked.values().forEach(item -> { + assertOffsetCommitted(messageCount - 1, (KafkaSpout.OffsetEntry) item); + }); + } + + @Test + public void shouldReplayFirstTupleFailedOutOfOrder() throws Exception { + int messageCount = 10; + SpoutContext context = initializeSpout(messageCount); + + + //play 1st tuple + ArgumentCaptor messageIdToFail = ArgumentCaptor.forClass(Object.class); + context.spout.nextTuple(); + verify(context.collector).emit(anyObject(), anyObject(), messageIdToFail.capture()); + reset(context.collector); + + //play 2nd tuple + ArgumentCaptor messageIdToAck = ArgumentCaptor.forClass(Object.class); + context.spout.nextTuple(); + verify(context.collector).emit(anyObject(), anyObject(), messageIdToAck.capture()); + reset(context.collector); + + //ack 2nd tuple + context.spout.ack(messageIdToAck.getValue()); + //fail 1st tuple + context.spout.fail(messageIdToFail.getValue()); + + //pause so that failed tuples will be retried + Thread.sleep(200); + + //allow for some calls to nextTuple() to fail to emit a tuple + IntStream.range(0, messageCount + 5).forEach(value -> { + context.spout.nextTuple(); + }); + + ArgumentCaptor remainingIds = ArgumentCaptor.forClass(Object.class); + //1 message replayed, messageCount - 2 messages emitted for the first time + verify(context.collector, times(messageCount - 1)).emit( + eq(SingleTopicKafkaSpoutConfiguration.STREAM), + anyObject(), + remainingIds.capture()); + remainingIds.getAllValues().iterator().forEachRemaining(context.spout::ack); + + context.spout.acked.values().forEach(item -> { + assertOffsetCommitted(messageCount - 1, (KafkaSpout.OffsetEntry) item); + }); + } +} \ No newline at end of file diff --git a/external/storm-kafka-client/src/test/java/org/apache/storm/kafka/spout/builders/SingleTopicKafkaSpoutConfiguration.java b/external/storm-kafka-client/src/test/java/org/apache/storm/kafka/spout/builders/SingleTopicKafkaSpoutConfiguration.java new file mode 100644 index 00000000000..baece9331b5 --- /dev/null +++ b/external/storm-kafka-client/src/test/java/org/apache/storm/kafka/spout/builders/SingleTopicKafkaSpoutConfiguration.java @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.storm.kafka.spout.builders; + +import org.apache.storm.Config; +import org.apache.storm.generated.StormTopology; +import org.apache.storm.kafka.spout.*; +import org.apache.storm.kafka.spout.test.KafkaSpoutTestBolt; +import org.apache.storm.topology.TopologyBuilder; +import org.apache.storm.tuple.Fields; + +import java.util.HashMap; +import java.util.Map; + +import static org.apache.storm.kafka.spout.KafkaSpoutConfig.FirstPollOffsetStrategy.EARLIEST; + +public class SingleTopicKafkaSpoutConfiguration { + public static final String STREAM = "test_stream"; + public static final String TOPIC = "test"; + + public static Config getConfig() { + Config config = new Config(); + config.setDebug(true); + return config; + } + + public static StormTopology getTopologyKafkaSpout(int port) { + final TopologyBuilder tp = new TopologyBuilder(); + tp.setSpout("kafka_spout", new KafkaSpout<>(getKafkaSpoutConfig(getKafkaSpoutStreams(), port)), 1); + tp.setBolt("kafka_bolt", new KafkaSpoutTestBolt()).shuffleGrouping("kafka_spout", STREAM); + return tp.createTopology(); + } + + public static KafkaSpoutConfig getKafkaSpoutConfig(KafkaSpoutStreams kafkaSpoutStreams, int port) { + return new KafkaSpoutConfig.Builder(getKafkaConsumerProps(port), kafkaSpoutStreams, getTuplesBuilder(), getRetryService()) + .setOffsetCommitPeriodMs(10_000) + .setFirstPollOffsetStrategy(EARLIEST) + .setMaxUncommittedOffsets(250) + .setPollTimeoutMs(1000) + .build(); + } + + protected static KafkaSpoutRetryService getRetryService() { + return new KafkaSpoutRetryExponentialBackoff(KafkaSpoutRetryExponentialBackoff.TimeInterval.microSeconds(0), + KafkaSpoutRetryExponentialBackoff.TimeInterval.milliSeconds(0), Integer.MAX_VALUE, KafkaSpoutRetryExponentialBackoff.TimeInterval.milliSeconds(0)); + + } + + protected static Map getKafkaConsumerProps(int port) { + Map props = new HashMap<>(); + props.put(KafkaSpoutConfig.Consumer.BOOTSTRAP_SERVERS, "127.0.0.1:" + port); + props.put(KafkaSpoutConfig.Consumer.GROUP_ID, "kafkaSpoutTestGroup"); + props.put(KafkaSpoutConfig.Consumer.KEY_DESERIALIZER, "org.apache.kafka.common.serialization.StringDeserializer"); + props.put(KafkaSpoutConfig.Consumer.VALUE_DESERIALIZER, "org.apache.kafka.common.serialization.StringDeserializer"); + props.put("max.poll.records", "5"); + return props; + } + + protected static KafkaSpoutTuplesBuilder getTuplesBuilder() { + return new KafkaSpoutTuplesBuilderNamedTopics.Builder<>( + new TopicKeyValueTupleBuilder(TOPIC)) + .build(); + } + + public static KafkaSpoutStreams getKafkaSpoutStreams() { + final Fields outputFields = new Fields("topic", "key", "value"); + return new KafkaSpoutStreamsNamedTopics.Builder(outputFields, STREAM, new String[]{TOPIC}) // contents of topics test sent to test_stream + .build(); + } +} diff --git a/external/storm-kafka-client/src/test/java/org/apache/storm/kafka/spout/builders/TopicKeyValueTupleBuilder.java b/external/storm-kafka-client/src/test/java/org/apache/storm/kafka/spout/builders/TopicKeyValueTupleBuilder.java new file mode 100644 index 00000000000..4f20b58a2dc --- /dev/null +++ b/external/storm-kafka-client/src/test/java/org/apache/storm/kafka/spout/builders/TopicKeyValueTupleBuilder.java @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.storm.kafka.spout.builders; + +import org.apache.kafka.clients.consumer.ConsumerRecord; +import org.apache.storm.kafka.spout.KafkaSpoutTupleBuilder; +import org.apache.storm.tuple.Values; + +import java.util.List; + +public class TopicKeyValueTupleBuilder extends KafkaSpoutTupleBuilder { + /** + * @param topics list of topics that use this implementation to build tuples + */ + public TopicKeyValueTupleBuilder(String... topics) { + super(topics); + } + + @Override + public List buildTuple(ConsumerRecord consumerRecord) { + return new Values(consumerRecord.topic(), + consumerRecord.key(), + consumerRecord.value()); + } +}