-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-27042][SS] Close cached Kafka producer in case of task retry #23956
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -20,10 +20,13 @@ package org.apache.spark.sql.kafka010 | |
| import java.{util => ju} | ||
| import java.util.concurrent.ConcurrentMap | ||
|
|
||
| import scala.collection.JavaConverters._ | ||
|
|
||
| import org.apache.kafka.clients.producer.KafkaProducer | ||
| import org.apache.kafka.common.serialization.ByteArraySerializer | ||
| import org.scalatest.PrivateMethodTester | ||
|
|
||
| import org.apache.spark.{TaskContext, TaskContextImpl} | ||
| import org.apache.spark.sql.test.SharedSQLContext | ||
|
|
||
| class CachedKafkaProducerSuite extends SharedSQLContext with PrivateMethodTester with KafkaTest { | ||
|
|
@@ -35,43 +38,68 @@ class CachedKafkaProducerSuite extends SharedSQLContext with PrivateMethodTester | |
| CachedKafkaProducer.clear() | ||
| } | ||
|
|
||
| test("Should not throw exception on calling close with non-existing key.") { | ||
| val kafkaParams = getKafkaParams() | ||
| CachedKafkaProducer.close(kafkaParams) | ||
| assert(getCacheMap().size === 0) | ||
| } | ||
|
|
||
| test("Should return the cached instance on calling getOrCreate with same params.") { | ||
| val kafkaParams = new ju.HashMap[String, Object]() | ||
| kafkaParams.put("acks", "0") | ||
| // Here only host should be resolvable, it does not need a running instance of kafka server. | ||
| kafkaParams.put("bootstrap.servers", "127.0.0.1:9022") | ||
| kafkaParams.put("key.serializer", classOf[ByteArraySerializer].getName) | ||
| kafkaParams.put("value.serializer", classOf[ByteArraySerializer].getName) | ||
| val kafkaParams = getKafkaParams() | ||
| val producer = CachedKafkaProducer.getOrCreate(kafkaParams) | ||
| val producer2 = CachedKafkaProducer.getOrCreate(kafkaParams) | ||
| assert(producer == producer2) | ||
|
|
||
| val cacheMap = PrivateMethod[ConcurrentMap[Seq[(String, Object)], KP]]('getAsMap) | ||
| val map = CachedKafkaProducer.invokePrivate(cacheMap()) | ||
| assert(map.size == 1) | ||
| assert(producer === producer2) | ||
| val cacheMap = getCacheMap() | ||
| assert(cacheMap.size === 1) | ||
| } | ||
|
|
||
| test("Should close the correct kafka producer for the given kafkaPrams.") { | ||
| val kafkaParams = new ju.HashMap[String, Object]() | ||
| kafkaParams.put("acks", "0") | ||
| kafkaParams.put("bootstrap.servers", "127.0.0.1:9022") | ||
| kafkaParams.put("key.serializer", classOf[ByteArraySerializer].getName) | ||
| kafkaParams.put("value.serializer", classOf[ByteArraySerializer].getName) | ||
| val kafkaParams = getKafkaParams() | ||
| val producer: KP = CachedKafkaProducer.getOrCreate(kafkaParams) | ||
| kafkaParams.put("acks", "1") | ||
| val producer2: KP = CachedKafkaProducer.getOrCreate(kafkaParams) | ||
| // With updated conf, a new producer instance should be created. | ||
| assert(producer != producer2) | ||
|
|
||
| val cacheMap = PrivateMethod[ConcurrentMap[Seq[(String, Object)], KP]]('getAsMap) | ||
| val map = CachedKafkaProducer.invokePrivate(cacheMap()) | ||
| assert(map.size == 2) | ||
| val cacheMap = getCacheMap() | ||
| assert(cacheMap.size === 2) | ||
|
|
||
| CachedKafkaProducer.close(kafkaParams) | ||
| val map2 = CachedKafkaProducer.invokePrivate(cacheMap()) | ||
| assert(map2.size == 1) | ||
| import scala.collection.JavaConverters._ | ||
| val (seq: Seq[(String, Object)], _producer: KP) = map2.asScala.toArray.apply(0) | ||
| assert(_producer == producer) | ||
| val cacheMap2 = getCacheMap() | ||
| assert(cacheMap2.size === 1) | ||
| assert(getCacheMapItem(cacheMap2, 0) === producer) | ||
| } | ||
|
|
||
| test("Should return new instance on calling getOrCreate with same params but task retry.") { | ||
| val kafkaParams = getKafkaParams() | ||
| val taskContext = new TaskContextImpl(0, 0, 0, 0, attemptNumber = 0, null, null, null) | ||
| TaskContext.setTaskContext(taskContext) | ||
| val producer = CachedKafkaProducer.getOrCreate(kafkaParams) | ||
| val retryTaskContext = new TaskContextImpl(0, 0, 0, 0, attemptNumber = 1, null, null, null) | ||
| TaskContext.setTaskContext(retryTaskContext) | ||
| val producer2 = CachedKafkaProducer.getOrCreate(kafkaParams) | ||
| assert(producer != producer2) | ||
| val cacheMap = getCacheMap() | ||
| assert(cacheMap.size === 1) | ||
| assert(getCacheMapItem(cacheMap, 0) === producer2) | ||
| } | ||
|
|
||
| private def getKafkaParams(): ju.HashMap[String, Object] = { | ||
| val kafkaParams = new ju.HashMap[String, Object]() | ||
| kafkaParams.put("acks", "0") | ||
| // Here only host should be resolvable, it does not need a running instance of kafka server. | ||
| kafkaParams.put("bootstrap.servers", "127.0.0.1:9022") | ||
| kafkaParams.put("key.serializer", classOf[ByteArraySerializer].getName) | ||
| kafkaParams.put("value.serializer", classOf[ByteArraySerializer].getName) | ||
| kafkaParams | ||
| } | ||
|
|
||
| private def getCacheMap(): ConcurrentMap[Seq[(String, Object)], KP] = { | ||
| val getAsMap = PrivateMethod[ConcurrentMap[Seq[(String, Object)], KP]]('getAsMap) | ||
| CachedKafkaProducer.invokePrivate(getAsMap()) | ||
| } | ||
|
|
||
| private def getCacheMapItem(map: ConcurrentMap[Seq[(String, Object)], KP], offset: Int): KP = { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm... maps don't necessarily have deterministic iteration order, so this method only really makes sense if the map has a single item. Since you always call it with Or maybe explicitly using |
||
| val (_: Seq[(String, Object)], _producer: KP) = map.asScala.toArray.apply(0) | ||
| _producer | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is probably fine; is there a way to close this earlier, when the task fails?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Since any part of the code can throw exception which may or may not caught I thought this is the most safe solution. The other consideration was that the consumer part works similar way without problem.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@gaborgsomogyi We cannot close a cached producer that can still be used by other tasks. A Kafka producer can be shared by all tasks that are using the same Kafka parameters. It is different than the consumer cache.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
But isn't the assumption that a bad producer will cause all those tasks to fail anyway? This would recover from that situation (and prevent the task retries from failing).
It may be that the task failed for other reasons and other tasks using the same producer would make progress, but that sounds both less likely and more complicated to handle.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@vanzin even if a bad producer could happen, this approach is still not correct. The new created producer can be closed by an attempt of a different task at once.
AFAIK, the current issue about the cached Kafka producer is https://issues.apache.org/jira/browse/SPARK-21869, which definitely can be solved in a smarter way.
By the way, I have never seen that anyone reported an issue about corrupt Kafka producers in Spark or Kafka community. @gaborgsomogyi do you have any ticket related to this one?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good point. Seems hard to solve without keeping more state about the producer... :-/
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Agree with @zsxwing, and once Kafka producer is made to thread-safe, it should have self-heal mechanism in itself to prevent one broken request-response to break others.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also think it's a good point. Some sort of
if (!inUse) close()mechanism would be correct.@zsxwing Just for the sake of my deeper understanding in which scenario can happen that a 2 tasks in the same executor are writing the same topicpartition?
@ScrapCodes are you proceeding with SPARK-21869? This PR needs the
inUseflag what you've shown in #19096. Happy to help any way.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@gaborgsomogyi , I wanted to revive it soon. sorry for the delay. Now I am on it. I will need your help for sure, to discuss possible approaches.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Cool, ping me and coming...