Skip to content

Commit efd93f2

Browse files
committed
In long running contexts, we encountered the situation of double register without a remove in between. The cause for that is unknown, and assumed a temp network issue.
However, since the second register is with a BlockManagerId on a different port, blockManagerInfo.contains() returns false, while blockManagerIdByExecutor returns Some. This inconsistency is caught in a conditional statement that does System.exit(1), which is a huge robustness issue for us. The fix - simply remove the old id from both maps during register when this happens. We are mimicking the behavior of expireDeadHosts(), by doing local cleanup of the maps before trying to add new ones. Also - added some logging for register and unregister.
1 parent 4322c0b commit efd93f2

File tree

1 file changed

+21
-6
lines changed

1 file changed

+21
-6
lines changed

core/src/main/scala/org/apache/spark/storage/BlockManagerMasterActor.scala

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -145,12 +145,17 @@ class BlockManagerMasterActor(val isLocal: Boolean, conf: SparkConf) extends Act
145145

146146
private def removeBlockManager(blockManagerId: BlockManagerId) {
147147
val info = blockManagerInfo(blockManagerId)
148-
148+
149149
// Remove the block manager from blockManagerIdByExecutor.
150150
blockManagerIdByExecutor -= blockManagerId.executorId
151+
152+
logInfo("removed executorId %s from blockManagerIdByExecutor".format(blockManagerId.executorId))
151153

152154
// Remove it from blockManagerInfo and remove all the blocks.
153155
blockManagerInfo.remove(blockManagerId)
156+
157+
logInfo("removed blockManagerId %s from blockManagerInfo".format(blockManagerId))
158+
154159
val iterator = info.blocks.keySet.iterator
155160
while (iterator.hasNext) {
156161
val blockId = iterator.next
@@ -160,6 +165,8 @@ class BlockManagerMasterActor(val isLocal: Boolean, conf: SparkConf) extends Act
160165
blockLocations.remove(locations)
161166
}
162167
}
168+
169+
logInfo("done with remove "+blockManagerId)
163170
}
164171

165172
private def expireDeadHosts() {
@@ -180,6 +187,7 @@ class BlockManagerMasterActor(val isLocal: Boolean, conf: SparkConf) extends Act
180187
private def removeExecutor(execId: String) {
181188
logInfo("Trying to remove executor " + execId + " from BlockManagerMaster.")
182189
blockManagerIdByExecutor.get(execId).foreach(removeBlockManager)
190+
logInfo("removed executor " + execId + " from BlockManagerMaster.")
183191
}
184192

185193
private def heartBeat(blockManagerId: BlockManagerId): Boolean = {
@@ -223,18 +231,25 @@ class BlockManagerMasterActor(val isLocal: Boolean, conf: SparkConf) extends Act
223231
}
224232

225233
private def register(id: BlockManagerId, maxMemSize: Long, slaveActor: ActorRef) {
234+
logInfo("Registering block manager %s with %s RAM, %s".format(id.hostPort, Utils.bytesToString(maxMemSize), id))
235+
226236
if (!blockManagerInfo.contains(id)) {
227237
blockManagerIdByExecutor.get(id.executorId) match {
228238
case Some(manager) =>
229239
// A block manager of the same executor already exists.
230240
// This should never happen. Let's just quit.
231-
logError("Got two different block manager registrations on " + id.executorId)
232-
System.exit(1)
241+
logError("Got two different block manager registrations on same executor - will remove, new Id " + id+", orig id - "+manager)
242+
removeExecutor(id.executorId)
233243
case None =>
234-
blockManagerIdByExecutor(id.executorId) = id
244+
logInfo("about to register new id "+id)
235245
}
236-
blockManagerInfo(id) = new BlockManagerMasterActor.BlockManagerInfo(
237-
id, System.currentTimeMillis(), maxMemSize, slaveActor)
246+
247+
blockManagerIdByExecutor(id.executorId) = id
248+
logInfo("Added %s to blockManagerIdByExecutor".format(id.executorId))
249+
250+
val info = new BlockManagerMasterActor.BlockManagerInfo(id, System.currentTimeMillis(), maxMemSize, slaveActor)
251+
blockManagerInfo(id) = info
252+
logInfo("Added %s, %s to blockManagerInfo".format(id, info))
238253
}
239254
}
240255

0 commit comments

Comments
 (0)