Skip to content

Commit 1acb0d9

Browse files
committed
[SPARK-29257][Core][Shuffle] Use task attempt number as noop reduce id to handle disk failures during shuffle
1 parent 4a3a6b6 commit 1acb0d9

File tree

2 files changed

+83
-15
lines changed

2 files changed

+83
-15
lines changed

core/src/main/scala/org/apache/spark/shuffle/IndexShuffleBlockResolver.scala

Lines changed: 17 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,11 @@ import java.io._
2121
import java.nio.channels.Channels
2222
import java.nio.file.Files
2323

24-
import org.apache.spark.{SparkConf, SparkEnv}
24+
import org.apache.spark.{SparkConf, SparkEnv, TaskContext}
2525
import org.apache.spark.internal.Logging
2626
import org.apache.spark.io.NioBufferedFileInputStream
2727
import org.apache.spark.network.buffer.{FileSegmentManagedBuffer, ManagedBuffer}
2828
import org.apache.spark.network.netty.SparkTransportConf
29-
import org.apache.spark.shuffle.IndexShuffleBlockResolver.NOOP_REDUCE_ID
3029
import org.apache.spark.storage._
3130
import org.apache.spark.util.Utils
3231

@@ -35,8 +34,9 @@ import org.apache.spark.util.Utils
3534
* Data of shuffle blocks from the same map task are stored in a single consolidated data file.
3635
* The offsets of the data blocks in the data file are stored in a separate index file.
3736
*
38-
* We use the name of the shuffle data's shuffleBlockId with reduce ID set to 0 and add ".data"
39-
* as the filename postfix for data file, and ".index" as the filename postfix for index file.
37+
* We use the name of the shuffle data's shuffleBlockId with reduce ID set to task attempt number
38+
* and add ".data" as the filename postfix for data file, and ".index" as the filename postfix for
39+
* index file.
4040
*
4141
*/
4242
// Note: Changes to the format in this file should be kept in sync with
@@ -51,12 +51,23 @@ private[spark] class IndexShuffleBlockResolver(
5151

5252
private val transportConf = SparkTransportConf.fromSparkConf(conf, "shuffle")
5353

54+
// No-op reduce ID used in interactions with disk store.
55+
// The disk store currently expects puts to relate to a (map, reduce) pair, but in the sort
56+
// shuffle outputs for several reduces are glommed into a single file.
57+
// SPARK-29257: the noop reduce id used to be 0, which results a fixed index or data file name.
58+
// When the nodes in your cluster has more than one disks, if one of them is broken and the fixed
59+
// file name's hash code % disk number just point to it, all attempts of one task will inevitably
60+
// access the broken disk, which lead to meaningless task failure or even tear down the whole job.
61+
// Here we change the noop reduce id to task attempt number to produce different file name, which
62+
// may try another healthy disk.
63+
private def noopReduceId: Int = Option(TaskContext.get()).map(_.attemptNumber()).getOrElse(0)
64+
5465
def getDataFile(shuffleId: Int, mapId: Int): File = {
55-
blockManager.diskBlockManager.getFile(ShuffleDataBlockId(shuffleId, mapId, NOOP_REDUCE_ID))
66+
blockManager.diskBlockManager.getFile(ShuffleDataBlockId(shuffleId, mapId, noopReduceId))
5667
}
5768

5869
private def getIndexFile(shuffleId: Int, mapId: Int): File = {
59-
blockManager.diskBlockManager.getFile(ShuffleIndexBlockId(shuffleId, mapId, NOOP_REDUCE_ID))
70+
blockManager.diskBlockManager.getFile(ShuffleIndexBlockId(shuffleId, mapId, noopReduceId))
6071
}
6172

6273
/**
@@ -225,10 +236,3 @@ private[spark] class IndexShuffleBlockResolver(
225236

226237
override def stop(): Unit = {}
227238
}
228-
229-
private[spark] object IndexShuffleBlockResolver {
230-
// No-op reduce ID used in interactions with disk store.
231-
// The disk store currently expects puts to relate to a (map, reduce) pair, but in the sort
232-
// shuffle outputs for several reduces are glommed into a single file.
233-
val NOOP_REDUCE_ID = 0
234-
}

core/src/test/scala/org/apache/spark/shuffle/sort/IndexShuffleBlockResolverSuite.scala

Lines changed: 66 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
package org.apache.spark.shuffle.sort
1919

20-
import java.io.{DataInputStream, File, FileInputStream, FileOutputStream}
20+
import java.io._
2121

2222
import org.mockito.{Mock, MockitoAnnotations}
2323
import org.mockito.Answers.RETURNS_SMART_NULLS
@@ -26,7 +26,7 @@ import org.mockito.Mockito._
2626
import org.mockito.invocation.InvocationOnMock
2727
import org.scalatest.BeforeAndAfterEach
2828

29-
import org.apache.spark.{SparkConf, SparkFunSuite}
29+
import org.apache.spark.{SparkConf, SparkFunSuite, TaskContext}
3030
import org.apache.spark.shuffle.IndexShuffleBlockResolver
3131
import org.apache.spark.storage._
3232
import org.apache.spark.util.Utils
@@ -36,6 +36,7 @@ class IndexShuffleBlockResolverSuite extends SparkFunSuite with BeforeAndAfterEa
3636

3737
@Mock(answer = RETURNS_SMART_NULLS) private var blockManager: BlockManager = _
3838
@Mock(answer = RETURNS_SMART_NULLS) private var diskBlockManager: DiskBlockManager = _
39+
@Mock(answer = RETURNS_SMART_NULLS) private var taskContext: TaskContext = _
3940

4041
private var tempDir: File = _
4142
private val conf: SparkConf = new SparkConf(loadDefaults = false)
@@ -48,6 +49,8 @@ class IndexShuffleBlockResolverSuite extends SparkFunSuite with BeforeAndAfterEa
4849
when(blockManager.diskBlockManager).thenReturn(diskBlockManager)
4950
when(diskBlockManager.getFile(any[BlockId])).thenAnswer(
5051
(invocation: InvocationOnMock) => new File(tempDir, invocation.getArguments.head.toString))
52+
53+
TaskContext.setTaskContext(taskContext)
5154
}
5255

5356
override def afterEach(): Unit = {
@@ -155,4 +158,65 @@ class IndexShuffleBlockResolverSuite extends SparkFunSuite with BeforeAndAfterEa
155158
indexIn2.close()
156159
}
157160
}
161+
162+
test("get data file should in different task attempts") {
163+
val resolver = new IndexShuffleBlockResolver(conf, blockManager)
164+
val shuffleId = 1
165+
val mapId = 2
166+
when(taskContext.attemptNumber()).thenReturn(0, Seq(1, 2, 3): _*)
167+
assert(resolver.getDataFile(shuffleId, mapId).getName.endsWith("0.data"))
168+
assert(resolver.getDataFile(shuffleId, mapId).getName.endsWith("1.data"))
169+
assert(resolver.getDataFile(shuffleId, mapId).getName.endsWith("2.data"))
170+
assert(resolver.getDataFile(shuffleId, mapId).getName.endsWith("3.data"))
171+
}
172+
173+
test("different task attempts should be able to choose different local dirs") {
174+
val localDirSuffixes = 1 to 4
175+
val dirs = localDirSuffixes.map(x => tempDir + "/test_local" + x).mkString(",")
176+
val confClone = conf.clone.set("spark.local.dir", dirs)
177+
val resolver = new IndexShuffleBlockResolver(confClone, blockManager)
178+
val dbm = new DiskBlockManager(confClone, true)
179+
when(blockManager.diskBlockManager).thenReturn(dbm)
180+
when(taskContext.attemptNumber()).thenReturn(0, Seq(1, 2, 3): _*)
181+
val dataFiles = localDirSuffixes.map(_ => resolver.getDataFile(1, 2))
182+
val usedLocalDirSuffixed =
183+
dataFiles.map(_.getAbsolutePath.split("test_local")(1).substring(0, 1).toInt)
184+
assert(usedLocalDirSuffixed.diff(localDirSuffixes).isEmpty)
185+
}
186+
187+
test("new task attempt should be able to success in another available local dir") {
188+
val localDirSuffixes = 1 to 2
189+
val dirs = localDirSuffixes.map { x => tempDir + "/test_local" + x }.mkString(",")
190+
val confClone = conf.clone.set("spark.local.dir", dirs)
191+
val resolver = new IndexShuffleBlockResolver(confClone, blockManager)
192+
val dbm = new DiskBlockManager(confClone, true)
193+
when(blockManager.diskBlockManager).thenReturn(dbm, dbm)
194+
val shuffleId = 1
195+
val mapId = 2
196+
val lengths = Array[Long](10, 0, 20)
197+
val dataTmp = File.createTempFile("shuffle", null, tempDir)
198+
val out = new FileOutputStream(dataTmp)
199+
Utils.tryWithSafeFinally {
200+
out.write(new Array[Byte](30))
201+
} {
202+
out.close()
203+
}
204+
val idxName = s"shuffle_${shuffleId}_${mapId}_0.index"
205+
val localDirIdx = Utils.nonNegativeHash(idxName) % localDirSuffixes.length
206+
207+
val badDisk = dbm.localDirs(localDirIdx)
208+
badDisk.setWritable(false) // just like a disk error occurs
209+
210+
// 1. index -> fail
211+
// 2. index -> data -> verify data
212+
when(taskContext.attemptNumber()).thenReturn(0, Seq(1, 1, 1): _*)
213+
val e =
214+
intercept[IOException](resolver.writeIndexFileAndCommit(shuffleId, mapId, lengths, dataTmp))
215+
assert(e.getMessage.contains(badDisk.getAbsolutePath))
216+
217+
resolver.writeIndexFileAndCommit(shuffleId, mapId, lengths, dataTmp)
218+
219+
val dataFile = resolver.getDataFile(shuffleId, mapId)
220+
assert(dataFile.exists())
221+
}
158222
}

0 commit comments

Comments
 (0)