Skip to content

Commit 545a0a1

Browse files
authored
HDFS-15386 ReplicaNotFoundException keeps happening in DN after removing multiple DN's data directories (#2052)
Contributed by Toshihiro Suzuki.
1 parent 8b146c1 commit 545a0a1

File tree

2 files changed

+95
-11
lines changed
  • hadoop-hdfs-project/hadoop-hdfs/src

2 files changed

+95
-11
lines changed

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetImpl.java

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -578,7 +578,8 @@ public void removeVolumes(
578578
// Unlike updating the volumeMap in addVolume(), this operation does
579579
// not scan disks.
580580
for (String bpid : volumeMap.getBlockPoolList()) {
581-
List<ReplicaInfo> blocks = new ArrayList<>();
581+
List<ReplicaInfo> blocks = blkToInvalidate
582+
.computeIfAbsent(bpid, (k) -> new ArrayList<>());
582583
for (Iterator<ReplicaInfo> it =
583584
volumeMap.replicas(bpid).iterator(); it.hasNext();) {
584585
ReplicaInfo block = it.next();
@@ -591,9 +592,7 @@ public void removeVolumes(
591592
it.remove();
592593
}
593594
}
594-
blkToInvalidate.put(bpid, blocks);
595595
}
596-
597596
storageToRemove.add(sd.getStorageUuid());
598597
storageLocationsToRemove.remove(sdLocation);
599598
}

hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/TestFsDatasetImpl.java

Lines changed: 93 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@
2020
import com.google.common.base.Supplier;
2121
import com.google.common.collect.Lists;
2222

23-
import java.io.FileInputStream;
2423
import java.io.OutputStream;
2524
import java.nio.file.Files;
2625
import java.nio.file.Paths;
@@ -106,6 +105,8 @@
106105
import static org.mockito.Mockito.doThrow;
107106
import static org.mockito.Mockito.mock;
108107
import static org.mockito.Mockito.spy;
108+
import static org.mockito.Mockito.times;
109+
import static org.mockito.Mockito.verify;
109110
import static org.mockito.Mockito.when;
110111

111112
import org.slf4j.Logger;
@@ -268,16 +269,24 @@ public void testAddVolumeWithSameStorageUuid() throws IOException {
268269
}
269270

270271
@Test(timeout = 30000)
271-
public void testRemoveVolumes() throws IOException {
272+
public void testRemoveOneVolume() throws IOException {
272273
// Feed FsDataset with block metadata.
273-
final int NUM_BLOCKS = 100;
274-
for (int i = 0; i < NUM_BLOCKS; i++) {
275-
String bpid = BLOCK_POOL_IDS[NUM_BLOCKS % BLOCK_POOL_IDS.length];
274+
final int numBlocks = 100;
275+
for (int i = 0; i < numBlocks; i++) {
276+
String bpid = BLOCK_POOL_IDS[numBlocks % BLOCK_POOL_IDS.length];
276277
ExtendedBlock eb = new ExtendedBlock(bpid, i);
277-
try (ReplicaHandler replica =
278-
dataset.createRbw(StorageType.DEFAULT, null, eb, false)) {
278+
ReplicaHandler replica = null;
279+
try {
280+
replica = dataset.createRbw(StorageType.DEFAULT, null, eb,
281+
false);
282+
} finally {
283+
if (replica != null) {
284+
replica.close();
285+
}
279286
}
280287
}
288+
289+
// Remove one volume
281290
final String[] dataDirs =
282291
conf.get(DFSConfigKeys.DFS_DATANODE_DATA_DIR_KEY).split(",");
283292
final String volumePathToRemove = dataDirs[0];
@@ -300,6 +309,11 @@ public void testRemoveVolumes() throws IOException {
300309
assertEquals("The volume has been removed from the storageMap.",
301310
expectedNumVolumes, dataset.storageMap.size());
302311

312+
// DataNode.notifyNamenodeDeletedBlock() should be called 50 times
313+
// as we deleted one volume that has 50 blocks
314+
verify(datanode, times(50))
315+
.notifyNamenodeDeletedBlock(any(), any());
316+
303317
try {
304318
dataset.asyncDiskService.execute(volumeToRemove,
305319
new Runnable() {
@@ -317,10 +331,81 @@ public void run() {}
317331
totalNumReplicas += dataset.volumeMap.size(bpid);
318332
}
319333
assertEquals("The replica infos on this volume has been removed from the "
320-
+ "volumeMap.", NUM_BLOCKS / NUM_INIT_VOLUMES,
334+
+ "volumeMap.", numBlocks / NUM_INIT_VOLUMES,
321335
totalNumReplicas);
322336
}
323337

338+
@Test(timeout = 30000)
339+
public void testRemoveTwoVolumes() throws IOException {
340+
// Feed FsDataset with block metadata.
341+
final int numBlocks = 100;
342+
for (int i = 0; i < numBlocks; i++) {
343+
String bpid = BLOCK_POOL_IDS[numBlocks % BLOCK_POOL_IDS.length];
344+
ExtendedBlock eb = new ExtendedBlock(bpid, i);
345+
ReplicaHandler replica = null;
346+
try {
347+
replica = dataset.createRbw(StorageType.DEFAULT, null, eb,
348+
false);
349+
} finally {
350+
if (replica != null) {
351+
replica.close();
352+
}
353+
}
354+
}
355+
356+
// Remove two volumes
357+
final String[] dataDirs =
358+
conf.get(DFSConfigKeys.DFS_DATANODE_DATA_DIR_KEY).split(",");
359+
Set<StorageLocation> volumesToRemove = new HashSet<>();
360+
volumesToRemove.add(StorageLocation.parse(dataDirs[0]));
361+
volumesToRemove.add(StorageLocation.parse(dataDirs[1]));
362+
363+
FsVolumeReferences volReferences = dataset.getFsVolumeReferences();
364+
Set<FsVolumeImpl> volumes = new HashSet<>();
365+
for (FsVolumeSpi vol: volReferences) {
366+
for (StorageLocation volume : volumesToRemove) {
367+
if (vol.getStorageLocation().equals(volume)) {
368+
volumes.add((FsVolumeImpl) vol);
369+
}
370+
}
371+
}
372+
assertEquals(2, volumes.size());
373+
volReferences.close();
374+
375+
dataset.removeVolumes(volumesToRemove, true);
376+
int expectedNumVolumes = dataDirs.length - 2;
377+
assertEquals("The volume has been removed from the volumeList.",
378+
expectedNumVolumes, getNumVolumes());
379+
assertEquals("The volume has been removed from the storageMap.",
380+
expectedNumVolumes, dataset.storageMap.size());
381+
382+
// DataNode.notifyNamenodeDeletedBlock() should be called 100 times
383+
// as we deleted 2 volumes that have 100 blocks totally
384+
verify(datanode, times(100))
385+
.notifyNamenodeDeletedBlock(any(), any());
386+
387+
for (FsVolumeImpl volume : volumes) {
388+
try {
389+
dataset.asyncDiskService.execute(volume,
390+
new Runnable() {
391+
@Override
392+
public void run() {}
393+
});
394+
fail("Expect RuntimeException: the volume has been removed from the "
395+
+ "AsyncDiskService.");
396+
} catch (RuntimeException e) {
397+
GenericTestUtils.assertExceptionContains("Cannot find volume", e);
398+
}
399+
}
400+
401+
int totalNumReplicas = 0;
402+
for (String bpid : dataset.volumeMap.getBlockPoolList()) {
403+
totalNumReplicas += dataset.volumeMap.size(bpid);
404+
}
405+
assertEquals("The replica infos on this volume has been removed from the "
406+
+ "volumeMap.", 0, totalNumReplicas);
407+
}
408+
324409
@Test(timeout = 5000)
325410
public void testRemoveNewlyAddedVolume() throws IOException {
326411
final int numExistingVolumes = getNumVolumes();

0 commit comments

Comments
 (0)