Skip to content

Commit 5ad7737

Browse files
authored
HDFS-17342. Fix DataNode may invalidates normal block causing missing block (#6464). Contributed by Haiyang Hu.
Reviewed-by: ZanderXu <zanderxu@apache.org> Reviewed-by: Chengwei Wang <1139557635@qq.com> Signed-off-by: Shuyan Zhang <zhangshuyan@apache.org>
1 parent 9a7eead commit 5ad7737

File tree

3 files changed

+114
-5
lines changed

3 files changed

+114
-5
lines changed

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/DataNodeFaultInjector.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,4 +167,9 @@ public void delayDeleteReplica() {}
167167
* Just delay run diff record a while.
168168
*/
169169
public void delayDiffRecord() {}
170+
171+
/**
172+
* Just delay getMetaDataInputStream a while.
173+
*/
174+
public void delayGetMetaDataInputStream() {}
170175
}

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetImpl.java

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@
6363
import org.apache.hadoop.hdfs.server.common.AutoCloseDataSetLock;
6464
import org.apache.hadoop.hdfs.server.common.DataNodeLockManager;
6565
import org.apache.hadoop.hdfs.server.common.DataNodeLockManager.LockLevel;
66+
import org.apache.hadoop.hdfs.server.datanode.DataNodeFaultInjector;
6667
import org.apache.hadoop.hdfs.server.datanode.DataSetLockManager;
6768
import org.apache.hadoop.hdfs.server.datanode.FileIoProvider;
6869
import org.apache.hadoop.hdfs.server.datanode.FinalizedReplica;
@@ -247,6 +248,7 @@ public LengthInputStream getMetaDataInputStream(ExtendedBlock b)
247248
if (info == null || !info.metadataExists()) {
248249
return null;
249250
}
251+
DataNodeFaultInjector.get().delayGetMetaDataInputStream();
250252
return info.getMetadataInputStream(0);
251253
}
252254

@@ -2403,8 +2405,9 @@ public void invalidate(String bpid, ReplicaInfo block) {
24032405
*
24042406
* @param bpid the block pool ID.
24052407
* @param block The block to be invalidated.
2408+
* @param checkFiles Whether to check data and meta files.
24062409
*/
2407-
public void invalidateMissingBlock(String bpid, Block block) {
2410+
public void invalidateMissingBlock(String bpid, Block block, boolean checkFiles) {
24082411

24092412
// The replica seems is on its volume map but not on disk.
24102413
// We can't confirm here is block file lost or disk failed.
@@ -2416,11 +2419,21 @@ public void invalidateMissingBlock(String bpid, Block block) {
24162419
// So remove if from volume map notify namenode is ok.
24172420
try (AutoCloseableLock lock = lockManager.writeLock(LockLevel.BLOCK_POOl,
24182421
bpid)) {
2419-
ReplicaInfo replica = volumeMap.remove(bpid, block);
2420-
invalidate(bpid, replica);
2422+
// Check if this block is on the volume map.
2423+
ReplicaInfo replica = volumeMap.get(bpid, block);
2424+
// Double-check block or meta file existence when checkFiles as true.
2425+
if (replica != null && (!checkFiles ||
2426+
(!replica.blockDataExists() || !replica.metadataExists()))) {
2427+
volumeMap.remove(bpid, block);
2428+
invalidate(bpid, replica);
2429+
}
24212430
}
24222431
}
24232432

2433+
public void invalidateMissingBlock(String bpid, Block block) {
2434+
invalidateMissingBlock(bpid, block, true);
2435+
}
2436+
24242437
/**
24252438
* Remove Replica from ReplicaMap.
24262439
*

hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/TestFsDatasetImpl.java

Lines changed: 93 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1962,7 +1962,7 @@ public void delayDeleteReplica() {
19621962
* 4. block would be recovered when disk back to normal.
19631963
*/
19641964
@Test
1965-
public void tesInvalidateMissingBlock() throws Exception {
1965+
public void testInvalidateMissingBlock() throws Exception {
19661966
long blockSize = 1024;
19671967
int heartbeatInterval = 1;
19681968
HdfsConfiguration c = new HdfsConfiguration();
@@ -1988,7 +1988,7 @@ public void tesInvalidateMissingBlock() throws Exception {
19881988
File metaFile = new File(metaPath);
19891989

19901990
// Mock local block file not found when disk with some exception.
1991-
fsdataset.invalidateMissingBlock(bpid, replicaInfo);
1991+
fsdataset.invalidateMissingBlock(bpid, replicaInfo, false);
19921992

19931993
// Assert local block file wouldn't be deleted from disk.
19941994
assertTrue(blockFile.exists());
@@ -2011,4 +2011,95 @@ public void tesInvalidateMissingBlock() throws Exception {
20112011
cluster.shutdown();
20122012
}
20132013
}
2014+
2015+
@Test
2016+
public void testCheckFilesWhenInvalidateMissingBlock() throws Exception {
2017+
long blockSize = 1024;
2018+
int heartbeatInterval = 1;
2019+
HdfsConfiguration c = new HdfsConfiguration();
2020+
c.setInt(DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_KEY, heartbeatInterval);
2021+
c.setLong(DFS_BLOCK_SIZE_KEY, blockSize);
2022+
MiniDFSCluster cluster = new MiniDFSCluster.Builder(c).
2023+
numDataNodes(1).build();
2024+
DataNodeFaultInjector oldDnInjector = DataNodeFaultInjector.get();
2025+
try {
2026+
cluster.waitActive();
2027+
GenericTestUtils.LogCapturer logCapturer = GenericTestUtils.LogCapturer.
2028+
captureLogs(DataNode.LOG);
2029+
BlockReaderTestUtil util = new BlockReaderTestUtil(cluster, new
2030+
HdfsConfiguration(conf));
2031+
Path path = new Path("/testFile");
2032+
util.writeFile(path, 1);
2033+
String bpid = cluster.getNameNode().getNamesystem().getBlockPoolId();
2034+
DataNode dn = cluster.getDataNodes().get(0);
2035+
FsDatasetImpl dnFSDataset = (FsDatasetImpl) dn.getFSDataset();
2036+
List<ReplicaInfo> replicaInfos = dnFSDataset.getFinalizedBlocks(bpid);
2037+
assertEquals(1, replicaInfos.size());
2038+
DFSTestUtil.readFile(cluster.getFileSystem(), path);
2039+
LocatedBlock blk = util.getFileBlocks(path, 512).get(0);
2040+
ExtendedBlock block = blk.getBlock();
2041+
2042+
// Append a new block with an incremented generation stamp.
2043+
long newGS = block.getGenerationStamp() + 1;
2044+
dnFSDataset.append(block, newGS, 1024);
2045+
block.setGenerationStamp(newGS);
2046+
ReplicaInfo tmpReplicaInfo = dnFSDataset.getReplicaInfo(blk.getBlock());
2047+
2048+
DataNodeFaultInjector injector = new DataNodeFaultInjector() {
2049+
@Override
2050+
public void delayGetMetaDataInputStream() {
2051+
try {
2052+
Thread.sleep(8000);
2053+
} catch (InterruptedException e) {
2054+
// Ignore exception.
2055+
}
2056+
}
2057+
};
2058+
// Delay to getMetaDataInputStream.
2059+
DataNodeFaultInjector.set(injector);
2060+
2061+
ExecutorService executorService = Executors.newFixedThreadPool(2);
2062+
try {
2063+
Future<?> blockReaderFuture = executorService.submit(() -> {
2064+
try {
2065+
// Submit tasks for reading block.
2066+
BlockReader blockReader = BlockReaderTestUtil.getBlockReader(
2067+
cluster.getFileSystem(), blk, 0, 512);
2068+
blockReader.close();
2069+
} catch (IOException e) {
2070+
// Ignore exception.
2071+
}
2072+
});
2073+
2074+
Future<?> finalizeBlockFuture = executorService.submit(() -> {
2075+
try {
2076+
// Submit tasks for finalizing block.
2077+
Thread.sleep(1000);
2078+
dnFSDataset.finalizeBlock(block, false);
2079+
} catch (Exception e) {
2080+
// Ignore exception
2081+
}
2082+
});
2083+
2084+
// Wait for both tasks to complete.
2085+
blockReaderFuture.get();
2086+
finalizeBlockFuture.get();
2087+
} finally {
2088+
executorService.shutdown();
2089+
}
2090+
2091+
// Validate the replica is exits.
2092+
assertNotNull(dnFSDataset.getReplicaInfo(blk.getBlock()));
2093+
2094+
// Check DN log for FileNotFoundException.
2095+
String expectedMsg = String.format("opReadBlock %s received exception " +
2096+
"java.io.FileNotFoundException: %s (No such file or directory)",
2097+
blk.getBlock(), tmpReplicaInfo.getMetadataURI().getPath());
2098+
assertTrue("Expected log message not found in DN log.",
2099+
logCapturer.getOutput().contains(expectedMsg));
2100+
} finally {
2101+
cluster.shutdown();
2102+
DataNodeFaultInjector.set(oldDnInjector);
2103+
}
2104+
}
20142105
}

0 commit comments

Comments
 (0)