Skip to content

Commit b588856

Browse files
HDFS-17093. Fix block report lease issue to avoid missing some storages report. (apache#5855). Contributed by Yanlei Yu.
Reviewed-by: Shuyan Zhang <zqingchai@gmail.com> Reviewed-by: Xing Lin <linxingnku@gmail.com> Signed-off-by: He Xiaoqiao <hexiaoqiao@apache.org>
1 parent 28d190b commit b588856

File tree

2 files changed

+99
-1
lines changed

2 files changed

+99
-1
lines changed

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2909,7 +2909,7 @@ public boolean processReport(final DatanodeID nodeID,
29092909
+ "discarded non-initial block report from {}"
29102910
+ " because namenode still in startup phase",
29112911
strBlockReportId, fullBrLeaseId, nodeID);
2912-
blockReportLeaseManager.removeLease(node);
2912+
removeDNLeaseIfNeeded(node);
29132913
return !node.hasStaleStorages();
29142914
}
29152915

@@ -2957,6 +2957,23 @@ public boolean processReport(final DatanodeID nodeID,
29572957
return !node.hasStaleStorages();
29582958
}
29592959

2960+
/**
2961+
* Remove the DN lease only when we have received block reports,
2962+
* for all storages for a particular DN.
2963+
*/
2964+
void removeDNLeaseIfNeeded(DatanodeDescriptor node) {
2965+
boolean needRemoveLease = true;
2966+
for (DatanodeStorageInfo sInfo : node.getStorageInfos()) {
2967+
if (sInfo.getBlockReportCount() == 0) {
2968+
needRemoveLease = false;
2969+
break;
2970+
}
2971+
}
2972+
if (needRemoveLease) {
2973+
blockReportLeaseManager.removeLease(node);
2974+
}
2975+
}
2976+
29602977
public void removeBRLeaseIfNeeded(final DatanodeID nodeID,
29612978
final BlockReportContext context) throws IOException {
29622979
namesystem.writeLock();

hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockReportLease.java

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@
2323
import org.apache.hadoop.hdfs.protocol.BlockListAsLongs;
2424
import org.apache.hadoop.hdfs.server.datanode.DataNode;
2525
import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
26+
import org.apache.hadoop.hdfs.server.namenode.NameNode;
27+
import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter;
2628
import org.apache.hadoop.hdfs.server.protocol.BlockReportContext;
2729
import org.apache.hadoop.hdfs.server.protocol.DatanodeCommand;
2830
import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration;
@@ -269,4 +271,83 @@ private StorageBlockReport[] createReports(DatanodeStorage[] dnStorages,
269271
}
270272
return storageBlockReports;
271273
}
274+
275+
@Test(timeout = 360000)
276+
public void testFirstIncompleteBlockReport() throws Exception {
277+
HdfsConfiguration conf = new HdfsConfiguration();
278+
Random rand = new Random();
279+
280+
try (MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf)
281+
.numDataNodes(1).build()) {
282+
cluster.waitActive();
283+
284+
FSNamesystem fsn = cluster.getNamesystem();
285+
286+
NameNode nameNode = cluster.getNameNode();
287+
// Pretend to be in safemode.
288+
NameNodeAdapter.enterSafeMode(nameNode, false);
289+
290+
BlockManager blockManager = fsn.getBlockManager();
291+
BlockManager spyBlockManager = spy(blockManager);
292+
fsn.setBlockManagerForTesting(spyBlockManager);
293+
String poolId = cluster.getNamesystem().getBlockPoolId();
294+
295+
NamenodeProtocols rpcServer = cluster.getNameNodeRpc();
296+
297+
// Test based on one DataNode report to Namenode.
298+
DataNode dn = cluster.getDataNodes().get(0);
299+
DatanodeDescriptor datanodeDescriptor = spyBlockManager
300+
.getDatanodeManager().getDatanode(dn.getDatanodeId());
301+
302+
DatanodeRegistration dnRegistration = dn.getDNRegistrationForBP(poolId);
303+
StorageReport[] storages = dn.getFSDataset().getStorageReports(poolId);
304+
305+
// Send heartbeat and request full block report lease.
306+
HeartbeatResponse hbResponse = rpcServer.sendHeartbeat(
307+
dnRegistration, storages, 0, 0, 0, 0, 0, null, true,
308+
SlowPeerReports.EMPTY_REPORT, SlowDiskReports.EMPTY_REPORT);
309+
310+
DelayAnswer delayer = new DelayAnswer(BlockManager.LOG);
311+
doAnswer(delayer).when(spyBlockManager).processReport(
312+
any(DatanodeStorageInfo.class),
313+
any(BlockListAsLongs.class));
314+
315+
// Trigger sendBlockReport.
316+
BlockReportContext brContext = new BlockReportContext(1, 0,
317+
rand.nextLong(), hbResponse.getFullBlockReportLeaseId());
318+
// Build every storage with 100 blocks for sending report.
319+
DatanodeStorage[] datanodeStorages
320+
= new DatanodeStorage[storages.length];
321+
for (int i = 0; i < storages.length; i++) {
322+
datanodeStorages[i] = storages[i].getStorage();
323+
StorageBlockReport[] reports = createReports(datanodeStorages, 100);
324+
325+
// The first multiple send once, simulating the failure of the first report,
326+
// only send successfully once.
327+
if(i == 0){
328+
rpcServer.blockReport(dnRegistration, poolId, reports, brContext);
329+
}
330+
331+
// Send blockReport.
332+
DatanodeCommand datanodeCommand = rpcServer.blockReport(dnRegistration, poolId, reports,
333+
brContext);
334+
335+
// Wait until BlockManager calls processReport.
336+
delayer.waitForCall();
337+
338+
// Allow blockreport to proceed.
339+
delayer.proceed();
340+
341+
// Get result, it will not null if process successfully.
342+
assertTrue(datanodeCommand instanceof FinalizeCommand);
343+
assertEquals(poolId, ((FinalizeCommand)datanodeCommand)
344+
.getBlockPoolId());
345+
if(i == 0){
346+
assertEquals(2, datanodeDescriptor.getStorageInfos()[i].getBlockReportCount());
347+
}else{
348+
assertEquals(1, datanodeDescriptor.getStorageInfos()[i].getBlockReportCount());
349+
}
350+
}
351+
}
352+
}
272353
}

0 commit comments

Comments
 (0)