Skip to content

Commit 279e2bd

Browse files
committed
centralized storage not chosen reason
1 parent f48300c commit 279e2bd

File tree

3 files changed

+152
-5
lines changed

3 files changed

+152
-5
lines changed

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import static org.apache.hadoop.hdfs.DFSConfigKeys.*;
2121
import static org.apache.hadoop.hdfs.protocol.BlockType.CONTIGUOUS;
2222
import static org.apache.hadoop.hdfs.protocol.BlockType.STRIPED;
23+
import static org.apache.hadoop.hdfs.server.blockmanagement.StorageNotChosenReason.*;
2324
import static org.apache.hadoop.util.ExitUtil.terminate;
2425
import static org.apache.hadoop.util.Time.now;
2526

@@ -2244,6 +2245,9 @@ BlockReconstructionWork scheduleReconstruction(BlockInfo block,
22442245
final DatanodeDescriptor[] srcNodes = chooseSourceDatanodes(block,
22452246
containingNodes, liveReplicaNodes, numReplicas,
22462247
liveBlockIndices, liveBusyBlockIndices, excludeReconstructed, priority);
2248+
if(LOG.isDebugEnabled()){
2249+
LOG.debug(getStorageNotChosenReason(block));
2250+
}
22472251
short requiredRedundancy = getExpectedLiveRedundancyNum(block,
22482252
numReplicas);
22492253
if (srcNodes == null || srcNodes.length == 0) {
@@ -2569,6 +2573,7 @@ DatanodeDescriptor[] chooseSourceDatanodes(BlockInfo block,
25692573
final boolean isStriped = block.isStriped();
25702574
DatanodeDescriptor decommissionedSrc = null;
25712575

2576+
StorageNotChosenReason.start();
25722577
BitSet liveBitSet = null;
25732578
BitSet decommissioningBitSet = null;
25742579
if (isStriped) {
@@ -2593,13 +2598,15 @@ DatanodeDescriptor[] chooseSourceDatanodes(BlockInfo block,
25932598
// do not select the replica if it is corrupt or excess
25942599
if (state == StoredReplicaState.CORRUPT ||
25952600
state == StoredReplicaState.EXCESS) {
2601+
logStorageIsNotChooseForReplication(storage, StorageNotChosenReason.REPLICA_CORRUPT_OR_EXCESS);
25962602
continue;
25972603
}
25982604

25992605
// Never use maintenance node not suitable for read
26002606
// or unknown state replicas.
26012607
if (state == null
26022608
|| state == StoredReplicaState.MAINTENANCE_NOT_FOR_READ) {
2609+
logStorageIsNotChooseForReplication(storage, StorageNotChosenReason.REPLICA_MAINTENANCE_NOT_FOR_READ);
26032610
continue;
26042611
}
26052612

@@ -2611,6 +2618,7 @@ DatanodeDescriptor[] chooseSourceDatanodes(BlockInfo block,
26112618
ThreadLocalRandom.current().nextBoolean()) {
26122619
decommissionedSrc = node;
26132620
}
2621+
logStorageIsNotChooseForReplication(storage, StorageNotChosenReason.REPLICA_DECOMMISSIONED);
26142622
continue;
26152623
}
26162624

@@ -2635,6 +2643,7 @@ DatanodeDescriptor[] chooseSourceDatanodes(BlockInfo block,
26352643
//HDFS-16566 ExcludeReconstructed won't be reconstructed.
26362644
excludeReconstructed.add(blockIndex);
26372645
}
2646+
logStorageIsNotChooseForReplication(storage, StorageNotChosenReason.REPLICA_ALREADY_REACH_REPLICATION_LIMIT);
26382647
continue; // already reached replication limit
26392648
}
26402649

@@ -2646,6 +2655,7 @@ DatanodeDescriptor[] chooseSourceDatanodes(BlockInfo block,
26462655
//HDFS-16566 ExcludeReconstructed won't be reconstructed.
26472656
excludeReconstructed.add(blockIndex);
26482657
}
2658+
logStorageIsNotChooseForReplication(storage, StorageNotChosenReason.REPLICA_ALREADY_REACH_REPLICATION_HARD_LIMIT);
26492659
continue;
26502660
}
26512661

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
package org.apache.hadoop.hdfs.server.blockmanagement;
2+
3+
4+
import org.apache.hadoop.classification.VisibleForTesting;
5+
import org.slf4j.Logger;
6+
import org.slf4j.LoggerFactory;
7+
8+
import java.util.HashMap;
9+
10+
11+
public enum StorageNotChosenReason {
12+
REPLICA_CORRUPT_OR_EXCESS("stored replica state is corrupt or excess"),
13+
REPLICA_MAINTENANCE_NOT_FOR_READ("stored replica is maintenance not for read"),
14+
REPLICA_DECOMMISSIONED("replica is already decommissioned"),
15+
REPLICA_ALREADY_REACH_REPLICATION_LIMIT("replica already reached replication soft limit"),
16+
REPLICA_ALREADY_REACH_REPLICATION_HARD_LIMIT("replica already reached replication hard limit");
17+
18+
public static final Logger LOG = LoggerFactory.getLogger(
19+
BlockManager.class);
20+
21+
private static final ThreadLocal<HashMap<StorageNotChosenReason, Integer>>
22+
REASONS_SUMMARY = ThreadLocal
23+
.withInitial(() -> new HashMap<StorageNotChosenReason, Integer>());
24+
25+
26+
private static final ThreadLocal<StringBuilder> debugLoggingBuilder
27+
= new ThreadLocal<StringBuilder>() {
28+
@Override
29+
protected StringBuilder initialValue() {
30+
return new StringBuilder();
31+
}
32+
};
33+
34+
private final String text;
35+
36+
StorageNotChosenReason(final String logText) {
37+
text = logText;
38+
}
39+
40+
private String getText() {
41+
return text;
42+
}
43+
44+
public static void start(){
45+
REASONS_SUMMARY.get().clear();
46+
debugLoggingBuilder.get().setLength(0);
47+
}
48+
49+
public static void logStorageIsNotChooseForReplication(DatanodeStorageInfo storage,
50+
StorageNotChosenReason reason, String reasonDetails) {
51+
if(LOG.isDebugEnabled()){
52+
genStorageIsNotChooseForReplication(storage, reason, reasonDetails);
53+
}
54+
}
55+
56+
57+
@VisibleForTesting
58+
static void genStorageIsNotChooseForReplication(DatanodeStorageInfo storage,
59+
StorageNotChosenReason reason, String reasonDetails){
60+
// build the error message for later use.
61+
debugLoggingBuilder.get()
62+
.append("\n Storage ").append((storage==null)?"None":storage)
63+
.append(" is not chosen since ").append(reason.getText());
64+
if (reasonDetails != null) {
65+
debugLoggingBuilder.get().append(" ").append(reasonDetails);
66+
}
67+
debugLoggingBuilder.get().append(".");
68+
final HashMap<StorageNotChosenReason, Integer> reasonMap =
69+
REASONS_SUMMARY.get();
70+
Integer base = reasonMap.get(reason);
71+
if (base == null) {
72+
base = 0;
73+
}
74+
reasonMap.put(reason, base + 1);
75+
}
76+
77+
@VisibleForTesting
78+
static String getStorageNotChosenReason(BlockInfo block){
79+
StringBuilder blockInfoPrefix = new StringBuilder("Block ").append(block);
80+
final HashMap<StorageNotChosenReason, Integer> reasonMap =
81+
REASONS_SUMMARY.get();
82+
if(reasonMap.isEmpty()){
83+
return blockInfoPrefix.append(" successfully chosen storage.").toString();
84+
}else{
85+
blockInfoPrefix.append(" has no chosen storage. Reason: [\n") ;
86+
debugLoggingBuilder.get().append("\n]");
87+
StringBuilder reasonMapResult = new StringBuilder();
88+
reasonMapResult.append("Reason statistics: ").append(reasonMap);
89+
return blockInfoPrefix.append(debugLoggingBuilder.get()).append("\n")
90+
.append(reasonMapResult).toString();
91+
}
92+
}
93+
}

hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/blockmanagement/TestBlockManager.java

Lines changed: 49 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -121,11 +121,7 @@
121121
import static org.apache.hadoop.hdfs.server.common.HdfsServerConstants.BlockUCState.UNDER_CONSTRUCTION;
122122
import static org.apache.hadoop.test.MetricsAsserts.getLongCounter;
123123
import static org.apache.hadoop.test.MetricsAsserts.getMetrics;
124-
import static org.junit.Assert.assertEquals;
125-
import static org.junit.Assert.assertFalse;
126-
import static org.junit.Assert.assertNotNull;
127-
import static org.junit.Assert.assertNull;
128-
import static org.junit.Assert.assertTrue;
124+
import static org.junit.Assert.*;
129125
import static org.mockito.ArgumentMatchers.any;
130126
import static org.mockito.Mockito.doReturn;
131127
import static org.mockito.Mockito.mock;
@@ -2329,4 +2325,52 @@ public void delayDeleteReplica() {
23292325
DataNodeFaultInjector.set(oldInjector);
23302326
}
23312327
}
2328+
2329+
/**
2330+
* Test the log mechasim is working as expected when storage is not chosen
2331+
* @throws IOException
2332+
* @throws InterruptedException
2333+
* @throws TimeoutException
2334+
*/
2335+
@Test(timeout = 6000)
2336+
public void testStorageNotChosenReason() throws InterruptedException {
2337+
String storageID = "storageID";
2338+
DatanodeStorageInfo targetDN = BlockManagerTestUtil
2339+
.newDatanodeStorageInfo(DFSTestUtil.getLocalDatanodeDescriptor(),
2340+
new DatanodeStorage("storage_test_0"));
2341+
BlockInfo blk = new BlockInfoContiguous(new Block(0), (short) 0);
2342+
StorageNotChosenReason.start();
2343+
String reason = StorageNotChosenReason.getStorageNotChosenReason(blk);
2344+
assertTrue(reason.contains(storageID) );
2345+
assertFalse(reason.contains(targetDN.toString()));
2346+
assertTrue(reason.contains("successfully chosen storage"));
2347+
assertFalse(reason.contains("is not chosen since"));
2348+
assertFalse(reason.contains("Reason statistics"));
2349+
2350+
int threadNum = 10;
2351+
Thread[] threads = new Thread[threadNum];
2352+
for(int i = 0; i<threadNum;i++){
2353+
final int index = i;
2354+
threads[i] = new Thread(() -> {
2355+
String newStorageID = "storageID"+index;
2356+
StorageNotChosenReason.start();
2357+
DatanodeStorageInfo newTargetStorage = BlockManagerTestUtil
2358+
.newDatanodeStorageInfo(DFSTestUtil.getLocalDatanodeDescriptor(),
2359+
new DatanodeStorage(newStorageID));
2360+
BlockInfo newBlk = new BlockInfoContiguous(new Block(index), (short) index);
2361+
StorageNotChosenReason.genStorageIsNotChooseForReplication(newTargetStorage,
2362+
StorageNotChosenReason.REPLICA_DECOMMISSIONED, null);
2363+
String reason1 = StorageNotChosenReason.getStorageNotChosenReason(newBlk);
2364+
assertTrue(reason1.contains(newBlk.toString()));
2365+
assertTrue(reason1.contains(newStorageID));
2366+
assertTrue(reason1.contains(newTargetStorage.toString()));
2367+
assertTrue(reason1.contains("is not chosen since"));
2368+
assertTrue(reason1.contains("Reason statistics"));
2369+
});
2370+
}
2371+
for(int i = 0;i<threadNum;i++){
2372+
threads[i].start();
2373+
threads[i].join(0);
2374+
}
2375+
}
23322376
}

0 commit comments

Comments
 (0)