Skip to content

Commit f79f1d2

Browse files
committed
HDFS-17218. NameNode should process time out excess redundancy blocks
1 parent 40e8300 commit f79f1d2

File tree

6 files changed

+345
-8
lines changed

6 files changed

+345
-8
lines changed

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -315,6 +315,13 @@ public class DFSConfigKeys extends CommonConfigurationKeys {
315315
public static final int
316316
DFS_NAMENODE_RECONSTRUCTION_PENDING_TIMEOUT_SEC_DEFAULT = 300;
317317

318+
public static final String DFS_NAMENODE_EXCESS_REDUNDANCY_TIMEOUT_SEC_KEY =
319+
"dfs.namenode.excess.redundancy.timeout-sec";
320+
public static final long DFS_NAMENODE_EXCESS_REDUNDANCY_TIMEOUT_SEC = 3600;
321+
public static final String DFS_NAMENODE_EXCESS_REDUNDANCY_TIMEOUT_CHECK_LIMIT
322+
= "dfs.namenode.excess.redundancy.timeout.check.limit";
323+
public static final long DFS_NAMENODE_EXCESS_REDUNDANCY_TIMEOUT_CHECK_LIMIT_DEFAULT = 1000;
324+
318325
public static final String DFS_NAMENODE_MAINTENANCE_REPLICATION_MIN_KEY =
319326
"dfs.namenode.maintenance.replication.min";
320327
public static final int DFS_NAMENODE_MAINTENANCE_REPLICATION_MIN_DEFAULT

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@
8686
import org.apache.hadoop.hdfs.server.blockmanagement.NumberReplicas.StoredReplicaState;
8787
import org.apache.hadoop.hdfs.server.blockmanagement.PendingDataNodeMessages.ReportedBlockInfo;
8888
import org.apache.hadoop.hdfs.server.blockmanagement.PendingReconstructionBlocks.PendingBlockInfo;
89+
import org.apache.hadoop.hdfs.server.blockmanagement.ExcessRedundancyMap.ExcessBlockInfo;
8990
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.BlockUCState;
9091
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.ReplicaState;
9192
import org.apache.hadoop.hdfs.server.namenode.CachedBlock;
@@ -116,6 +117,7 @@
116117

117118
import static org.apache.hadoop.hdfs.util.StripedBlockUtil.getInternalBlockLength;
118119

120+
import org.apache.hadoop.hdfs.util.LightWeightHashSet;
119121
import org.apache.hadoop.metrics2.util.MBeans;
120122
import org.apache.hadoop.net.Node;
121123
import org.apache.hadoop.security.UserGroupInformation;
@@ -482,6 +484,16 @@ public int getPendingSPSPaths() {
482484
/** Storages accessible from multiple DNs. */
483485
private final ProvidedStorageMap providedStorageMap;
484486

487+
/**
488+
* Timeout for excess redundancy block.
489+
*/
490+
private long excessRedundancyTimeout;
491+
492+
/**
493+
* Limits number of blocks used to check for excess redundancy timeout.
494+
*/
495+
private long excessRedundancyTimeoutCheckLimit;
496+
485497
public BlockManager(final Namesystem namesystem, boolean haEnabled,
486498
final Configuration conf) throws IOException {
487499
this.namesystem = namesystem;
@@ -589,6 +601,12 @@ public BlockManager(final Namesystem namesystem, boolean haEnabled,
589601
conf.getBoolean(DFS_NAMENODE_CORRUPT_BLOCK_DELETE_IMMEDIATELY_ENABLED,
590602
DFS_NAMENODE_CORRUPT_BLOCK_DELETE_IMMEDIATELY_ENABLED_DEFAULT);
591603

604+
setExcessRedundancyTimeout(conf.getLong(DFS_NAMENODE_EXCESS_REDUNDANCY_TIMEOUT_SEC_KEY,
605+
DFS_NAMENODE_EXCESS_REDUNDANCY_TIMEOUT_SEC));
606+
setExcessRedundancyTimeoutCheckLimit(conf.getLong(
607+
DFS_NAMENODE_EXCESS_REDUNDANCY_TIMEOUT_CHECK_LIMIT,
608+
DFS_NAMENODE_EXCESS_REDUNDANCY_TIMEOUT_CHECK_LIMIT_DEFAULT));
609+
592610
printInitialConfigs();
593611
}
594612

@@ -3040,6 +3058,98 @@ void rescanPostponedMisreplicatedBlocks() {
30403058
(Time.monotonicNow() - startTime), endSize, (startSize - endSize));
30413059
}
30423060
}
3061+
3062+
/**
3063+
* Sets the timeout (in seconds) for excess redundancy blocks, if the provided timeout is
3064+
* less than or equal to 0, the default value is used (converted to milliseconds).
3065+
* @param timeOut The time (in seconds) to set as the excess redundancy block timeout.
3066+
*/
3067+
public void setExcessRedundancyTimeout(long timeOut) {
3068+
if (timeOut <= 0) {
3069+
this.excessRedundancyTimeout = DFS_NAMENODE_EXCESS_REDUNDANCY_TIMEOUT_SEC * 1000L;
3070+
} else {
3071+
this.excessRedundancyTimeout = timeOut * 1000L;
3072+
}
3073+
}
3074+
3075+
/**
3076+
* Sets the limit number of blocks for checking excess redundancy timeout.
3077+
* If the provided limit is less than or equal to 0, the default limit is used.
3078+
*
3079+
* @param limit The limit number of blocks used to check for excess redundancy timeout.
3080+
*/
3081+
public void setExcessRedundancyTimeoutCheckLimit(long limit) {
3082+
if (excessRedundancyTimeoutCheckLimit <= 0) {
3083+
this.excessRedundancyTimeoutCheckLimit =
3084+
DFS_NAMENODE_EXCESS_REDUNDANCY_TIMEOUT_CHECK_LIMIT_DEFAULT;
3085+
} else {
3086+
this.excessRedundancyTimeoutCheckLimit = limit;
3087+
}
3088+
}
3089+
3090+
/**
3091+
* Process timed-out blocks in the excess redundancy map.
3092+
*/
3093+
void processTimedOutExcessBlocks() {
3094+
if (excessRedundancyMap.size() == 0) {
3095+
return;
3096+
}
3097+
namesystem.writeLock();
3098+
long now = Time.monotonicNow();
3099+
int processed = 0;
3100+
try {
3101+
Iterator<Map.Entry<String, LightWeightHashSet<ExcessBlockInfo>>> iter =
3102+
excessRedundancyMap.getExcessRedundancyMap().entrySet().iterator();
3103+
while (iter.hasNext() && processed < excessRedundancyTimeoutCheckLimit) {
3104+
Map.Entry<String, LightWeightHashSet<ExcessBlockInfo>> entry = iter.next();
3105+
String datanodeUuid = entry.getKey();
3106+
LightWeightHashSet<ExcessBlockInfo> blocks = entry.getValue();
3107+
List<ExcessRedundancyMap.ExcessBlockInfo> sortedBlocks = new ArrayList<>(blocks);
3108+
// Sort blocks by timestamp in descending order.
3109+
Collections.sort(sortedBlocks);
3110+
3111+
for (ExcessBlockInfo excessBlockInfo : sortedBlocks) {
3112+
if (processed >= excessRedundancyTimeoutCheckLimit) {
3113+
break;
3114+
}
3115+
BlockInfo blockInfo = excessBlockInfo.getBlockInfo();
3116+
BlockInfo bi = blocksMap.getStoredBlock(blockInfo);
3117+
if (bi == null || bi.isDeleted()) {
3118+
continue;
3119+
}
3120+
3121+
// If the datanode doesn't have any excess block that has exceeded the timeout,
3122+
// can exit this loop.
3123+
if (now <= excessBlockInfo.getTimeStamp() + excessRedundancyTimeout) {
3124+
break;
3125+
}
3126+
3127+
Iterator<DatanodeStorageInfo> iterator = blockInfo.getStorageInfos();
3128+
while (iterator.hasNext()) {
3129+
DatanodeStorageInfo datanodeStorageInfo = iterator.next();
3130+
DatanodeDescriptor datanodeDescriptor = datanodeStorageInfo.getDatanodeDescriptor();
3131+
if (datanodeDescriptor.getDatanodeUuid().equals(datanodeUuid)) {
3132+
if (datanodeStorageInfo.getState().equals(State.NORMAL)) {
3133+
final Block block = getBlockOnStorage(blockInfo,
3134+
datanodeStorageInfo);
3135+
if (!containsInvalidateBlock(datanodeDescriptor, block)) {
3136+
addToInvalidates(block, datanodeDescriptor);
3137+
LOG.debug("Excess block timeout ({}, {}) is added to invalidated.",
3138+
block, datanodeDescriptor);
3139+
}
3140+
excessBlockInfo.setTimeStamp();
3141+
processed ++;
3142+
break;
3143+
}
3144+
}
3145+
}
3146+
}
3147+
}
3148+
} finally {
3149+
namesystem.writeUnlock("processTimedOutExcessBlocks");
3150+
LOG.info("processTimedOutExcessBlocks {} msecs.", (Time.monotonicNow() - now));
3151+
}
3152+
}
30433153

30443154
Collection<Block> processReport(
30453155
final DatanodeStorageInfo storageInfo,
@@ -5231,6 +5341,7 @@ public void run() {
52315341
computeDatanodeWork();
52325342
processPendingReconstructions();
52335343
rescanPostponedMisreplicatedBlocks();
5344+
processTimedOutExcessBlocks();
52345345
lastRedundancyCycleTS.set(Time.monotonicNow());
52355346
}
52365347
TimeUnit.MILLISECONDS.sleep(redundancyRecheckIntervalMs);

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/ExcessRedundancyMap.java

Lines changed: 62 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@
2727

2828
import org.apache.hadoop.classification.VisibleForTesting;
2929

30+
import static org.apache.hadoop.util.Time.monotonicNow;
31+
3032
/**
3133
* Maps a datnode to the set of excess redundancy details.
3234
*
@@ -35,7 +37,7 @@
3537
class ExcessRedundancyMap {
3638
public static final Logger blockLog = NameNode.blockStateChangeLog;
3739

38-
private final Map<String, LightWeightHashSet<BlockInfo>> map =new HashMap<>();
40+
private final Map<String, LightWeightHashSet<ExcessBlockInfo>> map = new HashMap<>();
3941
private final AtomicLong size = new AtomicLong(0L);
4042

4143
/**
@@ -50,7 +52,7 @@ long size() {
5052
*/
5153
@VisibleForTesting
5254
synchronized int getSize4Testing(String dnUuid) {
53-
final LightWeightHashSet<BlockInfo> set = map.get(dnUuid);
55+
final LightWeightHashSet<ExcessBlockInfo> set = map.get(dnUuid);
5456
return set == null? 0: set.size();
5557
}
5658

@@ -64,8 +66,8 @@ synchronized void clear() {
6466
* datanode and the given block?
6567
*/
6668
synchronized boolean contains(DatanodeDescriptor dn, BlockInfo blk) {
67-
final LightWeightHashSet<BlockInfo> set = map.get(dn.getDatanodeUuid());
68-
return set != null && set.contains(blk);
69+
final LightWeightHashSet<ExcessBlockInfo> set = map.get(dn.getDatanodeUuid());
70+
return set != null && set.contains(new ExcessBlockInfo(blk));
6971
}
7072

7173
/**
@@ -75,12 +77,12 @@ synchronized boolean contains(DatanodeDescriptor dn, BlockInfo blk) {
7577
* @return true if the block is added.
7678
*/
7779
synchronized boolean add(DatanodeDescriptor dn, BlockInfo blk) {
78-
LightWeightHashSet<BlockInfo> set = map.get(dn.getDatanodeUuid());
80+
LightWeightHashSet<ExcessBlockInfo> set = map.get(dn.getDatanodeUuid());
7981
if (set == null) {
8082
set = new LightWeightHashSet<>();
8183
map.put(dn.getDatanodeUuid(), set);
8284
}
83-
final boolean added = set.add(blk);
85+
final boolean added = set.add(new ExcessBlockInfo(blk));
8486
if (added) {
8587
size.incrementAndGet();
8688
blockLog.debug("BLOCK* ExcessRedundancyMap.add({}, {})", dn, blk);
@@ -95,12 +97,12 @@ synchronized boolean add(DatanodeDescriptor dn, BlockInfo blk) {
9597
* @return true if the block is removed.
9698
*/
9799
synchronized boolean remove(DatanodeDescriptor dn, BlockInfo blk) {
98-
final LightWeightHashSet<BlockInfo> set = map.get(dn.getDatanodeUuid());
100+
final LightWeightHashSet<ExcessBlockInfo> set = map.get(dn.getDatanodeUuid());
99101
if (set == null) {
100102
return false;
101103
}
102104

103-
final boolean removed = set.remove(blk);
105+
final boolean removed = set.remove(new ExcessBlockInfo(blk));
104106
if (removed) {
105107
size.decrementAndGet();
106108
blockLog.debug("BLOCK* ExcessRedundancyMap.remove({}, {})", dn, blk);
@@ -111,4 +113,56 @@ synchronized boolean remove(DatanodeDescriptor dn, BlockInfo blk) {
111113
}
112114
return removed;
113115
}
116+
117+
synchronized Map<String, LightWeightHashSet<ExcessBlockInfo>> getExcessRedundancyMap() {
118+
return map;
119+
}
120+
121+
/**
122+
* An object that contains information about a block that is being excess redundancy.
123+
* It records the timestamp when added excess redundancy map of this block.
124+
*/
125+
static class ExcessBlockInfo implements Comparable<ExcessBlockInfo> {
126+
private long timeStamp;
127+
private BlockInfo blockInfo;
128+
129+
ExcessBlockInfo(BlockInfo blockInfo) {
130+
this.timeStamp = monotonicNow();
131+
this.blockInfo = blockInfo;
132+
}
133+
134+
public BlockInfo getBlockInfo() {
135+
return blockInfo;
136+
}
137+
138+
long getTimeStamp() {
139+
return timeStamp;
140+
}
141+
142+
void setTimeStamp() {
143+
timeStamp = monotonicNow();
144+
}
145+
146+
@Override
147+
public int hashCode() {
148+
return blockInfo.hashCode();
149+
}
150+
151+
@Override
152+
public boolean equals(Object obj) {
153+
if (this == obj) {
154+
return true;
155+
}
156+
if (!(obj instanceof ExcessBlockInfo)) {
157+
return false;
158+
}
159+
ExcessBlockInfo other = (ExcessBlockInfo) obj;
160+
return (this.blockInfo.equals(other.blockInfo));
161+
}
162+
163+
@Override
164+
public int compareTo(ExcessBlockInfo o) {
165+
return Long.compare(o.timeStamp, this.timeStamp);
166+
}
167+
}
114168
}

hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5409,6 +5409,24 @@
54095409
</description>
54105410
</property>
54115411

5412+
<property>
5413+
<name>dfs.namenode.excess.redundancy.timeout-sec</name>
5414+
<value>3600</value>
5415+
<description>
5416+
Timeout in seconds for excess redundancy block. If this value is 0 or less,
5417+
then it will default to 3600 minutes.
5418+
</description>
5419+
</property>
5420+
5421+
<property>
5422+
<name>dfs.namenode.excess.redundancy.timeout.check.limit</name>
5423+
<value>1000</value>
5424+
<description>
5425+
Limits number of blocks used to check for excess redundancy timeout.
5426+
If this value is 0 or less, then it will default to 1000.
5427+
</description>
5428+
</property>
5429+
54125430
<property>
54135431
<name>dfs.namenode.stale.datanode.minimum.interval</name>
54145432
<value>3</value>

hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/MiniDFSCluster.java

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2092,6 +2092,25 @@ public FsDatasetTestUtils getFsDatasetTestUtils(DataNode dn) {
20922092
.newInstance(dn);
20932093
}
20942094

2095+
/**
2096+
* Wait for the datanodes in the cluster to process any block
2097+
* deletions that have already been asynchronously queued.
2098+
*/
2099+
public void waitForDNDeletions()
2100+
throws TimeoutException, InterruptedException {
2101+
GenericTestUtils.waitFor(new Supplier<Boolean>() {
2102+
@Override
2103+
public Boolean get() {
2104+
for (DataNode dn : getDataNodes()) {
2105+
if (getFsDatasetTestUtils(dn).getPendingAsyncDeletions() > 0) {
2106+
return false;
2107+
}
2108+
}
2109+
return true;
2110+
}
2111+
}, 1000, 10000);
2112+
}
2113+
20952114
/**
20962115
* Gets the rpc port used by the NameNode, because the caller
20972116
* supplied port is not necessarily the actual port used.

0 commit comments

Comments
 (0)