Skip to content

Commit e8d8af3

Browse files
haiyang1987jiajunmao
authored andcommitted
HDFS-17218. NameNode should process time out excess redundancy blocks (apache#6176). Contributed by Haiyang Hu.
Signed-off-by: He Xiaoqiao <hexiaoqiao@apache.org>
1 parent 1c719e4 commit e8d8af3

File tree

6 files changed

+337
-7
lines changed

6 files changed

+337
-7
lines changed

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -315,6 +315,13 @@ public class DFSConfigKeys extends CommonConfigurationKeys {
315315
public static final int
316316
DFS_NAMENODE_RECONSTRUCTION_PENDING_TIMEOUT_SEC_DEFAULT = 300;
317317

318+
public static final String DFS_NAMENODE_EXCESS_REDUNDANCY_TIMEOUT_SEC_KEY =
319+
"dfs.namenode.excess.redundancy.timeout-sec";
320+
public static final long DFS_NAMENODE_EXCESS_REDUNDANCY_TIMEOUT_SEC_DEAFULT = 3600;
321+
public static final String DFS_NAMENODE_EXCESS_REDUNDANCY_TIMEOUT_CHECK_LIMIT
322+
= "dfs.namenode.excess.redundancy.timeout.check.limit";
323+
public static final long DFS_NAMENODE_EXCESS_REDUNDANCY_TIMEOUT_CHECK_LIMIT_DEFAULT = 1000;
324+
318325
public static final String DFS_NAMENODE_MAINTENANCE_REPLICATION_MIN_KEY =
319326
"dfs.namenode.maintenance.replication.min";
320327
public static final int DFS_NAMENODE_MAINTENANCE_REPLICATION_MIN_DEFAULT

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/BlockManager.java

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
import java.util.BitSet;
3131
import java.util.Collection;
3232
import java.util.Collections;
33+
import java.util.Comparator;
3334
import java.util.EnumSet;
3435
import java.util.HashMap;
3536
import java.util.HashSet;
@@ -50,6 +51,7 @@
5051
import java.util.concurrent.ConcurrentLinkedQueue;
5152

5253
import java.util.concurrent.atomic.AtomicLong;
54+
import java.util.stream.Collectors;
5355
import javax.management.ObjectName;
5456

5557
import org.apache.hadoop.HadoopIllegalArgumentException;
@@ -86,6 +88,7 @@
8688
import org.apache.hadoop.hdfs.server.blockmanagement.NumberReplicas.StoredReplicaState;
8789
import org.apache.hadoop.hdfs.server.blockmanagement.PendingDataNodeMessages.ReportedBlockInfo;
8890
import org.apache.hadoop.hdfs.server.blockmanagement.PendingReconstructionBlocks.PendingBlockInfo;
91+
import org.apache.hadoop.hdfs.server.blockmanagement.ExcessRedundancyMap.ExcessBlockInfo;
8992
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.BlockUCState;
9093
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.ReplicaState;
9194
import org.apache.hadoop.hdfs.server.namenode.CachedBlock;
@@ -116,6 +119,7 @@
116119

117120
import static org.apache.hadoop.hdfs.util.StripedBlockUtil.getInternalBlockLength;
118121

122+
import org.apache.hadoop.hdfs.util.LightWeightHashSet;
119123
import org.apache.hadoop.metrics2.util.MBeans;
120124
import org.apache.hadoop.net.Node;
121125
import org.apache.hadoop.security.UserGroupInformation;
@@ -482,6 +486,16 @@ public int getPendingSPSPaths() {
482486
/** Storages accessible from multiple DNs. */
483487
private final ProvidedStorageMap providedStorageMap;
484488

489+
/**
490+
* Timeout for excess redundancy block.
491+
*/
492+
private long excessRedundancyTimeout;
493+
494+
/**
495+
* Limits number of blocks used to check for excess redundancy timeout.
496+
*/
497+
private long excessRedundancyTimeoutCheckLimit;
498+
485499
public BlockManager(final Namesystem namesystem, boolean haEnabled,
486500
final Configuration conf) throws IOException {
487501
this.namesystem = namesystem;
@@ -589,6 +603,12 @@ public BlockManager(final Namesystem namesystem, boolean haEnabled,
589603
conf.getBoolean(DFS_NAMENODE_CORRUPT_BLOCK_DELETE_IMMEDIATELY_ENABLED,
590604
DFS_NAMENODE_CORRUPT_BLOCK_DELETE_IMMEDIATELY_ENABLED_DEFAULT);
591605

606+
setExcessRedundancyTimeout(conf.getLong(DFS_NAMENODE_EXCESS_REDUNDANCY_TIMEOUT_SEC_KEY,
607+
DFS_NAMENODE_EXCESS_REDUNDANCY_TIMEOUT_SEC_DEAFULT));
608+
setExcessRedundancyTimeoutCheckLimit(conf.getLong(
609+
DFS_NAMENODE_EXCESS_REDUNDANCY_TIMEOUT_CHECK_LIMIT,
610+
DFS_NAMENODE_EXCESS_REDUNDANCY_TIMEOUT_CHECK_LIMIT_DEFAULT));
611+
592612
printInitialConfigs();
593613
}
594614

@@ -3041,6 +3061,100 @@ void rescanPostponedMisreplicatedBlocks() {
30413061
(Time.monotonicNow() - startTime), endSize, (startSize - endSize));
30423062
}
30433063
}
3064+
3065+
/**
3066+
* Sets the timeout (in seconds) for excess redundancy blocks, if the provided timeout is
3067+
* less than or equal to 0, the default value is used (converted to milliseconds).
3068+
* @param timeout The time (in seconds) to set as the excess redundancy block timeout.
3069+
*/
3070+
public void setExcessRedundancyTimeout(long timeout) {
3071+
if (timeout <= 0) {
3072+
this.excessRedundancyTimeout = DFS_NAMENODE_EXCESS_REDUNDANCY_TIMEOUT_SEC_DEAFULT * 1000L;
3073+
} else {
3074+
this.excessRedundancyTimeout = timeout * 1000L;
3075+
}
3076+
}
3077+
3078+
/**
3079+
* Sets the limit number of blocks for checking excess redundancy timeout.
3080+
* If the provided limit is less than or equal to 0, the default limit is used.
3081+
*
3082+
* @param limit The limit number of blocks used to check for excess redundancy timeout.
3083+
*/
3084+
public void setExcessRedundancyTimeoutCheckLimit(long limit) {
3085+
if (excessRedundancyTimeoutCheckLimit <= 0) {
3086+
this.excessRedundancyTimeoutCheckLimit =
3087+
DFS_NAMENODE_EXCESS_REDUNDANCY_TIMEOUT_CHECK_LIMIT_DEFAULT;
3088+
} else {
3089+
this.excessRedundancyTimeoutCheckLimit = limit;
3090+
}
3091+
}
3092+
3093+
/**
3094+
* Process timed-out blocks in the excess redundancy map.
3095+
*/
3096+
void processTimedOutExcessBlocks() {
3097+
if (excessRedundancyMap.size() == 0) {
3098+
return;
3099+
}
3100+
namesystem.writeLock();
3101+
long now = Time.monotonicNow();
3102+
int processed = 0;
3103+
try {
3104+
Iterator<Map.Entry<String, LightWeightHashSet<Block>>> iter =
3105+
excessRedundancyMap.getExcessRedundancyMap().entrySet().iterator();
3106+
while (iter.hasNext() && processed < excessRedundancyTimeoutCheckLimit) {
3107+
Map.Entry<String, LightWeightHashSet<Block>> entry = iter.next();
3108+
String datanodeUuid = entry.getKey();
3109+
LightWeightHashSet<Block> blocks = entry.getValue();
3110+
// Sort blocks by timestamp in descending order.
3111+
List<ExcessBlockInfo> sortedBlocks = blocks.stream()
3112+
.filter(block -> block instanceof ExcessBlockInfo)
3113+
.map(block -> (ExcessBlockInfo) block)
3114+
.sorted(Comparator.comparingLong(ExcessBlockInfo::getTimeStamp))
3115+
.collect(Collectors.toList());
3116+
3117+
for (ExcessBlockInfo excessBlockInfo : sortedBlocks) {
3118+
if (processed >= excessRedundancyTimeoutCheckLimit) {
3119+
break;
3120+
}
3121+
3122+
processed++;
3123+
// If the datanode doesn't have any excess block that has exceeded the timeout,
3124+
// can exit this loop.
3125+
if (now <= excessBlockInfo.getTimeStamp() + excessRedundancyTimeout) {
3126+
break;
3127+
}
3128+
3129+
BlockInfo blockInfo = excessBlockInfo.getBlockInfo();
3130+
BlockInfo bi = blocksMap.getStoredBlock(blockInfo);
3131+
if (bi == null || bi.isDeleted()) {
3132+
continue;
3133+
}
3134+
3135+
Iterator<DatanodeStorageInfo> iterator = blockInfo.getStorageInfos();
3136+
while (iterator.hasNext()) {
3137+
DatanodeStorageInfo datanodeStorageInfo = iterator.next();
3138+
DatanodeDescriptor datanodeDescriptor = datanodeStorageInfo.getDatanodeDescriptor();
3139+
if (datanodeDescriptor.getDatanodeUuid().equals(datanodeUuid) &&
3140+
datanodeStorageInfo.getState().equals(State.NORMAL)) {
3141+
final Block b = getBlockOnStorage(blockInfo, datanodeStorageInfo);
3142+
if (!containsInvalidateBlock(datanodeDescriptor, b)) {
3143+
addToInvalidates(b, datanodeDescriptor);
3144+
LOG.debug("Excess block timeout ({}, {}) is added to invalidated.",
3145+
b, datanodeDescriptor);
3146+
}
3147+
excessBlockInfo.setTimeStamp();
3148+
break;
3149+
}
3150+
}
3151+
}
3152+
}
3153+
} finally {
3154+
namesystem.writeUnlock("processTimedOutExcessBlocks");
3155+
LOG.info("processTimedOutExcessBlocks {} msecs.", (Time.monotonicNow() - now));
3156+
}
3157+
}
30443158

30453159
Collection<Block> processReport(
30463160
final DatanodeStorageInfo storageInfo,
@@ -5232,6 +5346,7 @@ public void run() {
52325346
computeDatanodeWork();
52335347
processPendingReconstructions();
52345348
rescanPostponedMisreplicatedBlocks();
5349+
processTimedOutExcessBlocks();
52355350
lastRedundancyCycleTS.set(Time.monotonicNow());
52365351
}
52375352
TimeUnit.MILLISECONDS.sleep(redundancyRecheckIntervalMs);

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/ExcessRedundancyMap.java

Lines changed: 50 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,15 @@
2121
import java.util.Map;
2222
import java.util.concurrent.atomic.AtomicLong;
2323

24+
import org.apache.hadoop.hdfs.protocol.Block;
2425
import org.apache.hadoop.hdfs.server.namenode.NameNode;
2526
import org.apache.hadoop.hdfs.util.LightWeightHashSet;
2627
import org.slf4j.Logger;
2728

2829
import org.apache.hadoop.classification.VisibleForTesting;
2930

31+
import static org.apache.hadoop.util.Time.monotonicNow;
32+
3033
/**
3134
* Maps a datnode to the set of excess redundancy details.
3235
*
@@ -35,7 +38,7 @@
3538
class ExcessRedundancyMap {
3639
public static final Logger blockLog = NameNode.blockStateChangeLog;
3740

38-
private final Map<String, LightWeightHashSet<BlockInfo>> map =new HashMap<>();
41+
private final Map<String, LightWeightHashSet<Block>> map = new HashMap<>();
3942
private final AtomicLong size = new AtomicLong(0L);
4043

4144
/**
@@ -50,7 +53,7 @@ long size() {
5053
*/
5154
@VisibleForTesting
5255
synchronized int getSize4Testing(String dnUuid) {
53-
final LightWeightHashSet<BlockInfo> set = map.get(dnUuid);
56+
final LightWeightHashSet<Block> set = map.get(dnUuid);
5457
return set == null? 0: set.size();
5558
}
5659

@@ -64,7 +67,7 @@ synchronized void clear() {
6467
* datanode and the given block?
6568
*/
6669
synchronized boolean contains(DatanodeDescriptor dn, BlockInfo blk) {
67-
final LightWeightHashSet<BlockInfo> set = map.get(dn.getDatanodeUuid());
70+
final LightWeightHashSet<Block> set = map.get(dn.getDatanodeUuid());
6871
return set != null && set.contains(blk);
6972
}
7073

@@ -75,12 +78,12 @@ synchronized boolean contains(DatanodeDescriptor dn, BlockInfo blk) {
7578
* @return true if the block is added.
7679
*/
7780
synchronized boolean add(DatanodeDescriptor dn, BlockInfo blk) {
78-
LightWeightHashSet<BlockInfo> set = map.get(dn.getDatanodeUuid());
81+
LightWeightHashSet<Block> set = map.get(dn.getDatanodeUuid());
7982
if (set == null) {
8083
set = new LightWeightHashSet<>();
8184
map.put(dn.getDatanodeUuid(), set);
8285
}
83-
final boolean added = set.add(blk);
86+
final boolean added = set.add(new ExcessBlockInfo(blk));
8487
if (added) {
8588
size.incrementAndGet();
8689
blockLog.debug("BLOCK* ExcessRedundancyMap.add({}, {})", dn, blk);
@@ -95,11 +98,10 @@ synchronized boolean add(DatanodeDescriptor dn, BlockInfo blk) {
9598
* @return true if the block is removed.
9699
*/
97100
synchronized boolean remove(DatanodeDescriptor dn, BlockInfo blk) {
98-
final LightWeightHashSet<BlockInfo> set = map.get(dn.getDatanodeUuid());
101+
final LightWeightHashSet<Block> set = map.get(dn.getDatanodeUuid());
99102
if (set == null) {
100103
return false;
101104
}
102-
103105
final boolean removed = set.remove(blk);
104106
if (removed) {
105107
size.decrementAndGet();
@@ -111,4 +113,45 @@ synchronized boolean remove(DatanodeDescriptor dn, BlockInfo blk) {
111113
}
112114
return removed;
113115
}
116+
117+
synchronized Map<String, LightWeightHashSet<Block>> getExcessRedundancyMap() {
118+
return map;
119+
}
120+
121+
/**
122+
* An object that contains information about a block that is being excess redundancy.
123+
* It records the timestamp when added excess redundancy map of this block.
124+
*/
125+
static class ExcessBlockInfo extends Block {
126+
private long timeStamp;
127+
private final BlockInfo blockInfo;
128+
129+
ExcessBlockInfo(BlockInfo blockInfo) {
130+
super(blockInfo.getBlockId(), blockInfo.getNumBytes(), blockInfo.getGenerationStamp());
131+
this.timeStamp = monotonicNow();
132+
this.blockInfo = blockInfo;
133+
}
134+
135+
public BlockInfo getBlockInfo() {
136+
return blockInfo;
137+
}
138+
139+
long getTimeStamp() {
140+
return timeStamp;
141+
}
142+
143+
void setTimeStamp() {
144+
timeStamp = monotonicNow();
145+
}
146+
147+
@Override
148+
public int hashCode() {
149+
return super.hashCode();
150+
}
151+
152+
@Override
153+
public boolean equals(Object obj) {
154+
return super.equals(obj);
155+
}
156+
}
114157
}

hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5425,6 +5425,24 @@
54255425
</description>
54265426
</property>
54275427

5428+
<property>
5429+
<name>dfs.namenode.excess.redundancy.timeout-sec</name>
5430+
<value>3600</value>
5431+
<description>
5432+
Timeout in seconds for excess redundancy block. If this value is 0 or less,
5433+
then it will default to 3600 minutes.
5434+
</description>
5435+
</property>
5436+
5437+
<property>
5438+
<name>dfs.namenode.excess.redundancy.timeout.check.limit</name>
5439+
<value>1000</value>
5440+
<description>
5441+
Limits number of blocks used to check for excess redundancy timeout.
5442+
If this value is 0 or less, then it will default to 1000.
5443+
</description>
5444+
</property>
5445+
54285446
<property>
54295447
<name>dfs.namenode.stale.datanode.minimum.interval</name>
54305448
<value>3</value>

hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/MiniDFSCluster.java

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2092,6 +2092,25 @@ public FsDatasetTestUtils getFsDatasetTestUtils(DataNode dn) {
20922092
.newInstance(dn);
20932093
}
20942094

2095+
/**
2096+
* Wait for the datanodes in the cluster to process any block
2097+
* deletions that have already been asynchronously queued.
2098+
*/
2099+
public void waitForDNDeletions()
2100+
throws TimeoutException, InterruptedException {
2101+
GenericTestUtils.waitFor(new Supplier<Boolean>() {
2102+
@Override
2103+
public Boolean get() {
2104+
for (DataNode dn : getDataNodes()) {
2105+
if (getFsDatasetTestUtils(dn).getPendingAsyncDeletions() > 0) {
2106+
return false;
2107+
}
2108+
}
2109+
return true;
2110+
}
2111+
}, 1000, 10000);
2112+
}
2113+
20952114
/**
20962115
* Gets the rpc port used by the NameNode, because the caller
20972116
* supplied port is not necessarily the actual port used.

0 commit comments

Comments
 (0)