Skip to content

Commit 9b98db2

Browse files
ritegargRitesh Garg
authored andcommitted
HDFS-17299. Adding rack failure tolerance when creating a new file (apache#6566)
1 parent 3b4fe79 commit 9b98db2

File tree

7 files changed

+236
-22
lines changed

7 files changed

+236
-22
lines changed

hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DataStreamer.java

Lines changed: 41 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@
9494
import com.google.common.cache.RemovalListener;
9595
import com.google.common.cache.RemovalNotification;
9696

97+
import com.google.common.collect.Iterables;
9798
import org.slf4j.Logger;
9899
import org.slf4j.LoggerFactory;
99100

@@ -607,6 +608,10 @@ private void setPipeline(DatanodeInfo[] nodes, StorageType[] storageTypes,
607608
this.storageIDs = storageIDs;
608609
}
609610

611+
void setAccessToken(Token<BlockTokenIdentifier> t) {
612+
this.accessToken = t;
613+
}
614+
610615
/**
611616
* Initialize for data streaming
612617
*/
@@ -706,8 +711,8 @@ public void run() {
706711

707712
// get new block from namenode.
708713
if (stage == BlockConstructionStage.PIPELINE_SETUP_CREATE) {
709-
LOG.debug("Allocating new block");
710-
setPipeline(nextBlockOutputStream());
714+
LOG.debug("Allocating new block: {}", this);
715+
setupPipelineForCreate();
711716
initDataStreaming();
712717
} else if (stage == BlockConstructionStage.PIPELINE_SETUP_APPEND) {
713718
LOG.debug("Append to block {}", block);
@@ -1449,9 +1454,11 @@ private void transfer(final DatanodeInfo src, final DatanodeInfo[] targets,
14491454
* it can be written to.
14501455
* This happens when a file is appended or data streaming fails
14511456
* It keeps on trying until a pipeline is setup
1457+
*
1458+
* Returns boolean whether pipeline was setup successfully or not.
1459+
* This boolean is used upstream on whether to continue creating pipeline or throw exception
14521460
*/
14531461
private boolean setupPipelineForAppendOrRecovery() throws IOException {
1454-
// check number of datanodes
14551462
if (nodes == null || nodes.length == 0) {
14561463
String msg = "Could not get block locations. " + "Source file \""
14571464
+ src + "\" - Aborting...";
@@ -1463,23 +1470,35 @@ private boolean setupPipelineForAppendOrRecovery() throws IOException {
14631470

14641471
boolean success = false;
14651472
long newGS = 0L;
1473+
boolean isCreateStage = BlockConstructionStage.PIPELINE_SETUP_CREATE == stage;
14661474
while (!success && !streamerClosed && dfsClient.clientRunning) {
14671475
if (!handleRestartingDatanode()) {
14681476
return false;
14691477
}
14701478

1471-
final boolean isRecovery = errorState.hasError();
1479+
final boolean isRecovery = errorState.hasError() && !isCreateStage;
1480+
14721481
if (!handleBadDatanode()) {
14731482
return false;
14741483
}
14751484

14761485
handleDatanodeReplacement();
14771486

1487+
// During create stage, min replication should still be satisfied.
1488+
if (isCreateStage && !(dfsClient.dtpReplaceDatanodeOnFailureReplication > 0 &&
1489+
nodes.length >= dfsClient.dtpReplaceDatanodeOnFailureReplication)) {
1490+
return false;
1491+
}
1492+
14781493
// get a new generation stamp and an access token
14791494
final LocatedBlock lb = updateBlockForPipeline();
14801495
newGS = lb.getBlock().getGenerationStamp();
14811496
accessToken = lb.getBlockToken();
14821497

1498+
if (isCreateStage) {
1499+
block.setCurrentBlock(lb.getBlock());
1500+
}
1501+
14831502
// set up the pipeline again with the remaining nodes
14841503
success = createBlockOutputStream(nodes, storageTypes, newGS, isRecovery);
14851504

@@ -1491,7 +1510,7 @@ private boolean setupPipelineForAppendOrRecovery() throws IOException {
14911510
if (success) {
14921511
updatePipeline(newGS);
14931512
}
1494-
return false; // do not sleep, continue processing
1513+
return success;
14951514
}
14961515

14971516
/**
@@ -1629,17 +1648,18 @@ DatanodeInfo[] getExcludedNodes() {
16291648
* Must get block ID and the IDs of the destinations from the namenode.
16301649
* Returns the list of target datanodes.
16311650
*/
1632-
protected LocatedBlock nextBlockOutputStream() throws IOException {
1651+
protected void setupPipelineForCreate() throws IOException {
16331652
LocatedBlock lb;
16341653
DatanodeInfo[] nodes;
1635-
StorageType[] storageTypes;
1654+
StorageType[] nextStorageTypes;
16361655
int count = dfsClient.getConf().getNumBlockWriteRetry();
16371656
boolean success;
16381657
final ExtendedBlock oldBlock = block.getCurrentBlock();
16391658
do {
16401659
errorState.reset();
16411660
lastException.clear();
16421661
success = false;
1662+
streamerClosed = false;
16431663

16441664
DatanodeInfo[] excluded = getExcludedNodes();
16451665
lb = locateFollowingBlock(
@@ -1649,26 +1669,34 @@ protected LocatedBlock nextBlockOutputStream() throws IOException {
16491669
bytesSent = 0;
16501670
accessToken = lb.getBlockToken();
16511671
nodes = lb.getLocations();
1652-
storageTypes = lb.getStorageTypes();
1653-
1654-
// Connect to first DataNode in the list.
1655-
success = createBlockOutputStream(nodes, storageTypes, 0L, false);
1672+
nextStorageTypes = lb.getStorageTypes();
1673+
setPipeline(lb);
1674+
try {
1675+
// Connect to first DataNode in the list.
1676+
success = createBlockOutputStream(nodes, nextStorageTypes, 0L, false)
1677+
|| setupPipelineForAppendOrRecovery();
16561678

1679+
} catch(IOException ie) {
1680+
LOG.warn("Exception in setupPipelineForCreate " + this, ie);
1681+
success = false;
1682+
}
16571683
if (!success) {
16581684
LOG.warn("Abandoning " + block);
16591685
dfsClient.namenode.abandonBlock(block.getCurrentBlock(),
16601686
stat.getFileId(), src, dfsClient.clientName);
16611687
block.setCurrentBlock(null);
1662-
final DatanodeInfo badNode = nodes[errorState.getBadNodeIndex()];
1688+
final DatanodeInfo badNode = errorState.getBadNodeIndex() == -1
1689+
? Iterables.getLast(failed)
1690+
: nodes[errorState.getBadNodeIndex()];
16631691
LOG.warn("Excluding datanode " + badNode);
16641692
excludedNodes.put(badNode, badNode);
1693+
setPipeline(null, null, null);
16651694
}
16661695
} while (!success && --count >= 0);
16671696

16681697
if (!success) {
16691698
throw new IOException("Unable to create new block.");
16701699
}
1671-
return lb;
16721700
}
16731701

16741702
// connects to the first datanode in the pipeline

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockReceiver.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -212,7 +212,10 @@ class BlockReceiver implements Closeable {
212212
} else {
213213
switch (stage) {
214214
case PIPELINE_SETUP_CREATE:
215-
replicaHandler = datanode.data.createRbw(storageType, block, allowLazyPersist);
215+
replicaHandler = datanode.data.createRbw(storageType, block, allowLazyPersist, newGs);
216+
if (newGs != 0L) {
217+
block.setGenerationStamp(newGs);
218+
}
216219
datanode.notifyNamenodeReceivingBlock(
217220
block, replicaHandler.getReplica().getStorageUuid());
218221
break;

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/FsDatasetSpi.java

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -333,6 +333,16 @@ ReplicaHandler createTemporary(StorageType storageType,
333333
ReplicaHandler createRbw(StorageType storageType,
334334
ExtendedBlock b, boolean allowLazyPersist) throws IOException;
335335

336+
/**
337+
* Creates a RBW replica and returns the meta info of the replica
338+
*
339+
* @param b block
340+
* @return the meta info of the replica which is being written to
341+
* @throws IOException if an error occurs
342+
*/
343+
ReplicaHandler createRbw(StorageType storageType,
344+
ExtendedBlock b, boolean allowLazyPersist, long newGS) throws IOException;
345+
336346
/**
337347
* Recovers a RBW replica and returns the meta info of the replica.
338348
*
@@ -466,7 +476,7 @@ void checkBlock(ExtendedBlock b, long minLength, ReplicaState state)
466476
boolean isValidRbw(ExtendedBlock b);
467477

468478
/**
469-
* Invalidates the specified blocks
479+
* Invalidates the specified blocks.
470480
* @param bpid Block pool Id
471481
* @param invalidBlks - the blocks to be invalidated
472482
* @throws IOException

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetImpl.java

Lines changed: 29 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1472,13 +1472,29 @@ private void bumpReplicaGS(ReplicaInfo replicaInfo,
14721472
public ReplicaHandler createRbw(
14731473
StorageType storageType, ExtendedBlock b, boolean allowLazyPersist)
14741474
throws IOException {
1475+
return createRbw(storageType, b, allowLazyPersist, 0L);
1476+
}
1477+
1478+
@Override // FsDatasetSpi
1479+
public ReplicaHandler createRbw(
1480+
StorageType storageType, ExtendedBlock b,
1481+
boolean allowLazyPersist, long newGS) throws IOException {
14751482
try(AutoCloseableLock lock = datasetWriteLock.acquire()) {
14761483
ReplicaInfo replicaInfo = volumeMap.get(b.getBlockPoolId(),
14771484
b.getBlockId());
14781485
if (replicaInfo != null) {
1479-
throw new ReplicaAlreadyExistsException("Block " + b +
1480-
" already exists in state " + replicaInfo.getState() +
1481-
" and thus cannot be created.");
1486+
// In case of retries with same blockPoolId + blockId as before
1487+
// with updated GS, cleanup the old replica to avoid
1488+
// any multiple copies with same blockPoolId + blockId
1489+
if (newGS != 0L) {
1490+
cleanupReplica(replicaInfo, replicaInfo.getBlockFile(), replicaInfo.getMetaFile(),
1491+
replicaInfo.getBlockFile().length(), replicaInfo.getMetaFile().length(),
1492+
b.getBlockPoolId());
1493+
} else {
1494+
throw new ReplicaAlreadyExistsException("Block " + b +
1495+
" already exists in state " + replicaInfo.getState() +
1496+
" and thus cannot be created.");
1497+
}
14821498
}
14831499
// create a new block
14841500
FsVolumeReference ref = null;
@@ -3198,16 +3214,21 @@ private void removeOldReplica(ReplicaInfo replicaInfo,
31983214
newReplicaInfo.isOnTransientStorage());
31993215

32003216
// Remove the old replicas
3217+
cleanupReplica(replicaInfo, blockFile, metaFile, blockFileUsed, metaFileUsed, bpid);
3218+
3219+
// If deletion failed then the directory scanner will cleanup the blocks
3220+
// eventually.
3221+
}
3222+
3223+
private void cleanupReplica(ReplicaInfo replicaInfo, File blockFile, File metaFile,
3224+
long blockFileUsed, long metaFileUsed, final String bpid) {
32013225
if (blockFile.delete() || !blockFile.exists()) {
32023226
FsVolumeImpl volume = (FsVolumeImpl) replicaInfo.getVolume();
32033227
volume.onBlockFileDeletion(bpid, blockFileUsed);
32043228
if (metaFile.delete() || !metaFile.exists()) {
32053229
volume.onMetaFileDeletion(bpid, metaFileUsed);
32063230
}
32073231
}
3208-
3209-
// If deletion failed then the directory scanner will cleanup the blocks
3210-
// eventually.
32113232
}
32123233

32133234
class LazyWriter implements Runnable {
@@ -3526,3 +3547,5 @@ void stopAllDataxceiverThreads(FsVolumeImpl volume) {
35263547
}
35273548
}
35283549
}
3550+
3551+

0 commit comments

Comments
 (0)