Skip to content

Commit f62f116

Browse files
ritegargshahrs87
authored andcommitted
HDFS-17299. Adding rack failure tolerance when creating a new file (#6566)
(cherry picked from commit 58afe43)
1 parent bd8b77f commit f62f116

File tree

8 files changed

+225
-33
lines changed

8 files changed

+225
-33
lines changed

hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DataStreamer.java

Lines changed: 47 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@
8787
import org.apache.hadoop.thirdparty.com.google.common.cache.LoadingCache;
8888
import org.apache.hadoop.thirdparty.com.google.common.cache.RemovalListener;
8989
import org.apache.hadoop.thirdparty.com.google.common.cache.RemovalNotification;
90+
import org.apache.hadoop.thirdparty.com.google.common.collect.Iterables;
9091

9192
import org.slf4j.Logger;
9293
import org.slf4j.LoggerFactory;
@@ -643,17 +644,17 @@ void setAccessToken(Token<BlockTokenIdentifier> t) {
643644
this.accessToken = t;
644645
}
645646

646-
private void setPipeline(LocatedBlock lb) {
647+
protected void setPipeline(LocatedBlock lb) {
647648
setPipeline(lb.getLocations(), lb.getStorageTypes(), lb.getStorageIDs());
648649
}
649650

650-
private void setPipeline(DatanodeInfo[] nodes, StorageType[] storageTypes,
651-
String[] storageIDs) {
651+
protected void setPipeline(DatanodeInfo[] newNodes, StorageType[] newStorageTypes,
652+
String[] newStorageIDs) {
652653
synchronized (nodesLock) {
653-
this.nodes = nodes;
654+
this.nodes = newNodes;
654655
}
655-
this.storageTypes = storageTypes;
656-
this.storageIDs = storageIDs;
656+
this.storageTypes = newStorageTypes;
657+
this.storageIDs = newStorageIDs;
657658
}
658659

659660
/**
@@ -748,7 +749,7 @@ public void run() {
748749

749750
if (stage == BlockConstructionStage.PIPELINE_SETUP_CREATE) {
750751
LOG.debug("Allocating new block: {}", this);
751-
setPipeline(nextBlockOutputStream());
752+
setupPipelineForCreate();
752753
initDataStreaming();
753754
} else if (stage == BlockConstructionStage.PIPELINE_SETUP_APPEND) {
754755
LOG.debug("Append to block {}", block);
@@ -1607,8 +1608,11 @@ private void transfer(final DatanodeInfo src, final DatanodeInfo[] targets,
16071608
* it can be written to.
16081609
* This happens when a file is appended or data streaming fails
16091610
* It keeps on trying until a pipeline is setup
1611+
*
1612+
* Returns boolean whether pipeline was setup successfully or not.
1613+
* This boolean is used upstream on whether to continue creating pipeline or throw exception
16101614
*/
1611-
private void setupPipelineForAppendOrRecovery() throws IOException {
1615+
private boolean setupPipelineForAppendOrRecovery() throws IOException {
16121616
// Check number of datanodes. Note that if there is no healthy datanode,
16131617
// this must be internal error because we mark external error in striped
16141618
// outputstream only when all the streamers are in the DATA_STREAMING stage
@@ -1618,33 +1622,46 @@ private void setupPipelineForAppendOrRecovery() throws IOException {
16181622
LOG.warn(msg);
16191623
lastException.set(new IOException(msg));
16201624
streamerClosed = true;
1621-
return;
1625+
return false;
16221626
}
1623-
setupPipelineInternal(nodes, storageTypes, storageIDs);
1627+
return setupPipelineInternal(nodes, storageTypes, storageIDs);
16241628
}
16251629

1626-
protected void setupPipelineInternal(DatanodeInfo[] datanodes,
1630+
protected boolean setupPipelineInternal(DatanodeInfo[] datanodes,
16271631
StorageType[] nodeStorageTypes, String[] nodeStorageIDs)
16281632
throws IOException {
16291633
boolean success = false;
16301634
long newGS = 0L;
1635+
boolean isCreateStage = BlockConstructionStage.PIPELINE_SETUP_CREATE == stage;
16311636
while (!success && !streamerClosed && dfsClient.clientRunning) {
16321637
if (!handleRestartingDatanode()) {
1633-
return;
1638+
return false;
16341639
}
16351640

1636-
final boolean isRecovery = errorState.hasInternalError();
1641+
final boolean isRecovery = errorState.hasInternalError() && !isCreateStage;
1642+
1643+
16371644
if (!handleBadDatanode()) {
1638-
return;
1645+
return false;
16391646
}
16401647

16411648
handleDatanodeReplacement();
16421649

1650+
// During create stage, min replication should still be satisfied.
1651+
if (isCreateStage && !(dfsClient.dtpReplaceDatanodeOnFailureReplication > 0 &&
1652+
nodes.length >= dfsClient.dtpReplaceDatanodeOnFailureReplication)) {
1653+
return false;
1654+
}
1655+
16431656
// get a new generation stamp and an access token
16441657
final LocatedBlock lb = updateBlockForPipeline();
16451658
newGS = lb.getBlock().getGenerationStamp();
16461659
accessToken = lb.getBlockToken();
16471660

1661+
if (isCreateStage) {
1662+
block.setCurrentBlock(lb.getBlock());
1663+
}
1664+
16481665
// set up the pipeline again with the remaining nodes
16491666
success = createBlockOutputStream(nodes, storageTypes, storageIDs, newGS,
16501667
isRecovery);
@@ -1657,6 +1674,7 @@ protected void setupPipelineInternal(DatanodeInfo[] datanodes,
16571674
if (success) {
16581675
updatePipeline(newGS);
16591676
}
1677+
return success;
16601678
}
16611679

16621680
/**
@@ -1795,7 +1813,7 @@ DatanodeInfo[] getExcludedNodes() {
17951813
* Must get block ID and the IDs of the destinations from the namenode.
17961814
* Returns the list of target datanodes.
17971815
*/
1798-
protected LocatedBlock nextBlockOutputStream() throws IOException {
1816+
protected void setupPipelineForCreate() throws IOException {
17991817
LocatedBlock lb;
18001818
DatanodeInfo[] nodes;
18011819
StorageType[] nextStorageTypes;
@@ -1806,6 +1824,7 @@ protected LocatedBlock nextBlockOutputStream() throws IOException {
18061824
do {
18071825
errorState.resetInternalError();
18081826
lastException.clear();
1827+
streamerClosed = false;
18091828

18101829
DatanodeInfo[] excluded = getExcludedNodes();
18111830
lb = locateFollowingBlock(
@@ -1817,26 +1836,33 @@ protected LocatedBlock nextBlockOutputStream() throws IOException {
18171836
nodes = lb.getLocations();
18181837
nextStorageTypes = lb.getStorageTypes();
18191838
nextStorageIDs = lb.getStorageIDs();
1839+
setPipeline(lb);
1840+
try {
1841+
// Connect to first DataNode in the list.
1842+
success = createBlockOutputStream(nodes, nextStorageTypes, nextStorageIDs, 0L, false)
1843+
|| setupPipelineForAppendOrRecovery();
18201844

1821-
// Connect to first DataNode in the list.
1822-
success = createBlockOutputStream(nodes, nextStorageTypes, nextStorageIDs,
1823-
0L, false);
1824-
1845+
} catch(IOException ie) {
1846+
LOG.warn("Exception in setupPipelineForCreate " + this, ie);
1847+
success = false;
1848+
}
18251849
if (!success) {
18261850
LOG.warn("Abandoning " + block);
18271851
dfsClient.namenode.abandonBlock(block.getCurrentBlock(),
18281852
stat.getFileId(), src, dfsClient.clientName);
18291853
block.setCurrentBlock(null);
1830-
final DatanodeInfo badNode = nodes[errorState.getBadNodeIndex()];
1854+
final DatanodeInfo badNode = errorState.getBadNodeIndex() == -1
1855+
? Iterables.getLast(failed)
1856+
: nodes[errorState.getBadNodeIndex()];
18311857
LOG.warn("Excluding datanode " + badNode);
18321858
excludedNodes.put(badNode, badNode);
1859+
setPipeline(null, null, null);
18331860
}
18341861
} while (!success && --count >= 0);
18351862

18361863
if (!success) {
18371864
throw new IOException("Unable to create new block.");
18381865
}
1839-
return lb;
18401866
}
18411867

18421868
// connects to the first datanode in the pipeline

hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/StripedDataStreamer.java

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ private LocatedBlock getFollowingBlock() throws IOException {
9090
}
9191

9292
@Override
93-
protected LocatedBlock nextBlockOutputStream() throws IOException {
93+
protected void setupPipelineForCreate() throws IOException {
9494
boolean success;
9595
LocatedBlock lb = getFollowingBlock();
9696
block.setCurrentBlock(lb.getBlock());
@@ -101,7 +101,6 @@ protected LocatedBlock nextBlockOutputStream() throws IOException {
101101
DatanodeInfo[] nodes = lb.getLocations();
102102
StorageType[] storageTypes = lb.getStorageTypes();
103103
String[] storageIDs = lb.getStorageIDs();
104-
105104
// Connect to the DataNode. If fail the internal error state will be set.
106105
success = createBlockOutputStream(nodes, storageTypes, storageIDs, 0L,
107106
false);
@@ -113,7 +112,7 @@ protected LocatedBlock nextBlockOutputStream() throws IOException {
113112
excludedNodes.put(badNode, badNode);
114113
throw new IOException("Unable to create new block." + this);
115114
}
116-
return lb;
115+
setPipeline(lb);
117116
}
118117

119118
@VisibleForTesting
@@ -122,18 +121,18 @@ LocatedBlock peekFollowingBlock() {
122121
}
123122

124123
@Override
125-
protected void setupPipelineInternal(DatanodeInfo[] nodes,
124+
protected boolean setupPipelineInternal(DatanodeInfo[] nodes,
126125
StorageType[] nodeStorageTypes, String[] nodeStorageIDs)
127126
throws IOException {
128127
boolean success = false;
129128
while (!success && !streamerClosed() && dfsClient.clientRunning) {
130129
if (!handleRestartingDatanode()) {
131-
return;
130+
return false;
132131
}
133132
if (!handleBadDatanode()) {
134133
// for striped streamer if it is datanode error then close the stream
135134
// and return. no need to replace datanode
136-
return;
135+
return false;
137136
}
138137

139138
// get a new generation stamp and an access token
@@ -179,6 +178,7 @@ assert getErrorState().hasExternalError()
179178
setStreamerAsClosed();
180179
}
181180
} // while
181+
return success;
182182
}
183183

184184
void setExternalError() {

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockReceiver.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -218,7 +218,10 @@ class BlockReceiver implements Closeable {
218218
switch (stage) {
219219
case PIPELINE_SETUP_CREATE:
220220
replicaHandler = datanode.data.createRbw(storageType, storageId,
221-
block, allowLazyPersist);
221+
block, allowLazyPersist, newGs);
222+
if (newGs != 0L) {
223+
block.setGenerationStamp(newGs);
224+
}
222225
datanode.notifyNamenodeReceivingBlock(
223226
block, replicaHandler.getReplica().getStorageUuid());
224227
break;

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/FsDatasetSpi.java

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -335,6 +335,16 @@ ReplicaHandler createTemporary(StorageType storageType, String storageId,
335335
ReplicaHandler createRbw(StorageType storageType, String storageId,
336336
ExtendedBlock b, boolean allowLazyPersist) throws IOException;
337337

338+
/**
339+
* Creates a RBW replica and returns the meta info of the replica
340+
*
341+
* @param b block
342+
* @return the meta info of the replica which is being written to
343+
* @throws IOException if an error occurs
344+
*/
345+
ReplicaHandler createRbw(StorageType storageType, String storageId,
346+
ExtendedBlock b, boolean allowLazyPersist, long newGS) throws IOException;
347+
338348
/**
339349
* Recovers a RBW replica and returns the meta info of the replica.
340350
*
@@ -468,7 +478,7 @@ void checkBlock(ExtendedBlock b, long minLength, ReplicaState state)
468478
boolean isValidRbw(ExtendedBlock b);
469479

470480
/**
471-
* Invalidates the specified blocks
481+
* Invalidates the specified blocks.
472482
* @param bpid Block pool Id
473483
* @param invalidBlks - the blocks to be invalidated
474484
* @throws IOException

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsDatasetImpl.java

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1585,15 +1585,29 @@ public Replica recoverClose(ExtendedBlock b, long newGS,
15851585
public ReplicaHandler createRbw(
15861586
StorageType storageType, String storageId, ExtendedBlock b,
15871587
boolean allowLazyPersist) throws IOException {
1588+
return createRbw(storageType, storageId, b, allowLazyPersist, 0L);
1589+
}
1590+
1591+
@Override // FsDatasetSpi
1592+
public ReplicaHandler createRbw(
1593+
StorageType storageType, String storageId, ExtendedBlock b,
1594+
boolean allowLazyPersist, long newGS) throws IOException {
15881595
long startTimeMs = Time.monotonicNow();
15891596
try (AutoCloseableLock lock = lockManager.readLock(LockLevel.BLOCK_POOl,
15901597
b.getBlockPoolId())) {
15911598
ReplicaInfo replicaInfo = volumeMap.get(b.getBlockPoolId(),
15921599
b.getBlockId());
15931600
if (replicaInfo != null) {
1594-
throw new ReplicaAlreadyExistsException("Block " + b +
1595-
" already exists in state " + replicaInfo.getState() +
1596-
" and thus cannot be created.");
1601+
// In case of retries with same blockPoolId + blockId as before
1602+
// with updated GS, cleanup the old replica to avoid
1603+
// any multiple copies with same blockPoolId + blockId
1604+
if (newGS != 0L) {
1605+
cleanupReplica(b.getBlockPoolId(), replicaInfo);
1606+
} else {
1607+
throw new ReplicaAlreadyExistsException("Block " + b +
1608+
" already exists in state " + replicaInfo.getState() +
1609+
" and thus cannot be created.");
1610+
}
15971611
}
15981612
// create a new block
15991613
FsVolumeReference ref = null;

0 commit comments

Comments
 (0)