HDFS-17801. EC: Reading support retryCurrentNode to avoid transient errors cause application level failures.

hfutatzhanghb · hfutatzhanghb · commit f52be54c1c0a · 2025-06-25T20:56:24.000+08:00
diff --git a/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DFSStripedInputStream.java b/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/DFSStripedInputStream.java
@@ -93,6 +93,13 @@ public class DFSStripedInputStream extends DFSInputStream {
    */
   private final Set<String> warnedNodes =
       Collections.newSetFromMap(new ConcurrentHashMap<>());
+  /**
+   * We use this field to indicate whether we should retry the corresponding reader before
+   * we mark it skipped. possibly retry the same node so that transient errors don't
+   * result in application level failures (e.g. Datanode could have closed the connection
+   * because the client is idle for too long).
+   */
+  private boolean[] retryCurrentReaderFlags;
 
   DFSStripedInputStream(DFSClient dfsClient, String src,
       boolean verifyChecksum, ErasureCodingPolicy ecPolicy,
@@ -112,6 +119,8 @@ public class DFSStripedInputStream extends DFSInputStream {
         dataBlkNum, parityBlkNum);
     decoder = CodecUtil.createRawDecoder(dfsClient.getConfiguration(),
         ecPolicy.getCodecName(), coderOptions);
+    retryCurrentReaderFlags = new boolean[groupSize];
+    Arrays.fill(retryCurrentReaderFlags, true);
     DFSClient.LOG.debug("Creating an striped input stream for file {}", src);
   }
 
@@ -206,21 +215,24 @@ protected void closeCurrentBlockReaders() {
       return;
     }
     for (int i = 0; i < groupSize; i++) {
-      closeReader(blockReaders[i]);
+      retryCurrentReaderFlags[i] = false;
+      closeReader(blockReaders[i], i);
       blockReaders[i] = null;
     }
     blockEnd = -1;
   }
 
-  protected void closeReader(BlockReaderInfo readerInfo) {
+  protected void closeReader(BlockReaderInfo readerInfo, int readerIndex) {
     if (readerInfo != null) {
       if (readerInfo.reader != null) {
         try {
           readerInfo.reader.close();
         } catch (Throwable ignored) {
         }
       }
-      readerInfo.skip();
+      if (!retryCurrentReaderFlags[readerIndex]) {
+        readerInfo.skip();
+      }
     }
   }
 
@@ -516,8 +528,11 @@ protected void fetchBlockByteRange(LocatedBlock block, long start,
       }
       buf.position(buf.position() + (int)(end - start + 1));
     } finally {
+      int index = 0;
       for (BlockReaderInfo preaderInfo : preaderInfos) {
-        closeReader(preaderInfo);
+        retryCurrentReaderFlags[index] = false;
+        closeReader(preaderInfo, index);
+        index++;
       }
     }
   }
@@ -573,4 +588,7 @@ public synchronized void unbuffer() {
     }
   }
 
+  public boolean[] getRetryCurrentReaderFlags() {
+    return retryCurrentReaderFlags;
+  }
 }
diff --git a/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/StripeReader.java b/hadoop-hdfs-project/hadoop-hdfs-client/src/main/java/org/apache/hadoop/hdfs/StripeReader.java
@@ -38,7 +38,9 @@
 import java.nio.ByteBuffer;
 import java.util.Arrays;
 import java.util.HashMap;
+import java.util.HashSet;
 import java.util.Map;
+import java.util.Set;
 import java.util.concurrent.Callable;
 import java.util.concurrent.CompletionService;
 import java.util.concurrent.ExecutorCompletionService;
@@ -174,11 +176,26 @@ void updateState4SuccessRead(StripingChunkReadResult result) {
 
   private void checkMissingBlocks() throws IOException {
     if (alignedStripe.missingChunksNum > parityBlkNum) {
-      clearFutures();
-      throw new IOException(alignedStripe.missingChunksNum
-          + " missing blocks, the stripe is: " + alignedStripe
-          + "; locatedBlocks is: " + dfsStripedInputStream.getLocatedBlocks());
+      if (countOfNullReaderInfos(readerInfos) < parityBlkNum) {
+        clearFutures();
+        throw new IOException(alignedStripe.missingChunksNum
+            + " missing blocks, the stripe is: " + alignedStripe
+            + "; locatedBlocks is: " + dfsStripedInputStream.getLocatedBlocks());
+      }
+    }
+  }
+
+  private int countOfNullReaderInfos(BlockReaderInfo[] readerInfos) {
+    if (readerInfos == null) {
+      return 0;
+    }
+    int count = 0;
+    for (int i = 0; i < readerInfos.length; i++) {
+      if (readerInfos[i] == null) {
+        count++;
+      }
     }
+    return count;
   }
 
   /**
@@ -187,6 +204,23 @@ private void checkMissingBlocks() throws IOException {
    */
   private void readDataForDecoding() throws IOException {
     prepareDecodeInputs();
+
+    if (alignedStripe.missingChunksNum > parityBlkNum) {
+      Set<Integer> recoveredIndexes = new HashSet<>();
+      if (countOfNullReaderInfos(readerInfos) >= parityBlkNum) {
+        for (int index = 0; index < dataBlkNum + parityBlkNum; index++) {
+          if (readerInfos[index] == null) {
+            alignedStripe.chunks[index].state = StripingChunk.REQUESTED;
+            recoveredIndexes.add(index);
+          }
+        }
+      }
+
+      for (int recoveredIndex : recoveredIndexes) {
+        alignedStripe.missingChunksNum--;
+      }
+    }
+    
     for (int i = 0; i < dataBlkNum; i++) {
       Preconditions.checkNotNull(alignedStripe.chunks[i]);
       if (alignedStripe.chunks[i].state == StripingChunk.REQUESTED) {
@@ -332,7 +366,7 @@ boolean readChunk(final LocatedBlock block, int chunkIndex)
   }
 
   /**
-   * read the whole stripe. do decoding if necessary
+   * Read the whole stripe. do decoding if necessary.
    */
   void readStripe() throws IOException {
     try {
@@ -349,7 +383,7 @@ void readStripe() throws IOException {
       if (alignedStripe.missingChunksNum > 0) {
         checkMissingBlocks();
         readDataForDecoding();
-        // read parity chunks
+        // Read parity chunks.
         readParityChunks(alignedStripe.missingChunksNum);
       }
     } catch (IOException e) {
@@ -359,7 +393,7 @@ void readStripe() throws IOException {
     // TODO: for a full stripe we can start reading (dataBlkNum + 1) chunks
 
     // Input buffers for potential decode operation, which remains null until
-    // first read failure
+    // first read failure.
     while (!futures.isEmpty()) {
       try {
         long beginReadMS = Time.monotonicNow();
@@ -378,14 +412,20 @@ void readStripe() throws IOException {
           returnedChunk.state = StripingChunk.FETCHED;
           alignedStripe.fetchedChunksNum++;
           updateState4SuccessRead(r);
+          dfsStripedInputStream.getRetryCurrentReaderFlags()[r.index] = true;
           if (alignedStripe.fetchedChunksNum == dataBlkNum) {
             clearFutures();
             break;
           }
         } else {
           returnedChunk.state = StripingChunk.MISSING;
-          // close the corresponding reader
-          dfsStripedInputStream.closeReader(readerInfos[r.index]);
+          // Close the corresponding reader.
+          dfsStripedInputStream.closeReader(readerInfos[r.index], r.index);
+          boolean originalRetryFlag = dfsStripedInputStream.getRetryCurrentReaderFlags()[r.index];
+          if (originalRetryFlag) {
+            dfsStripedInputStream.getRetryCurrentReaderFlags()[r.index] = false;
+            readerInfos[r.index] = null;
+          }
 
           final int missing = alignedStripe.missingChunksNum;
           alignedStripe.missingChunksNum++;
@@ -399,7 +439,7 @@ void readStripe() throws IOException {
         DFSClient.LOG.error(err, ie);
         dfsStripedInputStream.close();
         clearFutures();
-        // Don't decode if read interrupted
+        // Don't decode if read interrupted.
         throw new InterruptedIOException(err);
       }
     }
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDFSStripedInputStream.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDFSStripedInputStream.java
@@ -17,6 +17,7 @@
  */
 package org.apache.hadoop.hdfs;
 
+import org.apache.hadoop.fs.FSDataOutputStream;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.apache.hadoop.HadoopIllegalArgumentException;
@@ -50,6 +51,7 @@
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.List;
+import java.util.Random;
 
 import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.IO_FILE_BUFFER_SIZE_DEFAULT;
 import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.IO_FILE_BUFFER_SIZE_KEY;
@@ -735,4 +737,86 @@ public void onCreateBlockReader(LocatedBlock block, int chunkIndex,
     assertEquals(rangesExpected, ranges);
   }
 
+  @Test
+  public void testStatefulReadAfterLongTimeIdle() throws Exception {
+    HdfsConfiguration hdfsConf = new HdfsConfiguration();
+    hdfsConf.setInt("dfs.datanode.socket.write.timeout", 5000);
+    hdfsConf.setInt("dfs.client.socket-timeout", 5000);
+    String testBaseDir = "/testECRead";
+    String testfileName = "testfile";
+    try (MiniDFSCluster cluster = new MiniDFSCluster.Builder(hdfsConf)
+        .numDataNodes(9).build()) {
+      cluster.waitActive();
+      final DistributedFileSystem dfs = cluster.getFileSystem();
+      Path dir = new Path(testBaseDir);
+      assertTrue(dfs.mkdirs(dir));
+      dfs.enableErasureCodingPolicy("RS-6-3-1024k");
+      dfs.setErasureCodingPolicy(dir, "RS-6-3-1024k");
+      assertEquals("RS-6-3-1024k", dfs.getErasureCodingPolicy(dir).getName());
+
+      int writeBufSize = 30 * 1024 * 1024 + 1;
+      byte[] writeBuf = new byte[writeBufSize];
+      try (FSDataOutputStream fsdos = dfs.create(
+          new Path(testBaseDir + Path.SEPARATOR + testfileName))) {
+        Random random = new Random();
+        random.nextBytes(writeBuf);
+        fsdos.write(writeBuf, 0, writeBuf.length);
+        Thread.sleep(2000);
+      }
+
+      byte[] readBuf = new byte[6 * 1024 * 1024];
+      try (FSDataInputStream fsdis = dfs.open(
+          new Path(testBaseDir + Path.SEPARATOR + testfileName))) {
+        fsdis.read(readBuf);
+        Thread.sleep(6 * 1000);
+        while ((fsdis.read(readBuf)) > 0) {
+          Thread.sleep(6 * 1000);
+        }
+      }
+      assertTrue(dfs.delete(new Path(testBaseDir + Path.SEPARATOR + testfileName), true));
+    }
+  }
+
+  @Test
+  public void testPReadAfterLongTimeIdle() throws Exception {
+    HdfsConfiguration hdfsConf = new HdfsConfiguration();
+    hdfsConf.setInt("dfs.datanode.socket.write.timeout", 5000);
+    hdfsConf.setInt("dfs.client.socket-timeout", 5000);
+    String testBaseDir = "/testECRead";
+    String testfileName = "testfile";
+    try (MiniDFSCluster cluster = new MiniDFSCluster.Builder(hdfsConf)
+        .numDataNodes(9).build()) {
+      cluster.waitActive();
+      final DistributedFileSystem dfs = cluster.getFileSystem();
+      Path dir = new Path(testBaseDir);
+      assertTrue(dfs.mkdirs(dir));
+      dfs.enableErasureCodingPolicy("RS-6-3-1024k");
+      dfs.setErasureCodingPolicy(dir, "RS-6-3-1024k");
+      assertEquals("RS-6-3-1024k", dfs.getErasureCodingPolicy(dir).getName());
+
+      int writeBufSize = 30 * 1024 * 1024 + 1;
+      byte[] writeBuf = new byte[writeBufSize];
+      try (FSDataOutputStream fsdos = dfs.create(
+          new Path(testBaseDir + Path.SEPARATOR + testfileName))) {
+        Random random = new Random();
+        random.nextBytes(writeBuf);
+        fsdos.write(writeBuf, 0, writeBuf.length);
+        Thread.sleep(2000);
+      }
+
+      byte[] readBuf = new byte[6 * 1024 * 1024];
+      try (FSDataInputStream fsdis = dfs.open(
+          new Path(testBaseDir + Path.SEPARATOR + testfileName))) {
+        int curPos = 0;
+        int readLen = fsdis.read(curPos, readBuf, 0, readBuf.length);
+        curPos += readLen;
+        Thread.sleep(6 * 1000);
+        while ((readLen = fsdis.read(curPos, readBuf, 0, readBuf.length)) > 0) {
+          curPos += readLen;
+          Thread.sleep(6 * 1000);
+        }
+      }
+      assertTrue(dfs.delete(new Path(testBaseDir + Path.SEPARATOR + testfileName), true));
+    }
+  }
 }

Original file line number	Diff line number	Diff line change
`@@ -93,6 +93,13 @@ public class DFSStripedInputStream extends DFSInputStream {`
`93`	`93`	`*/`
`94`	`94`	`private final Set<String> warnedNodes =`
`95`	`95`	`Collections.newSetFromMap(new ConcurrentHashMap<>());`
	`96`	`+ /**`
	`97`	`+ * We use this field to indicate whether we should retry the corresponding reader before`
	`98`	`+ * we mark it skipped. possibly retry the same node so that transient errors don't`
	`99`	`+ * result in application level failures (e.g. Datanode could have closed the connection`
	`100`	`+ * because the client is idle for too long).`
	`101`	`+ */`
	`102`	`+ private boolean[] retryCurrentReaderFlags;`
`96`	`103`
`97`	`104`	`DFSStripedInputStream(DFSClient dfsClient, String src,`
`98`	`105`	`boolean verifyChecksum, ErasureCodingPolicy ecPolicy,`
`@@ -112,6 +119,8 @@ public class DFSStripedInputStream extends DFSInputStream {`
`112`	`119`	`dataBlkNum, parityBlkNum);`
`113`	`120`	`decoder = CodecUtil.createRawDecoder(dfsClient.getConfiguration(),`
`114`	`121`	`ecPolicy.getCodecName(), coderOptions);`
	`122`	`+ retryCurrentReaderFlags = new boolean[groupSize];`
	`123`	`+ Arrays.fill(retryCurrentReaderFlags, true);`
`115`	`124`	`DFSClient.LOG.debug("Creating an striped input stream for file {}", src);`
`116`	`125`	`}`
`117`	`126`
`@@ -206,21 +215,24 @@ protected void closeCurrentBlockReaders() {`
`206`	`215`	`return;`
`207`	`216`	`}`
`208`	`217`	`for (int i = 0; i < groupSize; i++) {`
`209`		`- closeReader(blockReaders[i]);`
	`218`	`+ retryCurrentReaderFlags[i] = false;`
	`219`	`+ closeReader(blockReaders[i], i);`
`210`	`220`	`blockReaders[i] = null;`
`211`	`221`	`}`
`212`	`222`	`blockEnd = -1;`
`213`	`223`	`}`
`214`	`224`
`215`		`- protected void closeReader(BlockReaderInfo readerInfo) {`
	`225`	`+ protected void closeReader(BlockReaderInfo readerInfo, int readerIndex) {`
`216`	`226`	`if (readerInfo != null) {`
`217`	`227`	`if (readerInfo.reader != null) {`
`218`	`228`	`try {`
`219`	`229`	`readerInfo.reader.close();`
`220`	`230`	`} catch (Throwable ignored) {`
`221`	`231`	`}`
`222`	`232`	`}`
`223`		`- readerInfo.skip();`
	`233`	`+ if (!retryCurrentReaderFlags[readerIndex]) {`
	`234`	`+ readerInfo.skip();`
	`235`	`+ }`
`224`	`236`	`}`
`225`	`237`	`}`
`226`	`238`
`@@ -516,8 +528,11 @@ protected void fetchBlockByteRange(LocatedBlock block, long start,`
`516`	`528`	`}`
`517`	`529`	`buf.position(buf.position() + (int)(end - start + 1));`
`518`	`530`	`} finally {`
	`531`	`+ int index = 0;`
`519`	`532`	`for (BlockReaderInfo preaderInfo : preaderInfos) {`
`520`		`- closeReader(preaderInfo);`
	`533`	`+ retryCurrentReaderFlags[index] = false;`
	`534`	`+ closeReader(preaderInfo, index);`
	`535`	`+ index++;`
`521`	`536`	`}`
`522`	`537`	`}`
`523`	`538`	`}`
`@@ -573,4 +588,7 @@ public synchronized void unbuffer() {`
`573`	`588`	`}`
`574`	`589`	`}`
`575`	`590`
	`591`	`+ public boolean[] getRetryCurrentReaderFlags() {`
	`592`	`+ return retryCurrentReaderFlags;`
	`593`	`+ }`
`576`	`594`	`}`