From a93234426cd3984a264130605e5f2e069c4d2dc7 Mon Sep 17 00:00:00 2001 From: Wei-Chiu Chuang Date: Tue, 4 Jun 2024 12:34:23 -0700 Subject: [PATCH 1/3] HBASE-28637 asyncwal should attempt to recover lease if close fails Change-Id: I08dc88c8d6ba4e23754ef2acd1e14b32b225ae17 --- .../hadoop/hbase/regionserver/wal/AbstractFSWAL.java | 6 +++++- .../hadoop/hbase/regionserver/wal/AsyncFSWAL.java | 11 +++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/AbstractFSWAL.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/AbstractFSWAL.java index 5f06b04cdf92..3254b338addb 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/AbstractFSWAL.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/AbstractFSWAL.java @@ -2022,13 +2022,17 @@ protected final void waitForSafePoint() { } } + protected void recoverLease(FileSystem fs, Path p, Configuration conf) { + } + protected final void closeWriter(W writer, Path path) { inflightWALClosures.put(path.getName(), writer); closeExecutor.execute(() -> { try { writer.close(); } catch (IOException e) { - LOG.warn("close old writer failed", e); + LOG.warn("close old writer failed.", e); + recoverLease(this.fs, path, conf); } finally { // call this even if the above close fails, as there is no other chance we can set closed to // true, it will not cause big problems. diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/AsyncFSWAL.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/AsyncFSWAL.java index 8d4afb322d5a..2e082f6d0753 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/AsyncFSWAL.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/AsyncFSWAL.java @@ -29,6 +29,7 @@ import org.apache.hadoop.hbase.HBaseInterfaceAudience; import org.apache.hadoop.hbase.io.asyncfs.AsyncFSOutput; import org.apache.hadoop.hbase.io.asyncfs.monitor.StreamSlowMonitor; +import org.apache.hadoop.hbase.util.RecoverLeaseFSUtils; import org.apache.hadoop.hbase.wal.AsyncFSWALProvider; import org.apache.hadoop.hbase.wal.WALProvider.AsyncWriter; import org.apache.hadoop.hdfs.protocol.DatanodeInfo; @@ -208,4 +209,14 @@ protected AsyncWriter createCombinedWriter(AsyncWriter localWriter, AsyncWriter // put remote writer first as usually it will cost more time to finish, so we write to it first return CombinedAsyncWriter.create(remoteWriter, localWriter); } + + @Override + protected void recoverLease(FileSystem fs, Path p, Configuration conf) { + try { + RecoverLeaseFSUtils.recoverFileLease(fs, p, conf, null); + } catch (IOException ex) { + LOG.warn("Unable to recover lease after several attempts. Give up.", ex); + throw new RuntimeException(ex); + } + } } From 6f92d7e7aabad40a87c9ba7033ce1ed36e524559 Mon Sep 17 00:00:00 2001 From: Wei-Chiu Chuang Date: Wed, 5 Jun 2024 11:19:37 -0700 Subject: [PATCH 2/3] move the code into AbstractFSWal. Change-Id: Id2906597d4709d66db370886598f339cd77ee754 --- .../hadoop/hbase/regionserver/wal/AbstractFSWAL.java | 9 ++++++++- .../hadoop/hbase/regionserver/wal/AsyncFSWAL.java | 11 ----------- 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/AbstractFSWAL.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/AbstractFSWAL.java index 3254b338addb..29f3e297d76b 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/AbstractFSWAL.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/AbstractFSWAL.java @@ -96,6 +96,7 @@ import org.apache.hadoop.hbase.util.CommonFSUtils; import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; import org.apache.hadoop.hbase.util.Pair; +import org.apache.hadoop.hbase.util.RecoverLeaseFSUtils; import org.apache.hadoop.hbase.wal.AbstractFSWALProvider; import org.apache.hadoop.hbase.wal.WAL; import org.apache.hadoop.hbase.wal.WALEdit; @@ -2022,7 +2023,13 @@ protected final void waitForSafePoint() { } } - protected void recoverLease(FileSystem fs, Path p, Configuration conf) { + private void recoverLease(FileSystem fs, Path p, Configuration conf) { + try { + RecoverLeaseFSUtils.recoverFileLease(fs, p, conf, null); + } catch (IOException ex) { + LOG.error("Unable to recover lease after several attempts. Give up.", ex); + throw new RuntimeException(ex); + } } protected final void closeWriter(W writer, Path path) { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/AsyncFSWAL.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/AsyncFSWAL.java index 2e082f6d0753..8d4afb322d5a 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/AsyncFSWAL.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/AsyncFSWAL.java @@ -29,7 +29,6 @@ import org.apache.hadoop.hbase.HBaseInterfaceAudience; import org.apache.hadoop.hbase.io.asyncfs.AsyncFSOutput; import org.apache.hadoop.hbase.io.asyncfs.monitor.StreamSlowMonitor; -import org.apache.hadoop.hbase.util.RecoverLeaseFSUtils; import org.apache.hadoop.hbase.wal.AsyncFSWALProvider; import org.apache.hadoop.hbase.wal.WALProvider.AsyncWriter; import org.apache.hadoop.hdfs.protocol.DatanodeInfo; @@ -209,14 +208,4 @@ protected AsyncWriter createCombinedWriter(AsyncWriter localWriter, AsyncWriter // put remote writer first as usually it will cost more time to finish, so we write to it first return CombinedAsyncWriter.create(remoteWriter, localWriter); } - - @Override - protected void recoverLease(FileSystem fs, Path p, Configuration conf) { - try { - RecoverLeaseFSUtils.recoverFileLease(fs, p, conf, null); - } catch (IOException ex) { - LOG.warn("Unable to recover lease after several attempts. Give up.", ex); - throw new RuntimeException(ex); - } - } } From 44cf751533df1c781ece9c5e497f8e78f17678ec Mon Sep 17 00:00:00 2001 From: Wei-Chiu Chuang Date: Mon, 10 Jun 2024 10:21:06 -0700 Subject: [PATCH 3/3] Do not throw RuntimeException in the close thread pool. Change-Id: I68d3b6e15c141433321c58b66b8de9efb10b171d --- .../org/apache/hadoop/hbase/regionserver/wal/AbstractFSWAL.java | 1 - 1 file changed, 1 deletion(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/AbstractFSWAL.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/AbstractFSWAL.java index 29f3e297d76b..bba9bd534e9a 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/AbstractFSWAL.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/AbstractFSWAL.java @@ -2028,7 +2028,6 @@ private void recoverLease(FileSystem fs, Path p, Configuration conf) { RecoverLeaseFSUtils.recoverFileLease(fs, p, conf, null); } catch (IOException ex) { LOG.error("Unable to recover lease after several attempts. Give up.", ex); - throw new RuntimeException(ex); } }