From 4604b937c8118a373fd687526587adf1294ce923 Mon Sep 17 00:00:00 2001 From: David Manning Date: Fri, 14 Jun 2024 09:11:28 -0700 Subject: [PATCH 1/3] HBASE-28663 Graceful shutdown of CanaryTool timeouts --- .../apache/hadoop/hbase/tool/CanaryTool.java | 40 +++++++++++++-- .../hadoop/hbase/tool/TestCanaryTool.java | 49 +++++++++++++++++++ 2 files changed, 86 insertions(+), 3 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/CanaryTool.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/CanaryTool.java index a54bed8935ed..14637699e3b1 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/CanaryTool.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/CanaryTool.java @@ -198,6 +198,10 @@ public interface Sink { long getWriteSuccessCount(); long incWriteSuccessCount(); + + void stop(); + + boolean isStopped(); } /** @@ -208,6 +212,7 @@ public static class StdOutSink implements Sink { readSuccessCount = new AtomicLong(0), writeSuccessCount = new AtomicLong(0); private Map readFailures = new ConcurrentHashMap<>(); private Map writeFailures = new ConcurrentHashMap<>(); + private volatile boolean stopped = false; @Override public long getReadFailureCount() { @@ -268,6 +273,14 @@ public long getWriteSuccessCount() { public long incWriteSuccessCount() { return writeSuccessCount.incrementAndGet(); } + + public void stop() { + stopped = true; + } + + @Override public boolean isStopped() { + return stopped; + } } /** @@ -444,6 +457,9 @@ public ZookeeperTask(Connection connection, String host, String znode, int timeo @Override public Void call() throws Exception { + if (this.sink.isStopped()) { + return null; + } ZooKeeper zooKeeper = null; try { zooKeeper = new ZooKeeper(host, timeout, EmptyWatcher.instance); @@ -498,6 +514,9 @@ public enum TaskType { @Override public Void call() { + if (this.sink.isStopped()) { + return null; + } switch (taskType) { case READ: return read(); @@ -685,6 +704,9 @@ static class RegionServerTask implements Callable { @Override public Void call() { + if (this.sink.isStopped()) { + return null; + } TableName tableName = null; Table table = null; Get get = null; @@ -1075,6 +1097,7 @@ private int runMonitor(String[] monitorTargets) throws Exception { if (currentTimeLength > timeout) { LOG.error("The monitor is running too long (" + currentTimeLength + ") after timeout limit:" + timeout + " will be killed itself !!"); + monitorThread.interrupt(); if (monitor.initialized) { return TIMEOUT_ERROR_EXIT_CODE; } else { @@ -1113,6 +1136,15 @@ public Map getWriteFailures() { return sink.getWriteFailures(); } + /** + * Return a CanaryTool.Sink object containing the detailed results of the canary run. + * The Sink may not have been created if a Monitor thread is not yet running. + * @return the active Sink if one exists, null otherwise. + */ + public Sink getActiveSink() { + return sink; + } + private void printUsageAndExit() { System.err.println( "Usage: canary [OPTIONS] [ [ [ regions = testingUtility.getAdmin().getRegions(tableName); + assertTrue("verify table has multiple regions", regions.size() > 1); + HRegionServer regionserver = testingUtility.getMiniHBaseCluster().getRegionServer(0); + for (RegionInfo region : regions) { + closeRegion(testingUtility, regionserver, region); + } + + // Run CanaryTool with 1 thread. This thread will attempt to scan the first region. + // It will use default rpc retries and receive NotServingRegionExceptions for many seconds + // according to HConstants.RETRY_BACKOFF. The CanaryTool timeout is set to 4 seconds, so it + // will time out before the first region scan is complete. + ExecutorService executor = new ScheduledThreadPoolExecutor(1); + CanaryTool canary = new CanaryTool(executor); + String[] args = { "-t", "4000", tableName.getNameAsString() }; + int retCode = ToolRunner.run(testingUtility.getConfiguration(), canary, args); + executor.shutdown(); + try { + if (!executor.awaitTermination(3, TimeUnit.SECONDS)) { + executor.shutdownNow(); + } + } catch (InterruptedException e) { + executor.shutdownNow(); + } + + CanaryTool.Sink sink = canary.getActiveSink(); + assertEquals("verify canary timed out with TIMEOUT_ERROR_EXIT_CODE", 3, retCode); + assertEquals("verify only the first region failed", 1, sink.getReadFailureCount()); + assertEquals("verify no successful reads", 0, sink.getReadSuccessCount()); + assertEquals("verify we were attempting to scan all regions", + regions.size(), + ((CanaryTool.RegionStdOutSink) sink).getTotalExpectedRegions()); + } + @Test public void testCanaryRegionTaskReadAllCF() throws Exception { final TableName tableName = TableName.valueOf(name.getMethodName()); From 5641c20c141fd326d8e72e89441aeaa235cacb81 Mon Sep 17 00:00:00 2001 From: David Manning Date: Fri, 14 Jun 2024 09:45:51 -0700 Subject: [PATCH 2/3] branch-2 uses HRegionInfo still --- .../test/java/org/apache/hadoop/hbase/tool/TestCanaryTool.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/tool/TestCanaryTool.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/tool/TestCanaryTool.java index 9b8eda60aed2..d08bfac41e0a 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/tool/TestCanaryTool.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/tool/TestCanaryTool.java @@ -45,6 +45,7 @@ import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.HBaseTestingUtility; import org.apache.hadoop.hbase.HConstants; +import org.apache.hadoop.hbase.HRegionInfo; import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.client.ColumnFamilyDescriptor; @@ -150,7 +151,7 @@ public void testCanaryStopsScanningAfterTimeout() throws Exception { assertTrue("verify table has multiple regions", regions.size() > 1); HRegionServer regionserver = testingUtility.getMiniHBaseCluster().getRegionServer(0); for (RegionInfo region : regions) { - closeRegion(testingUtility, regionserver, region); + closeRegion(testingUtility, regionserver, new HRegionInfo(region)); } // Run CanaryTool with 1 thread. This thread will attempt to scan the first region. From c05607cce60e197ea8e04277fa8374c6135176a4 Mon Sep 17 00:00:00 2001 From: David Manning Date: Fri, 14 Jun 2024 10:04:23 -0700 Subject: [PATCH 3/3] mvn spotless:apply --- .../java/org/apache/hadoop/hbase/tool/CanaryTool.java | 7 ++++--- .../org/apache/hadoop/hbase/tool/TestCanaryTool.java | 9 ++++----- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/CanaryTool.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/CanaryTool.java index 14637699e3b1..979fdeb0bb88 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/CanaryTool.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/CanaryTool.java @@ -278,7 +278,8 @@ public void stop() { stopped = true; } - @Override public boolean isStopped() { + @Override + public boolean isStopped() { return stopped; } } @@ -1137,8 +1138,8 @@ public Map getWriteFailures() { } /** - * Return a CanaryTool.Sink object containing the detailed results of the canary run. - * The Sink may not have been created if a Monitor thread is not yet running. + * Return a CanaryTool.Sink object containing the detailed results of the canary run. The Sink may + * not have been created if a Monitor thread is not yet running. * @return the active Sink if one exists, null otherwise. */ public Sink getActiveSink() { diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/tool/TestCanaryTool.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/tool/TestCanaryTool.java index d08bfac41e0a..b0a091d88f3a 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/tool/TestCanaryTool.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/tool/TestCanaryTool.java @@ -135,8 +135,8 @@ public void testBasicCanaryWorks() throws Exception { } /** - * When CanaryTool times out, it should stop scanning and shutdown quickly and gracefully. - * This test helps to confirm that threadpools do not continue executing work after the canary + * When CanaryTool times out, it should stop scanning and shutdown quickly and gracefully. This + * test helps to confirm that threadpools do not continue executing work after the canary * finishes. It also verifies sink behavior and measures correct failure counts in the sink. * @throws Exception if it can't create a table, communicate with minicluster, or run the canary. */ @@ -146,7 +146,7 @@ public void testCanaryStopsScanningAfterTimeout() throws Exception { // Do not notify HMaster or META. CanaryTool will scan and receive NotServingRegionExceptions. final TableName tableName = TableName.valueOf(name.getMethodName()); // Close the unused Table reference returned by createMultiRegionTable. - testingUtility.createMultiRegionTable(tableName, new byte[][] {FAMILY}).close(); + testingUtility.createMultiRegionTable(tableName, new byte[][] { FAMILY }).close(); List regions = testingUtility.getAdmin().getRegions(tableName); assertTrue("verify table has multiple regions", regions.size() > 1); HRegionServer regionserver = testingUtility.getMiniHBaseCluster().getRegionServer(0); @@ -175,8 +175,7 @@ public void testCanaryStopsScanningAfterTimeout() throws Exception { assertEquals("verify canary timed out with TIMEOUT_ERROR_EXIT_CODE", 3, retCode); assertEquals("verify only the first region failed", 1, sink.getReadFailureCount()); assertEquals("verify no successful reads", 0, sink.getReadSuccessCount()); - assertEquals("verify we were attempting to scan all regions", - regions.size(), + assertEquals("verify we were attempting to scan all regions", regions.size(), ((CanaryTool.RegionStdOutSink) sink).getTotalExpectedRegions()); }