From 67cc82029c1b3b06f224b825641163d2db2188d4 Mon Sep 17 00:00:00 2001 From: d-c-manning <67607031+d-c-manning@users.noreply.github.com> Date: Wed, 19 Jun 2024 12:21:18 -0700 Subject: [PATCH] HBASE-28663 Graceful shutdown of CanaryTool timeouts (#5991) Signed-off-by: Viraj Jasani Signed-off-by: Mihir Monani --- .../apache/hadoop/hbase/tool/CanaryTool.java | 41 ++++++++++++++-- .../hadoop/hbase/tool/TestCanaryTool.java | 48 +++++++++++++++++++ 2 files changed, 86 insertions(+), 3 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/CanaryTool.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/CanaryTool.java index 92dca7c24c92..21e9edfe0688 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/CanaryTool.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/tool/CanaryTool.java @@ -198,6 +198,10 @@ public interface Sink { long getWriteSuccessCount(); long incWriteSuccessCount(); + + void stop(); + + boolean isStopped(); } /** @@ -208,6 +212,7 @@ public static class StdOutSink implements Sink { readSuccessCount = new AtomicLong(0), writeSuccessCount = new AtomicLong(0); private Map readFailures = new ConcurrentHashMap<>(); private Map writeFailures = new ConcurrentHashMap<>(); + private volatile boolean stopped = false; @Override public long getReadFailureCount() { @@ -268,6 +273,15 @@ public long getWriteSuccessCount() { public long incWriteSuccessCount() { return writeSuccessCount.incrementAndGet(); } + + public void stop() { + stopped = true; + } + + @Override + public boolean isStopped() { + return stopped; + } } /** @@ -444,6 +458,9 @@ public ZookeeperTask(Connection connection, String host, String znode, int timeo @Override public Void call() throws Exception { + if (this.sink.isStopped()) { + return null; + } ZooKeeper zooKeeper = null; try { zooKeeper = new ZooKeeper(host, timeout, EmptyWatcher.instance); @@ -498,6 +515,9 @@ public enum TaskType { @Override public Void call() { + if (this.sink.isStopped()) { + return null; + } switch (taskType) { case READ: return read(); @@ -685,6 +705,9 @@ static class RegionServerTask implements Callable { @Override public Void call() { + if (this.sink.isStopped()) { + return null; + } TableName tableName = null; Table table = null; Get get = null; @@ -1075,6 +1098,7 @@ private int runMonitor(String[] monitorTargets) throws Exception { if (currentTimeLength > timeout) { LOG.error("The monitor is running too long (" + currentTimeLength + ") after timeout limit:" + timeout + " will be killed itself !!"); + monitorThread.interrupt(); if (monitor.initialized) { return TIMEOUT_ERROR_EXIT_CODE; } else { @@ -1113,6 +1137,15 @@ public Map getWriteFailures() { return sink.getWriteFailures(); } + /** + * Return a CanaryTool.Sink object containing the detailed results of the canary run. The Sink may + * not have been created if a Monitor thread is not yet running. + * @return the active Sink if one exists, null otherwise. + */ + public Sink getActiveSink() { + return sink; + } + private void printUsageAndExit() { System.err.println( "Usage: canary [OPTIONS] [ [ [ regions = testingUtility.getAdmin().getRegions(tableName); + assertTrue("verify table has multiple regions", regions.size() > 1); + HRegionServer regionserver = testingUtility.getMiniHBaseCluster().getRegionServer(0); + for (RegionInfo region : regions) { + closeRegion(testingUtility, regionserver, region); + } + + // Run CanaryTool with 1 thread. This thread will attempt to scan the first region. + // It will use default rpc retries and receive NotServingRegionExceptions for many seconds + // according to HConstants.RETRY_BACKOFF. The CanaryTool timeout is set to 4 seconds, so it + // will time out before the first region scan is complete. + ExecutorService executor = new ScheduledThreadPoolExecutor(1); + CanaryTool canary = new CanaryTool(executor); + String[] args = { "-t", "4000", tableName.getNameAsString() }; + int retCode = ToolRunner.run(testingUtility.getConfiguration(), canary, args); + executor.shutdown(); + try { + if (!executor.awaitTermination(3, TimeUnit.SECONDS)) { + executor.shutdownNow(); + } + } catch (InterruptedException e) { + executor.shutdownNow(); + } + + CanaryTool.Sink sink = canary.getActiveSink(); + assertEquals("verify canary timed out with TIMEOUT_ERROR_EXIT_CODE", 3, retCode); + assertEquals("verify only the first region failed", 1, sink.getReadFailureCount()); + assertEquals("verify no successful reads", 0, sink.getReadSuccessCount()); + assertEquals("verify we were attempting to scan all regions", regions.size(), + ((CanaryTool.RegionStdOutSink) sink).getTotalExpectedRegions()); + } + @Test public void testCanaryRegionTaskReadAllCF() throws Exception { final TableName tableName = TableName.valueOf(name.getMethodName());