From 62250e26a8607d8d05c483dd205650ac15bb2e7d Mon Sep 17 00:00:00 2001 From: Caroline Zhou Date: Fri, 11 Dec 2020 10:49:09 -0800 Subject: [PATCH] HBASE-25032 Wait for region server to become online before adding it to online servers in Master --- .../hadoop/hbase/master/ServerManager.java | 8 ++--- .../hbase/regionserver/HRegionServer.java | 34 ++++++++++--------- .../apache/hadoop/hbase/MiniHBaseCluster.java | 3 +- .../hbase/master/TestGetReplicationLoad.java | 3 +- .../hbase/master/TestMasterMetrics.java | 11 +++--- .../TestCompactionInDeadRegionServer.java | 3 +- .../TestShutdownWhileWALBroken.java | 3 +- 7 files changed, 32 insertions(+), 33 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java index 6b9a0a566ed0..064c2e6be67a 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java @@ -179,7 +179,8 @@ public boolean unregisterListener(final ServerListener listener) { } /** - * Let the server manager know a new regionserver has come online + * Let the server manager know a regionserver is requesting configurations. + * Regionserver will not be added here, but in its first report. * @param request the startup request * @param versionNumber the version number of the new regionserver * @param version the version of the new regionserver, could contain strings like "SNAPSHOT" @@ -202,10 +203,6 @@ ServerName regionServerStartup(RegionServerStartupRequest request, int versionNu ServerName sn = ServerName.valueOf(hostname, request.getPort(), request.getServerStartCode()); checkClockSkew(sn, request.getServerCurrentTime()); checkIsDead(sn, "STARTUP"); - if (!checkAndRecordNewServer(sn, ServerMetricsBuilder.of(sn, versionNumber, version))) { - LOG.warn( - "THIS SHOULD NOT HAPPEN, RegionServerStartup" + " could not record the server: " + sn); - } return sn; } @@ -256,7 +253,6 @@ public void regionServerReport(ServerName sn, if (null == this.onlineServers.replace(sn, sl)) { // Already have this host+port combo and its just different start code? // Just let the server in. Presume master joining a running cluster. - // recordNewServer is what happens at the end of reportServerStartup. // The only thing we are skipping is passing back to the regionserver // the ServerName to use. Here we presume a master has already done // that so we'll press on with whatever it gave us for ServerName. diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java index a8931de49696..d6981e24dccb 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java @@ -1026,10 +1026,9 @@ public void run() { // node was created, in case any coprocessors want to use ZooKeeper this.rsHost = new RegionServerCoprocessorHost(this, this.conf); - // Try and register with the Master; tell it we are here. Break if server is stopped or - // the clusterup flag is down or hdfs went wacky. Once registered successfully, go ahead and - // start up all Services. Use RetryCounter to get backoff in case Master is struggling to - // come up. + // Get configurations from the Master. Break if server is stopped or + // the clusterup flag is down or hdfs went wacky. Then start up all Services. + // Use RetryCounter to get backoff in case Master is struggling to come up. LOG.debug("About to register with Master."); RetryCounterFactory rcf = new RetryCounterFactory(Integer.MAX_VALUE, this.sleeper.getPeriod(), 1000 * 60 * 5); @@ -1062,7 +1061,7 @@ public void run() { } } - // We registered with the Master. Go into run mode. + // Run mode. long lastMsg = System.currentTimeMillis(); long oldRequestCount = -1; // The main run loop. @@ -1096,7 +1095,14 @@ public void run() { } long now = System.currentTimeMillis(); if ((now - lastMsg) >= msgInterval) { - tryRegionServerReport(lastMsg, now); + // Register with the Master now that our setup is complete. + if (tryRegionServerReport(lastMsg, now) && !online.get()) { + // Wake up anyone waiting for this server to online + synchronized (online) { + online.set(true); + online.notifyAll(); + } + } lastMsg = System.currentTimeMillis(); } if (!isStopped() && !isAborted()) { @@ -1266,12 +1272,12 @@ private long getWriteRequestCount() { } @InterfaceAudience.Private - protected void tryRegionServerReport(long reportStartTime, long reportEndTime) + protected boolean tryRegionServerReport(long reportStartTime, long reportEndTime) throws IOException { RegionServerStatusService.BlockingInterface rss = rssStub; if (rss == null) { // the current server could be stopping. - return; + return false; } ClusterStatusProtos.ServerLoad sl = buildServerLoad(reportStartTime, reportEndTime); try { @@ -1291,7 +1297,9 @@ protected void tryRegionServerReport(long reportStartTime, long reportEndTime) // Couldn't connect to the master, get location from zk and reconnect // Method blocks until new master is found or we are stopped createRegionServerStatusStub(true); + return false; } + return true; } /** @@ -1654,11 +1662,6 @@ protected void handleReportForDutyResponse(final RegionServerStartupResponse c) ", sessionid=0x" + Long.toHexString(this.zooKeeper.getRecoverableZooKeeper().getSessionId())); - // Wake up anyone waiting for this server to online - synchronized (online) { - online.set(true); - online.notifyAll(); - } } catch (Throwable e) { stop("Failed initialization"); throw convertThrowableToIOE(cleanup(e, "Failed init"), @@ -2822,10 +2825,9 @@ private boolean keepLooping() { } /* - * Let the master know we're here Run initialization using parameters passed - * us by the master. + * Run initialization using parameters passed us by the master. * @return A Map of key/value configurations we got from the Master else - * null if we failed to register. + * null if we failed during report. * @throws IOException */ private RegionServerStartupResponse reportForDuty() throws IOException { diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java index f795eef86c05..89d19685fb79 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java @@ -449,8 +449,9 @@ public JVMClusterUtil.RegionServerThread startRegionServerAndWait(long timeout) ServerName rsServerName = t.getRegionServer().getServerName(); long start = System.currentTimeMillis(); - ClusterStatus clusterStatus = getClusterStatus(); + ClusterStatus clusterStatus; while ((System.currentTimeMillis() - start) < timeout) { + clusterStatus = getClusterStatus(); if (clusterStatus != null && clusterStatus.getServers().contains(rsServerName)) { return t; } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestGetReplicationLoad.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestGetReplicationLoad.java index ac9ae397267d..1fcd6464be77 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestGetReplicationLoad.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestGetReplicationLoad.java @@ -58,8 +58,9 @@ public MyMaster(Configuration conf) throws IOException, KeeperException, Interru } @Override - protected void tryRegionServerReport(long reportStartTime, long reportEndTime) { + protected boolean tryRegionServerReport(long reportStartTime, long reportEndTime) { // do nothing + return true; } } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterMetrics.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterMetrics.java index 511693338722..3f879846528c 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterMetrics.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterMetrics.java @@ -60,12 +60,6 @@ public static class MyMaster extends HMaster { public MyMaster(Configuration conf) throws IOException, KeeperException, InterruptedException { super(conf); } - @Override - protected void tryRegionServerReport( - long reportStartTime, long reportEndTime) { - // do nothing - } - } @BeforeClass @@ -100,12 +94,15 @@ public void testClusterRequests() throws Exception { request.setServer(ProtobufUtil.toServerName(serverName)); long expectedRequestNumber = 10000; - MetricsMasterSource masterSource = master.getMasterMetrics().getMetricsSource(); ClusterStatusProtos.ServerLoad sl = ClusterStatusProtos.ServerLoad.newBuilder() .setTotalNumberOfRequests(expectedRequestNumber) .build(); request.setLoad(sl); + MetricsMasterSource masterSource = master.getMasterMetrics().getMetricsSource(); + // Init master source again to reset cluster requests counter + masterSource.init(); + master.getMasterRpcServices().regionServerReport(null, request.build()); boolean tablesOnMaster = LoadBalancer.isTablesOnMaster(TEST_UTIL.getConfiguration()); if (tablesOnMaster) { diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestCompactionInDeadRegionServer.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestCompactionInDeadRegionServer.java index 78042cc976fd..9b5c523ee4da 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestCompactionInDeadRegionServer.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestCompactionInDeadRegionServer.java @@ -84,13 +84,14 @@ public IgnoreYouAreDeadRS(Configuration conf) throws IOException, InterruptedExc } @Override - protected void tryRegionServerReport(long reportStartTime, long reportEndTime) + protected boolean tryRegionServerReport(long reportStartTime, long reportEndTime) throws IOException { try { super.tryRegionServerReport(reportStartTime, reportEndTime); } catch (YouAreDeadException e) { // ignore, do not abort } + return true; } } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestShutdownWhileWALBroken.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestShutdownWhileWALBroken.java index 30c954be5c3f..ffc3a199e477 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestShutdownWhileWALBroken.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestShutdownWhileWALBroken.java @@ -85,13 +85,14 @@ public MyRegionServer(Configuration conf) throws IOException { } @Override - protected void tryRegionServerReport(long reportStartTime, long reportEndTime) + protected boolean tryRegionServerReport(long reportStartTime, long reportEndTime) throws IOException { try { super.tryRegionServerReport(reportStartTime, reportEndTime); } catch (YouAreDeadException e) { LOG.info("Caught YouAreDeadException, ignore", e); } + return true; } @Override