Skip to content

Commit

Permalink
HBASE-25032 Wait for region server to become online before adding it …
Browse files Browse the repository at this point in the history
…to online servers in Master (apache#2771)

Signed-off-by: Bharath Vissapragada <bharathv@apache.org>
Signed-off-by: Michael Stack <stack@apache.org>
(cherry picked from commit 3bb9788)
  • Loading branch information
caroliney14 authored and bharathv committed Mar 26, 2021
1 parent c64850e commit 9bef8de
Show file tree
Hide file tree
Showing 7 changed files with 28 additions and 32 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,8 @@ public boolean unregisterListener(final ServerListener listener) {
}

/**
* Let the server manager know a new regionserver has come online
* Let the server manager know a regionserver is requesting configurations.
* Regionserver will not be added here, but in its first report.
* @param request the startup request
* @param versionNumber the version number of the new regionserver
* @param version the version of the new regionserver, could contain strings like "SNAPSHOT"
Expand All @@ -201,10 +202,6 @@ ServerName regionServerStartup(RegionServerStartupRequest request, int versionNu
ServerName sn = ServerName.valueOf(hostname, request.getPort(), request.getServerStartCode());
checkClockSkew(sn, request.getServerCurrentTime());
checkIsDead(sn, "STARTUP");
if (!checkAndRecordNewServer(sn, ServerMetricsBuilder.of(sn, versionNumber, version))) {
LOG.warn(
"THIS SHOULD NOT HAPPEN, RegionServerStartup" + " could not record the server: " + sn);
}
return sn;
}

Expand Down Expand Up @@ -256,7 +253,6 @@ public void regionServerReport(ServerName sn,
if (null == this.onlineServers.replace(sn, sl)) {
// Already have this host+port combo and its just different start code?
// Just let the server in. Presume master joining a running cluster.
// recordNewServer is what happens at the end of reportServerStartup.
// The only thing we are skipping is passing back to the regionserver
// the ServerName to use. Here we presume a master has already done
// that so we'll press on with whatever it gave us for ServerName.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -975,10 +975,9 @@ public void run() {
// node was created, in case any coprocessors want to use ZooKeeper
this.rsHost = new RegionServerCoprocessorHost(this, this.conf);

// Try and register with the Master; tell it we are here. Break if server is stopped or
// the clusterup flag is down or hdfs went wacky. Once registered successfully, go ahead and
// start up all Services. Use RetryCounter to get backoff in case Master is struggling to
// come up.
// Get configurations from the Master. Break if server is stopped or
// the clusterup flag is down or hdfs went wacky. Then start up all Services.
// Use RetryCounter to get backoff in case Master is struggling to come up.
LOG.debug("About to register with Master.");
RetryCounterFactory rcf =
new RetryCounterFactory(Integer.MAX_VALUE, this.sleeper.getPeriod(), 1000 * 60 * 5);
Expand Down Expand Up @@ -1011,7 +1010,7 @@ public void run() {
}
}

// We registered with the Master. Go into run mode.
// Run mode.
long lastMsg = System.currentTimeMillis();
long oldRequestCount = -1;
// The main run loop.
Expand Down Expand Up @@ -1045,7 +1044,14 @@ public void run() {
}
long now = System.currentTimeMillis();
if ((now - lastMsg) >= msgInterval) {
tryRegionServerReport(lastMsg, now);
// Register with the Master now that our setup is complete.
if (tryRegionServerReport(lastMsg, now) && !online.get()) {
// Wake up anyone waiting for this server to online
synchronized (online) {
online.set(true);
online.notifyAll();
}
}
lastMsg = System.currentTimeMillis();
}
if (!isStopped() && !isAborted()) {
Expand Down Expand Up @@ -1215,12 +1221,12 @@ private long getWriteRequestCount() {
}

@VisibleForTesting
protected void tryRegionServerReport(long reportStartTime, long reportEndTime)
protected boolean tryRegionServerReport(long reportStartTime, long reportEndTime)
throws IOException {
RegionServerStatusService.BlockingInterface rss = rssStub;
if (rss == null) {
// the current server could be stopping.
return;
return false;
}
ClusterStatusProtos.ServerLoad sl = buildServerLoad(reportStartTime, reportEndTime);
try {
Expand All @@ -1240,7 +1246,9 @@ protected void tryRegionServerReport(long reportStartTime, long reportEndTime)
// Couldn't connect to the master, get location from zk and reconnect
// Method blocks until new master is found or we are stopped
createRegionServerStatusStub(true);
return false;
}
return true;
}

/**
Expand Down Expand Up @@ -1603,11 +1611,6 @@ protected void handleReportForDutyResponse(final RegionServerStartupResponse c)
", sessionid=0x" +
Long.toHexString(this.zooKeeper.getRecoverableZooKeeper().getSessionId()));

// Wake up anyone waiting for this server to online
synchronized (online) {
online.set(true);
online.notifyAll();
}
} catch (Throwable e) {
stop("Failed initialization");
throw convertThrowableToIOE(cleanup(e, "Failed init"),
Expand Down Expand Up @@ -2726,10 +2729,9 @@ private boolean keepLooping() {
}

/*
* Let the master know we're here Run initialization using parameters passed
* us by the master.
* Run initialization using parameters passed us by the master.
* @return A Map of key/value configurations we got from the Master else
* null if we failed to register.
* null if we failed during report.
* @throws IOException
*/
private RegionServerStartupResponse reportForDuty() throws IOException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -449,8 +449,9 @@ public JVMClusterUtil.RegionServerThread startRegionServerAndWait(long timeout)
ServerName rsServerName = t.getRegionServer().getServerName();

long start = System.currentTimeMillis();
ClusterStatus clusterStatus = getClusterStatus();
ClusterStatus clusterStatus;
while ((System.currentTimeMillis() - start) < timeout) {
clusterStatus = getClusterStatus();
if (clusterStatus != null && clusterStatus.getServers().contains(rsServerName)) {
return t;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,9 @@ public MyMaster(Configuration conf) throws IOException, KeeperException, Interru
}

@Override
protected void tryRegionServerReport(long reportStartTime, long reportEndTime) {
protected boolean tryRegionServerReport(long reportStartTime, long reportEndTime) {
// do nothing
return true;
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,12 +60,6 @@ public static class MyMaster extends HMaster {
public MyMaster(Configuration conf) throws IOException, KeeperException, InterruptedException {
super(conf);
}
@Override
protected void tryRegionServerReport(
long reportStartTime, long reportEndTime) {
// do nothing
}

}

@BeforeClass
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -84,13 +84,14 @@ public IgnoreYouAreDeadRS(Configuration conf) throws IOException, InterruptedExc
}

@Override
protected void tryRegionServerReport(long reportStartTime, long reportEndTime)
protected boolean tryRegionServerReport(long reportStartTime, long reportEndTime)
throws IOException {
try {
super.tryRegionServerReport(reportStartTime, reportEndTime);
} catch (YouAreDeadException e) {
// ignore, do not abort
}
return true;
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,13 +85,14 @@ public MyRegionServer(Configuration conf) throws IOException {
}

@Override
protected void tryRegionServerReport(long reportStartTime, long reportEndTime)
protected boolean tryRegionServerReport(long reportStartTime, long reportEndTime)
throws IOException {
try {
super.tryRegionServerReport(reportStartTime, reportEndTime);
} catch (YouAreDeadException e) {
LOG.info("Caught YouAreDeadException, ignore", e);
}
return true;
}

@Override
Expand Down

0 comments on commit 9bef8de

Please sign in to comment.