From 91519e8befd8507794d73dfcef1437dae4d26d96 Mon Sep 17 00:00:00 2001 From: Viraj Jasani Date: Thu, 4 Apr 2024 21:13:10 -0800 Subject: [PATCH] HBASE-28366 Mis-order of SCP and regionServerReport results into region inconsistencies (#5774) Signed-off-by: Duo Zhang Signed-off-by: Bryan Beaudreault --- .../hadoop/hbase/master/ServerManager.java | 20 +++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java index 8b8f42201bfc..f7115a5cefb1 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java @@ -293,8 +293,24 @@ public void regionServerReport(ServerName sn, ServerMetrics sl) throws YouAreDea // the ServerName to use. Here we presume a master has already done // that so we'll press on with whatever it gave us for ServerName. if (!checkAndRecordNewServer(sn, sl)) { - LOG.info("RegionServerReport ignored, could not record the server: " + sn); - return; // Not recorded, so no need to move on + // Master already registered server with same (host + port) and higher startcode. + // This can happen if regionserver report comes late from old server + // (possible race condition), by that time master has already processed SCP for that + // server and started accepting regionserver report from new server i.e. server with + // same (host + port) and higher startcode. + // The exception thrown here is not meant to tell the region server it is dead because if + // there is a new server on the same host port, the old server should have already been + // dead in ideal situation. + // The exception thrown here is to skip the later steps of the whole regionServerReport + // request processing. Usually, after recording it in ServerManager, we will call the + // related methods in AssignmentManager to record region states. If the region server + // is already dead, we should not do these steps anymore, so here we throw an exception + // to let the upper layer know that they should not continue processing anymore. + final String errorMsg = "RegionServerReport received from " + sn + + ", but another server with the same name and higher startcode is already registered," + + " ignoring"; + LOG.warn(errorMsg); + throw new YouAreDeadException(errorMsg); } } updateLastFlushedSequenceIds(sn, sl);