Skip to content

Commit

Permalink
HBASE-27277 TestRaceBetweenSCPAndTRSP fails in pre commit (#5248)
Browse files Browse the repository at this point in the history
Signed-off-by: GeorryHuang <huangzhuoyue@apache.org>
  • Loading branch information
Apache9 authored May 23, 2023
1 parent e4e7917 commit dc30ca5
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
*/
package org.apache.hadoop.hbase.procedure2;

import com.google.errorprone.annotations.RestrictedApi;
import java.io.IOException;
import java.lang.Thread.UncaughtExceptionHandler;
import java.util.HashSet;
Expand Down Expand Up @@ -296,6 +297,12 @@ protected <T extends RemoteOperation> List<T> fetchType(
return (List<T>) requestByType.removeAll(type);
}

@RestrictedApi(explanation = "Should only be called in tests", link = "",
allowedOnPath = ".*/src/test/.*")
public boolean hasNode(TRemote key) {
return nodeMap.containsKey(key);
}

// ============================================================================================
// Timeout Helpers
// ============================================================================================
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import org.apache.hadoop.hbase.master.HMaster;
import org.apache.hadoop.hbase.master.MasterServices;
import org.apache.hadoop.hbase.master.RegionPlan;
import org.apache.hadoop.hbase.master.procedure.RSProcedureDispatcher;
import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure;
import org.apache.hadoop.hbase.master.region.MasterRegion;
import org.apache.hadoop.hbase.procedure2.ProcedureExecutor;
Expand Down Expand Up @@ -147,16 +148,32 @@ public void test() throws Exception {
Future<byte[]> moveFuture = am.moveAsync(new RegionPlan(region, sn, sn));
arriveRegionOpening.await();

// Kill the region server and trigger a SCP
UTIL.getMiniHBaseCluster().killRegionServer(sn);
// Wait until the SCP reaches the getRegionsOnServer call
arriveGetRegionsOnServer.await();
RSProcedureDispatcher remoteDispatcher = UTIL.getMiniHBaseCluster().getMaster()
.getMasterProcedureExecutor().getEnvironment().getRemoteDispatcher();
// this is necessary for making the UT stable, the problem here is that, in
// ServerManager.expireServer, we will submit the SCP and then the SCP will be executed in
// another thread(the PEWorker), so when we reach the above getRegionsOnServer call in SCP, it
// is still possible that the expireServer call has not been finished so the remote dispatcher
// still think it can dispatcher the TRSP, in this way we will be in dead lock as the TRSP will
// not schedule a new ORP since it relies on SCP to wake it up after everything is OK. This is
// not what we want to test in this UT so we need to wait here to prevent this from happening.
// See HBASE-27277 for more detailed analysis.
UTIL.waitFor(15000, () -> !remoteDispatcher.hasNode(sn));

// Resume the TRSP, it should be able to finish
RESUME_REGION_OPENING.countDown();

moveFuture.get();

ProcedureExecutor<?> procExec =
UTIL.getMiniHBaseCluster().getMaster().getMasterProcedureExecutor();
long scpProcId =
procExec.getProcedures().stream().filter(p -> p instanceof ServerCrashProcedure)
.map(p -> (ServerCrashProcedure) p).findAny().get().getProcId();
// Resume the SCP and make sure it can finish too
RESUME_GET_REGIONS_ON_SERVER.countDown();
UTIL.waitFor(60000, () -> procExec.isFinished(scpProcId));
}
Expand Down

0 comments on commit dc30ca5

Please sign in to comment.