Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

HBASE-25973 Balancer should explain progress in a better way in log -… #3483

Merged
merged 2 commits into from
Jul 18, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -1877,6 +1877,8 @@ public List<RegionPlan> executeRegionPlansWithThrottling(List<RegionPlan> plans)
}
}
}
LOG.info("Balancer is going into sleep until next period in {}ms", getConfiguration()
.getInt(HConstants.HBASE_BALANCER_PERIOD, HConstants.DEFAULT_HBASE_BALANCER_PERIOD));
return successRegionPlans;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,8 @@ public class StochasticLoadBalancer extends BaseLoadBalancer {

private List<CandidateGenerator> candidateGenerators;
private List<CostFunction> costFunctions; // FindBugs: Wants this protected; IS2_INCONSISTENT_SYNC

// To save currently configed sum of multiplier. Defaulted at 1 for cases that carry high cost
private float sumMultiplier = 1.0f;
// to save and report costs to JMX
private double curOverallCost = 0d;
private double[] tempFunctionCosts;
Expand Down Expand Up @@ -229,7 +230,6 @@ protected void loadConf(Configuration conf) {

regionReplicaHostCostFunction = new RegionReplicaHostCostFunction(conf);
regionReplicaRackCostFunction = new RegionReplicaRackCostFunction(conf);

costFunctions = new ArrayList<>();
addCostFunction(new RegionCountSkewCostFunction(conf));
addCostFunction(new PrimaryRegionCountSkewCostFunction(conf));
Expand Down Expand Up @@ -310,63 +310,66 @@ private boolean areSomeRegionReplicasColocated(BalancerClusterState c) {
boolean needsBalance(TableName tableName, BalancerClusterState cluster) {
ClusterLoadState cs = new ClusterLoadState(cluster.clusterState);
if (cs.getNumServers() < MIN_SERVER_BALANCE) {
if (LOG.isDebugEnabled()) {
LOG.debug("Not running balancer because only " + cs.getNumServers()
+ " active regionserver(s)");
}
if (this.isBalancerRejectionRecording) {
sendRejectionReasonToRingBuffer("The number of RegionServers " +
cs.getNumServers() + " < MIN_SERVER_BALANCE(" + MIN_SERVER_BALANCE + ")", null);
}
LOG.info("Not running balancer because only " + cs.getNumServers() +
" active regionserver(s)");
sendRejectionReasonToRingBuffer(
"The number of RegionServers " + cs.getNumServers() + " < MIN_SERVER_BALANCE(" +
MIN_SERVER_BALANCE + ")", null);
return false;
}
if (areSomeRegionReplicasColocated(cluster)) {
LOG.info("Running balancer because at least one server hosts replicas of the same region.");
return true;
}

if (idleRegionServerExist(cluster)){
LOG.info("Running balancer because cluster has idle server(s).");
return true;
}

sumMultiplier = 0.0f;
double total = 0.0;
float sumMultiplier = 0.0f;
for (CostFunction c : costFunctions) {
float multiplier = c.getMultiplier();
if (multiplier <= 0) {
LOG.trace("{} not needed because multiplier is <= 0", c.getClass().getSimpleName());
continue;
}
double cost = c.cost();
if (!c.isNeeded()) {
LOG.trace("{} not needed", c.getClass().getSimpleName());
continue;
}
total += cost * multiplier;
sumMultiplier += multiplier;
total += c.cost() * multiplier;
}

boolean balanced = total <= 0 || sumMultiplier <= 0 ||
(sumMultiplier > 0 && (total / sumMultiplier) < minCostNeedBalance);
if(balanced && isBalancerRejectionRecording){
String reason = "";
if (total <= 0) {
reason = "(cost1*multiplier1)+(cost2*multiplier2)+...+(costn*multipliern) = " + total + " <= 0";
} else if (sumMultiplier <= 0) {
reason = "sumMultiplier = " + sumMultiplier + " <= 0";
} else if ((total / sumMultiplier) < minCostNeedBalance) {
reason =
"[(cost1*multiplier1)+(cost2*multiplier2)+...+(costn*multipliern)]/sumMultiplier = " + (total
/ sumMultiplier) + " <= minCostNeedBalance(" + minCostNeedBalance + ")";
}
sendRejectionReasonToRingBuffer(reason, costFunctions);
}
if (LOG.isDebugEnabled()) {
LOG.debug("{} {}; total cost={}, sum multiplier={}; cost/multiplier to need a balance is {}",
balanced ? "Skipping load balancing because balanced" : "We need to load balance",
isByTable ? String.format("table (%s)", tableName) : "cluster",
total, sumMultiplier, minCostNeedBalance);
if (LOG.isTraceEnabled()) {
LOG.trace("Balance decision detailed function costs={}", functionCost());
}
if (sumMultiplier <= 0) {
LOG.error("At least one cost function needs a multiplier > 0. For example, set "
+ "hbase.master.balancer.stochastic.regionCountCost to a positive value or default");
return false;
}

boolean balanced = (total / sumMultiplier < minCostNeedBalance);
if (balanced) {
if (isBalancerRejectionRecording) {
String reason = "";
if (total <= 0) {
reason = "(cost1*multiplier1)+(cost2*multiplier2)+...+(costn*multipliern) = " +
total + " <= 0";
} else if (sumMultiplier <= 0) {
reason = "sumMultiplier = " + sumMultiplier + " <= 0";
} else if ((total / sumMultiplier) < minCostNeedBalance) {
reason =
"[(cost1*multiplier1)+(cost2*multiplier2)+...+(costn*multipliern)]/sumMultiplier = " +
(total / sumMultiplier) + " <= minCostNeedBalance(" + minCostNeedBalance + ")";
}
sendRejectionReasonToRingBuffer(reason, costFunctions);
}
LOG.info("{} - skipping load balancing because weighted average imbalance={} <= "
+ "threshold({}). If you want more aggressive balancing, either lower "
+ "hbase.master.balancer.stochastic.minCostNeedBalance from {} or increase the relative "
+ "multiplier(s) of the specific cost function(s). functionCost={}",
isByTable ? "Table specific ("+tableName+")" : "Cluster wide", total / sumMultiplier,
minCostNeedBalance, minCostNeedBalance, functionCost());
} else {
LOG.info("{} - Calculating plan. may take up to {}ms to complete.",
isByTable ? "Table specific ("+tableName+")" : "Cluster wide", maxRunningTime);
}
return !balanced;
}
Expand Down Expand Up @@ -452,8 +455,9 @@ protected List<RegionPlan> balanceTable(TableName tableName, Map<ServerName,
maxSteps);
}
}
LOG.info("start StochasticLoadBalancer.balancer, initCost=" + currentCost + ", functionCost="
+ functionCost() + " computedMaxSteps: " + computedMaxSteps);
LOG.info("Start StochasticLoadBalancer.balancer, initial weighted average imbalance={}, "
+ "functionCost={} computedMaxSteps={}",
currentCost / sumMultiplier, functionCost(), computedMaxSteps);

final String initFunctionTotalCosts = totalCostsPerFunc();
// Perform a stochastic walk to see if we can get a good fit.
Expand Down Expand Up @@ -499,17 +503,19 @@ protected List<RegionPlan> balanceTable(TableName tableName, Map<ServerName,
updateStochasticCosts(tableName, curOverallCost, curFunctionCosts);
if (initCost > currentCost) {
plans = createRegionPlans(cluster);
LOG.info("Finished computing new load balance plan. Computation took {}" +
" to try {} different iterations. Found a solution that moves " +
"{} regions; Going from a computed cost of {}" +
" to a new cost of {}", java.time.Duration.ofMillis(endTime - startTime),
step, plans.size(), initCost, currentCost);
LOG.info("Finished computing new moving plan. Computation took {} ms" +
" to try {} different iterations. Found a solution that moves " +
"{} regions; Going from a computed imbalance of {}" +
" to a new imbalance of {}. ",
endTime - startTime, step, plans.size(),
initCost / sumMultiplier, currentCost / sumMultiplier);

sendRegionPlansToRingBuffer(plans, currentCost, initCost, initFunctionTotalCosts, step);
return plans;
}
LOG.info("Could not find a better load balance plan. Tried {} different configurations in " +
"{}, and did not find anything with a computed cost less than {}", step,
java.time.Duration.ofMillis(endTime - startTime), initCost);
LOG.info("Could not find a better moving plan. Tried {} different configurations in " +
"{} ms, and did not find anything with an imbalance score less than {}", step,
endTime - startTime, initCost / sumMultiplier);
return null;
}

Expand All @@ -520,8 +526,7 @@ private void sendRejectionReasonToRingBuffer(String reason, List<CostFunction> c
.setReason(reason);
if (costFunctions != null) {
for (CostFunction c : costFunctions) {
float multiplier = c.getMultiplier();
if (multiplier <= 0 || !c.isNeeded()) {
if (!c.isNeeded()) {
continue;
}
builder.addCostFuncInfo(c.getClass().getName(), c.cost(), c.getMultiplier());
Expand Down Expand Up @@ -580,7 +585,8 @@ private void updateStochasticCosts(TableName tableName, double overall, double[]
}

private void addCostFunction(CostFunction costFunction) {
if (costFunction.getMultiplier() > 0) {
float multiplier = costFunction.getMultiplier();
if (multiplier > 0) {
costFunctions.add(costFunction);
}
}
Expand All @@ -591,9 +597,13 @@ private String functionCost() {
builder.append(c.getClass().getSimpleName());
builder.append(" : (");
if (c.isNeeded()) {
builder.append(c.getMultiplier());
builder.append("multiplier=" + c.getMultiplier());
builder.append(", ");
builder.append(c.cost());
double cost = c.cost();
builder.append("imbalance=" + cost);
if (cost < minCostNeedBalance) {
builder.append(", balanced");
}
} else {
builder.append("not needed");
}
Expand All @@ -605,7 +615,7 @@ private String functionCost() {
private String totalCostsPerFunc() {
StringBuilder builder = new StringBuilder();
for (CostFunction c : costFunctions) {
if (c.getMultiplier() <= 0 || !c.isNeeded()) {
if (!c.isNeeded()) {
continue;
}
double cost = c.getMultiplier() * c.cost();
Expand Down Expand Up @@ -689,7 +699,7 @@ void initCosts(BalancerClusterState cluster) {
allowedOnPath = ".*(/src/test/.*|StochasticLoadBalancer).java")
void updateCostsWithAction(BalancerClusterState cluster, BalanceAction action) {
for (CostFunction c : costFunctions) {
if (c.getMultiplier() > 0 && c.isNeeded()) {
if (c.isNeeded()) {
c.postAction(action);
}
}
Expand Down Expand Up @@ -728,7 +738,7 @@ String[] getCostFunctionNames() {
CostFunction c = costFunctions.get(i);
this.tempFunctionCosts[i] = 0.0;

if (c.getMultiplier() <= 0 || !c.isNeeded()) {
if (!c.isNeeded()) {
continue;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,6 @@ public static void beforeAllTests() throws Exception {
conf.setFloat("hbase.master.balancer.stochastic.maxMovePercent", 0.75f);
conf.setFloat("hbase.regions.slop", 0.0f);
conf.setFloat("hbase.master.balancer.stochastic.localityCost", 0);
conf.setBoolean("hbase.master.balancer.stochastic.runMaxSteps", true);
loadBalancer = new StochasticLoadBalancer();
MasterServices services = mock(MasterServices.class);
when(services.getConfiguration()).thenReturn(conf);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,7 @@ public void testNeedBalance() {
for (boolean isByTable : perTableBalancerConfigs) {
conf.setBoolean(HConstants.HBASE_MASTER_LOADBALANCE_BYTABLE, isByTable);
loadBalancer.onConfigurationChange(conf);

for (int[] mockCluster : clusterStateMocks) {
Map<ServerName, List<RegionInfo>> servers = mockClusterServers(mockCluster);
Map<TableName, Map<ServerName, List<RegionInfo>>> LoadOfAllTable =
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,12 +53,13 @@ public class TestStochasticLoadBalancerBalanceCluster extends BalancerTestBase {
public void testBalanceCluster() throws Exception {
conf.setLong("hbase.master.balancer.stochastic.maxRunningTime", 3 * 60 * 1000); // 3 min
conf.setFloat("hbase.master.balancer.stochastic.maxMovePercent", 1.0f);
conf.setLong(StochasticLoadBalancer.MAX_STEPS_KEY, 20000000L);
loadBalancer.onConfigurationChange(conf);

for (int[] mockCluster : clusterStateMocks) {
Map<ServerName, List<RegionInfo>> servers = mockClusterServers(mockCluster);
List<ServerAndLoad> list = convertToList(servers);
LOG.info("Mock Cluster : " + printMock(list) + " " + printStats(list));

Map<TableName, Map<ServerName, List<RegionInfo>>> LoadOfAllTable =
(Map) mockClusterServersWithTables(servers);
List<RegionPlan> plans = loadBalancer.balanceCluster(LoadOfAllTable);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ public void testLargeCluster() {
int numTables = 100;
int replication = 1;
conf.setLong("hbase.master.balancer.stochastic.maxRunningTime", 6 * 60 * 1000);
conf.setFloat("hbase.master.balancer.stochastic.maxMovePercent", 1.0f);
conf.setLong(StochasticLoadBalancer.MAX_STEPS_KEY, 20000000L);
loadBalancer.onConfigurationChange(conf);
testWithCluster(numNodes, numRegions, numRegionsPerServer, replication, numTables, true, true);
}
Expand Down
Loading