Skip to content

Commit

Permalink
CDPD-68639: HBASE-28419 Allow Action and Policies of ServerKillingMon…
Browse files Browse the repository at this point in the history
…key to be configurable. (apache#5743)

Signed-off-by: Nick Dimiduk <ndimiduk@apache.org>
Change-Id: I3023800b61e00820377578720af712bc0dfba760
  • Loading branch information
jojochuang authored and sdevineni committed Aug 29, 2024
1 parent 627af8f commit 7b424b2
Show file tree
Hide file tree
Showing 3 changed files with 77 additions and 16 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.TimeUnit;

public interface MonkeyConstants {

Expand All @@ -45,6 +46,11 @@ public interface MonkeyConstants {
String UNBALANCE_WAIT_AFTER_BALANCE_MS = "unbalance.action.wait.after.period";
String UNBALANCE_KILL_META_RS = "unbalance.action.kill.meta.rs";
String DECREASE_HFILE_SIZE_SLEEP_TIME = "decrease.hfile.size.sleep.time";
String RESTART_RANDOM_RS_EXCEPTION_SLEEP_TIME = "restart.random.rs.exception.sleep.time";
String RESTART_ACTIVE_NAMENODE_SLEEP_TIME = "restart.active.namenode.sleep.time";
String RESTART_RANDOM_DATANODE_SLEEP_TIME = "restart.random.datanode.sleep.time";
String RESTART_RANDOM_JOURNALNODE_SLEEP_TIME = "restart.random.journalnode.sleep.time";
String RESTART_RANDOM_ZKNODE_SLEEP_TIME = "restart.random.zknode.sleep.time";
String GRACEFUL_RESTART_RS_SLEEP_TIME = "graceful.restart.rs.sleep.time";
String ROLLING_BATCH_SUSPEND_RS_SLEEP_TIME = "rolling.batch.suspend.rs.sleep.time";
String ROLLING_BATCH_SUSPEND_RS_RATIO = "rolling.batch.suspend.rs.ratio";
Expand Down Expand Up @@ -92,6 +98,13 @@ public interface MonkeyConstants {
long DEFAULT_UNBALANCE_WAIT_AFTER_BALANCE_MS = 5 * 1000;
boolean DEFAULT_UNBALANCE_KILL_META_RS = true;
long DEFAULT_DECREASE_HFILE_SIZE_SLEEP_TIME = 30 * 1000;

long DEFAULT_RESTART_RANDOM_RS_EXCEPTION_SLEEP_TIME = TimeUnit.MILLISECONDS.toMillis(60000);
long DEFAULT_RESTART_ACTIVE_NAMENODE_SLEEP_TIME = TimeUnit.MILLISECONDS.toMillis(60000);
long DEFAULT_RESTART_RANDOM_DATANODE_SLEEP_TIME = TimeUnit.MILLISECONDS.toMillis(60000);
long DEFAULT_RESTART_RANDOM_JOURNALNODE_SLEEP_TIME = TimeUnit.MILLISECONDS.toMillis(60000);
long DEFAULT_RESTART_RANDOM_ZKNODE_SLEEP_TIME = TimeUnit.MILLISECONDS.toMillis(60000);

long DEFAULT_GRACEFUL_RESTART_RS_SLEEP_TIME = 5000;
long DEFAULT_ROLLING_BATCH_SUSPEND_RS_SLEEP_TIME = 30 * 1000;
float DEFAULT_ROLLING_BATCH_SUSPEND_RS_RATIO = 1.0f;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,17 @@
*/
public class ServerAndDependenciesKillingMonkeyFactory extends MonkeyFactory {

private long restartRandomRsExceptMetaSleepTime;
private long restartActiveMasterSleepTime;
private long rollingBatchRestartRSSleepTime;
private long restartActiveNameNodeSleepTime;
private long restartRandomDataNodeSleepTime;
private long restartRandomJournalNodeSleepTime;
private long restartRandomZKNodeSleepTime;
private long gracefulRollingRestartTSSLeepTime;
private long rollingBatchSuspendRSSleepTime;
private float rollingBatchSuspendtRSRatio;
private long action1Period;

@Override
public ChaosMonkey build() {
Expand All @@ -53,15 +61,15 @@ public ChaosMonkey build() {
// Destructive actions to mess things around. Cannot run batch restart.
// @formatter:off
Action[] actions1 = new Action[] {
new RestartRandomRsExceptMetaAction(60000),
new RestartActiveMasterAction(5000),
new RestartRandomRsExceptMetaAction(restartRandomRsExceptMetaSleepTime),
new RestartActiveMasterAction(restartActiveMasterSleepTime),
// only allow 2 servers to be dead.
new RollingBatchRestartRsAction(5000, 1.0f, 2, true),
new RollingBatchRestartRsAction(rollingBatchRestartRSSleepTime, 1.0f, 2, true),
new ForceBalancerAction(),
new RestartActiveNameNodeAction(60000),
new RestartRandomDataNodeAction(60000),
new RestartRandomJournalNodeAction(60000),
new RestartRandomZKNodeAction(60000),
new RestartActiveNameNodeAction(restartActiveNameNodeSleepTime),
new RestartRandomDataNodeAction(restartRandomDataNodeSleepTime),
new RestartRandomJournalNodeAction(restartRandomJournalNodeSleepTime),
new RestartRandomZKNodeAction(restartRandomZKNodeSleepTime),
new GracefulRollingRestartRsAction(gracefulRollingRestartTSSLeepTime),
new RollingBatchSuspendResumeRsAction(rollingBatchSuspendRSSleepTime,
rollingBatchSuspendtRSRatio)
Expand All @@ -73,12 +81,33 @@ public ChaosMonkey build() {
new Action[] { new DumpClusterStatusAction(), new DumpHdfsClusterStatusAction() };

return new PolicyBasedChaosMonkey(properties, util,
new CompositeSequentialPolicy(new DoActionsOncePolicy(60 * 1000, actions1),
new PeriodicRandomActionPolicy(60 * 1000, actions1)),
new PeriodicRandomActionPolicy(60 * 1000, actions2));
new CompositeSequentialPolicy(new DoActionsOncePolicy(action1Period, actions1),
new PeriodicRandomActionPolicy(action1Period, actions1)),
new PeriodicRandomActionPolicy(action1Period, actions2));
}

private void loadProperties() {
restartRandomRsExceptMetaSleepTime = Long
.parseLong(this.properties.getProperty(MonkeyConstants.RESTART_RANDOM_RS_EXCEPTION_SLEEP_TIME,
MonkeyConstants.DEFAULT_RESTART_RANDOM_RS_EXCEPTION_SLEEP_TIME + ""));
restartActiveMasterSleepTime =
Long.parseLong(this.properties.getProperty(MonkeyConstants.RESTART_ACTIVE_MASTER_SLEEP_TIME,
MonkeyConstants.DEFAULT_RESTART_ACTIVE_MASTER_SLEEP_TIME + ""));
rollingBatchRestartRSSleepTime = Long
.parseLong(this.properties.getProperty(MonkeyConstants.ROLLING_BATCH_RESTART_RS_SLEEP_TIME,
MonkeyConstants.DEFAULT_ROLLING_BATCH_RESTART_RS_SLEEP_TIME + ""));
restartActiveNameNodeSleepTime =
Long.parseLong(this.properties.getProperty(MonkeyConstants.RESTART_ACTIVE_NAMENODE_SLEEP_TIME,
MonkeyConstants.DEFAULT_RESTART_ACTIVE_NAMENODE_SLEEP_TIME + ""));
restartRandomDataNodeSleepTime =
Long.parseLong(this.properties.getProperty(MonkeyConstants.RESTART_RANDOM_DATANODE_SLEEP_TIME,
MonkeyConstants.DEFAULT_RESTART_RANDOM_DATANODE_SLEEP_TIME + ""));
restartRandomJournalNodeSleepTime = Long
.parseLong(this.properties.getProperty(MonkeyConstants.RESTART_RANDOM_JOURNALNODE_SLEEP_TIME,
MonkeyConstants.DEFAULT_RESTART_RANDOM_JOURNALNODE_SLEEP_TIME + ""));
restartRandomZKNodeSleepTime =
Long.parseLong(this.properties.getProperty(MonkeyConstants.RESTART_RANDOM_ZKNODE_SLEEP_TIME,
MonkeyConstants.DEFAULT_RESTART_RANDOM_ZKNODE_SLEEP_TIME + ""));
gracefulRollingRestartTSSLeepTime =
Long.parseLong(this.properties.getProperty(MonkeyConstants.GRACEFUL_RESTART_RS_SLEEP_TIME,
MonkeyConstants.DEFAULT_GRACEFUL_RESTART_RS_SLEEP_TIME + ""));
Expand All @@ -88,5 +117,8 @@ private void loadProperties() {
rollingBatchSuspendtRSRatio =
Float.parseFloat(this.properties.getProperty(MonkeyConstants.ROLLING_BATCH_SUSPEND_RS_RATIO,
MonkeyConstants.DEFAULT_ROLLING_BATCH_SUSPEND_RS_RATIO + ""));
action1Period =
Long.parseLong(this.properties.getProperty(MonkeyConstants.PERIODIC_ACTION1_PERIOD,
MonkeyConstants.DEFAULT_PERIODIC_ACTION1_PERIOD + ""));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,13 @@
*/
public class ServerKillingMonkeyFactory extends MonkeyFactory {

private long restartRandomRsExceptMetaSleepTime;
private long restartActiveMasterSleepTime;
private long rollingBatchRestartRSSleepTime;
private long gracefulRollingRestartTSSLeepTime;
private long rollingBatchSuspendRSSleepTime;
private float rollingBatchSuspendtRSRatio;
private long action1Period;

@Override
public ChaosMonkey build() {
Expand All @@ -48,10 +52,10 @@ public ChaosMonkey build() {
// Destructive actions to mess things around. Cannot run batch restart
// @formatter:off
Action[] actions1 = new Action[] {
new RestartRandomRsExceptMetaAction(60000),
new RestartActiveMasterAction(5000),
new RestartRandomRsExceptMetaAction(restartRandomRsExceptMetaSleepTime),
new RestartActiveMasterAction(restartActiveMasterSleepTime),
// only allow 2 servers to be dead
new RollingBatchRestartRsAction(5000, 1.0f, 2, true),
new RollingBatchRestartRsAction(rollingBatchRestartRSSleepTime, 1.0f, 2, true),
new ForceBalancerAction(),
new GracefulRollingRestartRsAction(gracefulRollingRestartTSSLeepTime),
new RollingBatchSuspendResumeRsAction(rollingBatchSuspendRSSleepTime,
Expand All @@ -63,12 +67,21 @@ public ChaosMonkey build() {
Action[] actions2 = new Action[] { new DumpClusterStatusAction() };

return new PolicyBasedChaosMonkey(properties, util,
new CompositeSequentialPolicy(new DoActionsOncePolicy(60 * 1000, actions1),
new PeriodicRandomActionPolicy(60 * 1000, actions1)),
new PeriodicRandomActionPolicy(60 * 1000, actions2));
new CompositeSequentialPolicy(new DoActionsOncePolicy(action1Period, actions1),
new PeriodicRandomActionPolicy(action1Period, actions1)),
new PeriodicRandomActionPolicy(action1Period, actions2));
}

private void loadProperties() {
restartRandomRsExceptMetaSleepTime = Long
.parseLong(this.properties.getProperty(MonkeyConstants.RESTART_RANDOM_RS_EXCEPTION_SLEEP_TIME,
MonkeyConstants.DEFAULT_RESTART_RANDOM_RS_EXCEPTION_SLEEP_TIME + ""));
restartActiveMasterSleepTime =
Long.parseLong(this.properties.getProperty(MonkeyConstants.RESTART_ACTIVE_MASTER_SLEEP_TIME,
MonkeyConstants.DEFAULT_RESTART_ACTIVE_MASTER_SLEEP_TIME + ""));
rollingBatchRestartRSSleepTime = Long
.parseLong(this.properties.getProperty(MonkeyConstants.ROLLING_BATCH_RESTART_RS_SLEEP_TIME,
MonkeyConstants.DEFAULT_ROLLING_BATCH_RESTART_RS_SLEEP_TIME + ""));
gracefulRollingRestartTSSLeepTime =
Long.parseLong(this.properties.getProperty(MonkeyConstants.GRACEFUL_RESTART_RS_SLEEP_TIME,
MonkeyConstants.DEFAULT_GRACEFUL_RESTART_RS_SLEEP_TIME + ""));
Expand All @@ -78,5 +91,8 @@ private void loadProperties() {
rollingBatchSuspendtRSRatio =
Float.parseFloat(this.properties.getProperty(MonkeyConstants.ROLLING_BATCH_SUSPEND_RS_RATIO,
MonkeyConstants.DEFAULT_ROLLING_BATCH_SUSPEND_RS_RATIO + ""));
action1Period =
Long.parseLong(this.properties.getProperty(MonkeyConstants.PERIODIC_ACTION1_PERIOD,
MonkeyConstants.DEFAULT_PERIODIC_ACTION1_PERIOD + ""));
}
}

0 comments on commit 7b424b2

Please sign in to comment.