Skip to content

Commit f0dddd1

Browse files
BukrosSzabolcsmeszibalu
authored andcommitted
HBASE-22982: region server suspend/resume and graceful rolling restart actions (#592)
* Add chaos monkey action for suspend/resume region servers * Add chaos monkey action for graceful rolling restart * Add these to relevant chaos monkeys Signed-off-by: Balazs Meszaros <meszibalu@apache.org> Signed-off-by: Peter Somogyi <psomogyi@apache.org>
1 parent ea24ea7 commit f0dddd1

13 files changed

+481
-73
lines changed

hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java

Lines changed: 48 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -97,13 +97,13 @@ public void close() throws IOException {
9797

9898
@Override
9999
public void startRegionServer(String hostname, int port) throws IOException {
100-
LOG.info("Starting RS on: " + hostname);
100+
LOG.info("Starting RS on: {}", hostname);
101101
clusterManager.start(ServiceType.HBASE_REGIONSERVER, hostname, port);
102102
}
103103

104104
@Override
105105
public void killRegionServer(ServerName serverName) throws IOException {
106-
LOG.info("Aborting RS: " + serverName.getServerName());
106+
LOG.info("Aborting RS: {}", serverName.getServerName());
107107
killedRegionServers.add(serverName);
108108
clusterManager.kill(ServiceType.HBASE_REGIONSERVER,
109109
serverName.getHostname(), serverName.getPort());
@@ -116,7 +116,7 @@ public boolean isKilledRS(ServerName serverName) {
116116

117117
@Override
118118
public void stopRegionServer(ServerName serverName) throws IOException {
119-
LOG.info("Stopping RS: " + serverName.getServerName());
119+
LOG.info("Stopping RS: {}", serverName.getServerName());
120120
clusterManager.stop(ServiceType.HBASE_REGIONSERVER,
121121
serverName.getHostname(), serverName.getPort());
122122
}
@@ -126,22 +126,36 @@ public void waitForRegionServerToStop(ServerName serverName, long timeout) throw
126126
waitForServiceToStop(ServiceType.HBASE_REGIONSERVER, serverName, timeout);
127127
}
128128

129+
@Override
130+
public void suspendRegionServer(ServerName serverName) throws IOException {
131+
LOG.info("Suspend RS: {}", serverName.getServerName());
132+
clusterManager.suspend(ServiceType.HBASE_REGIONSERVER,
133+
serverName.getHostname(), serverName.getPort());
134+
}
135+
136+
@Override
137+
public void resumeRegionServer(ServerName serverName) throws IOException {
138+
LOG.info("Resume RS: {}", serverName.getServerName());
139+
clusterManager.resume(ServiceType.HBASE_REGIONSERVER,
140+
serverName.getHostname(), serverName.getPort());
141+
}
142+
129143
@Override
130144
public void startZkNode(String hostname, int port) throws IOException {
131-
LOG.info("Starting ZooKeeper node on: " + hostname);
145+
LOG.info("Starting ZooKeeper node on: {}", hostname);
132146
clusterManager.start(ServiceType.ZOOKEEPER_SERVER, hostname, port);
133147
}
134148

135149
@Override
136150
public void killZkNode(ServerName serverName) throws IOException {
137-
LOG.info("Aborting ZooKeeper node on: " + serverName.getServerName());
151+
LOG.info("Aborting ZooKeeper node on: {}", serverName.getServerName());
138152
clusterManager.kill(ServiceType.ZOOKEEPER_SERVER,
139153
serverName.getHostname(), serverName.getPort());
140154
}
141155

142156
@Override
143157
public void stopZkNode(ServerName serverName) throws IOException {
144-
LOG.info("Stopping ZooKeeper node: " + serverName.getServerName());
158+
LOG.info("Stopping ZooKeeper node: {}", serverName.getServerName());
145159
clusterManager.stop(ServiceType.ZOOKEEPER_SERVER,
146160
serverName.getHostname(), serverName.getPort());
147161
}
@@ -158,21 +172,21 @@ public void waitForZkNodeToStop(ServerName serverName, long timeout) throws IOEx
158172

159173
@Override
160174
public void startDataNode(ServerName serverName) throws IOException {
161-
LOG.info("Starting data node on: " + serverName.getServerName());
175+
LOG.info("Starting data node on: {}", serverName.getServerName());
162176
clusterManager.start(ServiceType.HADOOP_DATANODE,
163177
serverName.getHostname(), serverName.getPort());
164178
}
165179

166180
@Override
167181
public void killDataNode(ServerName serverName) throws IOException {
168-
LOG.info("Aborting data node on: " + serverName.getServerName());
182+
LOG.info("Aborting data node on: {}", serverName.getServerName());
169183
clusterManager.kill(ServiceType.HADOOP_DATANODE,
170184
serverName.getHostname(), serverName.getPort());
171185
}
172186

173187
@Override
174188
public void stopDataNode(ServerName serverName) throws IOException {
175-
LOG.info("Stopping data node on: " + serverName.getServerName());
189+
LOG.info("Stopping data node on: {}", serverName.getServerName());
176190
clusterManager.stop(ServiceType.HADOOP_DATANODE,
177191
serverName.getHostname(), serverName.getPort());
178192
}
@@ -189,21 +203,21 @@ public void waitForDataNodeToStop(ServerName serverName, long timeout) throws IO
189203

190204
@Override
191205
public void startNameNode(ServerName serverName) throws IOException {
192-
LOG.info("Starting name node on: " + serverName.getServerName());
206+
LOG.info("Starting name node on: {}", serverName.getServerName());
193207
clusterManager.start(ServiceType.HADOOP_NAMENODE, serverName.getHostname(),
194208
serverName.getPort());
195209
}
196210

197211
@Override
198212
public void killNameNode(ServerName serverName) throws IOException {
199-
LOG.info("Aborting name node on: " + serverName.getServerName());
213+
LOG.info("Aborting name node on: {}", serverName.getServerName());
200214
clusterManager.kill(ServiceType.HADOOP_NAMENODE, serverName.getHostname(),
201215
serverName.getPort());
202216
}
203217

204218
@Override
205219
public void stopNameNode(ServerName serverName) throws IOException {
206-
LOG.info("Stopping name node on: " + serverName.getServerName());
220+
LOG.info("Stopping name node on: {}", serverName.getServerName());
207221
clusterManager.stop(ServiceType.HADOOP_NAMENODE, serverName.getHostname(),
208222
serverName.getPort());
209223
}
@@ -220,7 +234,7 @@ public void waitForNameNodeToStop(ServerName serverName, long timeout) throws IO
220234

221235
private void waitForServiceToStop(ServiceType service, ServerName serverName, long timeout)
222236
throws IOException {
223-
LOG.info("Waiting for service: " + service + " to stop: " + serverName.getServerName());
237+
LOG.info("Waiting for service: {} to stop: {}", service, serverName.getServerName());
224238
long start = System.currentTimeMillis();
225239

226240
while ((System.currentTimeMillis() - start) < timeout) {
@@ -234,7 +248,7 @@ private void waitForServiceToStop(ServiceType service, ServerName serverName, lo
234248

235249
private void waitForServiceToStart(ServiceType service, ServerName serverName, long timeout)
236250
throws IOException {
237-
LOG.info("Waiting for service: " + service + " to start: " + serverName.getServerName());
251+
LOG.info("Waiting for service: {} to start: ", service, serverName.getServerName());
238252
long start = System.currentTimeMillis();
239253

240254
while ((System.currentTimeMillis() - start) < timeout) {
@@ -248,19 +262,19 @@ private void waitForServiceToStart(ServiceType service, ServerName serverName, l
248262

249263
@Override
250264
public void startMaster(String hostname, int port) throws IOException {
251-
LOG.info("Starting Master on: " + hostname + ":" + port);
265+
LOG.info("Starting Master on: {}:{}", hostname, port);
252266
clusterManager.start(ServiceType.HBASE_MASTER, hostname, port);
253267
}
254268

255269
@Override
256270
public void killMaster(ServerName serverName) throws IOException {
257-
LOG.info("Aborting Master: " + serverName.getServerName());
271+
LOG.info("Aborting Master: {}", serverName.getServerName());
258272
clusterManager.kill(ServiceType.HBASE_MASTER, serverName.getHostname(), serverName.getPort());
259273
}
260274

261275
@Override
262276
public void stopMaster(ServerName serverName) throws IOException {
263-
LOG.info("Stopping Master: " + serverName.getServerName());
277+
LOG.info("Stopping Master: {}", serverName.getServerName());
264278
clusterManager.stop(ServiceType.HBASE_MASTER, serverName.getHostname(), serverName.getPort());
265279
}
266280

@@ -294,7 +308,7 @@ public ServerName getServerHoldingRegion(TableName tn, byte[] regionName) throws
294308
regionLoc = locator.getRegionLocation(startKey, true);
295309
}
296310
if (regionLoc == null) {
297-
LOG.warn("Cannot find region server holding region " + Bytes.toStringBinary(regionName));
311+
LOG.warn("Cannot find region server holding region {}", Bytes.toStringBinary(regionName));
298312
return null;
299313
}
300314
return regionLoc.getServerName();
@@ -338,15 +352,15 @@ protected boolean restoreMasters(ClusterMetrics initial, ClusterMetrics current)
338352
//check whether current master has changed
339353
final ServerName initMaster = initial.getMasterName();
340354
if (!ServerName.isSameAddress(initMaster, current.getMasterName())) {
341-
LOG.info("Restoring cluster - Initial active master : " + initMaster.getAddress() +
342-
" has changed to : " + current.getMasterName().getAddress());
355+
LOG.info("Restoring cluster - Initial active master : {} has changed to : {}",
356+
initMaster.getAddress(), current.getMasterName().getAddress());
343357
// If initial master is stopped, start it, before restoring the state.
344358
// It will come up as a backup master, if there is already an active master.
345359
try {
346360
if (!clusterManager.isRunning(ServiceType.HBASE_MASTER,
347361
initMaster.getHostname(), initMaster.getPort())) {
348-
LOG.info("Restoring cluster - starting initial active master at:"
349-
+ initMaster.getAddress());
362+
LOG.info("Restoring cluster - starting initial active master at:{}",
363+
initMaster.getAddress());
350364
startMaster(initMaster.getHostname(), initMaster.getPort());
351365
}
352366

@@ -356,11 +370,11 @@ protected boolean restoreMasters(ClusterMetrics initial, ClusterMetrics current)
356370
// 3. Start backup masters
357371
for (ServerName currentBackup : current.getBackupMasterNames()) {
358372
if (!ServerName.isSameAddress(currentBackup, initMaster)) {
359-
LOG.info("Restoring cluster - stopping backup master: " + currentBackup);
373+
LOG.info("Restoring cluster - stopping backup master: {}", currentBackup);
360374
stopMaster(currentBackup);
361375
}
362376
}
363-
LOG.info("Restoring cluster - stopping active master: " + current.getMasterName());
377+
LOG.info("Restoring cluster - stopping active master: {}", current.getMasterName());
364378
stopMaster(current.getMasterName());
365379
waitForActiveAndReadyMaster(); // wait so that active master takes over
366380
} catch (IOException ex) {
@@ -376,8 +390,8 @@ protected boolean restoreMasters(ClusterMetrics initial, ClusterMetrics current)
376390
if (!clusterManager.isRunning(ServiceType.HBASE_MASTER,
377391
backup.getHostname(),
378392
backup.getPort())) {
379-
LOG.info("Restoring cluster - starting initial backup master: "
380-
+ backup.getAddress());
393+
LOG.info("Restoring cluster - starting initial backup master: {}",
394+
backup.getAddress());
381395
startMaster(backup.getHostname(), backup.getPort());
382396
}
383397
} catch (IOException ex) {
@@ -401,7 +415,7 @@ protected boolean restoreMasters(ClusterMetrics initial, ClusterMetrics current)
401415
for (ServerName sn:toStart) {
402416
try {
403417
if(!clusterManager.isRunning(ServiceType.HBASE_MASTER, sn.getHostname(), sn.getPort())) {
404-
LOG.info("Restoring cluster - starting initial backup master: " + sn.getAddress());
418+
LOG.info("Restoring cluster - starting initial backup master: {}", sn.getAddress());
405419
startMaster(sn.getHostname(), sn.getPort());
406420
}
407421
} catch (IOException ex) {
@@ -412,7 +426,7 @@ protected boolean restoreMasters(ClusterMetrics initial, ClusterMetrics current)
412426
for (ServerName sn:toKill) {
413427
try {
414428
if(clusterManager.isRunning(ServiceType.HBASE_MASTER, sn.getHostname(), sn.getPort())) {
415-
LOG.info("Restoring cluster - stopping backup master: " + sn.getAddress());
429+
LOG.info("Restoring cluster - stopping backup master: {}", sn.getAddress());
416430
stopMaster(sn);
417431
}
418432
} catch (IOException ex) {
@@ -421,8 +435,8 @@ protected boolean restoreMasters(ClusterMetrics initial, ClusterMetrics current)
421435
}
422436
}
423437
if (!deferred.isEmpty()) {
424-
LOG.warn("Restoring cluster - restoring region servers reported "
425-
+ deferred.size() + " errors:");
438+
LOG.warn("Restoring cluster - restoring region servers reported {} errors:",
439+
deferred.size());
426440
for (int i=0; i<deferred.size() && i < 3; i++) {
427441
LOG.warn(Objects.toString(deferred.get(i)));
428442
}
@@ -464,7 +478,7 @@ protected boolean restoreRegionServers(ClusterMetrics initial, ClusterMetrics cu
464478
try {
465479
if (!clusterManager.isRunning(ServiceType.HBASE_REGIONSERVER, sn.getHostname(),
466480
sn.getPort()) && master.getPort() != sn.getPort()) {
467-
LOG.info("Restoring cluster - starting initial region server: " + sn.getAddress());
481+
LOG.info("Restoring cluster - starting initial region server: {}", sn.getAddress());
468482
startRegionServer(sn.getHostname(), sn.getPort());
469483
}
470484
} catch (IOException ex) {
@@ -476,16 +490,16 @@ protected boolean restoreRegionServers(ClusterMetrics initial, ClusterMetrics cu
476490
try {
477491
if (clusterManager.isRunning(ServiceType.HBASE_REGIONSERVER, sn.getHostname(),
478492
sn.getPort()) && master.getPort() != sn.getPort()) {
479-
LOG.info("Restoring cluster - stopping initial region server: " + sn.getAddress());
493+
LOG.info("Restoring cluster - stopping initial region server: {}", sn.getAddress());
480494
stopRegionServer(sn);
481495
}
482496
} catch (IOException ex) {
483497
deferred.add(ex);
484498
}
485499
}
486500
if (!deferred.isEmpty()) {
487-
LOG.warn("Restoring cluster - restoring region servers reported "
488-
+ deferred.size() + " errors:");
501+
LOG.warn("Restoring cluster - restoring region servers reported {} errors:",
502+
deferred.size());
489503
for (int i=0; i<deferred.size() && i < 3; i++) {
490504
LOG.warn(Objects.toString(deferred.get(i)));
491505
}

0 commit comments

Comments
 (0)