From b18c38899c408e291c8a0bf72af100c837c1c1d4 Mon Sep 17 00:00:00 2001 From: Nick Dimiduk Date: Mon, 16 Jan 2023 14:40:50 +0100 Subject: [PATCH] HBASE-27567 Introduce ChaosMonkey Action to print HDFS Cluster status Signed-off-by: Reid Chan Signed-off-by: Duo Zhang --- .../actions/DumpHdfsClusterStatusAction.java | 78 +++++++++++++++++++ .../hbase/chaos/actions/HdfsActionUtils.java | 73 +++++++++++++++++ .../actions/RestartActiveNameNodeAction.java | 64 +++++++++------ .../actions/RestartRandomDataNodeAction.java | 20 ++--- ...erAndDependenciesKillingMonkeyFactory.java | 6 +- 5 files changed, 202 insertions(+), 39 deletions(-) create mode 100644 hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/DumpHdfsClusterStatusAction.java create mode 100644 hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/HdfsActionUtils.java diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/DumpHdfsClusterStatusAction.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/DumpHdfsClusterStatusAction.java new file mode 100644 index 000000000000..932590f84b23 --- /dev/null +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/DumpHdfsClusterStatusAction.java @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.chaos.actions; + +import java.net.InetSocketAddress; +import java.net.URI; +import java.util.List; +import org.apache.commons.io.FileUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hdfs.DistributedFileSystem; +import org.apache.hadoop.hdfs.HAUtil; +import org.apache.hadoop.hdfs.HAUtilClient; +import org.apache.hadoop.hdfs.protocol.ClientProtocol; +import org.apache.hadoop.hdfs.protocol.DatanodeInfo; +import org.apache.hadoop.hdfs.protocol.HdfsConstants; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class DumpHdfsClusterStatusAction extends Action { + private static final Logger LOG = LoggerFactory.getLogger(DumpHdfsClusterStatusAction.class); + private static final String PREFIX = "\n "; + + @Override + protected Logger getLogger() { + return LOG; + } + + @Override + public void perform() throws Exception { + StringBuilder sb = new StringBuilder(); + try (final DistributedFileSystem dfs = HdfsActionUtils.createDfs(getConf())) { + final Configuration dfsConf = dfs.getConf(); + final URI dfsUri = dfs.getUri(); + final boolean isHaAndLogicalUri = HAUtilClient.isLogicalUri(dfsConf, dfsUri); + sb.append("Cluster status").append('\n'); + if (isHaAndLogicalUri) { + final String nsId = dfsUri.getHost(); + final List namenodes = + HAUtil.getProxiesForAllNameNodesInNameservice(dfsConf, nsId); + final boolean atLeastOneActive = HAUtil.isAtLeastOneActive(namenodes); + final InetSocketAddress activeAddress = HAUtil.getAddressOfActive(dfs); + sb.append("Active NameNode=").append(activeAddress).append(", isAtLeastOneActive=") + .append(atLeastOneActive).append('\n'); + } + DatanodeInfo[] dns = dfs.getClient().datanodeReport(HdfsConstants.DatanodeReportType.LIVE); + sb.append("Number of live DataNodes: ").append(dns.length); + for (DatanodeInfo dni : dns) { + sb.append(PREFIX).append("name=").append(dni.getName()).append(", used%=") + .append(dni.getDfsUsedPercent()).append(", capacity=") + .append(FileUtils.byteCountToDisplaySize(dni.getCapacity())); + } + sb.append('\n'); + dns = dfs.getClient().datanodeReport(HdfsConstants.DatanodeReportType.DEAD); + sb.append("Number of dead DataNodes: ").append(dns.length); + for (DatanodeInfo dni : dns) { + sb.append(PREFIX).append(dni.getName()).append("/").append(dni.getNetworkLocation()); + } + } + // TODO: add more on NN, JNs, and ZK. + // TODO: Print how long process has been up. + getLogger().info(sb.toString()); + } +} diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/HdfsActionUtils.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/HdfsActionUtils.java new file mode 100644 index 000000000000..b60f7bd54a50 --- /dev/null +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/HdfsActionUtils.java @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.chaos.actions; + +import java.io.IOException; +import java.io.InterruptedIOException; +import java.security.PrivilegedExceptionAction; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hbase.util.CommonFSUtils; +import org.apache.hadoop.hdfs.DistributedFileSystem; +import org.apache.hadoop.security.UserGroupInformation; + +/** + * Configuration common across the HDFS Actions. + */ +public final class HdfsActionUtils { + + private HdfsActionUtils() { + } + + /** + * Specify a user as whom HDFS actions should be run. The chaos process must have permissions + * sufficient to assume the role of the specified user. + * @see Proxy + * user - Superusers Acting On Behalf Of Other Users + */ + public static final String HDFS_USER_CONF_KEY = "org.apache.hadoop.hbase.chaos.actions.hdfs_user"; + + private static DistributedFileSystem createUnproxiedDfs(final Configuration conf) + throws IOException { + final Path rootDir = CommonFSUtils.getRootDir(conf); + final FileSystem fs = rootDir.getFileSystem(conf); + return (DistributedFileSystem) fs; + } + + /** + * Create an instance of {@link DistributedFileSystem} that honors {@value HDFS_USER_CONF_KEY}. + */ + static DistributedFileSystem createDfs(final Configuration conf) throws IOException { + final String proxyUser = conf.get(HDFS_USER_CONF_KEY); + if (proxyUser == null) { + return createUnproxiedDfs(conf); + } + final UserGroupInformation proxyUgi = + UserGroupInformation.createProxyUser(proxyUser, UserGroupInformation.getLoginUser()); + try { + return proxyUgi + .doAs((PrivilegedExceptionAction) () -> createUnproxiedDfs(conf)); + } catch (InterruptedException e) { + final InterruptedIOException iioe = new InterruptedIOException(e.getMessage()); + iioe.setStackTrace(e.getStackTrace()); + throw iioe; + } + } +} diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartActiveNameNodeAction.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartActiveNameNodeAction.java index 8fe3ec2cdffc..13e67b8e7eed 100644 --- a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartActiveNameNodeAction.java +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartActiveNameNodeAction.java @@ -17,15 +17,17 @@ */ package org.apache.hadoop.hbase.chaos.actions; +import java.util.Collections; import java.util.List; +import java.util.Optional; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.ServerName; -import org.apache.hadoop.hbase.util.CommonFSUtils; import org.apache.hadoop.hbase.zookeeper.RecoverableZooKeeper; import org.apache.hadoop.hbase.zookeeper.ZKUtil; import org.apache.hadoop.hbase.zookeeper.ZKWatcher; import org.apache.hadoop.hbase.zookeeper.ZNodePaths; import org.apache.hadoop.hdfs.DFSUtil; +import org.apache.hadoop.hdfs.DistributedFileSystem; import org.apache.hadoop.hdfs.HAUtil; import org.apache.hadoop.hdfs.server.namenode.ha.proto.HAZKInfoProtos.ActiveNodeInfo; import org.slf4j.Logger; @@ -57,39 +59,51 @@ protected Logger getLogger() { @Override public void perform() throws Exception { getLogger().info("Performing action: Restart active namenode"); - Configuration conf = CommonFSUtils.getRootDir(getConf()).getFileSystem(getConf()).getConf(); - String nameServiceID = DFSUtil.getNamenodeNameServiceId(conf); - if (!HAUtil.isHAEnabled(conf, nameServiceID)) { - throw new Exception("HA for namenode is not enabled"); - } - ZKWatcher zkw = null; - RecoverableZooKeeper rzk = null; + + final String hadoopHAZkNode; String activeNamenode = null; - String hadoopHAZkNode = conf.get(ZK_PARENT_ZNODE_KEY, ZK_PARENT_ZNODE_DEFAULT); - try { - zkw = new ZKWatcher(conf, "get-active-namenode", null); - rzk = zkw.getRecoverableZooKeeper(); - String hadoopHAZkNodePath = ZNodePaths.joinZNode(hadoopHAZkNode, nameServiceID); - List subChildern = ZKUtil.listChildrenNoWatch(zkw, hadoopHAZkNodePath); - for (String eachEntry : subChildern) { - if (eachEntry.contains(ACTIVE_NN_LOCK_NAME)) { + int activeNamenodePort = -1; + try (final DistributedFileSystem dfs = HdfsActionUtils.createDfs(getConf())) { + final Configuration conf = dfs.getConf(); + hadoopHAZkNode = conf.get(ZK_PARENT_ZNODE_KEY, ZK_PARENT_ZNODE_DEFAULT); + final String nameServiceID = DFSUtil.getNamenodeNameServiceId(conf); + + if (!HAUtil.isHAEnabled(conf, nameServiceID)) { + getLogger().info("HA for HDFS is not enabled; skipping"); + return; + } + try (final ZKWatcher zkw = new ZKWatcher(conf, "get-active-namenode", null)) { + final RecoverableZooKeeper rzk = zkw.getRecoverableZooKeeper(); + // If hadoopHAZkNode == '/', pass '' instead because then joinZNode will return '//' as a + // prefix + // which zk doesn't like as a prefix on the path. + final String hadoopHAZkNodePath = ZNodePaths.joinZNode( + (hadoopHAZkNode != null && hadoopHAZkNode.equals("/")) ? "" : hadoopHAZkNode, + nameServiceID); + final List subChildren = + Optional.ofNullable(ZKUtil.listChildrenNoWatch(zkw, hadoopHAZkNodePath)) + .orElse(Collections.emptyList()); + for (final String eachEntry : subChildren) { + if (!eachEntry.contains(ACTIVE_NN_LOCK_NAME)) { + continue; + } byte[] data = rzk.getData(ZNodePaths.joinZNode(hadoopHAZkNodePath, ACTIVE_NN_LOCK_NAME), false, null); ActiveNodeInfo proto = ActiveNodeInfo.parseFrom(data); activeNamenode = proto.getHostname(); + activeNamenodePort = proto.getPort(); } } - } finally { - if (zkw != null) { - zkw.close(); - } } + if (activeNamenode == null) { - throw new Exception("No active Name node found in zookeeper under " + hadoopHAZkNode); + getLogger().info("No active Name node found in zookeeper under '{}'", hadoopHAZkNode); + return; } - getLogger().info("Found active namenode host:" + activeNamenode); - ServerName activeNNHost = ServerName.valueOf(activeNamenode, -1, -1); - getLogger().info("Restarting Active NameNode :" + activeNamenode); - restartNameNode(activeNNHost, sleepTime); + + getLogger().info("Found Active NameNode host: {}", activeNamenode); + final ServerName activeNNHost = ServerName.valueOf(activeNamenode, activeNamenodePort, -1L); + getLogger().info("Restarting Active NameNode: {}", activeNamenode); + restartNameNode(activeNNHost, this.sleepTime); } } diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartRandomDataNodeAction.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartRandomDataNodeAction.java index b039738e3d3e..74ea5e50043f 100644 --- a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartRandomDataNodeAction.java +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartRandomDataNodeAction.java @@ -18,14 +18,11 @@ package org.apache.hadoop.hbase.chaos.actions; import java.io.IOException; -import java.util.LinkedList; -import java.util.List; +import java.util.Arrays; import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.chaos.monkies.PolicyBasedChaosMonkey; -import org.apache.hadoop.hbase.util.CommonFSUtils; import org.apache.hadoop.hdfs.DFSClient; import org.apache.hadoop.hdfs.DistributedFileSystem; -import org.apache.hadoop.hdfs.protocol.DatanodeInfo; import org.apache.hadoop.hdfs.protocol.HdfsConstants; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -48,18 +45,15 @@ protected Logger getLogger() { @Override public void perform() throws Exception { getLogger().info("Performing action: Restart random data node"); - ServerName server = PolicyBasedChaosMonkey.selectRandomItem(getDataNodes()); + final ServerName server = PolicyBasedChaosMonkey.selectRandomItem(getDataNodes()); restartDataNode(server, sleepTime); } - public ServerName[] getDataNodes() throws IOException { - DistributedFileSystem fs = - (DistributedFileSystem) CommonFSUtils.getRootDir(getConf()).getFileSystem(getConf()); - DFSClient dfsClient = fs.getClient(); - List hosts = new LinkedList<>(); - for (DatanodeInfo dataNode : dfsClient.datanodeReport(HdfsConstants.DatanodeReportType.LIVE)) { - hosts.add(ServerName.valueOf(dataNode.getHostName(), -1, -1)); + private ServerName[] getDataNodes() throws IOException { + try (final DistributedFileSystem dfs = HdfsActionUtils.createDfs(getConf())) { + final DFSClient dfsClient = dfs.getClient(); + return Arrays.stream(dfsClient.datanodeReport(HdfsConstants.DatanodeReportType.LIVE)) + .map(dn -> ServerName.valueOf(dn.getHostName(), -1, -1)).toArray(ServerName[]::new); } - return hosts.toArray(new ServerName[0]); } } diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/ServerAndDependenciesKillingMonkeyFactory.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/ServerAndDependenciesKillingMonkeyFactory.java index 11115ee201d7..ab0e8cd20eb8 100644 --- a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/ServerAndDependenciesKillingMonkeyFactory.java +++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/factories/ServerAndDependenciesKillingMonkeyFactory.java @@ -19,9 +19,11 @@ import org.apache.hadoop.hbase.chaos.actions.Action; import org.apache.hadoop.hbase.chaos.actions.DumpClusterStatusAction; +import org.apache.hadoop.hbase.chaos.actions.DumpHdfsClusterStatusAction; import org.apache.hadoop.hbase.chaos.actions.ForceBalancerAction; import org.apache.hadoop.hbase.chaos.actions.GracefulRollingRestartRsAction; import org.apache.hadoop.hbase.chaos.actions.RestartActiveMasterAction; +import org.apache.hadoop.hbase.chaos.actions.RestartActiveNameNodeAction; import org.apache.hadoop.hbase.chaos.actions.RestartRandomDataNodeAction; import org.apache.hadoop.hbase.chaos.actions.RestartRandomRsExceptMetaAction; import org.apache.hadoop.hbase.chaos.actions.RestartRandomZKNodeAction; @@ -55,6 +57,7 @@ public ChaosMonkey build() { // only allow 2 servers to be dead. new RollingBatchRestartRsAction(5000, 1.0f, 2, true), new ForceBalancerAction(), + new RestartActiveNameNodeAction(60000), new RestartRandomDataNodeAction(60000), new RestartRandomZKNodeAction(60000), new GracefulRollingRestartRsAction(gracefulRollingRestartTSSLeepTime), @@ -64,7 +67,8 @@ public ChaosMonkey build() { // @formatter:on // Action to log more info for debugging - Action[] actions2 = new Action[] { new DumpClusterStatusAction() }; + Action[] actions2 = + new Action[] { new DumpClusterStatusAction(), new DumpHdfsClusterStatusAction() }; return new PolicyBasedChaosMonkey(properties, util, new CompositeSequentialPolicy(new DoActionsOncePolicy(60 * 1000, actions1),