Skip to content
This repository was archived by the owner on Jun 23, 2022. It is now read-only.

Commit c4081c4

Browse files
committed
BUG-42254. HDFS-8880. NameNode metrics logging. (Arpit Agarwal)
1 parent ee47964 commit c4081c4

File tree

9 files changed

+448
-7
lines changed

9 files changed

+448
-7
lines changed

HDP-CHANGES.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,8 @@ Changes only in Hortonworks github and not in hadoop-2.7.0 and hence not covered
147147
YARN-3700. Made generic history service load a number of latest applications
148148
according to the parameter or the configuration. (Xuan Gong via zjshen)
149149

150+
BUG-42254. HDFS-8880. NameNode metrics logging. (Arpit Agarwal)
151+
150152
BUG FIXES
151153

152154
BUG-5700. NameNode should shut down if out of memory error occurs. (cnauroth)

hadoop-common-project/hadoop-common/src/main/conf/log4j.properties

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,19 @@ log4j.appender.RFAAUDIT.layout.ConversionPattern=%d{ISO8601} %p %c{2}: %m%n
148148
log4j.appender.RFAAUDIT.MaxFileSize=${hdfs.audit.log.maxfilesize}
149149
log4j.appender.RFAAUDIT.MaxBackupIndex=${hdfs.audit.log.maxbackupindex}
150150

151+
#
152+
# NameNode metrics logging.
153+
# The default is to retain two namenode-metrics.log files up to 64MB each.
154+
#
155+
log4j.logger.NameNodeMetricsLog=INFO,NNMETRICSRFA
156+
log4j.additivity.NameNodeMetricsLog=false
157+
log4j.appender.NNMETRICSRFA=org.apache.log4j.RollingFileAppender
158+
log4j.appender.NNMETRICSRFA.File=${hadoop.log.dir}/namenode-metrics.log
159+
log4j.appender.NNMETRICSRFA.layout=org.apache.log4j.PatternLayout
160+
log4j.appender.NNMETRICSRFA.layout.ConversionPattern=%d{ISO8601} %m%n
161+
log4j.appender.NNMETRICSRFA.MaxBackupIndex=1
162+
log4j.appender.NNMETRICSRFA.MaxFileSize=64MB
163+
151164
#
152165
# mapred audit logging
153166
#

hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics2/util/MBeans.java

Lines changed: 33 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@
1818
package org.apache.hadoop.metrics2.util;
1919

2020
import java.lang.management.ManagementFactory;
21+
import java.util.regex.Matcher;
22+
import java.util.regex.Pattern;
2123

2224
import javax.management.InstanceAlreadyExistsException;
2325
import javax.management.MBeanServer;
@@ -38,6 +40,13 @@
3840
@InterfaceStability.Stable
3941
public class MBeans {
4042
private static final Log LOG = LogFactory.getLog(MBeans.class);
43+
private static final String DOMAIN_PREFIX = "Hadoop:";
44+
private static final String SERVICE_PREFIX = "service=";
45+
private static final String NAME_PREFIX = "name=";
46+
47+
private static final Pattern MBEAN_NAME_PATTERN = Pattern.compile(
48+
"^" + DOMAIN_PREFIX + SERVICE_PREFIX + "([^,]+)," +
49+
NAME_PREFIX + "(.+)$");
4150

4251
/**
4352
* Register the MBean using our standard MBeanName format
@@ -70,6 +79,26 @@ static public ObjectName register(String serviceName, String nameName,
7079
return null;
7180
}
7281

82+
public static String getMbeanNameService(final ObjectName objectName) {
83+
Matcher matcher = MBEAN_NAME_PATTERN.matcher(objectName.toString());
84+
if (matcher.matches()) {
85+
return matcher.group(1);
86+
} else {
87+
throw new IllegalArgumentException(
88+
objectName + " is not a valid Hadoop mbean");
89+
}
90+
}
91+
92+
public static String getMbeanNameName(final ObjectName objectName) {
93+
Matcher matcher = MBEAN_NAME_PATTERN.matcher(objectName.toString());
94+
if (matcher.matches()) {
95+
return matcher.group(2);
96+
} else {
97+
throw new IllegalArgumentException(
98+
objectName + " is not a valid Hadoop mbean");
99+
}
100+
}
101+
73102
static public void unregister(ObjectName mbeanName) {
74103
LOG.debug("Unregistering "+ mbeanName);
75104
final MBeanServer mbs = ManagementFactory.getPlatformMBeanServer();
@@ -86,13 +115,13 @@ static public void unregister(ObjectName mbeanName) {
86115
}
87116

88117
static private ObjectName getMBeanName(String serviceName, String nameName) {
89-
ObjectName name = null;
90-
String nameStr = "Hadoop:service="+ serviceName +",name="+ nameName;
118+
String nameStr = DOMAIN_PREFIX + SERVICE_PREFIX + serviceName + "," +
119+
NAME_PREFIX + nameName;
91120
try {
92-
name = DefaultMetricsSystem.newMBeanName(nameStr);
121+
return DefaultMetricsSystem.newMBeanName(nameStr);
93122
} catch (Exception e) {
94123
LOG.warn("Error creating MBean object name: "+ nameStr, e);
124+
return null;
95125
}
96-
return name;
97126
}
98127
}

hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,11 @@ Release 2.8.0 - UNRELEASED
2020

2121
HDFS-8824. Do not use small blocks for balancing the cluster. (szetszwo)
2222

23-
2423
HDFS-8883. NameNode Metrics : Add FSNameSystem lock Queue Length.
2524
(Anu Engineer via xyao)
2625

26+
HDFS-8880. NameNode metrics logging. (Arpit Agarwal)
27+
2728
OPTIMIZATIONS
2829

2930
BUG FIXES

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -404,6 +404,10 @@ public class DFSConfigKeys extends CommonConfigurationKeys {
404404
public static final boolean DFS_NAMENODE_AUDIT_LOG_TOKEN_TRACKING_ID_DEFAULT = false;
405405
public static final String DFS_NAMENODE_AUDIT_LOG_ASYNC_KEY = "dfs.namenode.audit.log.async";
406406
public static final boolean DFS_NAMENODE_AUDIT_LOG_ASYNC_DEFAULT = false;
407+
public static final String DFS_NAMENODE_METRICS_LOGGER_PERIOD_SECONDS_KEY =
408+
"dfs.namenode.metrics.logger.period.seconds";
409+
public static final int DFS_NAMENODE_METRICS_LOGGER_PERIOD_SECONDS_DEFAULT =
410+
600;
407411

408412
// Much code in hdfs is not yet updated to use these keys.
409413
public static final String DFS_CLIENT_BLOCK_WRITE_LOCATEFOLLOWINGBLOCK_RETRIES_KEY = "dfs.client.block.write.locateFollowingBlock.retries";

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java

Lines changed: 175 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,9 @@
2121
import com.google.common.base.Joiner;
2222
import com.google.common.base.Preconditions;
2323
import com.google.common.collect.Lists;
24+
import org.apache.commons.logging.Log;
25+
import org.apache.commons.logging.LogFactory;
26+
import org.apache.commons.logging.impl.Log4JLogger;
2427
import org.apache.hadoop.HadoopIllegalArgumentException;
2528
import org.apache.hadoop.classification.InterfaceAudience;
2629
import org.apache.hadoop.conf.Configuration;
@@ -72,21 +75,34 @@
7275
import org.apache.hadoop.util.JvmPauseMonitor;
7376
import org.apache.hadoop.util.ServicePlugin;
7477
import org.apache.hadoop.util.StringUtils;
75-
import org.apache.log4j.LogManager;
78+
import org.apache.log4j.Appender;
79+
import org.apache.log4j.AsyncAppender;
7680
import org.slf4j.Logger;
7781
import org.slf4j.LoggerFactory;
7882

83+
import javax.management.Attribute;
84+
import javax.management.AttributeList;
85+
import javax.management.MBeanAttributeInfo;
86+
import javax.management.MBeanInfo;
87+
import javax.management.MBeanServer;
88+
import javax.management.MalformedObjectNameException;
7989
import javax.management.ObjectName;
8090

8191
import java.io.IOException;
8292
import java.io.PrintStream;
93+
import java.lang.management.ManagementFactory;
8394
import java.net.InetSocketAddress;
8495
import java.net.URI;
8596
import java.security.PrivilegedExceptionAction;
8697
import java.util.ArrayList;
8798
import java.util.Arrays;
8899
import java.util.Collection;
100+
import java.util.Collections;
101+
import java.util.HashSet;
89102
import java.util.List;
103+
import java.util.Set;
104+
import java.util.concurrent.ScheduledThreadPoolExecutor;
105+
import java.util.concurrent.TimeUnit;
90106
import java.util.concurrent.atomic.AtomicBoolean;
91107

92108
import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.FS_DEFAULT_NAME_KEY;
@@ -112,6 +128,8 @@
112128
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_KERBEROS_INTERNAL_SPNEGO_PRINCIPAL_KEY;
113129
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_KERBEROS_PRINCIPAL_KEY;
114130
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_KEYTAB_FILE_KEY;
131+
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_METRICS_LOGGER_PERIOD_SECONDS_DEFAULT;
132+
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_METRICS_LOGGER_PERIOD_SECONDS_KEY;
115133
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_NAME_DIR_KEY;
116134
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_PLUGINS_KEY;
117135
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RPC_ADDRESS_KEY;
@@ -303,7 +321,10 @@ public long getProtocolVersion(String protocol,
303321
LoggerFactory.getLogger("BlockStateChange");
304322
public static final HAState ACTIVE_STATE = new ActiveState();
305323
public static final HAState STANDBY_STATE = new StandbyState();
306-
324+
325+
public static final Log MetricsLog =
326+
LogFactory.getLog("NameNodeMetricsLog");
327+
307328
protected FSNamesystem namesystem;
308329
protected final Configuration conf;
309330
protected final NamenodeRole role;
@@ -329,6 +350,8 @@ public long getProtocolVersion(String protocol,
329350
private JvmPauseMonitor pauseMonitor;
330351
private ObjectName nameNodeStatusBeanName;
331352
SpanReceiverHost spanReceiverHost;
353+
ScheduledThreadPoolExecutor metricsLoggerTimer;
354+
332355
/**
333356
* The namenode address that clients will use to access this namenode
334357
* or the name service. For HA configurations using logical URI, it
@@ -662,6 +685,68 @@ protected void initialize(Configuration conf) throws IOException {
662685
metrics.getJvmMetrics().setPauseMonitor(pauseMonitor);
663686

664687
startCommonServices(conf);
688+
startMetricsLogger(conf);
689+
}
690+
691+
/**
692+
* Start a timer to periodically write NameNode metrics to the log
693+
* file. This behavior can be disabled by configuration.
694+
* @param conf
695+
*/
696+
protected void startMetricsLogger(Configuration conf) {
697+
long metricsLoggerPeriodSec =
698+
conf.getInt(DFS_NAMENODE_METRICS_LOGGER_PERIOD_SECONDS_KEY,
699+
DFS_NAMENODE_METRICS_LOGGER_PERIOD_SECONDS_DEFAULT);
700+
701+
if (metricsLoggerPeriodSec <= 0) {
702+
return;
703+
}
704+
705+
makeMetricsLoggerAsync();
706+
707+
// Schedule the periodic logging.
708+
metricsLoggerTimer = new ScheduledThreadPoolExecutor(1);
709+
metricsLoggerTimer.setExecuteExistingDelayedTasksAfterShutdownPolicy(
710+
false);
711+
metricsLoggerTimer.scheduleWithFixedDelay(new MetricsLoggerTask(),
712+
metricsLoggerPeriodSec,
713+
metricsLoggerPeriodSec,
714+
TimeUnit.SECONDS);
715+
}
716+
717+
/**
718+
* Make the metrics logger async and add all pre-existing appenders
719+
* to the async appender.
720+
*/
721+
private static void makeMetricsLoggerAsync() {
722+
if (!(MetricsLog instanceof Log4JLogger)) {
723+
LOG.warn(
724+
"Metrics logging will not be async since the logger is not log4j");
725+
return;
726+
}
727+
org.apache.log4j.Logger logger = ((Log4JLogger) MetricsLog).getLogger();
728+
logger.setAdditivity(false); // Don't pollute NN logs with metrics dump
729+
730+
@SuppressWarnings("unchecked")
731+
List<Appender> appenders = Collections.list(logger.getAllAppenders());
732+
// failsafe against trying to async it more than once
733+
if (!appenders.isEmpty() && !(appenders.get(0) instanceof AsyncAppender)) {
734+
AsyncAppender asyncAppender = new AsyncAppender();
735+
// change logger to have an async appender containing all the
736+
// previously configured appenders
737+
for (Appender appender : appenders) {
738+
logger.removeAppender(appender);
739+
asyncAppender.addAppender(appender);
740+
}
741+
logger.addAppender(asyncAppender);
742+
}
743+
}
744+
745+
protected void stopMetricsLogger() {
746+
if (metricsLoggerTimer != null) {
747+
metricsLoggerTimer.shutdown();
748+
metricsLoggerTimer = null;
749+
}
665750
}
666751

667752
/**
@@ -867,6 +952,7 @@ public void stop() {
867952
} catch (ServiceFailedException e) {
868953
LOG.warn("Encountered exception while exiting state ", e);
869954
} finally {
955+
stopMetricsLogger();
870956
stopCommonServices();
871957
if (metrics != null) {
872958
metrics.shutdown();
@@ -1846,4 +1932,91 @@ void checkHaStateChange(StateChangeRequestInfo req)
18461932
break;
18471933
}
18481934
}
1935+
1936+
private static class MetricsLoggerTask implements Runnable {
1937+
private static final int MAX_LOGGED_VALUE_LEN = 128;
1938+
private static ObjectName OBJECT_NAME = null;
1939+
1940+
static {
1941+
try {
1942+
OBJECT_NAME = new ObjectName("Hadoop:*");
1943+
} catch (MalformedObjectNameException m) {
1944+
// This should not occur in practice since we pass
1945+
// a valid pattern to the constructor above.
1946+
}
1947+
}
1948+
1949+
/**
1950+
* Write NameNode metrics to the metrics appender when invoked.
1951+
*/
1952+
@Override
1953+
public void run() {
1954+
// Skip querying metrics if there are no known appenders.
1955+
if (!MetricsLog.isInfoEnabled() ||
1956+
!hasAppenders(MetricsLog) ||
1957+
OBJECT_NAME == null) {
1958+
return;
1959+
}
1960+
1961+
MetricsLog.info(" >> Begin NameNode metrics dump");
1962+
final MBeanServer server = ManagementFactory.getPlatformMBeanServer();
1963+
1964+
// Iterate over each MBean.
1965+
for (final ObjectName mbeanName : server.queryNames(OBJECT_NAME, null)) {
1966+
try {
1967+
MBeanInfo mBeanInfo = server.getMBeanInfo(mbeanName);
1968+
final String mBeanNameName = MBeans.getMbeanNameName(mbeanName);
1969+
final Set<String> attributeNames = getFilteredAttributes(mBeanInfo);
1970+
1971+
final AttributeList attributes =
1972+
server.getAttributes(mbeanName,
1973+
attributeNames.toArray(new String[attributeNames.size()]));
1974+
1975+
for (Object o : attributes) {
1976+
final Attribute attribute = (Attribute) o;
1977+
final Object value = attribute.getValue();
1978+
final String valueStr =
1979+
(value != null) ? value.toString() : "null";
1980+
// Truncate the value if it is too long
1981+
MetricsLog.info(mBeanNameName + ":" + attribute.getName() + "=" +
1982+
(valueStr.length() < MAX_LOGGED_VALUE_LEN ? valueStr :
1983+
valueStr.substring(0, MAX_LOGGED_VALUE_LEN) + "..."));
1984+
}
1985+
} catch (Exception e) {
1986+
MetricsLog.error("Failed to get NameNode metrics for mbean " +
1987+
mbeanName.toString(), e);
1988+
}
1989+
}
1990+
MetricsLog.info(" << End NameNode metrics dump");
1991+
}
1992+
1993+
private static boolean hasAppenders(Log logger) {
1994+
if (!(logger instanceof Log4JLogger)) {
1995+
// Don't bother trying to determine the presence of appenders.
1996+
return true;
1997+
}
1998+
Log4JLogger log4JLogger = ((Log4JLogger) MetricsLog);
1999+
return log4JLogger.getLogger().getAllAppenders().hasMoreElements();
2000+
}
2001+
2002+
/**
2003+
* Get the list of attributes for the MBean, filtering out a few
2004+
* attribute types.
2005+
*/
2006+
private static Set<String> getFilteredAttributes(
2007+
MBeanInfo mBeanInfo) {
2008+
Set<String> attributeNames = new HashSet<>();
2009+
for (MBeanAttributeInfo attributeInfo : mBeanInfo.getAttributes()) {
2010+
if (!attributeInfo.getType().equals(
2011+
"javax.management.openmbean.TabularData") &&
2012+
!attributeInfo.getType().equals(
2013+
"javax.management.openmbean.CompositeData") &&
2014+
!attributeInfo.getType().equals(
2015+
"[Ljavax.management.openmbean.CompositeData;")) {
2016+
attributeNames.add(attributeInfo.getName());
2017+
}
2018+
}
2019+
return attributeNames;
2020+
}
2021+
}
18492022
}

hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1528,6 +1528,18 @@
15281528
</description>
15291529
</property>
15301530

1531+
<property>
1532+
<name>dfs.namenode.metrics.logger.period.seconds</name>
1533+
<value>600</value>
1534+
<description>
1535+
This setting controls how frequently the NameNode logs its metrics. The
1536+
logging configuration must also define one or more appenders for
1537+
NameNodeMetricsLog for the metrics to be logged.
1538+
NameNode metrics logging is disabled if this value is set to zero or
1539+
less than zero.
1540+
</description>
1541+
</property>
1542+
15311543
<property>
15321544
<name>dfs.metrics.percentiles.intervals</name>
15331545
<value></value>

0 commit comments

Comments
 (0)