Skip to content

Commit 863689f

Browse files
author
Eric E Payne
committed
YARN-1529: Add Localization overhead metrics to NM. Contributed by Jim_Brennan.
(cherry picked from commit e0c9653)
1 parent 3a5f48a commit 863689f

File tree

11 files changed

+191
-5
lines changed

11 files changed

+191
-5
lines changed

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/ApplicationConstants.java

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,15 @@ enum Environment {
202202
@Private
203203
CLASSPATH_PREPEND_DISTCACHE("CLASSPATH_PREPEND_DISTCACHE"),
204204

205+
/**
206+
* $LOCALIZATION_COUNTERS
207+
*
208+
* Since NM does not RPC Container JVM's we pass Localization counter
209+
* vector as an environment variable
210+
*
211+
*/
212+
LOCALIZATION_COUNTERS("LOCALIZATION_COUNTERS"),
213+
205214
/**
206215
* $CONTAINER_ID
207216
* Final, exported by NodeManager and non-modifiable by users.

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManagerImpl.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,7 @@ private enum ReInitOp {
209209
private final ResourceLocalizationService rsrcLocalizationSrvc;
210210
private final AbstractContainersLauncher containersLauncher;
211211
private final AuxServices auxiliaryServices;
212-
private final NodeManagerMetrics metrics;
212+
@VisibleForTesting final NodeManagerMetrics metrics;
213213

214214
protected final NodeStatusUpdater nodeStatusUpdater;
215215

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/Container.java

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,4 +119,14 @@ public interface Container extends EventHandler<ContainerEvent> {
119119
* @return true/false based on container's state
120120
*/
121121
boolean isContainerInFinalStates();
122+
123+
/**
124+
* Vector of localization counters to be passed from NM to application
125+
* container via environment variable {@code $LOCALIZATION_COUNTERS}. See
126+
* {@link org.apache.hadoop.yarn.api.ApplicationConstants.Environment#LOCALIZATION_COUNTERS}
127+
*
128+
* @return coma-separated counter values
129+
*/
130+
String localizationCountersAsString();
131+
122132
}

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
import org.apache.hadoop.conf.Configuration;
4545
import org.apache.hadoop.fs.Path;
4646
import org.apache.hadoop.security.Credentials;
47+
import org.apache.hadoop.util.Time;
4748
import org.apache.hadoop.yarn.api.records.ContainerExitStatus;
4849
import org.apache.hadoop.yarn.api.records.ContainerId;
4950
import org.apache.hadoop.yarn.api.records.ContainerLaunchContext;
@@ -101,6 +102,14 @@
101102
import org.apache.hadoop.yarn.util.resource.Resources;
102103

103104
public class ContainerImpl implements Container {
105+
private enum LocalizationCounter {
106+
// 1-to-1 correspondence with MR TaskCounter.LOCALIZED_*
107+
BYTES_MISSED,
108+
BYTES_CACHED,
109+
FILES_MISSED,
110+
FILES_CACHED,
111+
MILLIS;
112+
}
104113

105114
private static final class ReInitializationContext {
106115
private final ContainerLaunchContext newLaunchContext;
@@ -154,6 +163,9 @@ private ReInitializationContext createContextForRollback() {
154163
private final NMStateStoreService stateStore;
155164
private final Credentials credentials;
156165
private final NodeManagerMetrics metrics;
166+
private final long[] localizationCounts =
167+
new long[LocalizationCounter.values().length];
168+
157169
private volatile ContainerLaunchContext launchContext;
158170
private volatile ContainerTokenIdentifier containerTokenIdentifier;
159171
private final ContainerId containerId;
@@ -1197,6 +1209,12 @@ public ContainerState transition(ContainerImpl container,
11971209
}
11981210

11991211
container.containerLocalizationStartTime = clock.getTime();
1212+
// duration = end - start;
1213+
// record in RequestResourcesTransition: -start
1214+
// add in LocalizedTransition: +end
1215+
//
1216+
container.localizationCounts[LocalizationCounter.MILLIS.ordinal()]
1217+
= -Time.monotonicNow();
12001218

12011219
// Send requests for public, private resources
12021220
Map<String,LocalResource> cntrRsrc = ctxt.getLocalResources();
@@ -1243,6 +1261,21 @@ public ContainerState transition(ContainerImpl container,
12431261
return ContainerState.LOCALIZING;
12441262
}
12451263

1264+
final long localizedSize = rsrcEvent.getSize();
1265+
if (localizedSize > 0) {
1266+
container.localizationCounts
1267+
[LocalizationCounter.BYTES_MISSED.ordinal()] += localizedSize;
1268+
container.localizationCounts
1269+
[LocalizationCounter.FILES_MISSED.ordinal()]++;
1270+
} else if (localizedSize < 0) {
1271+
// cached: recorded negative, restore the sign
1272+
container.localizationCounts
1273+
[LocalizationCounter.BYTES_CACHED.ordinal()] -= localizedSize;
1274+
container.localizationCounts
1275+
[LocalizationCounter.FILES_CACHED.ordinal()]++;
1276+
}
1277+
container.metrics.localizationCacheHitMiss(localizedSize);
1278+
12461279
// check to see if this resource should be uploaded to the shared cache
12471280
// as well
12481281
if (shouldBeUploadedToSharedCache(container, resourceRequest)) {
@@ -1253,6 +1286,14 @@ public ContainerState transition(ContainerImpl container,
12531286
return ContainerState.LOCALIZING;
12541287
}
12551288

1289+
// duration = end - start;
1290+
// record in RequestResourcesTransition: -start
1291+
// add in LocalizedTransition: +end
1292+
//
1293+
container.localizationCounts[LocalizationCounter.MILLIS.ordinal()]
1294+
+= Time.monotonicNow();
1295+
container.metrics.localizationComplete(
1296+
container.localizationCounts[LocalizationCounter.MILLIS.ordinal()]);
12561297
container.dispatcher.getEventHandler().handle(
12571298
new ContainerLocalizationEvent(LocalizationEventType.
12581299
CONTAINER_RESOURCES_LOCALIZED, container));
@@ -2270,4 +2311,14 @@ public boolean isContainerInFinalStates() {
22702311
|| state == ContainerState.EXITED_WITH_FAILURE
22712312
|| state == ContainerState.EXITED_WITH_SUCCESS;
22722313
}
2314+
2315+
@Override
2316+
public String localizationCountersAsString() {
2317+
StringBuilder result =
2318+
new StringBuilder(String.valueOf(localizationCounts[0]));
2319+
for (int i = 1; i < localizationCounts.length; i++) {
2320+
result.append(',').append(localizationCounts[i]);
2321+
}
2322+
return result.toString();
2323+
}
22732324
}

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerResourceLocalizedEvent.java

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,11 @@ public class ContainerResourceLocalizedEvent extends ContainerResourceEvent {
2525

2626
private final Path loc;
2727

28+
// > 0: downloaded
29+
// < 0: cached
30+
//
31+
private long size;
32+
2833
public ContainerResourceLocalizedEvent(ContainerId container, LocalResourceRequest rsrc,
2934
Path loc) {
3035
super(container, ContainerEventType.RESOURCE_LOCALIZED, rsrc);
@@ -35,4 +40,12 @@ public Path getLocation() {
3540
return loc;
3641
}
3742

43+
public long getSize() {
44+
return size;
45+
}
46+
47+
public void setSize(long size) {
48+
this.size = size;
49+
}
50+
3851
}

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/ContainerLaunch.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1556,6 +1556,9 @@ public void sanitizeEnv(Map<String, String> environment, Path pwd,
15561556

15571557
addToEnvMap(environment, nmVars, Environment.PWD.name(), pwd.toString());
15581558

1559+
addToEnvMap(environment, nmVars, Environment.LOCALIZATION_COUNTERS.name(),
1560+
container.localizationCountersAsString());
1561+
15591562
if (!Shell.WINDOWS) {
15601563
addToEnvMap(environment, nmVars, "JVM_PID", "$$");
15611564
}

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/localizer/LocalizedResource.java

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -249,9 +249,11 @@ public void transition(LocalizedResource rsrc, ResourceEvent event) {
249249
Path.getPathWithoutSchemeAndAuthority(locEvent.getLocation());
250250
rsrc.size = locEvent.getSize();
251251
for (ContainerId container : rsrc.ref) {
252-
rsrc.dispatcher.getEventHandler().handle(
252+
final ContainerResourceLocalizedEvent localizedEvent =
253253
new ContainerResourceLocalizedEvent(
254-
container, rsrc.rsrc, rsrc.localPath));
254+
container, rsrc.rsrc, rsrc.localPath);
255+
localizedEvent.setSize(rsrc.size);
256+
rsrc.dispatcher.getEventHandler().handle(localizedEvent);
255257
}
256258
}
257259
}
@@ -286,9 +288,11 @@ public void transition(LocalizedResource rsrc, ResourceEvent event) {
286288
ResourceRequestEvent reqEvent = (ResourceRequestEvent) event;
287289
ContainerId container = reqEvent.getContext().getContainerId();
288290
rsrc.ref.add(container);
289-
rsrc.dispatcher.getEventHandler().handle(
291+
final ContainerResourceLocalizedEvent localizedEvent =
290292
new ContainerResourceLocalizedEvent(
291-
container, rsrc.rsrc, rsrc.localPath));
293+
container, rsrc.rsrc, rsrc.localPath);
294+
localizedEvent.setSize(-rsrc.size);
295+
rsrc.dispatcher.getEventHandler().handle(localizedEvent);
292296
}
293297
}
294298

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/metrics/NodeManagerMetrics.java

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
import org.apache.hadoop.metrics2.annotation.Metrics;
2323
import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
2424
import org.apache.hadoop.metrics2.lib.MutableCounterInt;
25+
import org.apache.hadoop.metrics2.lib.MutableCounterLong;
2526
import org.apache.hadoop.metrics2.lib.MutableGaugeInt;
2627
import org.apache.hadoop.metrics2.lib.MutableGaugeLong;
2728
import org.apache.hadoop.metrics2.lib.MutableGaugeFloat;
@@ -91,6 +92,21 @@ public class NodeManagerMetrics {
9192
@Metric("Current CPU utilization")
9293
MutableGaugeFloat nodeCpuUtilization;
9394

95+
@Metric("Missed localization requests in bytes")
96+
MutableCounterLong localizedCacheMissBytes;
97+
@Metric("Cached localization requests in bytes")
98+
MutableCounterLong localizedCacheHitBytes;
99+
@Metric("Localization cache hit ratio (bytes)")
100+
MutableGaugeInt localizedCacheHitBytesRatio;
101+
@Metric("Missed localization requests (files)")
102+
MutableCounterLong localizedCacheMissFiles;
103+
@Metric("Cached localization requests (files)")
104+
MutableCounterLong localizedCacheHitFiles;
105+
@Metric("Localization cache hit ratio (files)")
106+
MutableGaugeInt localizedCacheHitFilesRatio;
107+
@Metric("Container localization time in milliseconds")
108+
MutableRate localizationDurationMillis;
109+
94110
// CHECKSTYLE:ON:VisibilityModifier
95111

96112
private JvmMetrics jvmMetrics = null;
@@ -377,4 +393,38 @@ public float getNodeCpuUtilization() {
377393
public void setNodeCpuUtilization(float cpuUtilization) {
378394
this.nodeCpuUtilization.set(cpuUtilization);
379395
}
396+
397+
private void updateLocalizationHitRatios() {
398+
updateLocalizationHitRatio(localizedCacheHitBytes, localizedCacheMissBytes,
399+
localizedCacheHitBytesRatio);
400+
updateLocalizationHitRatio(localizedCacheHitFiles, localizedCacheMissFiles,
401+
localizedCacheHitFilesRatio);
402+
}
403+
404+
private static void updateLocalizationHitRatio(MutableCounterLong hitCounter,
405+
MutableCounterLong missedCounter, MutableGaugeInt ratioGauge) {
406+
final long hits = hitCounter.value();
407+
final long misses = missedCounter.value();
408+
final long total = hits + misses;
409+
if (total > 0) {
410+
ratioGauge.set((int)(100 * hits / total));
411+
}
412+
}
413+
414+
public void localizationCacheHitMiss(long size) {
415+
if (size > 0) {
416+
localizedCacheMissBytes.incr(size);
417+
localizedCacheMissFiles.incr();
418+
updateLocalizationHitRatios();
419+
} else if (size < 0) {
420+
// cached: recorded negative, restore the sign
421+
localizedCacheHitBytes.incr(-size);
422+
localizedCacheHitFiles.incr();
423+
updateLocalizationHitRatios();
424+
}
425+
}
426+
427+
public void localizationComplete(long downloadMillis) {
428+
localizationDurationMillis.add(downloadMillis);
429+
}
380430
}

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManager.java

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,10 @@
2020

2121
import org.apache.hadoop.yarn.server.api.AuxiliaryLocalPathHandler;
2222
import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService;
23+
import static org.apache.hadoop.test.MetricsAsserts.assertCounter;
24+
import static org.apache.hadoop.test.MetricsAsserts.assertGauge;
25+
import static org.apache.hadoop.test.MetricsAsserts.assertGaugeGt;
26+
import static org.apache.hadoop.test.MetricsAsserts.getMetrics;
2327
import static org.junit.Assert.assertEquals;
2428
import static org.junit.Assert.assertTrue;
2529
import static org.junit.Assert.fail;
@@ -38,15 +42,18 @@
3842
import java.nio.ByteBuffer;
3943
import java.util.ArrayList;
4044
import java.util.Arrays;
45+
import java.util.Collections;
4146
import java.util.HashMap;
4247
import java.util.HashSet;
4348
import java.util.List;
4449
import java.util.Map;
4550

4651
import java.util.function.Supplier;
4752
import org.apache.hadoop.fs.FileContext;
53+
import org.apache.hadoop.fs.FileUtil;
4854
import org.apache.hadoop.fs.Path;
4955
import org.apache.hadoop.fs.UnsupportedFileSystemException;
56+
import org.apache.hadoop.metrics2.MetricsRecordBuilder;
5057
import org.apache.hadoop.security.UserGroupInformation;
5158
import org.apache.hadoop.service.Service;
5259
import org.apache.hadoop.test.GenericTestUtils;
@@ -316,6 +323,39 @@ public void testContainerSetup() throws Exception {
316323
BufferedReader reader = new BufferedReader(new FileReader(targetFile));
317324
Assert.assertEquals("Hello World!", reader.readLine());
318325
Assert.assertEquals(null, reader.readLine());
326+
327+
//
328+
// check the localization counter
329+
//
330+
long targetFileSize =
331+
FileUtil.getDU(targetFile.getCanonicalFile().getParentFile());
332+
MetricsRecordBuilder rb = getMetrics("NodeManagerMetrics");
333+
assertCounter("LocalizedCacheMissBytes", targetFileSize, rb);
334+
assertCounter("LocalizedCacheHitBytes", 0L, rb);
335+
assertCounter("LocalizedCacheMissFiles", 1L, rb);
336+
assertCounter("LocalizedCacheHitFiles", 0L, rb);
337+
assertGaugeGt("LocalizationDurationMillisAvgTime", 0, rb);
338+
assertGauge("LocalizedCacheHitBytesRatio", 0, rb);
339+
assertGauge("LocalizedCacheHitFilesRatio", 0, rb);
340+
341+
// test cache being used
342+
final ContainerId cid1 = createContainerId(1);
343+
containerManager.startContainers(StartContainersRequest.newInstance(
344+
Collections.singletonList(
345+
StartContainerRequest.newInstance(
346+
containerLaunchContext,
347+
createContainerToken(cid1, DUMMY_RM_IDENTIFIER,
348+
context.getNodeId(),
349+
user,
350+
context.getContainerTokenSecretManager())))));
351+
waitForContainerState(containerManager, cid1, ContainerState.COMPLETE);
352+
rb = getMetrics("NodeManagerMetrics");
353+
assertCounter("LocalizedCacheMissBytes", targetFileSize, rb);
354+
assertCounter("LocalizedCacheHitBytes", targetFileSize, rb);
355+
assertCounter("LocalizedCacheMissFiles", 1L, rb);
356+
assertCounter("LocalizedCacheHitFiles", 1L, rb);
357+
assertGauge("LocalizedCacheHitBytesRatio", 50, rb);
358+
assertGauge("LocalizedCacheHitFilesRatio", 50, rb);
319359
}
320360

321361
@Test (timeout = 10000L)

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/TestContainerLaunch.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -856,6 +856,7 @@ private void verifyTailErrorLogOnContainerExit(Configuration conf,
856856
.newContainerId(ApplicationAttemptId.newInstance(appId, 1), 1);
857857
when(container.getContainerId()).thenReturn(containerId);
858858
when(container.getUser()).thenReturn("test");
859+
when(container.localizationCountersAsString()).thenReturn("");
859860
String relativeContainerLogDir = ContainerLaunch.getRelativeContainerLogDir(
860861
appId.toString(), containerId.toString());
861862
Path containerLogDir =

0 commit comments

Comments
 (0)