Skip to content

Commit 40b2569

Browse files
committed
YARN-3850. NM fails to read files from full disks which can lead to container logs being lost and other issues. Contributed by Varun Saxena
1 parent 8ef07f7 commit 40b2569

File tree

7 files changed

+93
-19
lines changed

7 files changed

+93
-19
lines changed

hadoop-yarn-project/CHANGES.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -716,6 +716,9 @@ Release 2.7.1 - UNRELEASED
716716
YARN-3832. Resource Localization fails on a cluster due to existing cache
717717
directories (Brahma Reddy Battula via jlowe)
718718

719+
YARN-3850. NM fails to read files from full disks which can lead to
720+
container logs being lost and other issues (Varun Saxena via jlowe)
721+
719722
Release 2.7.0 - 2015-04-20
720723

721724
INCOMPATIBLE CHANGES

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LocalDirsHandlerService.java

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -237,6 +237,18 @@ public List<String> getDiskFullLogDirs() {
237237
return logDirs.getFullDirs();
238238
}
239239

240+
/**
241+
* Function to get the local dirs which should be considered for reading
242+
* existing files on disk. Contains the good local dirs and the local dirs
243+
* that have reached the disk space limit
244+
*
245+
* @return the local dirs which should be considered for reading
246+
*/
247+
public List<String> getLocalDirsForRead() {
248+
return DirectoryCollection.concat(localDirs.getGoodDirs(),
249+
localDirs.getFullDirs());
250+
}
251+
240252
/**
241253
* Function to get the local dirs which should be considered when cleaning up
242254
* resources. Contains the good local dirs and the local dirs that have reached
@@ -249,6 +261,18 @@ public List<String> getLocalDirsForCleanup() {
249261
localDirs.getFullDirs());
250262
}
251263

264+
/**
265+
* Function to get the log dirs which should be considered for reading
266+
* existing files on disk. Contains the good log dirs and the log dirs that
267+
* have reached the disk space limit
268+
*
269+
* @return the log dirs which should be considered for reading
270+
*/
271+
public List<String> getLogDirsForRead() {
272+
return DirectoryCollection.concat(logDirs.getGoodDirs(),
273+
logDirs.getFullDirs());
274+
}
275+
252276
/**
253277
* Function to get the log dirs which should be considered when cleaning up
254278
* resources. Contains the good log dirs and the log dirs that have reached

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/RecoveredContainerLaunch.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,8 @@ public Integer call() {
126126

127127
private File locatePidFile(String appIdStr, String containerIdStr) {
128128
String pidSubpath= getPidFileSubpath(appIdStr, containerIdStr);
129-
for (String dir : getContext().getLocalDirsHandler().getLocalDirs()) {
129+
for (String dir : getContext().getLocalDirsHandler().
130+
getLocalDirsForRead()) {
130131
File pidFile = new File(dir, pidSubpath);
131132
if (pidFile.exists()) {
132133
return pidFile;

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/logaggregation/AppLogAggregatorImpl.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -595,10 +595,10 @@ public Set<Path> doContainerLogAggregation(LogWriter writer,
595595
boolean appFinished) {
596596
LOG.info("Uploading logs for container " + containerId
597597
+ ". Current good log dirs are "
598-
+ StringUtils.join(",", dirsHandler.getLogDirs()));
598+
+ StringUtils.join(",", dirsHandler.getLogDirsForRead()));
599599
final LogKey logKey = new LogKey(containerId);
600600
final LogValue logValue =
601-
new LogValue(dirsHandler.getLogDirs(), containerId,
601+
new LogValue(dirsHandler.getLogDirsForRead(), containerId,
602602
userUgi.getShortUserName(), logAggregationContext,
603603
this.uploadedFileMeta, appFinished);
604604
try {

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/ContainerLogsUtils.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ public static List<File> getContainerLogDirs(ContainerId containerId,
7474

7575
static List<File> getContainerLogDirs(ContainerId containerId,
7676
LocalDirsHandlerService dirsHandler) throws YarnException {
77-
List<String> logDirs = dirsHandler.getLogDirs();
77+
List<String> logDirs = dirsHandler.getLogDirsForRead();
7878
List<File> containerLogDirs = new ArrayList<File>(logDirs.size());
7979
for (String logDir : logDirs) {
8080
logDir = new File(logDir).toURI().getPath();

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/logaggregation/TestLogAggregationService.java

Lines changed: 40 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -177,22 +177,11 @@ public void tearDown() throws IOException, InterruptedException {
177177
dispatcher.close();
178178
}
179179

180-
@Test
181-
public void testLocalFileDeletionAfterUpload() throws Exception {
182-
this.delSrvc = new DeletionService(createContainerExecutor());
183-
delSrvc = spy(delSrvc);
184-
this.delSrvc.init(conf);
185-
this.conf.set(YarnConfiguration.NM_LOG_DIRS, localLogDir.getAbsolutePath());
186-
this.conf.set(YarnConfiguration.NM_REMOTE_APP_LOG_DIR,
187-
this.remoteRootLogDir.getAbsolutePath());
188-
189-
LogAggregationService logAggregationService = spy(
190-
new LogAggregationService(dispatcher, this.context, this.delSrvc,
191-
super.dirsHandler));
180+
private void verifyLocalFileDeletion(
181+
LogAggregationService logAggregationService) throws Exception {
192182
logAggregationService.init(this.conf);
193183
logAggregationService.start();
194184

195-
196185
ApplicationId application1 = BuilderUtils.newApplicationId(1234, 1);
197186

198187
// AppLogDir should be created
@@ -252,9 +241,46 @@ public void testLocalFileDeletionAfterUpload() throws Exception {
252241
ApplicationEventType.APPLICATION_LOG_HANDLING_FINISHED)
253242
};
254243

255-
checkEvents(appEventHandler, expectedEvents, true, "getType", "getApplicationID");
244+
checkEvents(appEventHandler, expectedEvents, true, "getType",
245+
"getApplicationID");
256246
}
257247

248+
@Test
249+
public void testLocalFileDeletionAfterUpload() throws Exception {
250+
this.delSrvc = new DeletionService(createContainerExecutor());
251+
delSrvc = spy(delSrvc);
252+
this.delSrvc.init(conf);
253+
this.conf.set(YarnConfiguration.NM_LOG_DIRS, localLogDir.getAbsolutePath());
254+
this.conf.set(YarnConfiguration.NM_REMOTE_APP_LOG_DIR,
255+
this.remoteRootLogDir.getAbsolutePath());
256+
257+
LogAggregationService logAggregationService = spy(
258+
new LogAggregationService(dispatcher, this.context, this.delSrvc,
259+
super.dirsHandler));
260+
verifyLocalFileDeletion(logAggregationService);
261+
}
262+
263+
@Test
264+
public void testLocalFileDeletionOnDiskFull() throws Exception {
265+
this.delSrvc = new DeletionService(createContainerExecutor());
266+
delSrvc = spy(delSrvc);
267+
this.delSrvc.init(conf);
268+
this.conf.set(YarnConfiguration.NM_LOG_DIRS, localLogDir.getAbsolutePath());
269+
this.conf.set(YarnConfiguration.NM_REMOTE_APP_LOG_DIR,
270+
this.remoteRootLogDir.getAbsolutePath());
271+
List<String> logDirs = super.dirsHandler.getLogDirs();
272+
LocalDirsHandlerService dirsHandler = spy(super.dirsHandler);
273+
// Simulate disk being full by returning no good log dirs but having a
274+
// directory in full log dirs.
275+
when(dirsHandler.getLogDirs()).thenReturn(new ArrayList<String>());
276+
when(dirsHandler.getLogDirsForRead()).thenReturn(logDirs);
277+
LogAggregationService logAggregationService = spy(
278+
new LogAggregationService(dispatcher, this.context, this.delSrvc,
279+
dirsHandler));
280+
verifyLocalFileDeletion(logAggregationService);
281+
}
282+
283+
258284
@Test
259285
public void testNoContainerOnNode() throws Exception {
260286
this.conf.set(YarnConfiguration.NM_LOG_DIRS, localLogDir.getAbsolutePath());

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestContainerLogsPage.java

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020

2121
import static org.junit.Assume.assumeTrue;
2222
import static org.mockito.Mockito.mock;
23+
import static org.mockito.Mockito.spy;
2324
import static org.mockito.Mockito.when;
2425
import static org.mockito.Mockito.verify;
2526

@@ -29,6 +30,7 @@
2930
import java.io.IOException;
3031
import java.io.PrintWriter;
3132
import java.util.ArrayList;
33+
import java.util.Arrays;
3234
import java.util.HashMap;
3335
import java.util.List;
3436
import java.util.Map;
@@ -122,6 +124,24 @@ public void testContainerLogDirs() throws IOException, YarnException {
122124
Assert.assertNull(nmContext.getContainers().get(container1));
123125
files = ContainerLogsUtils.getContainerLogDirs(container1, user, nmContext);
124126
Assert.assertTrue(!(files.get(0).toString().contains("file:")));
127+
128+
// Create a new context to check if correct container log dirs are fetched
129+
// on full disk.
130+
LocalDirsHandlerService dirsHandlerForFullDisk = spy(dirsHandler);
131+
// good log dirs are empty and nm log dir is in the full log dir list.
132+
when(dirsHandlerForFullDisk.getLogDirs()).
133+
thenReturn(new ArrayList<String>());
134+
when(dirsHandlerForFullDisk.getLogDirsForRead()).
135+
thenReturn(Arrays.asList(new String[] {absLogDir.getAbsolutePath()}));
136+
nmContext = new NodeManager.NMContext(null, null, dirsHandlerForFullDisk,
137+
new ApplicationACLsManager(conf), new NMNullStateStoreService());
138+
nmContext.getApplications().put(appId, app);
139+
container.setState(ContainerState.RUNNING);
140+
nmContext.getContainers().put(container1, container);
141+
List<File> dirs =
142+
ContainerLogsUtils.getContainerLogDirs(container1, user, nmContext);
143+
File containerLogDir = new File(absLogDir, appId + "/" + container1);
144+
Assert.assertTrue(dirs.contains(containerLogDir));
125145
}
126146

127147
@Test(timeout = 10000)
@@ -231,7 +251,7 @@ public void testLogDirWithDriveLetter() throws Exception {
231251
LocalDirsHandlerService localDirs = mock(LocalDirsHandlerService.class);
232252
List<String> logDirs = new ArrayList<String>();
233253
logDirs.add("F:/nmlogs");
234-
when(localDirs.getLogDirs()).thenReturn(logDirs);
254+
when(localDirs.getLogDirsForRead()).thenReturn(logDirs);
235255

236256
ApplicationIdPBImpl appId = mock(ApplicationIdPBImpl.class);
237257
when(appId.toString()).thenReturn("app_id_1");

0 commit comments

Comments
 (0)