Skip to content

Commit 3651fe1

Browse files
committed
YARN-2853. Fixed a bug in ResourceManager causing apps to hang when the user kill request races with ApplicationMaster finish. Contributed by Jian He.
1 parent 33ea5ae commit 3651fe1

File tree

5 files changed

+136
-9
lines changed

5 files changed

+136
-9
lines changed

hadoop-yarn-project/CHANGES.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,9 @@ Release 2.7.0 - UNRELEASED
8787
YARN-2603. ApplicationConstants missing HADOOP_MAPRED_HOME (Ray Chiang via
8888
aw)
8989

90+
YARN-2853. Fixed a bug in ResourceManager causing apps to hang when the user
91+
kill request races with ApplicationMaster finish. (Jian He via vinodkv)
92+
9093
Release 2.6.0 - 2014-11-15
9194

9295
INCOMPATIBLE CHANGES

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ApplicationMasterService.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -348,6 +348,7 @@ public FinishApplicationMasterResponse finishApplicationMaster(
348348
// ApplicationDoesNotExistInCacheException before and after
349349
// RM work-preserving restart.
350350
if (rmApp.isAppFinalStateStored()) {
351+
LOG.info(rmApp.getApplicationId() + " unregistered successfully. ");
351352
return FinishApplicationMasterResponse.newInstance(true);
352353
}
353354

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -293,13 +293,23 @@ RMAppEventType.KILL, new KillAttemptTransition())
293293
RMAppEventType.ATTEMPT_KILLED,
294294
new FinalSavingTransition(
295295
new AppKilledTransition(), RMAppState.KILLED))
296+
.addTransition(RMAppState.KILLING, RMAppState.FINAL_SAVING,
297+
RMAppEventType.ATTEMPT_UNREGISTERED,
298+
new FinalSavingTransition(
299+
new AttemptUnregisteredTransition(),
300+
RMAppState.FINISHING, RMAppState.FINISHED))
301+
.addTransition(RMAppState.KILLING, RMAppState.FINISHED,
302+
// UnManagedAM directly jumps to finished
303+
RMAppEventType.ATTEMPT_FINISHED, FINISHED_TRANSITION)
304+
.addTransition(RMAppState.KILLING,
305+
EnumSet.of(RMAppState.FINAL_SAVING),
306+
RMAppEventType.ATTEMPT_FAILED,
307+
new AttemptFailedTransition(RMAppState.KILLING))
308+
296309
.addTransition(RMAppState.KILLING, RMAppState.KILLING,
297310
EnumSet.of(
298311
RMAppEventType.NODE_UPDATE,
299312
RMAppEventType.ATTEMPT_REGISTERED,
300-
RMAppEventType.ATTEMPT_UNREGISTERED,
301-
RMAppEventType.ATTEMPT_FINISHED,
302-
RMAppEventType.ATTEMPT_FAILED,
303313
RMAppEventType.APP_UPDATE_SAVED,
304314
RMAppEventType.KILL, RMAppEventType.MOVE))
305315

@@ -1199,6 +1209,14 @@ public RMAppState transition(RMAppImpl app, RMAppEvent event) {
11991209
+ app.maxAppAttempts);
12001210
if (!app.submissionContext.getUnmanagedAM()
12011211
&& numberOfFailure < app.maxAppAttempts) {
1212+
if (initialState.equals(RMAppState.KILLING)) {
1213+
// If this is not last attempt, app should be killed instead of
1214+
// launching a new attempt
1215+
app.rememberTargetTransitionsAndStoreState(event,
1216+
new AppKilledTransition(), RMAppState.KILLED, RMAppState.KILLED);
1217+
return RMAppState.FINAL_SAVING;
1218+
}
1219+
12021220
boolean transferStateFromPreviousAttempt;
12031221
RMAppFailedAttemptEvent failedEvent = (RMAppFailedAttemptEvent) event;
12041222
transferStateFromPreviousAttempt =

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRM.java

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import org.junit.Before;
2222
import static org.mockito.Matchers.argThat;
2323
import static org.mockito.Mockito.doNothing;
24+
import static org.mockito.Mockito.doAnswer;
2425
import static org.mockito.Mockito.spy;
2526

2627
import java.util.ArrayList;
@@ -37,16 +38,19 @@
3738
import org.apache.commons.logging.Log;
3839
import org.apache.commons.logging.LogFactory;
3940
import org.apache.hadoop.yarn.api.protocolrecords.AllocateResponse;
41+
import org.apache.hadoop.yarn.api.protocolrecords.FinishApplicationMasterRequest;
4042
import org.apache.hadoop.yarn.api.protocolrecords.GetApplicationReportRequest;
4143
import org.apache.hadoop.yarn.api.protocolrecords.GetApplicationsRequest;
4244
import org.apache.hadoop.yarn.api.protocolrecords.GetApplicationsResponse;
4345
import org.apache.hadoop.yarn.api.protocolrecords.GetNewApplicationResponse;
4446
import org.apache.hadoop.yarn.api.protocolrecords.KillApplicationRequest;
4547
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
48+
import org.apache.hadoop.yarn.api.records.ApplicationId;
4649
import org.apache.hadoop.yarn.api.records.ApplicationReport;
4750
import org.apache.hadoop.yarn.api.records.Container;
4851
import org.apache.hadoop.yarn.api.records.ContainerId;
4952
import org.apache.hadoop.yarn.api.records.ContainerState;
53+
import org.apache.hadoop.yarn.api.records.FinalApplicationStatus;
5054
import org.apache.hadoop.yarn.api.records.NMToken;
5155
import org.apache.hadoop.yarn.api.records.NodeId;
5256
import org.apache.hadoop.yarn.api.records.ResourceRequest;
@@ -57,6 +61,8 @@
5761
import org.apache.hadoop.yarn.event.AsyncDispatcher;
5862
import org.apache.hadoop.yarn.event.Dispatcher;
5963
import org.apache.hadoop.yarn.event.EventHandler;
64+
import org.apache.hadoop.yarn.server.resourcemanager.TestRMRestart.TestSecurityMockRM;
65+
import org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore;
6066
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp;
6167
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppEvent;
6268
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppEventType;
@@ -73,6 +79,8 @@
7379
import org.apache.log4j.Logger;
7480
import org.junit.Test;
7581
import org.mockito.ArgumentMatcher;
82+
import org.mockito.invocation.InvocationOnMock;
83+
import org.mockito.stubbing.Answer;
7684

7785
@SuppressWarnings({"unchecked", "rawtypes"})
7886
public class TestRM extends ParameterizedSchedulerTestBase {
@@ -638,4 +646,107 @@ protected Dispatcher createDispatcher() {
638646
Assert.assertEquals(appsSubmitted + 1, metrics.getAppsSubmitted());
639647
}
640648

649+
// Test Kill an app while the app is finishing in the meanwhile.
650+
@Test (timeout = 30000)
651+
public void testKillFinishingApp() throws Exception{
652+
653+
// this dispatcher ignores RMAppAttemptEventType.KILL event
654+
final Dispatcher dispatcher = new AsyncDispatcher() {
655+
@Override
656+
public EventHandler getEventHandler() {
657+
658+
class EventArgMatcher extends ArgumentMatcher<AbstractEvent> {
659+
@Override
660+
public boolean matches(Object argument) {
661+
if (argument instanceof RMAppAttemptEvent) {
662+
if (((RMAppAttemptEvent) argument).getType().equals(
663+
RMAppAttemptEventType.KILL)) {
664+
return true;
665+
}
666+
}
667+
return false;
668+
}
669+
}
670+
671+
EventHandler handler = spy(super.getEventHandler());
672+
doNothing().when(handler).handle(argThat(new EventArgMatcher()));
673+
return handler;
674+
}
675+
};
676+
677+
MockRM rm1 = new MockRM(conf){
678+
@Override
679+
protected Dispatcher createDispatcher() {
680+
return dispatcher;
681+
}
682+
};
683+
rm1.start();
684+
MockNM nm1 =
685+
new MockNM("127.0.0.1:1234", 8192, rm1.getResourceTrackerService());
686+
nm1.registerNode();
687+
RMApp app1 = rm1.submitApp(200);
688+
MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
689+
690+
rm1.killApp(app1.getApplicationId());
691+
692+
FinishApplicationMasterRequest req =
693+
FinishApplicationMasterRequest.newInstance(
694+
FinalApplicationStatus.SUCCEEDED, "", "");
695+
am1.unregisterAppAttempt(req,true);
696+
697+
rm1.waitForState(am1.getApplicationAttemptId(), RMAppAttemptState.FINISHING);
698+
nm1.nodeHeartbeat(am1.getApplicationAttemptId(), 1, ContainerState.COMPLETE);
699+
rm1.waitForState(am1.getApplicationAttemptId(), RMAppAttemptState.FINISHED);
700+
rm1.waitForState(app1.getApplicationId(), RMAppState.FINISHED);
701+
}
702+
703+
// Test Kill an app while the app is failing
704+
@Test (timeout = 30000)
705+
public void testKillFailingApp() throws Exception{
706+
707+
// this dispatcher ignores RMAppAttemptEventType.KILL event
708+
final Dispatcher dispatcher = new AsyncDispatcher() {
709+
@Override
710+
public EventHandler getEventHandler() {
711+
712+
class EventArgMatcher extends ArgumentMatcher<AbstractEvent> {
713+
@Override
714+
public boolean matches(Object argument) {
715+
if (argument instanceof RMAppAttemptEvent) {
716+
if (((RMAppAttemptEvent) argument).getType().equals(
717+
RMAppAttemptEventType.KILL)) {
718+
return true;
719+
}
720+
}
721+
return false;
722+
}
723+
}
724+
725+
EventHandler handler = spy(super.getEventHandler());
726+
doNothing().when(handler).handle(argThat(new EventArgMatcher()));
727+
return handler;
728+
}
729+
};
730+
731+
MockRM rm1 = new MockRM(conf){
732+
@Override
733+
protected Dispatcher createDispatcher() {
734+
return dispatcher;
735+
}
736+
};
737+
rm1.start();
738+
MockNM nm1 =
739+
new MockNM("127.0.0.1:1234", 8192, rm1.getResourceTrackerService());
740+
nm1.registerNode();
741+
RMApp app1 = rm1.submitApp(200);
742+
MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
743+
744+
rm1.killApp(app1.getApplicationId());
745+
746+
// fail the app by sending container_finished event.
747+
nm1.nodeHeartbeat(am1.getApplicationAttemptId(), 1, ContainerState.COMPLETE);
748+
rm1.waitForState(am1.getApplicationAttemptId(), RMAppAttemptState.FAILED);
749+
// app is killed, not launching a new attempt
750+
rm1.waitForState(app1.getApplicationId(), RMAppState.KILLED);
751+
}
641752
}

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/TestRMAppTransitions.java

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -726,12 +726,6 @@ public void testAppRunningKill() throws IOException {
726726
application.handle(event);
727727
rmDispatcher.await();
728728

729-
// Ignore Attempt_Finished if we were supposed to go to Finished.
730-
assertAppState(RMAppState.KILLING, application);
731-
RMAppEvent finishEvent =
732-
new RMAppFinishedAttemptEvent(application.getApplicationId(), null);
733-
application.handle(finishEvent);
734-
assertAppState(RMAppState.KILLING, application);
735729
sendAttemptUpdateSavedEvent(application);
736730
sendAppUpdateSavedEvent(application);
737731
assertKilled(application);

0 commit comments

Comments
 (0)