Skip to content

Commit 4d4b099

Browse files
authored
YARN-11534. Fixed exception handling when container signalling is interrupted (#5864)
1 parent 130bd03 commit 4d4b099

File tree

5 files changed

+67
-9
lines changed

5 files changed

+67
-9
lines changed

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@
4848
import java.io.File;
4949
import java.io.FileOutputStream;
5050
import java.io.IOException;
51+
import java.io.InterruptedIOException;
5152
import java.net.InetSocketAddress;
5253
import java.util.ArrayList;
5354
import java.util.Arrays;
@@ -787,9 +788,18 @@ public boolean signalContainer(ContainerSignalContext ctx)
787788
LOG.warn("Error in signalling container {} with {}; exit = {}",
788789
pid, signal, retCode, e);
789790
logOutput(e.getOutput());
790-
throw new IOException("Problem signalling container " + pid + " with "
791-
+ signal + "; output: " + e.getOutput() + " and exitCode: "
792-
+ retCode, e);
791+
792+
// In ContainerExecutionException -1 is the default value for the exit code.
793+
// If it remained unset, we can treat the signalling as interrupted.
794+
if (retCode == ContainerExecutionException.getDefaultExitCode()) {
795+
throw new InterruptedIOException("Signalling container " + pid + " with "
796+
+ signal + " is interrupted; output: " + e.getOutput() + " and exitCode: "
797+
+ retCode);
798+
} else {
799+
throw new IOException("Problem signalling container " + pid + " with "
800+
+ signal + "; output: " + e.getOutput() + " and exitCode: "
801+
+ retCode, e);
802+
}
793803
}
794804
return true;
795805
}

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/RecoverPausedContainerLaunch.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ public Integer call() {
6868

6969
dispatcher.getEventHandler().handle(new ContainerEvent(containerId,
7070
ContainerEventType.RECOVER_PAUSED_CONTAINER));
71-
boolean notInterrupted = true;
71+
boolean interrupted = false;
7272
try {
7373
File pidFile = locatePidFile(appIdStr, containerIdStr);
7474
if (pidFile != null) {
@@ -87,11 +87,11 @@ public Integer call() {
8787

8888
} catch (InterruptedException | InterruptedIOException e) {
8989
LOG.warn("Interrupted while waiting for exit code from " + containerId);
90-
notInterrupted = false;
90+
interrupted = true;
9191
} catch (IOException e) {
9292
LOG.error("Unable to kill the paused container " + containerIdStr, e);
9393
} finally {
94-
if (notInterrupted) {
94+
if (!interrupted) {
9595
this.completed.set(true);
9696
exec.deactivateContainer(containerId);
9797
try {

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/RecoveredContainerLaunch.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ public Integer call() {
7474
dispatcher.getEventHandler().handle(new ContainerEvent(containerId,
7575
ContainerEventType.CONTAINER_LAUNCHED));
7676

77-
boolean notInterrupted = true;
77+
boolean interrupted = false;
7878
try {
7979
File pidFile = locatePidFile(appIdStr, containerIdStr);
8080
if (pidFile != null) {
@@ -92,11 +92,11 @@ public Integer call() {
9292
}
9393
} catch (InterruptedException | InterruptedIOException e) {
9494
LOG.warn("Interrupted while waiting for exit code from " + containerId);
95-
notInterrupted = false;
95+
interrupted = true;
9696
} catch (IOException e) {
9797
LOG.error("Unable to recover container " + containerIdStr, e);
9898
} finally {
99-
if (notInterrupted) {
99+
if (!interrupted) {
100100
this.completed.set(true);
101101
exec.deactivateContainer(containerId);
102102
try {

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/runtime/ContainerExecutionException.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,4 +88,8 @@ public String getErrorOutput() {
8888
return errorOutput;
8989
}
9090

91+
public static int getDefaultExitCode() {
92+
return EXIT_CODE_UNSET;
93+
}
94+
9195
}

hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutor.java

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,14 +25,19 @@
2525
import static org.junit.Assert.assertNotNull;
2626
import static org.junit.Assert.assertTrue;
2727
import static org.junit.Assert.fail;
28+
import static org.junit.jupiter.api.Assertions.assertThrows;
2829
import static org.mockito.ArgumentMatchers.any;
30+
import static org.mockito.Mockito.doNothing;
31+
import static org.mockito.Mockito.doThrow;
2932
import static org.mockito.Mockito.mock;
3033
import static org.mockito.Mockito.spy;
3134
import static org.mockito.Mockito.times;
3235
import static org.mockito.Mockito.verify;
3336
import static org.mockito.Mockito.when;
3437

3538
import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.runtime.LinuxContainerRuntime;
39+
import org.apache.hadoop.yarn.server.nodemanager.containermanager.runtime.ContainerExecutionException;
40+
import org.apache.hadoop.yarn.server.nodemanager.containermanager.runtime.ContainerRuntimeContext;
3641
import org.apache.hadoop.yarn.server.nodemanager.executor.ContainerExecContext;
3742
import org.apache.hadoop.yarn.server.nodemanager.executor.ContainerReapContext;
3843
import org.slf4j.Logger;
@@ -41,6 +46,7 @@
4146
import java.io.File;
4247
import java.io.FileOutputStream;
4348
import java.io.IOException;
49+
import java.io.InterruptedIOException;
4450
import java.io.PrintWriter;
4551
import java.net.InetSocketAddress;
4652
import java.util.ArrayList;
@@ -725,6 +731,44 @@ public void testGetLocalResources() throws Exception {
725731
verify(lce, times(1)).getLocalResources(container);
726732
}
727733

734+
@Test
735+
public void testSignalContainerFailureWhenExitCodeIsPresentInTheException()
736+
throws ContainerExecutionException {
737+
LinuxContainerRuntime containerRuntime = mock(LinuxContainerRuntime.class);
738+
LinuxContainerExecutor containerExecutor = spy(new LinuxContainerExecutor(
739+
containerRuntime));
740+
ContainerSignalContext signalContext = new ContainerSignalContext.Builder().build();
741+
ContainerExecutionException testException =
742+
new ContainerExecutionException("exceptionWithExitCode", 123);
743+
744+
doNothing().when(containerExecutor).verifyUsernamePattern(any());
745+
doThrow(testException)
746+
.when(containerRuntime)
747+
.signalContainer(any(ContainerRuntimeContext.class));
748+
749+
assertThrows(IOException.class,
750+
() -> containerExecutor.signalContainer(signalContext));
751+
}
752+
753+
@Test
754+
public void testSignalContainerFailureWhenExitCodeIsNotPresentInTheException()
755+
throws ContainerExecutionException {
756+
LinuxContainerRuntime containerRuntime = mock(LinuxContainerRuntime.class);
757+
LinuxContainerExecutor containerExecutor = spy(new LinuxContainerExecutor(
758+
containerRuntime));
759+
ContainerSignalContext signalContext = new ContainerSignalContext.Builder().build();
760+
ContainerExecutionException testException =
761+
new ContainerExecutionException("exceptionWithoutExitCode");
762+
763+
doNothing().when(containerExecutor).verifyUsernamePattern(any());
764+
doThrow(testException)
765+
.when(containerRuntime)
766+
.signalContainer(any(ContainerRuntimeContext.class));
767+
768+
assertThrows(InterruptedIOException.class,
769+
() -> containerExecutor.signalContainer(signalContext));
770+
}
771+
728772
@Deprecated
729773
private static class TestResourceHandler implements LCEResourcesHandler {
730774
static Set<ContainerId> postExecContainers = new HashSet<ContainerId>();

0 commit comments

Comments
 (0)