From 5069cc15c593abdbad1b27791f92b1f1db9a80d5 Mon Sep 17 00:00:00 2001 From: Peter Szucs Date: Thu, 20 Jul 2023 13:06:45 +0200 Subject: [PATCH 1/4] YARN-11534. Fixed exception handling when container signalling is interrupted Change-Id: I5cc10436be452ee51c604acd60d92b49fa10a75b --- .../nodemanager/LinuxContainerExecutor.java | 16 +++++++++++++--- .../launcher/RecoverPausedContainerLaunch.java | 6 +++--- .../launcher/RecoveredContainerLaunch.java | 6 +++--- 3 files changed, 19 insertions(+), 9 deletions(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java index ea4595dffc48d..14321accaa5f2 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java @@ -48,6 +48,7 @@ import java.io.File; import java.io.FileOutputStream; import java.io.IOException; +import java.io.InterruptedIOException; import java.net.InetSocketAddress; import java.util.ArrayList; import java.util.Arrays; @@ -787,9 +788,18 @@ public boolean signalContainer(ContainerSignalContext ctx) LOG.warn("Error in signalling container {} with {}; exit = {}", pid, signal, retCode, e); logOutput(e.getOutput()); - throw new IOException("Problem signalling container " + pid + " with " - + signal + "; output: " + e.getOutput() + " and exitCode: " - + retCode, e); + + // In ContainerExecutionException -1 is the default value for the exit code. + // If it remained unset, we can treat the signalling as interrupted. + if (retCode == -1) { + throw new InterruptedIOException("Signalling container " + pid + " with " + + signal + " is interrupted; output: " + e.getOutput() + " and exitCode: " + + retCode); + } else { + throw new IOException("Problem signalling container " + pid + " with " + + signal + "; output: " + e.getOutput() + " and exitCode: " + + retCode, e); + } } return true; } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/RecoverPausedContainerLaunch.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/RecoverPausedContainerLaunch.java index c678c91860d6e..a56a4531de413 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/RecoverPausedContainerLaunch.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/RecoverPausedContainerLaunch.java @@ -68,7 +68,7 @@ public Integer call() { dispatcher.getEventHandler().handle(new ContainerEvent(containerId, ContainerEventType.RECOVER_PAUSED_CONTAINER)); - boolean notInterrupted = true; + boolean interrupted = false; try { File pidFile = locatePidFile(appIdStr, containerIdStr); if (pidFile != null) { @@ -87,11 +87,11 @@ public Integer call() { } catch (InterruptedException | InterruptedIOException e) { LOG.warn("Interrupted while waiting for exit code from " + containerId); - notInterrupted = false; + interrupted = true; } catch (IOException e) { LOG.error("Unable to kill the paused container " + containerIdStr, e); } finally { - if (notInterrupted) { + if (!interrupted) { this.completed.set(true); exec.deactivateContainer(containerId); try { diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/RecoveredContainerLaunch.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/RecoveredContainerLaunch.java index 17ddd77857fbd..0dcffe6afe538 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/RecoveredContainerLaunch.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/RecoveredContainerLaunch.java @@ -74,7 +74,7 @@ public Integer call() { dispatcher.getEventHandler().handle(new ContainerEvent(containerId, ContainerEventType.CONTAINER_LAUNCHED)); - boolean notInterrupted = true; + boolean interrupted = false; try { File pidFile = locatePidFile(appIdStr, containerIdStr); if (pidFile != null) { @@ -92,11 +92,11 @@ public Integer call() { } } catch (InterruptedException | InterruptedIOException e) { LOG.warn("Interrupted while waiting for exit code from " + containerId); - notInterrupted = false; + interrupted = true; } catch (IOException e) { LOG.error("Unable to recover container " + containerIdStr, e); } finally { - if (notInterrupted) { + if (!interrupted) { this.completed.set(true); exec.deactivateContainer(containerId); try { From ce2c183455df5710ae2655565f0312b6ffd82589 Mon Sep 17 00:00:00 2001 From: Peter Szucs Date: Thu, 20 Jul 2023 17:07:56 +0200 Subject: [PATCH 2/4] YARN-11534. Added unit tests Change-Id: I379f76d5c45f7371a81fb6a01bbb4f2807f27ef6 --- .../nodemanager/LinuxContainerExecutor.java | 2 +- .../TestLinuxContainerExecutor.java | 44 +++++++++++++++++++ 2 files changed, 45 insertions(+), 1 deletion(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java index 14321accaa5f2..77ae9e2b1ad51 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java @@ -788,7 +788,7 @@ public boolean signalContainer(ContainerSignalContext ctx) LOG.warn("Error in signalling container {} with {}; exit = {}", pid, signal, retCode, e); logOutput(e.getOutput()); - + // In ContainerExecutionException -1 is the default value for the exit code. // If it remained unset, we can treat the signalling as interrupted. if (retCode == -1) { diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutor.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutor.java index 8918588bf373f..e506b71fed7eb 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutor.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutor.java @@ -25,7 +25,10 @@ import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; +import static org.junit.jupiter.api.Assertions.assertThrows; import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.doNothing; +import static org.mockito.Mockito.doThrow; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.spy; import static org.mockito.Mockito.times; @@ -33,6 +36,8 @@ import static org.mockito.Mockito.when; import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.runtime.LinuxContainerRuntime; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.runtime.ContainerExecutionException; +import org.apache.hadoop.yarn.server.nodemanager.containermanager.runtime.ContainerRuntimeContext; import org.apache.hadoop.yarn.server.nodemanager.executor.ContainerExecContext; import org.apache.hadoop.yarn.server.nodemanager.executor.ContainerReapContext; import org.slf4j.Logger; @@ -41,6 +46,7 @@ import java.io.File; import java.io.FileOutputStream; import java.io.IOException; +import java.io.InterruptedIOException; import java.io.PrintWriter; import java.net.InetSocketAddress; import java.util.ArrayList; @@ -725,6 +731,44 @@ public void testGetLocalResources() throws Exception { verify(lce, times(1)).getLocalResources(container); } + @Test + public void testSignalContainerFailureWhenExitCodeIsPresentInTheException() + throws ContainerExecutionException { + LinuxContainerRuntime containerRuntime = mock(LinuxContainerRuntime.class); + LinuxContainerExecutor containerExecutor = spy(new LinuxContainerExecutor( + containerRuntime)); + ContainerSignalContext signalContext = new ContainerSignalContext.Builder().build(); + ContainerExecutionException testException = + new ContainerExecutionException("exceptionWithExitCode", 123); + + doNothing().when(containerExecutor).verifyUsernamePattern(any()); + doThrow(testException) + .when(containerRuntime) + .signalContainer(any(ContainerRuntimeContext.class)); + + assertThrows(IOException.class, + () -> containerExecutor.signalContainer(signalContext)); + } + + @Test + public void testSignalContainerFailureWhenExitCodeIsNotPresentInTheException() + throws ContainerExecutionException { + LinuxContainerRuntime containerRuntime = mock(LinuxContainerRuntime.class); + LinuxContainerExecutor containerExecutor = spy(new LinuxContainerExecutor( + containerRuntime)); + ContainerSignalContext signalContext = new ContainerSignalContext.Builder().build(); + ContainerExecutionException testException = + new ContainerExecutionException("exceptionWithoutExitCode"); + + doNothing().when(containerExecutor).verifyUsernamePattern(any()); + doThrow(testException) + .when(containerRuntime) + .signalContainer(any(ContainerRuntimeContext.class)); + + assertThrows(InterruptedIOException.class, + () -> containerExecutor.signalContainer(signalContext)); + } + @Deprecated private static class TestResourceHandler implements LCEResourcesHandler { static Set postExecContainers = new HashSet(); From 6ed8f29c1501afd65a2439a40034fdcb79917aba Mon Sep 17 00:00:00 2001 From: Peter Szucs Date: Thu, 20 Jul 2023 17:43:59 +0200 Subject: [PATCH 3/4] YARN-11534. Code improvements Change-Id: Idce41dfff72f0f209683a75b5743672446288f91 --- .../yarn/server/nodemanager/LinuxContainerExecutor.java | 2 +- .../containermanager/runtime/ContainerExecutionException.java | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java index 77ae9e2b1ad51..a28a6fc4110e9 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java @@ -791,7 +791,7 @@ public boolean signalContainer(ContainerSignalContext ctx) // In ContainerExecutionException -1 is the default value for the exit code. // If it remained unset, we can treat the signalling as interrupted. - if (retCode == -1) { + if (retCode == ContainerExecutionException.getDefaultExitCode()) { throw new InterruptedIOException("Signalling container " + pid + " with " + signal + " is interrupted; output: " + e.getOutput() + " and exitCode: " + retCode); diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/runtime/ContainerExecutionException.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/runtime/ContainerExecutionException.java index 735db1f608299..77d63b2b0b208 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/runtime/ContainerExecutionException.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/runtime/ContainerExecutionException.java @@ -87,5 +87,9 @@ public String getOutput() { public String getErrorOutput() { return errorOutput; } + + public static int getDefaultExitCode() { + return EXIT_CODE_UNSET; + } } \ No newline at end of file From f64a9234791cf78eac94c1eaefa705cf5a481c23 Mon Sep 17 00:00:00 2001 From: Peter Szucs Date: Fri, 21 Jul 2023 08:48:32 +0200 Subject: [PATCH 4/4] YARN-11534. Removed blank line endings Change-Id: I4933b6102a3bd283206ce9980ad037e750eef4a7 --- .../containermanager/runtime/ContainerExecutionException.java | 2 +- .../yarn/server/nodemanager/TestLinuxContainerExecutor.java | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/runtime/ContainerExecutionException.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/runtime/ContainerExecutionException.java index 77d63b2b0b208..dbb2b5138f4f5 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/runtime/ContainerExecutionException.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/runtime/ContainerExecutionException.java @@ -87,7 +87,7 @@ public String getOutput() { public String getErrorOutput() { return errorOutput; } - + public static int getDefaultExitCode() { return EXIT_CODE_UNSET; } diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutor.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutor.java index e506b71fed7eb..69e7b3bc008aa 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutor.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutor.java @@ -759,12 +759,12 @@ public void testSignalContainerFailureWhenExitCodeIsNotPresentInTheException() ContainerSignalContext signalContext = new ContainerSignalContext.Builder().build(); ContainerExecutionException testException = new ContainerExecutionException("exceptionWithoutExitCode"); - + doNothing().when(containerExecutor).verifyUsernamePattern(any()); doThrow(testException) .when(containerRuntime) .signalContainer(any(ContainerRuntimeContext.class)); - + assertThrows(InterruptedIOException.class, () -> containerExecutor.signalContainer(signalContext)); }