Skip to content

Commit 06b0cd6

Browse files
committed
[JENKINS-46680] Disconnect computer on ping timeout (#3005)
* [JENKINS-46680] Reproduce in unittest * [FIX JENKINS-46680] Reset SlaveComputer channel before closing it on ping timeout * [JENKINS-46680] Attach channel termination offline cause on ping timeouts (cherry picked from commit dbb5e44)
1 parent 8ec2510 commit 06b0cd6

File tree

3 files changed

+141
-24
lines changed

3 files changed

+141
-24
lines changed

core/src/main/java/hudson/slaves/ChannelPinger.java

+34-16
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
*/
2424
package hudson.slaves;
2525

26+
import com.google.common.annotations.VisibleForTesting;
2627
import hudson.Extension;
2728
import hudson.FilePath;
2829
import jenkins.util.SystemProperties;
@@ -34,6 +35,7 @@
3435
import jenkins.security.MasterToSlaveCallable;
3536
import jenkins.slaves.PingFailureAnalyzer;
3637

38+
import javax.annotation.CheckForNull;
3739
import java.io.IOException;
3840
import java.util.concurrent.atomic.AtomicBoolean;
3941
import java.util.logging.Level;
@@ -86,28 +88,38 @@ public ChannelPinger() {
8688

8789
@Override
8890
public void preOnline(Computer c, Channel channel, FilePath root, TaskListener listener) {
89-
install(channel);
91+
SlaveComputer slaveComputer = null;
92+
if (c instanceof SlaveComputer) {
93+
slaveComputer = (SlaveComputer) c;
94+
}
95+
install(channel, slaveComputer);
9096
}
9197

9298
public void install(Channel channel) {
99+
install(channel, null);
100+
}
101+
102+
@VisibleForTesting
103+
/*package*/ void install(Channel channel, @CheckForNull SlaveComputer c) {
93104
if (pingTimeoutSeconds < 1 || pingIntervalSeconds < 1) {
94105
LOGGER.warning("Agent ping is disabled");
95106
return;
96107
}
97108

109+
// set up ping from both directions, so that in case of a router dropping a connection,
110+
// both sides can notice it and take compensation actions.
98111
try {
99112
channel.call(new SetUpRemotePing(pingTimeoutSeconds, pingIntervalSeconds));
100113
LOGGER.fine("Set up a remote ping for " + channel.getName());
101114
} catch (Exception e) {
102-
LOGGER.severe("Failed to set up a ping for " + channel.getName());
115+
LOGGER.log(Level.SEVERE, "Failed to set up a ping for " + channel.getName(), e);
103116
}
104117

105-
// set up ping from both directions, so that in case of a router dropping a connection,
106-
// both sides can notice it and take compensation actions.
107-
setUpPingForChannel(channel, pingTimeoutSeconds, pingIntervalSeconds, true);
118+
setUpPingForChannel(channel, c, pingTimeoutSeconds, pingIntervalSeconds, true);
108119
}
109120

110-
static class SetUpRemotePing extends MasterToSlaveCallable<Void, IOException> {
121+
@VisibleForTesting
122+
/*package*/ static class SetUpRemotePing extends MasterToSlaveCallable<Void, IOException> {
111123
private static final long serialVersionUID = -2702219700841759872L;
112124
@Deprecated
113125
private transient int pingInterval;
@@ -121,7 +133,7 @@ static class SetUpRemotePing extends MasterToSlaveCallable<Void, IOException> {
121133

122134
@Override
123135
public Void call() throws IOException {
124-
setUpPingForChannel(Channel.current(), pingTimeoutSeconds, pingIntervalSeconds, false);
136+
setUpPingForChannel(Channel.current(), null, pingTimeoutSeconds, pingIntervalSeconds, false);
125137
return null;
126138
}
127139

@@ -163,30 +175,36 @@ protected Object readResolve() {
163175
}
164176
}
165177

166-
static void setUpPingForChannel(final Channel channel, int timeoutSeconds, int intervalSeconds, final boolean analysis) {
178+
@VisibleForTesting
179+
/*package*/ static void setUpPingForChannel(final Channel channel, final SlaveComputer computer, int timeoutSeconds, int intervalSeconds, final boolean analysis) {
167180
LOGGER.log(Level.FINE, "setting up ping on {0} with a {1} seconds interval and {2} seconds timeout", new Object[] {channel.getName(), intervalSeconds, timeoutSeconds});
168181
final AtomicBoolean isInClosed = new AtomicBoolean(false);
169182
final PingThread t = new PingThread(channel, timeoutSeconds * 1000L, intervalSeconds * 1000L) {
170183
@Override
171184
protected void onDead(Throwable cause) {
172-
try {
173185
if (analysis) {
174186
analyze(cause);
175187
}
176-
if (isInClosed.get()) {
188+
boolean inClosed = isInClosed.get();
189+
// Disassociate computer channel before closing it
190+
if (computer != null) {
191+
Exception exception = cause instanceof Exception ? (Exception) cause: new IOException(cause);
192+
computer.disconnect(new OfflineCause.ChannelTermination(exception));
193+
}
194+
if (inClosed) {
177195
LOGGER.log(Level.FINE,"Ping failed after the channel "+channel.getName()+" is already partially closed.",cause);
178196
} else {
179197
LOGGER.log(Level.INFO,"Ping failed. Terminating the channel "+channel.getName()+".",cause);
180-
channel.close(cause);
181198
}
182-
} catch (IOException e) {
183-
LOGGER.log(Level.SEVERE,"Failed to terminate the channel "+channel.getName(),e);
184-
}
185199
}
186200
/** Keep in a separate method so we do not even try to do class loading on {@link PingFailureAnalyzer} from an agent JVM. */
187-
private void analyze(Throwable cause) throws IOException {
201+
private void analyze(Throwable cause) {
188202
for (PingFailureAnalyzer pfa : PingFailureAnalyzer.all()) {
189-
pfa.onPingFailure(channel,cause);
203+
try {
204+
pfa.onPingFailure(channel, cause);
205+
} catch (IOException ex) {
206+
LOGGER.log(Level.WARNING, "Ping failure analyzer " + pfa.getClass().getName() + " failed for " + channel.getName(), ex);
207+
}
190208
}
191209
}
192210
@Deprecated

core/src/test/java/hudson/slaves/ChannelPingerTest.java

+8-8
Original file line numberDiff line numberDiff line change
@@ -59,12 +59,12 @@ private void preserveSystemProperty(String propertyName) {
5959
@Test
6060
public void testDefaults() throws Exception {
6161
ChannelPinger channelPinger = new ChannelPinger();
62-
channelPinger.install(mockChannel);
62+
channelPinger.install(mockChannel, null);
6363

6464
verify(mockChannel).call(eq(new ChannelPinger.SetUpRemotePing(ChannelPinger.PING_TIMEOUT_SECONDS_DEFAULT,
6565
ChannelPinger.PING_INTERVAL_SECONDS_DEFAULT)));
6666
verifyStatic();
67-
ChannelPinger.setUpPingForChannel(mockChannel, ChannelPinger.PING_TIMEOUT_SECONDS_DEFAULT,
67+
ChannelPinger.setUpPingForChannel(mockChannel, null, ChannelPinger.PING_TIMEOUT_SECONDS_DEFAULT,
6868
ChannelPinger.PING_INTERVAL_SECONDS_DEFAULT, true);
6969
}
7070

@@ -74,23 +74,23 @@ public void testFromSystemProperties() throws Exception {
7474
System.setProperty("hudson.slaves.ChannelPinger.pingIntervalSeconds", "73");
7575

7676
ChannelPinger channelPinger = new ChannelPinger();
77-
channelPinger.install(mockChannel);
77+
channelPinger.install(mockChannel, null);
7878

7979
verify(mockChannel).call(new ChannelPinger.SetUpRemotePing(42, 73));
8080
verifyStatic();
81-
ChannelPinger.setUpPingForChannel(mockChannel, 42, 73, true);
81+
ChannelPinger.setUpPingForChannel(mockChannel, null, 42, 73, true);
8282
}
8383

8484
@Test
8585
public void testFromOldSystemProperty() throws Exception {
8686
System.setProperty("hudson.slaves.ChannelPinger.pingInterval", "7");
8787

8888
ChannelPinger channelPinger = new ChannelPinger();
89-
channelPinger.install(mockChannel);
89+
channelPinger.install(mockChannel, null);
9090

9191
verify(mockChannel).call(eq(new ChannelPinger.SetUpRemotePing(ChannelPinger.PING_TIMEOUT_SECONDS_DEFAULT, 420)));
9292
verifyStatic();
93-
ChannelPinger.setUpPingForChannel(mockChannel, ChannelPinger.PING_TIMEOUT_SECONDS_DEFAULT, 420, true);
93+
ChannelPinger.setUpPingForChannel(mockChannel, null, ChannelPinger.PING_TIMEOUT_SECONDS_DEFAULT, 420, true);
9494
}
9595

9696
@Test
@@ -99,11 +99,11 @@ public void testNewSystemPropertyTrumpsOld() throws Exception {
9999
System.setProperty("hudson.slaves.ChannelPinger.pingInterval", "7");
100100

101101
ChannelPinger channelPinger = new ChannelPinger();
102-
channelPinger.install(mockChannel);
102+
channelPinger.install(mockChannel, null);
103103

104104
verify(mockChannel).call(eq(new ChannelPinger.SetUpRemotePing(ChannelPinger.PING_TIMEOUT_SECONDS_DEFAULT, 73)));
105105
verifyStatic();
106-
ChannelPinger.setUpPingForChannel(mockChannel, ChannelPinger.PING_TIMEOUT_SECONDS_DEFAULT, 73, true);
106+
ChannelPinger.setUpPingForChannel(mockChannel, null, ChannelPinger.PING_TIMEOUT_SECONDS_DEFAULT, 73, true);
107107
}
108108

109109
@Test
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
/*
2+
* The MIT License
3+
*
4+
* Copyright (c) Red Hat, Inc.
5+
*
6+
* Permission is hereby granted, free of charge, to any person obtaining a copy
7+
* of this software and associated documentation files (the "Software"), to deal
8+
* in the Software without restriction, including without limitation the rights
9+
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10+
* copies of the Software, and to permit persons to whom the Software is
11+
* furnished to do so, subject to the following conditions:
12+
*
13+
* The above copyright notice and this permission notice shall be included in
14+
* all copies or substantial portions of the Software.
15+
*
16+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19+
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20+
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21+
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22+
* THE SOFTWARE.
23+
*/
24+
package hudson.slaves;
25+
26+
import hudson.Functions;
27+
import hudson.model.Computer;
28+
import hudson.remoting.Channel;
29+
import hudson.remoting.ChannelClosedException;
30+
import hudson.remoting.PingThread;
31+
import jenkins.security.MasterToSlaveCallable;
32+
import org.junit.Rule;
33+
import org.junit.Test;
34+
import org.jvnet.hudson.test.JenkinsRule;
35+
36+
import java.io.IOException;
37+
import java.lang.management.ManagementFactory;
38+
import java.lang.reflect.Method;
39+
import java.util.Date;
40+
import java.util.concurrent.TimeoutException;
41+
42+
import static org.junit.Assert.assertNotNull;
43+
import static org.junit.Assert.assertNull;
44+
import static org.junit.Assert.fail;
45+
import static org.junit.Assume.assumeFalse;
46+
47+
/**
48+
* @author ogondza.
49+
*/
50+
public class PingThreadTest {
51+
52+
@Rule
53+
public JenkinsRule j = new JenkinsRule();
54+
55+
@Test
56+
public void failedPingThreadResetsComputerChannel() throws Exception {
57+
assumeFalse("We simulate hung agent by sending the SIGTSTP signal", Functions.isWindows());
58+
59+
DumbSlave slave = j.createOnlineSlave();
60+
Computer computer = slave.toComputer();
61+
Channel channel = (Channel) slave.getChannel();
62+
String pid = channel.call(new GetPid());
63+
64+
PingThread pingThread = null;
65+
for (Thread it: Thread.getAllStackTraces().keySet()) {
66+
if (it instanceof PingThread && it.getName().endsWith(channel.toString())) {
67+
pingThread = (PingThread) it;
68+
}
69+
}
70+
assertNotNull(pingThread);
71+
72+
// Simulate lost connection
73+
assert new ProcessBuilder("kill", "-TSTP", pid).start().waitFor() == 0;
74+
try {
75+
// ... do not wait for Ping Thread to notice
76+
Method onDead = PingThread.class.getDeclaredMethod("onDead", Throwable.class);
77+
onDead.setAccessible(true);
78+
onDead.invoke(pingThread, new TimeoutException("No ping"));
79+
80+
try {
81+
channel.call(new GetPid());
82+
fail();
83+
} catch (ChannelClosedException ex) {
84+
// Expected
85+
}
86+
87+
assertNull(slave.getComputer().getChannel());
88+
assertNull(computer.getChannel());
89+
} finally {
90+
assert new ProcessBuilder("kill", "-CONT", pid).start().waitFor() == 0;
91+
}
92+
}
93+
94+
private static final class GetPid extends MasterToSlaveCallable<String, IOException> {
95+
@Override public String call() throws IOException {
96+
return ManagementFactory.getRuntimeMXBean().getName().replaceAll("@.*", "");
97+
}
98+
}
99+
}

0 commit comments

Comments
 (0)