-
-
Notifications
You must be signed in to change notification settings - Fork 9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[JENKINS-46680] Disconnect computer on ping timeout #3005
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -23,6 +23,7 @@ | |
*/ | ||
package hudson.slaves; | ||
|
||
import com.google.common.annotations.VisibleForTesting; | ||
import hudson.Extension; | ||
import hudson.FilePath; | ||
import jenkins.util.SystemProperties; | ||
|
@@ -34,6 +35,7 @@ | |
import jenkins.security.MasterToSlaveCallable; | ||
import jenkins.slaves.PingFailureAnalyzer; | ||
|
||
import javax.annotation.CheckForNull; | ||
import java.io.IOException; | ||
import java.util.concurrent.atomic.AtomicBoolean; | ||
import java.util.logging.Level; | ||
|
@@ -86,28 +88,38 @@ public ChannelPinger() { | |
|
||
@Override | ||
public void preOnline(Computer c, Channel channel, FilePath root, TaskListener listener) { | ||
install(channel); | ||
SlaveComputer slaveComputer = null; | ||
if (c instanceof SlaveComputer) { | ||
slaveComputer = (SlaveComputer) c; | ||
} | ||
install(channel, slaveComputer); | ||
} | ||
|
||
public void install(Channel channel) { | ||
install(channel, null); | ||
} | ||
|
||
@VisibleForTesting | ||
/*package*/ void install(Channel channel, @CheckForNull SlaveComputer c) { | ||
if (pingTimeoutSeconds < 1 || pingIntervalSeconds < 1) { | ||
LOGGER.warning("Agent ping is disabled"); | ||
return; | ||
} | ||
|
||
// set up ping from both directions, so that in case of a router dropping a connection, | ||
// both sides can notice it and take compensation actions. | ||
try { | ||
channel.call(new SetUpRemotePing(pingTimeoutSeconds, pingIntervalSeconds)); | ||
LOGGER.fine("Set up a remote ping for " + channel.getName()); | ||
} catch (Exception e) { | ||
LOGGER.severe("Failed to set up a ping for " + channel.getName()); | ||
LOGGER.log(Level.SEVERE, "Failed to set up a ping for " + channel.getName(), e); | ||
} | ||
|
||
// set up ping from both directions, so that in case of a router dropping a connection, | ||
// both sides can notice it and take compensation actions. | ||
setUpPingForChannel(channel, pingTimeoutSeconds, pingIntervalSeconds, true); | ||
setUpPingForChannel(channel, c, pingTimeoutSeconds, pingIntervalSeconds, true); | ||
} | ||
|
||
static class SetUpRemotePing extends MasterToSlaveCallable<Void, IOException> { | ||
@VisibleForTesting | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Never used in jenkinsci group. |
||
/*package*/ static class SetUpRemotePing extends MasterToSlaveCallable<Void, IOException> { | ||
private static final long serialVersionUID = -2702219700841759872L; | ||
@Deprecated | ||
private transient int pingInterval; | ||
|
@@ -121,7 +133,7 @@ static class SetUpRemotePing extends MasterToSlaveCallable<Void, IOException> { | |
|
||
@Override | ||
public Void call() throws IOException { | ||
setUpPingForChannel(Channel.current(), pingTimeoutSeconds, pingIntervalSeconds, false); | ||
setUpPingForChannel(Channel.current(), null, pingTimeoutSeconds, pingIntervalSeconds, false); | ||
return null; | ||
} | ||
|
||
|
@@ -163,30 +175,36 @@ protected Object readResolve() { | |
} | ||
} | ||
|
||
static void setUpPingForChannel(final Channel channel, int timeoutSeconds, int intervalSeconds, final boolean analysis) { | ||
@VisibleForTesting | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Never used in jenkinsci group. |
||
/*package*/ static void setUpPingForChannel(final Channel channel, final SlaveComputer computer, int timeoutSeconds, int intervalSeconds, final boolean analysis) { | ||
LOGGER.log(Level.FINE, "setting up ping on {0} with a {1} seconds interval and {2} seconds timeout", new Object[] {channel.getName(), intervalSeconds, timeoutSeconds}); | ||
final AtomicBoolean isInClosed = new AtomicBoolean(false); | ||
final PingThread t = new PingThread(channel, timeoutSeconds * 1000L, intervalSeconds * 1000L) { | ||
@Override | ||
protected void onDead(Throwable cause) { | ||
try { | ||
if (analysis) { | ||
analyze(cause); | ||
} | ||
if (isInClosed.get()) { | ||
boolean inClosed = isInClosed.get(); | ||
// Disassociate computer channel before closing it | ||
if (computer != null) { | ||
Exception exception = cause instanceof Exception ? (Exception) cause: new IOException(cause); | ||
computer.disconnect(new OfflineCause.ChannelTermination(exception)); | ||
} | ||
if (inClosed) { | ||
LOGGER.log(Level.FINE,"Ping failed after the channel "+channel.getName()+" is already partially closed.",cause); | ||
} else { | ||
LOGGER.log(Level.INFO,"Ping failed. Terminating the channel "+channel.getName()+".",cause); | ||
channel.close(cause); | ||
} | ||
} catch (IOException e) { | ||
LOGGER.log(Level.SEVERE,"Failed to terminate the channel "+channel.getName(),e); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The close is now done in |
||
} | ||
} | ||
/** Keep in a separate method so we do not even try to do class loading on {@link PingFailureAnalyzer} from an agent JVM. */ | ||
private void analyze(Throwable cause) throws IOException { | ||
private void analyze(Throwable cause) { | ||
for (PingFailureAnalyzer pfa : PingFailureAnalyzer.all()) { | ||
pfa.onPingFailure(channel,cause); | ||
try { | ||
pfa.onPingFailure(channel, cause); | ||
} catch (IOException ex) { | ||
LOGGER.log(Level.WARNING, "Ping failure analyzer " + pfa.getClass().getName() + " failed for " + channel.getName(), ex); | ||
} | ||
} | ||
} | ||
@Deprecated | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
/* | ||
* The MIT License | ||
* | ||
* Copyright (c) Red Hat, Inc. | ||
* | ||
* Permission is hereby granted, free of charge, to any person obtaining a copy | ||
* of this software and associated documentation files (the "Software"), to deal | ||
* in the Software without restriction, including without limitation the rights | ||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
* copies of the Software, and to permit persons to whom the Software is | ||
* furnished to do so, subject to the following conditions: | ||
* | ||
* The above copyright notice and this permission notice shall be included in | ||
* all copies or substantial portions of the Software. | ||
* | ||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | ||
* THE SOFTWARE. | ||
*/ | ||
package hudson.slaves; | ||
|
||
import hudson.Functions; | ||
import hudson.model.Computer; | ||
import hudson.remoting.Channel; | ||
import hudson.remoting.ChannelClosedException; | ||
import hudson.remoting.PingThread; | ||
import jenkins.security.MasterToSlaveCallable; | ||
import org.junit.Rule; | ||
import org.junit.Test; | ||
import org.jvnet.hudson.test.JenkinsRule; | ||
|
||
import java.io.IOException; | ||
import java.lang.management.ManagementFactory; | ||
import java.lang.reflect.Method; | ||
import java.util.Date; | ||
import java.util.concurrent.TimeoutException; | ||
|
||
import static org.junit.Assert.assertNotNull; | ||
import static org.junit.Assert.assertNull; | ||
import static org.junit.Assert.fail; | ||
import static org.junit.Assume.assumeFalse; | ||
|
||
/** | ||
* @author ogondza. | ||
*/ | ||
public class PingThreadTest { | ||
|
||
@Rule | ||
public JenkinsRule j = new JenkinsRule(); | ||
|
||
@Test | ||
public void failedPingThreadResetsComputerChannel() throws Exception { | ||
assumeFalse("We simulate hung agent by sending the SIGTSTP signal", Functions.isWindows()); | ||
|
||
DumbSlave slave = j.createOnlineSlave(); | ||
Computer computer = slave.toComputer(); | ||
Channel channel = (Channel) slave.getChannel(); | ||
String pid = channel.call(new GetPid()); | ||
|
||
PingThread pingThread = null; | ||
for (Thread it: Thread.getAllStackTraces().keySet()) { | ||
if (it instanceof PingThread && it.getName().endsWith(channel.toString())) { | ||
pingThread = (PingThread) it; | ||
} | ||
} | ||
assertNotNull(pingThread); | ||
|
||
// Simulate lost connection | ||
assert new ProcessBuilder("kill", "-TSTP", pid).start().waitFor() == 0; | ||
try { | ||
// ... do not wait for Ping Thread to notice | ||
Method onDead = PingThread.class.getDeclaredMethod("onDead", Throwable.class); | ||
onDead.setAccessible(true); | ||
onDead.invoke(pingThread, new TimeoutException("No ping")); | ||
|
||
try { | ||
channel.call(new GetPid()); | ||
fail(); | ||
} catch (ChannelClosedException ex) { | ||
// Expected | ||
} | ||
|
||
assertNull(slave.getComputer().getChannel()); | ||
assertNull(computer.getChannel()); | ||
} finally { | ||
assert new ProcessBuilder("kill", "-CONT", pid).start().waitFor() == 0; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @olivergondza this seems to be flaky: #3961 (comment) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This comment remained unacknowledged until #7149 fixed the flaky test 3 years, 5 months, and 18 days after the flakiness was first reported. |
||
} | ||
} | ||
|
||
private static final class GetPid extends MasterToSlaveCallable<String, IOException> { | ||
@Override public String call() throws IOException { | ||
return ManagementFactory.getRuntimeMXBean().getName().replaceAll("@.*", ""); | ||
} | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Never used in jenkinsci group.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I confirm it's being used in one of private-source plugins. I would not deprecate the old method since there is no replacement in public API
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
OK. I am just wondering, is it used for
SlaveComputer
channel from master's side so it can be migrated to new API?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nope, it's another Remoting channel type. In new API it would be possible to pass
null
for sure (would not make it worse), but our use-case would probably require a special callback support in API. Injenkinsci
org there is a similar use-case in Maven Plugin, where we could need a custom Remoting connection termination logic as well, e.g. force termination of the build (CC @aheritier)We could do it API for sure, but IMHO it's out of the scope of this PR. I would rather prefer t keep it in the current state (just without API deprecation) so we could easily backport it to 2.73.x.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Alright, so let's un-deprecate the old call and keep the new one private so we can give the API design a proper though later.