Skip to content

Commit

Permalink
Merge pull request #1190 from jglick/retry-JENKINS-49707-base
Browse files Browse the repository at this point in the history
  • Loading branch information
Vlatombe authored Jul 13, 2022
2 parents 908e7db + c38fe0f commit 1c1e0ec
Show file tree
Hide file tree
Showing 11 changed files with 348 additions and 49 deletions.
29 changes: 10 additions & 19 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
<parent>
<groupId>org.jenkins-ci.plugins</groupId>
<artifactId>plugin</artifactId>
<version>4.40</version>
<version>4.42</version>
<relativePath />
</parent>

Expand Down Expand Up @@ -45,7 +45,7 @@
<connectorHost />
<jenkins.host.address />
<slaveAgentPort />
<jenkins.version>2.303.3</jenkins.version>
<jenkins.version>2.332.1</jenkins.version>
<no-test-jar>false</no-test-jar>
<useBeta>true</useBeta>
<gitHubRepo>jenkinsci/${project.artifactId}-plugin</gitHubRepo>
Expand Down Expand Up @@ -124,7 +124,6 @@
<dependency>
<groupId>org.jenkins-ci.plugins</groupId>
<artifactId>metrics</artifactId>
<version>4.0.2.8.1</version>
</dependency>
<dependency>
<groupId>io.jenkins.plugins</groupId>
Expand All @@ -134,6 +133,11 @@
<groupId>org.jenkins-ci.plugins</groupId>
<artifactId>credentials-binding</artifactId>
</dependency>
<dependency>
<groupId>org.jenkins-ci.plugins.workflow</groupId>
<artifactId>workflow-durable-task-step</artifactId>
<version>1174.v73a_9a_17edce0</version> <!-- TODO until in BOM -->
</dependency>

<!-- for testing -->
<dependency>
Expand All @@ -146,11 +150,6 @@
<artifactId>workflow-basic-steps</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.jenkins-ci.plugins.workflow</groupId>
<artifactId>workflow-durable-task-step</artifactId>
<scope>test</scope>
</dependency>
<dependency> <!-- SemaphoreStep -->
<groupId>org.jenkins-ci.plugins.workflow</groupId>
<artifactId>workflow-support</artifactId>
Expand Down Expand Up @@ -194,15 +193,7 @@
<dependency>
<groupId>org.jenkins-ci.plugins</groupId>
<artifactId>ssh-agent</artifactId>
<version>1.23</version>
<scope>test</scope>
<exclusions>
<!-- conflict with sshd module -->
<exclusion>
<groupId>org.apache.sshd</groupId>
<artifactId>sshd-core</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.jenkins-ci.plugins</groupId>
Expand Down Expand Up @@ -261,8 +252,8 @@
<dependencies>
<dependency>
<groupId>io.jenkins.tools.bom</groupId>
<artifactId>bom-2.303.x</artifactId>
<version>1090.v0a_33df40457a_</version>
<artifactId>bom-2.332.x</artifactId>
<version>1478.v81d3dc4f9a_43</version>
<scope>import</scope>
<type>pom</type>
</dependency>
Expand All @@ -281,7 +272,7 @@
<dependency><!-- pipeline-model-extensions vs. io.jenkins.configuration-as-code:test-harness -->
<groupId>joda-time</groupId>
<artifactId>joda-time</artifactId>
<version>2.10.2</version>
<version>2.10.5</version>
</dependency>
<dependency><!-- io.jenkins:configuration-as-code vs. org.jenkins-ci.plugins:junit -->
<groupId>org.apache.commons</groupId>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
/*
* Copyright 2021 CloudBees, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.csanchez.jenkins.plugins.kubernetes.pipeline;

import hudson.Extension;
import hudson.ExtensionList;
import hudson.model.Node;
import hudson.model.labels.LabelAtom;
import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import java.util.logging.Logger;
import jenkins.model.Jenkins;
import org.csanchez.jenkins.plugins.kubernetes.KubernetesCloud;
import org.csanchez.jenkins.plugins.kubernetes.KubernetesSlave;
import org.csanchez.jenkins.plugins.kubernetes.pod.retention.Reaper;
import org.jenkinsci.Symbol;
import org.jenkinsci.plugins.workflow.actions.ErrorAction;
import org.jenkinsci.plugins.workflow.actions.WorkspaceAction;
import org.jenkinsci.plugins.workflow.flow.ErrorCondition;
import org.jenkinsci.plugins.workflow.flow.FlowExecution;
import org.jenkinsci.plugins.workflow.graph.BlockEndNode;
import org.jenkinsci.plugins.workflow.graph.FlowNode;
import org.jenkinsci.plugins.workflow.graphanalysis.LinearBlockHoppingScanner;
import org.jenkinsci.plugins.workflow.steps.FlowInterruptedException;
import org.jenkinsci.plugins.workflow.steps.StepContext;
import org.jenkinsci.plugins.workflow.support.steps.AgentErrorCondition;
import org.jenkinsci.plugins.workflow.support.steps.ExecutorStepExecution;
import org.kohsuke.stapler.DataBoundConstructor;
import org.kohsuke.stapler.DataBoundSetter;

/**
* Qualifies {@code node} blocks associated with {@link KubernetesSlave} to be retried if the node was deleted.
* A more specific version of {@link AgentErrorCondition}.
*/
public class KubernetesAgentErrorCondition extends ErrorCondition {

private static final Logger LOGGER = Logger.getLogger(KubernetesAgentErrorCondition.class.getName());

private static final Set<String> IGNORED_CONTAINER_TERMINATION_REASONS = new HashSet<>();
static {
IGNORED_CONTAINER_TERMINATION_REASONS.add("OOMKilled");
IGNORED_CONTAINER_TERMINATION_REASONS.add("Completed");
IGNORED_CONTAINER_TERMINATION_REASONS.add("DeadlineExceeded");
}

private boolean handleNonKubernetes;

@DataBoundConstructor public KubernetesAgentErrorCondition() {}

public boolean isHandleNonKubernetes() {
return handleNonKubernetes;
}

@DataBoundSetter public void setHandleNonKubernetes(boolean handleNonKubernetes) {
this.handleNonKubernetes = handleNonKubernetes;
}

@Override
public boolean test(Throwable t, StepContext context) throws IOException, InterruptedException {
if (context == null) {
LOGGER.fine("Cannot check error without context");
return handleNonKubernetes;
}
if (!new AgentErrorCondition().test(t, context)) {
LOGGER.fine(() -> "Not a recognized failure: " + t);
return false;
}
FlowNode _origin = ErrorAction.findOrigin(t, context.get(FlowExecution.class));
if (_origin == null) {
LOGGER.fine(() -> "No recognized origin of error: " + t);
return handleNonKubernetes;
}
FlowNode origin = _origin instanceof BlockEndNode ? ((BlockEndNode) _origin).getStartNode() : _origin;
LOGGER.fine(() -> "Found origin " + origin + " " + origin.getDisplayFunctionName());
LinearBlockHoppingScanner scanner = new LinearBlockHoppingScanner();
scanner.setup(origin);
for (FlowNode callStack : scanner) {
WorkspaceAction ws = callStack.getPersistentAction(WorkspaceAction.class);
if (ws != null) {
String node = ws.getNode();
Node n = Jenkins.get().getNode(node);
if (n != null) {
if (!(n instanceof KubernetesSlave)) {
LOGGER.fine(() -> node + " was not a K8s agent");
return handleNonKubernetes;
}
} else {
// May have been removed already, but we can look up the labels to see what it was.
Set<LabelAtom> labels = ws.getLabels();
if (labels.stream().noneMatch(l -> Jenkins.get().clouds.stream().anyMatch(c -> c instanceof KubernetesCloud && ((KubernetesCloud) c).getTemplate(l) != null))) {
LOGGER.fine(() -> node + " was not a K8s agent judging by " + labels);
return handleNonKubernetes;
}
}
Set<String> terminationReasons = ExtensionList.lookupSingleton(Reaper.class).terminationReasons(node);
if (terminationReasons.stream().anyMatch(r -> IGNORED_CONTAINER_TERMINATION_REASONS.contains(r))) {
LOGGER.fine(() -> "ignored termination reason(s) for " + node + ": " + terminationReasons);
return false;
}
LOGGER.fine(() -> "active on " + node + " (termination reasons: " + terminationReasons + ")");
return true;
}
}
LOGGER.fine(() -> "found no WorkspaceAction starting from " + origin);
return handleNonKubernetes;
}

@Symbol("kubernetesAgent")
@Extension public static final class DescriptorImpl extends ErrorConditionDescriptor {

@Override public String getDisplayName() {
return "Kubernetes agent errors";
}

}

}
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,11 @@

package org.csanchez.jenkins.plugins.kubernetes.pod.retention;

import com.github.benmanes.caffeine.cache.Caffeine;
import com.github.benmanes.caffeine.cache.LoadingCache;
import edu.umd.cs.findbugs.annotations.CheckForNull;
import edu.umd.cs.findbugs.annotations.NonNull;
import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;
import hudson.Extension;
import hudson.ExtensionList;
import hudson.ExtensionPoint;
Expand All @@ -32,20 +35,36 @@
import hudson.security.ACLContext;
import hudson.slaves.ComputerListener;
import hudson.slaves.EphemeralNode;
import io.fabric8.kubernetes.api.model.*;
import io.fabric8.kubernetes.api.model.ContainerStateTerminated;
import io.fabric8.kubernetes.api.model.ContainerStateWaiting;
import io.fabric8.kubernetes.api.model.ContainerStatus;
import io.fabric8.kubernetes.api.model.Pod;
import io.fabric8.kubernetes.api.model.PodStatus;
import io.fabric8.kubernetes.client.KubernetesClient;
import io.fabric8.kubernetes.client.Watch;
import io.fabric8.kubernetes.client.Watcher;
import io.fabric8.kubernetes.client.WatcherException;
import java.io.IOException;
import java.util.*;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentSkipListSet;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.logging.Level;
import java.util.logging.Logger;

import io.fabric8.kubernetes.client.WatcherException;
import jenkins.model.Jenkins;
import org.csanchez.jenkins.plugins.kubernetes.*;
import jenkins.util.Timer;
import org.csanchez.jenkins.plugins.kubernetes.KubernetesClientProvider;
import org.csanchez.jenkins.plugins.kubernetes.KubernetesCloud;
import org.csanchez.jenkins.plugins.kubernetes.KubernetesComputer;
import org.csanchez.jenkins.plugins.kubernetes.KubernetesSlave;
import org.csanchez.jenkins.plugins.kubernetes.PodUtils;
import org.jenkinsci.plugins.kubernetes.auth.KubernetesAuthException;

/**
Expand Down Expand Up @@ -85,10 +104,14 @@ public static Reaper getInstance() {

private final Map<String, CloudPodWatcher> watchers = new ConcurrentHashMap<>();

private final LoadingCache<String, Set<String>> terminationReasons = Caffeine.newBuilder().
expireAfterAccess(1, TimeUnit.DAYS).
build(k -> new ConcurrentSkipListSet<>());

@Override
public void onOnline(Computer c, TaskListener listener) throws IOException, InterruptedException {
public void preLaunch(Computer c, TaskListener taskListener) throws IOException, InterruptedException {
if (c instanceof KubernetesComputer) {
maybeActivate();
Timer.get().schedule(this::maybeActivate, 10, TimeUnit.SECONDS);

// ensure associated cloud is being watched. the watch may have been closed due to exception or
// failure to register on initial activation.
Expand Down Expand Up @@ -292,7 +315,7 @@ public void eventReceived(Action action, Pod pod) {

ExtensionList.lookup(Listener.class).forEach(listener -> { // TODO 2.324+ jenkins.util.Listeners
try {
listener.onEvent(action, optionalNode.get(), pod);
listener.onEvent(action, optionalNode.get(), pod, terminationReasons.get(optionalNode.get().getNodeName()));
} catch (Exception x) {
LOGGER.log(Level.WARNING, "Listener " + listener + " failed for " + ns + "/" + name, x);
}
Expand Down Expand Up @@ -328,6 +351,19 @@ public void onClose(WatcherException e) {
}
}

/**
* Get any reason(s) why a node was terminated by a listener.
* @param node a {@link Node#getNodeName}
* @return a possibly empty set of {@link ContainerStateTerminated#getReason} or {@link PodStatus#getReason}
*/
@SuppressFBWarnings(value = "NP_NULL_ON_SOME_PATH_FROM_RETURN_VALUE", justification = "Confused by @org.checkerframework.checker.nullness.qual.Nullable on LoadingCache.get? Never null here.")
@NonNull
public Set<String> terminationReasons(@NonNull String node) {
synchronized (terminationReasons) {
return new HashSet<>(terminationReasons.get(node));
}
}

/**
* Listener called when a Kubernetes event related to a Kubernetes agent happens.
*/
Expand All @@ -338,13 +374,13 @@ public interface Listener extends ExtensionPoint {
* @param node The affected node
* @param pod The affected pod
*/
void onEvent(@NonNull Watcher.Action action, @NonNull KubernetesSlave node, @NonNull Pod pod) throws IOException, InterruptedException;
void onEvent(@NonNull Watcher.Action action, @NonNull KubernetesSlave node, @NonNull Pod pod, @NonNull Set<String> terminationReaons) throws IOException, InterruptedException;
}

@Extension
public static class RemoveAgentOnPodDeleted implements Listener {
@Override
public void onEvent(@NonNull Watcher.Action action, @NonNull KubernetesSlave node, @NonNull Pod pod) throws IOException {
public void onEvent(@NonNull Watcher.Action action, @NonNull KubernetesSlave node, @NonNull Pod pod, @NonNull Set<String> terminationReasons) throws IOException {
if (action != Watcher.Action.DELETED) {
return;
}
Expand All @@ -359,8 +395,9 @@ public void onEvent(@NonNull Watcher.Action action, @NonNull KubernetesSlave nod

@Extension
public static class TerminateAgentOnContainerTerminated implements Listener {

@Override
public void onEvent(@NonNull Watcher.Action action, @NonNull KubernetesSlave node, @NonNull Pod pod) throws IOException, InterruptedException {
public void onEvent(@NonNull Watcher.Action action, @NonNull KubernetesSlave node, @NonNull Pod pod, @NonNull Set<String> terminationReasons) throws IOException, InterruptedException {
if (action != Watcher.Action.MODIFIED) {
return;
}
Expand All @@ -372,7 +409,11 @@ public void onEvent(@NonNull Watcher.Action action, @NonNull KubernetesSlave nod
terminatedContainers.forEach(c -> {
ContainerStateTerminated t = c.getState().getTerminated();
LOGGER.info(() -> ns + "/" + name + " Container " + c.getName() + " was just terminated, so removing the corresponding Jenkins agent");
runListener.getLogger().printf("%s/%s Container %s was terminated (Exit Code: %d, Reason: %s)%n", ns, name, c.getName(), t.getExitCode(), t.getReason());
String reason = t.getReason();
runListener.getLogger().printf("%s/%s Container %s was terminated (Exit Code: %d, Reason: %s)%n", ns, name, c.getName(), t.getExitCode(), reason);
if (reason != null) {
terminationReasons.add(reason);
}
});
logLastLinesThenTerminateNode(node, pod, runListener);
try (ACLContext _ = ACL.as(ACL.SYSTEM)) {
Expand All @@ -385,16 +426,20 @@ public void onEvent(@NonNull Watcher.Action action, @NonNull KubernetesSlave nod
@Extension
public static class TerminateAgentOnPodFailed implements Listener {
@Override
public void onEvent(@NonNull Watcher.Action action, @NonNull KubernetesSlave node, @NonNull Pod pod) throws IOException, InterruptedException {
public void onEvent(@NonNull Watcher.Action action, @NonNull KubernetesSlave node, @NonNull Pod pod, @NonNull Set<String> terminationReasons) throws IOException, InterruptedException {
if (action != Watcher.Action.MODIFIED) {
return;
}
if ("Failed".equals(pod.getStatus().getPhase())) {
String ns = pod.getMetadata().getNamespace();
String name = pod.getMetadata().getName();
TaskListener runListener = node.getTemplate().getListener();
LOGGER.info(() -> ns + "/" + name + " Pod just failed. Removing the corresponding Jenkins agent. Reason: " + pod.getStatus().getReason() + ", Message: " + pod.getStatus().getMessage());
runListener.getLogger().printf("%s/%s Pod just failed (Reason: %s, Message: %s)%n", ns, name, pod.getStatus().getReason(), pod.getStatus().getMessage());
String reason = pod.getStatus().getReason();
LOGGER.info(() -> ns + "/" + name + " Pod just failed. Removing the corresponding Jenkins agent. Reason: " + reason + ", Message: " + pod.getStatus().getMessage());
runListener.getLogger().printf("%s/%s Pod just failed (Reason: %s, Message: %s)%n", ns, name, reason, pod.getStatus().getMessage());
if (reason != null) {
terminationReasons.add(reason);
}
logLastLinesThenTerminateNode(node, pod, runListener);
}
}
Expand All @@ -417,7 +462,7 @@ private static void logLastLinesThenTerminateNode(KubernetesSlave node, Pod pod,
public static class TerminateAgentOnImagePullBackOff implements Listener {

@Override
public void onEvent(@NonNull Watcher.Action action, @NonNull KubernetesSlave node, @NonNull Pod pod) throws IOException, InterruptedException {
public void onEvent(@NonNull Watcher.Action action, @NonNull KubernetesSlave node, @NonNull Pod pod, @NonNull Set<String> terminationReasons) throws IOException, InterruptedException {
if (action != Watcher.Action.MODIFIED) {
return;
}
Expand All @@ -433,6 +478,7 @@ public void onEvent(@NonNull Watcher.Action action, @NonNull KubernetesSlave nod
TaskListener runListener = node.getTemplate().getListener();
runListener.error("Unable to pull Docker image \""+cs.getImage()+"\". Check if image tag name is spelled correctly.");
});
terminationReasons.add("ImagePullBackOff");
try (ACLContext _ = ACL.as(ACL.SYSTEM)) {
PodUtils.cancelQueueItemFor(pod, "ImagePullBackOff");
}
Expand Down
Loading

0 comments on commit 1c1e0ec

Please sign in to comment.