From 2e86829ee021784fec24ec3de1fce6d3391481c9 Mon Sep 17 00:00:00 2001 From: Jesse Glick Date: Fri, 3 Jun 2022 16:48:21 -0400 Subject: [PATCH 1/8] [JENKINS-49707] Allow `node` blocks from deleted pods to be retried --- pom.xml | 20 +-- .../KubernetesAgentErrorCondition.java | 127 ++++++++++++++++++ .../kubernetes/pod/retention/Reaper.java | 62 +++++++-- .../AbstractKubernetesPipelineTest.java | 3 +- .../pipeline/KubernetesPipelineTest.java | 7 +- .../kubernetes/pipeline/terminatedPod.groovy | 19 ++- 6 files changed, 209 insertions(+), 29 deletions(-) create mode 100644 src/main/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesAgentErrorCondition.java diff --git a/pom.xml b/pom.xml index 66c27b3946..ad986a20c2 100644 --- a/pom.xml +++ b/pom.xml @@ -45,7 +45,7 @@ - 2.303.3 + 2.332.1 false true jenkinsci/${project.artifactId}-plugin @@ -104,6 +104,7 @@ org.jenkins-ci.plugins.workflow workflow-api + 1171.vc28585a_716e7 org.jenkinsci.plugins @@ -134,6 +135,11 @@ org.jenkins-ci.plugins credentials-binding + + org.jenkins-ci.plugins.workflow + workflow-durable-task-step + 1145.ve842533f4d9a_ + @@ -144,11 +150,7 @@ org.jenkins-ci.plugins.workflow workflow-basic-steps - test - - - org.jenkins-ci.plugins.workflow - workflow-durable-task-step + 958.vde579c5541ed test @@ -249,8 +251,8 @@ io.jenkins.tools.bom - bom-2.303.x - 1090.v0a_33df40457a_ + bom-2.332.x + 1370.vfa_e23fe119c3 import pom @@ -269,7 +271,7 @@ joda-time joda-time - 2.10.2 + 2.10.5 org.apache.commons diff --git a/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesAgentErrorCondition.java b/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesAgentErrorCondition.java new file mode 100644 index 0000000000..345ea013b1 --- /dev/null +++ b/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesAgentErrorCondition.java @@ -0,0 +1,127 @@ +/* + * Copyright 2021 CloudBees, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.csanchez.jenkins.plugins.kubernetes.pipeline; + +import hudson.Extension; +import hudson.ExtensionList; +import hudson.model.Node; +import hudson.model.labels.LabelAtom; +import java.io.IOException; +import java.util.HashSet; +import java.util.Set; +import java.util.logging.Logger; +import jenkins.model.Jenkins; +import org.csanchez.jenkins.plugins.kubernetes.KubernetesCloud; +import org.csanchez.jenkins.plugins.kubernetes.KubernetesSlave; +import org.csanchez.jenkins.plugins.kubernetes.pod.retention.Reaper; +import org.jenkinsci.Symbol; +import org.jenkinsci.plugins.workflow.actions.ErrorAction; +import org.jenkinsci.plugins.workflow.actions.WorkspaceAction; +import org.jenkinsci.plugins.workflow.flow.ErrorCondition; +import org.jenkinsci.plugins.workflow.flow.FlowExecution; +import org.jenkinsci.plugins.workflow.graph.BlockEndNode; +import org.jenkinsci.plugins.workflow.graph.FlowNode; +import org.jenkinsci.plugins.workflow.graphanalysis.LinearBlockHoppingScanner; +import org.jenkinsci.plugins.workflow.steps.FlowInterruptedException; +import org.jenkinsci.plugins.workflow.steps.StepContext; +import org.jenkinsci.plugins.workflow.support.steps.AgentErrorCondition; +import org.jenkinsci.plugins.workflow.support.steps.ExecutorStepExecution; +import org.kohsuke.stapler.DataBoundConstructor; + +/** + * Qualifies {@code node} blocks associated with {@link KubernetesSlave} to be retried if the node was deleted. + * A more specific version of {@link AgentErrorCondition}. + */ +public class KubernetesAgentErrorCondition extends ErrorCondition { + + private static final Logger LOGGER = Logger.getLogger(KubernetesAgentErrorCondition.class.getName()); + + private static final Set IGNORED_CONTAINER_TERMINATION_REASONS = new HashSet<>(); + static { + IGNORED_CONTAINER_TERMINATION_REASONS.add("OOMKilled"); + IGNORED_CONTAINER_TERMINATION_REASONS.add("Completed"); + IGNORED_CONTAINER_TERMINATION_REASONS.add("DeadlineExceeded"); + } + + @DataBoundConstructor public KubernetesAgentErrorCondition() {} + + @Override + public boolean test(Throwable t, StepContext context) throws IOException, InterruptedException { + if (context == null) { + LOGGER.fine("Cannot check error without context"); + return false; + } + if (!new AgentErrorCondition().test(t, context)) { + if (t instanceof FlowInterruptedException && ((FlowInterruptedException) t).getCauses().stream().anyMatch(ExecutorStepExecution.QueueTaskCancelled.class::isInstance)) { + LOGGER.fine(() -> "QueueTaskCancelled normally ignored by AgentErrorCondition but might be delivered here from Reaper.TerminateAgentOnContainerTerminated"); + // TODO cleaner to somehow suppress that QueueTaskCancelled and let the underlying RemovedNodeCause be delivered + // (or just let AgentErrorCondition trigger on QueueTaskCancelled) + } else { + LOGGER.fine(() -> "Not a recognized failure: " + t); + return false; + } + } + FlowNode _origin = ErrorAction.findOrigin(t, context.get(FlowExecution.class)); + if (_origin == null) { + LOGGER.fine(() -> "No recognized origin of error: " + t); + return false; + } + FlowNode origin = _origin instanceof BlockEndNode ? ((BlockEndNode) _origin).getStartNode() : _origin; + LOGGER.fine(() -> "Found origin " + origin + " " + origin.getDisplayFunctionName()); + LinearBlockHoppingScanner scanner = new LinearBlockHoppingScanner(); + scanner.setup(origin); + for (FlowNode callStack : scanner) { + WorkspaceAction ws = callStack.getPersistentAction(WorkspaceAction.class); + if (ws != null) { + String node = ws.getNode(); + Node n = Jenkins.get().getNode(node); + if (n != null) { + if (!(n instanceof KubernetesSlave)) { + LOGGER.fine(() -> node + " was not a K8s agent"); + return false; + } + } else { + // May have been removed already, but we can look up the labels to see what it was. + Set labels = ws.getLabels(); + if (labels.stream().noneMatch(l -> Jenkins.get().clouds.stream().anyMatch(c -> c instanceof KubernetesCloud && ((KubernetesCloud) c).getTemplate(l) != null))) { + LOGGER.fine(() -> node + " was not a K8s agent judging by " + labels); + return false; + } + } + Set terminationReasons = ExtensionList.lookupSingleton(Reaper.class).terminationReasons(node); + if (terminationReasons.stream().anyMatch(r -> IGNORED_CONTAINER_TERMINATION_REASONS.contains(r))) { + LOGGER.fine(() -> "ignored termination reason(s) for " + node + ": " + terminationReasons); + return false; + } + LOGGER.fine(() -> "active on " + node + " (termination reasons: " + terminationReasons + ")"); + return true; + } + } + LOGGER.fine(() -> "found no WorkspaceAction starting from " + origin); + return false; + } + + @Symbol("kubernetesAgent") + @Extension public static final class DescriptorImpl extends ErrorConditionDescriptor { + + @Override public String getDisplayName() { + return "Kubernetes agent errors"; + } + + } + +} diff --git a/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pod/retention/Reaper.java b/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pod/retention/Reaper.java index 6c7fe64a8b..d513219606 100644 --- a/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pod/retention/Reaper.java +++ b/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pod/retention/Reaper.java @@ -16,7 +16,11 @@ package org.csanchez.jenkins.plugins.kubernetes.pod.retention; +import com.github.benmanes.caffeine.cache.CacheLoader; +import com.github.benmanes.caffeine.cache.Caffeine; +import com.github.benmanes.caffeine.cache.LoadingCache; import edu.umd.cs.findbugs.annotations.NonNull; +import edu.umd.cs.findbugs.annotations.SuppressFBWarnings; import hudson.Extension; import hudson.ExtensionList; import hudson.ExtensionPoint; @@ -33,6 +37,7 @@ import io.fabric8.kubernetes.api.model.ContainerStateWaiting; import io.fabric8.kubernetes.api.model.ContainerStatus; import io.fabric8.kubernetes.api.model.Pod; +import io.fabric8.kubernetes.api.model.PodStatus; import io.fabric8.kubernetes.client.KubernetesClient; import io.fabric8.kubernetes.client.Watch; import io.fabric8.kubernetes.client.Watcher; @@ -46,7 +51,15 @@ import java.util.logging.Logger; import io.fabric8.kubernetes.client.WatcherException; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.ConcurrentSkipListSet; +import java.util.concurrent.TimeUnit; import jenkins.model.Jenkins; +import jenkins.util.Timer; import org.csanchez.jenkins.plugins.kubernetes.KubernetesCloud; import org.csanchez.jenkins.plugins.kubernetes.KubernetesComputer; import org.csanchez.jenkins.plugins.kubernetes.KubernetesSlave; @@ -90,10 +103,14 @@ public static Reaper getInstance() { private Watch watch; + private final LoadingCache> terminationReasons = Caffeine.newBuilder(). + expireAfterAccess(1, TimeUnit.DAYS). + build(k -> new ConcurrentSkipListSet<>()); + @Override - public void onOnline(Computer c, TaskListener listener) throws IOException, InterruptedException { + public void preLaunch(Computer c, TaskListener taskListener) throws IOException, InterruptedException { if (c instanceof KubernetesComputer) { - maybeActivate(); + Timer.get().schedule(this::maybeActivate, 10, TimeUnit.SECONDS); } } @@ -164,7 +181,7 @@ public void eventReceived(Watcher.Action action, Pod pod) { } ExtensionList.lookup(Listener.class).forEach(listener -> { try { - listener.onEvent(action, optionalNode.get(), pod); + listener.onEvent(action, optionalNode.get(), pod, terminationReasons.get(optionalNode.get().getNodeName())); } catch (Exception x) { LOGGER.log(Level.WARNING, "Listener " + listener + " failed for " + ns + "/" + name, x); } @@ -194,6 +211,19 @@ private void closeWatch() { } } + /** + * Get any reason(s) why a node was terminated by a listener. + * @param node a {@link Node#getNodeName} + * @return a possibly empty set of {@link ContainerStateTerminated#getReason} or {@link PodStatus#getReason} + */ + @SuppressFBWarnings(value = "NP_NULL_ON_SOME_PATH_FROM_RETURN_VALUE", justification = "Confused by @org.checkerframework.checker.nullness.qual.Nullable on LoadingCache.get? Never null here.") + @NonNull + public Set terminationReasons(@NonNull String node) { + synchronized (terminationReasons) { + return new HashSet<>(terminationReasons.get(node)); + } + } + /** * Listener called when a Kubernetes event related to a Kubernetes agent happens. */ @@ -204,13 +234,13 @@ public interface Listener extends ExtensionPoint { * @param node The affected node * @param pod The affected pod */ - void onEvent(@NonNull Watcher.Action action, @NonNull KubernetesSlave node, @NonNull Pod pod) throws IOException, InterruptedException; + void onEvent(@NonNull Watcher.Action action, @NonNull KubernetesSlave node, @NonNull Pod pod, @NonNull Set terminationReaons) throws IOException, InterruptedException; } @Extension public static class RemoveAgentOnPodDeleted implements Listener { @Override - public void onEvent(@NonNull Watcher.Action action, @NonNull KubernetesSlave node, @NonNull Pod pod) throws IOException { + public void onEvent(@NonNull Watcher.Action action, @NonNull KubernetesSlave node, @NonNull Pod pod, @NonNull Set terminationReasons) throws IOException { if (action != Action.DELETED) { return; } @@ -225,8 +255,9 @@ public void onEvent(@NonNull Watcher.Action action, @NonNull KubernetesSlave nod @Extension public static class TerminateAgentOnContainerTerminated implements Listener { + @Override - public void onEvent(@NonNull Action action, @NonNull KubernetesSlave node, @NonNull Pod pod) throws IOException, InterruptedException { + public void onEvent(@NonNull Action action, @NonNull KubernetesSlave node, @NonNull Pod pod, @NonNull Set terminationReasons) throws IOException, InterruptedException { if (action != Action.MODIFIED) { return; } @@ -238,7 +269,11 @@ public void onEvent(@NonNull Action action, @NonNull KubernetesSlave node, @NonN terminatedContainers.forEach(c -> { ContainerStateTerminated t = c.getState().getTerminated(); LOGGER.info(() -> ns + "/" + name + " Container " + c.getName() + " was just terminated, so removing the corresponding Jenkins agent"); - runListener.getLogger().printf("%s/%s Container %s was terminated (Exit Code: %d, Reason: %s)%n", ns, name, c.getName(), t.getExitCode(), t.getReason()); + String reason = t.getReason(); + runListener.getLogger().printf("%s/%s Container %s was terminated (Exit Code: %d, Reason: %s)%n", ns, name, c.getName(), t.getExitCode(), reason); + if (reason != null) { + terminationReasons.add(reason); + } }); try (ACLContext _ = ACL.as(ACL.SYSTEM)) { PodUtils.cancelQueueItemFor(pod, "ContainerError"); @@ -251,7 +286,7 @@ public void onEvent(@NonNull Action action, @NonNull KubernetesSlave node, @NonN @Extension public static class TerminateAgentOnPodFailed implements Listener { @Override - public void onEvent(@NonNull Action action, @NonNull KubernetesSlave node, @NonNull Pod pod) throws IOException, InterruptedException { + public void onEvent(@NonNull Action action, @NonNull KubernetesSlave node, @NonNull Pod pod, @NonNull Set terminationReasons) throws IOException, InterruptedException { if (action != Action.MODIFIED) { return; } @@ -259,8 +294,12 @@ public void onEvent(@NonNull Action action, @NonNull KubernetesSlave node, @NonN String ns = pod.getMetadata().getNamespace(); String name = pod.getMetadata().getName(); TaskListener runListener = node.getTemplate().getListener(); - LOGGER.info(() -> ns + "/" + name + " Pod just failed. Removing the corresponding Jenkins agent. Reason: " + pod.getStatus().getReason() + ", Message: " + pod.getStatus().getMessage()); - runListener.getLogger().printf("%s/%s Pod just failed (Reason: %s, Message: %s)%n", ns, name, pod.getStatus().getReason(), pod.getStatus().getMessage()); + String reason = pod.getStatus().getReason(); + LOGGER.info(() -> ns + "/" + name + " Pod just failed. Removing the corresponding Jenkins agent. Reason: " + reason + ", Message: " + pod.getStatus().getMessage()); + runListener.getLogger().printf("%s/%s Pod just failed (Reason: %s, Message: %s)%n", ns, name, reason, pod.getStatus().getMessage()); + if (reason != null) { + terminationReasons.add(reason); + } logLastLinesThenTerminateNode(node, pod, runListener); } } @@ -283,7 +322,7 @@ private static void logLastLinesThenTerminateNode(KubernetesSlave node, Pod pod, public static class TerminateAgentOnImagePullBackOff implements Listener { @Override - public void onEvent(@NonNull Action action, @NonNull KubernetesSlave node, @NonNull Pod pod) throws IOException, InterruptedException { + public void onEvent(@NonNull Action action, @NonNull KubernetesSlave node, @NonNull Pod pod, @NonNull Set terminationReasons) throws IOException, InterruptedException { List backOffContainers = PodUtils.getContainers(pod, cs -> { ContainerStateWaiting waiting = cs.getState().getWaiting(); return waiting != null && waiting.getMessage() != null && waiting.getMessage().contains("Back-off pulling image"); @@ -295,6 +334,7 @@ public void onEvent(@NonNull Action action, @NonNull KubernetesSlave node, @NonN TaskListener runListener = node.getTemplate().getListener(); runListener.error("Unable to pull Docker image \""+cs.getImage()+"\". Check if image tag name is spelled correctly."); }); + terminationReasons.add("ImagePullBackOff"); try (ACLContext _ = ACL.as(ACL.SYSTEM)) { PodUtils.cancelQueueItemFor(pod, "ImagePullBackOff"); } diff --git a/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/AbstractKubernetesPipelineTest.java b/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/AbstractKubernetesPipelineTest.java index 0e5113af96..81ab16ba1f 100644 --- a/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/AbstractKubernetesPipelineTest.java +++ b/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/AbstractKubernetesPipelineTest.java @@ -81,7 +81,8 @@ public abstract class AbstractKubernetesPipelineTest { public LoggerRule logs = new LoggerRule() .recordPackage(KubernetesCloud.class, Level.FINE) .recordPackage(NoDelayProvisionerStrategy.class, Level.FINE) - .record(NodeProvisioner.class, Level.FINE); + .record(NodeProvisioner.class, Level.FINE) + .record(KubernetesAgentErrorCondition.class, Level.FINE); @BeforeClass public static void isKubernetesConfigured() throws Exception { diff --git a/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesPipelineTest.java b/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesPipelineTest.java index 506fd2166e..d9cd29faaf 100644 --- a/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesPipelineTest.java +++ b/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesPipelineTest.java @@ -73,7 +73,6 @@ import org.hamcrest.MatcherAssert; import org.hamcrest.Matchers; import org.jenkinsci.plugins.workflow.job.WorkflowRun; -import org.jenkinsci.plugins.workflow.support.steps.ExecutorStepExecution; import org.jenkinsci.plugins.workflow.test.steps.SemaphoreStep; import org.junit.After; import org.junit.Before; @@ -472,12 +471,14 @@ public void runInPodWithRetention() throws Exception { @Issue("JENKINS-49707") @Test public void terminatedPod() throws Exception { + logs.record(KubernetesAgentErrorCondition.class, Level.FINE); r.waitForMessage("+ sleep", b); deletePods(cloud.connect(), getLabels(this, name), false); - r.assertBuildStatus(Result.ABORTED, r.waitForCompletion(b)); - r.waitForMessage(new ExecutorStepExecution.RemovedNodeCause().getShortDescription(), b); r.waitForMessage("busybox --", b); r.waitForMessage("jnlp --", b); + r.waitForMessage("was deleted; cancelling node body", b); + r.waitForMessage("Retrying", b); + r.assertBuildStatusSuccess(r.waitForCompletion(b)); } @Issue("JENKINS-59340") diff --git a/src/test/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/terminatedPod.groovy b/src/test/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/terminatedPod.groovy index d757048650..73f1f453ee 100644 --- a/src/test/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/terminatedPod.groovy +++ b/src/test/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/terminatedPod.groovy @@ -1,10 +1,19 @@ -podTemplate(label: '$NAME', containers: [ - containerTemplate(name: 'busybox', image: 'busybox', ttyEnabled: true, command: '/bin/cat'), - ]) { - node ('$NAME') { +podTemplate(yaml: ''' +spec: + containers: + - name: busybox + image: busybox + command: + - sleep + - 99d + terminationGracePeriodSeconds: 3 +''') { + retry(count: 2, conditions: [kubernetesAgent()]) { + node(POD_LABEL) { container('busybox') { sh 'echo hello world' - sh 'sleep 9999999' + sh 'sleep 15' } } + } } From 68c2cc93bd50a17af500ce1ba477a52d965ef63c Mon Sep 17 00:00:00 2001 From: Jesse Glick Date: Tue, 7 Jun 2022 17:32:51 -0400 Subject: [PATCH 2/8] Help for `KubernetesAgentErrorCondition` --- .../pipeline/KubernetesAgentErrorCondition/help.html | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 src/main/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesAgentErrorCondition/help.html diff --git a/src/main/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesAgentErrorCondition/help.html b/src/main/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesAgentErrorCondition/help.html new file mode 100644 index 0000000000..c1c6163408 --- /dev/null +++ b/src/main/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesAgentErrorCondition/help.html @@ -0,0 +1,6 @@ +
+ Similar to agent() (Agent errors) but tailored to agents provisioned from a Kubernetes cloud. + Unlike the generic agent error condition, + this will ignore certain pod termination reasons which are likely to be under the control of the Pipeline author (e.g., OOMKilled) + while still allowing retry to recover after common cases of pod deletion. +
From 55af8d4c8e6abafedca3b8d069cf76a30c28927d Mon Sep 17 00:00:00 2001 From: Jesse Glick Date: Mon, 13 Jun 2022 15:39:12 -0400 Subject: [PATCH 3/8] Bumps --- pom.xml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pom.xml b/pom.xml index 92ac528330..28fd1c1fed 100644 --- a/pom.xml +++ b/pom.xml @@ -104,7 +104,7 @@ org.jenkins-ci.plugins.workflow workflow-api - 1171.vc28585a_716e7 + 1174.va_3d1b_702a_07b_ org.jenkinsci.plugins @@ -138,7 +138,7 @@ org.jenkins-ci.plugins.workflow workflow-durable-task-step - 1145.ve842533f4d9a_ + 1147.vf46a_1da_37680 @@ -150,7 +150,7 @@ org.jenkins-ci.plugins.workflow workflow-basic-steps - 958.vde579c5541ed + 962.v44eb_d807841a_ test From e6f9e42d64de1e8efe1c513e9ca195e178b19eef Mon Sep 17 00:00:00 2001 From: Jesse Glick Date: Mon, 13 Jun 2022 15:48:18 -0400 Subject: [PATCH 4/8] Adapting `ReaperTest` to changed activation timing --- .../plugins/kubernetes/pod/retention/ReaperTest.java | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pod/retention/ReaperTest.java b/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pod/retention/ReaperTest.java index 0344a82c61..0de0797215 100644 --- a/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pod/retention/ReaperTest.java +++ b/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pod/retention/ReaperTest.java @@ -143,10 +143,12 @@ public void testActivateOnNewComputer() throws IOException, InterruptedException assertFalse("should not be watching cloud", r.isWatchingCloud(cloud.name)); // fire compute on-line event - r.onOnline(kc, tl); + r.preLaunch(kc, tl); // expect new cloud registered - assertTrue("should be watching cloud", r.isWatchingCloud(cloud.name)); + while (!r.isWatchingCloud(cloud.name)) { + Thread.sleep(100); + } kubeClientRequests() .assertRequestCountAtLeast("/api/v1/namespaces/foo/pods?allowWatchBookmarks=true&watch=true", 1); } @@ -180,10 +182,12 @@ public void testReconnectOnNewComputer() throws InterruptedException, IOExceptio KubernetesSlave n2 = addNode(cloud, "p1-123", "p1"); TaskListener tl = mock(TaskListener.class); KubernetesComputer kc = new KubernetesComputer(n2); - r.onOnline(kc, tl); + r.preLaunch(kc, tl); // should have started new watch - assertTrue("watcher is restarted", r.isWatchingCloud(cloud.name)); + while (!r.isWatchingCloud(cloud.name)) { + Thread.sleep(100); + } } @Test(timeout = 10_000) From 34b75bc0db5ae49365660ddc93391aacafa2a1cc Mon Sep 17 00:00:00 2001 From: Jesse Glick Date: Thu, 7 Jul 2022 16:54:19 -0400 Subject: [PATCH 5/8] https://github.com/jenkinsci/workflow-api-plugin/pull/217 & https://github.com/jenkinsci/workflow-durable-task-step-plugin/pull/231 & https://github.com/jenkinsci/workflow-basic-steps-plugin/pull/203 released --- pom.xml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pom.xml b/pom.xml index 28fd1c1fed..52f44b0c3e 100644 --- a/pom.xml +++ b/pom.xml @@ -104,7 +104,7 @@ org.jenkins-ci.plugins.workflow workflow-api - 1174.va_3d1b_702a_07b_ + 1182.v41475e53ea_43 org.jenkinsci.plugins @@ -138,7 +138,7 @@ org.jenkins-ci.plugins.workflow workflow-durable-task-step - 1147.vf46a_1da_37680 + 1164.v2334ddcf48d0 @@ -150,7 +150,7 @@ org.jenkins-ci.plugins.workflow workflow-basic-steps - 962.v44eb_d807841a_ + 969.vc4ec3e4854b_f test From 87b6a0243ad1987fd06283cf4379834fe0ef6df8 Mon Sep 17 00:00:00 2001 From: Jesse Glick Date: Fri, 8 Jul 2022 17:09:16 -0400 Subject: [PATCH 6/8] Picking up various versions from BOM --- pom.xml | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/pom.xml b/pom.xml index 52f44b0c3e..e39167a3af 100644 --- a/pom.xml +++ b/pom.xml @@ -104,7 +104,6 @@ org.jenkins-ci.plugins.workflow workflow-api - 1182.v41475e53ea_43 org.jenkinsci.plugins @@ -125,7 +124,6 @@ org.jenkins-ci.plugins metrics - 4.0.2.8.1 io.jenkins.plugins @@ -138,7 +136,6 @@ org.jenkins-ci.plugins.workflow workflow-durable-task-step - 1164.v2334ddcf48d0 @@ -150,7 +147,6 @@ org.jenkins-ci.plugins.workflow workflow-basic-steps - 969.vc4ec3e4854b_f test @@ -196,15 +192,7 @@ org.jenkins-ci.plugins ssh-agent - 1.23 test - - - - org.apache.sshd - sshd-core - - org.jenkins-ci.plugins @@ -264,7 +252,7 @@ io.jenkins.tools.bom bom-2.332.x - 1370.vfa_e23fe119c3 + 1478.v81d3dc4f9a_43 import pom From 71b93ed865c7b7d55757eb367fd5577886cb2062 Mon Sep 17 00:00:00 2001 From: Jesse Glick Date: Mon, 11 Jul 2022 09:13:41 -0400 Subject: [PATCH 7/8] `QueueTaskCancelled` https://github.com/jenkinsci/workflow-durable-task-step-plugin/pull/249/commits/76432f0f737f2b28becb9aeca14ad45f2bbf22c4 --- pom.xml | 1 + .../pipeline/KubernetesAgentErrorCondition.java | 10 ++-------- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/pom.xml b/pom.xml index e39167a3af..4c89ea7451 100644 --- a/pom.xml +++ b/pom.xml @@ -136,6 +136,7 @@ org.jenkins-ci.plugins.workflow workflow-durable-task-step + 1174.v73a_9a_17edce0 diff --git a/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesAgentErrorCondition.java b/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesAgentErrorCondition.java index 345ea013b1..9fb1d5667f 100644 --- a/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesAgentErrorCondition.java +++ b/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesAgentErrorCondition.java @@ -66,14 +66,8 @@ public boolean test(Throwable t, StepContext context) throws IOException, Interr return false; } if (!new AgentErrorCondition().test(t, context)) { - if (t instanceof FlowInterruptedException && ((FlowInterruptedException) t).getCauses().stream().anyMatch(ExecutorStepExecution.QueueTaskCancelled.class::isInstance)) { - LOGGER.fine(() -> "QueueTaskCancelled normally ignored by AgentErrorCondition but might be delivered here from Reaper.TerminateAgentOnContainerTerminated"); - // TODO cleaner to somehow suppress that QueueTaskCancelled and let the underlying RemovedNodeCause be delivered - // (or just let AgentErrorCondition trigger on QueueTaskCancelled) - } else { - LOGGER.fine(() -> "Not a recognized failure: " + t); - return false; - } + LOGGER.fine(() -> "Not a recognized failure: " + t); + return false; } FlowNode _origin = ErrorAction.findOrigin(t, context.get(FlowExecution.class)); if (_origin == null) { From c38fe0fabf27d0583c9b546aa701dc075245e72d Mon Sep 17 00:00:00 2001 From: Jesse Glick Date: Tue, 12 Jul 2022 16:50:14 -0400 Subject: [PATCH 8/8] `handleNonKubernetes` https://github.com/jenkins-infra/pipeline-library/pull/405#discussion_r916058375 --- pom.xml | 2 +- .../KubernetesAgentErrorCondition.java | 21 +++-- .../config.jelly | 23 ++++++ .../help-handleNonKubernetes.html | 4 + .../KubernetesAgentErrorConditionTest.java | 81 +++++++++++++++++++ 5 files changed, 125 insertions(+), 6 deletions(-) create mode 100644 src/main/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesAgentErrorCondition/config.jelly create mode 100644 src/main/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesAgentErrorCondition/help-handleNonKubernetes.html create mode 100644 src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesAgentErrorConditionTest.java diff --git a/pom.xml b/pom.xml index 4c89ea7451..5be9d2e374 100644 --- a/pom.xml +++ b/pom.xml @@ -3,7 +3,7 @@ org.jenkins-ci.plugins plugin - 4.40 + 4.42 diff --git a/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesAgentErrorCondition.java b/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesAgentErrorCondition.java index 9fb1d5667f..a362e0b68f 100644 --- a/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesAgentErrorCondition.java +++ b/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesAgentErrorCondition.java @@ -41,6 +41,7 @@ import org.jenkinsci.plugins.workflow.support.steps.AgentErrorCondition; import org.jenkinsci.plugins.workflow.support.steps.ExecutorStepExecution; import org.kohsuke.stapler.DataBoundConstructor; +import org.kohsuke.stapler.DataBoundSetter; /** * Qualifies {@code node} blocks associated with {@link KubernetesSlave} to be retried if the node was deleted. @@ -57,13 +58,23 @@ public class KubernetesAgentErrorCondition extends ErrorCondition { IGNORED_CONTAINER_TERMINATION_REASONS.add("DeadlineExceeded"); } + private boolean handleNonKubernetes; + @DataBoundConstructor public KubernetesAgentErrorCondition() {} + public boolean isHandleNonKubernetes() { + return handleNonKubernetes; + } + + @DataBoundSetter public void setHandleNonKubernetes(boolean handleNonKubernetes) { + this.handleNonKubernetes = handleNonKubernetes; + } + @Override public boolean test(Throwable t, StepContext context) throws IOException, InterruptedException { if (context == null) { LOGGER.fine("Cannot check error without context"); - return false; + return handleNonKubernetes; } if (!new AgentErrorCondition().test(t, context)) { LOGGER.fine(() -> "Not a recognized failure: " + t); @@ -72,7 +83,7 @@ public boolean test(Throwable t, StepContext context) throws IOException, Interr FlowNode _origin = ErrorAction.findOrigin(t, context.get(FlowExecution.class)); if (_origin == null) { LOGGER.fine(() -> "No recognized origin of error: " + t); - return false; + return handleNonKubernetes; } FlowNode origin = _origin instanceof BlockEndNode ? ((BlockEndNode) _origin).getStartNode() : _origin; LOGGER.fine(() -> "Found origin " + origin + " " + origin.getDisplayFunctionName()); @@ -86,14 +97,14 @@ public boolean test(Throwable t, StepContext context) throws IOException, Interr if (n != null) { if (!(n instanceof KubernetesSlave)) { LOGGER.fine(() -> node + " was not a K8s agent"); - return false; + return handleNonKubernetes; } } else { // May have been removed already, but we can look up the labels to see what it was. Set labels = ws.getLabels(); if (labels.stream().noneMatch(l -> Jenkins.get().clouds.stream().anyMatch(c -> c instanceof KubernetesCloud && ((KubernetesCloud) c).getTemplate(l) != null))) { LOGGER.fine(() -> node + " was not a K8s agent judging by " + labels); - return false; + return handleNonKubernetes; } } Set terminationReasons = ExtensionList.lookupSingleton(Reaper.class).terminationReasons(node); @@ -106,7 +117,7 @@ public boolean test(Throwable t, StepContext context) throws IOException, Interr } } LOGGER.fine(() -> "found no WorkspaceAction starting from " + origin); - return false; + return handleNonKubernetes; } @Symbol("kubernetesAgent") diff --git a/src/main/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesAgentErrorCondition/config.jelly b/src/main/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesAgentErrorCondition/config.jelly new file mode 100644 index 0000000000..53def7b9af --- /dev/null +++ b/src/main/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesAgentErrorCondition/config.jelly @@ -0,0 +1,23 @@ + + + + + + + + + diff --git a/src/main/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesAgentErrorCondition/help-handleNonKubernetes.html b/src/main/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesAgentErrorCondition/help-handleNonKubernetes.html new file mode 100644 index 0000000000..cee421d506 --- /dev/null +++ b/src/main/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesAgentErrorCondition/help-handleNonKubernetes.html @@ -0,0 +1,4 @@ +
+ Behave like the generic agent() (Agent errors) when applied to a non-Kubernetes agent. + Useful in cases where it is hard to predict in a job definition whether a Kubernetes or other sort of agent will be used. +
diff --git a/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesAgentErrorConditionTest.java b/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesAgentErrorConditionTest.java new file mode 100644 index 0000000000..4a84d8551a --- /dev/null +++ b/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesAgentErrorConditionTest.java @@ -0,0 +1,81 @@ +/* + * Copyright 2022 CloudBees, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.csanchez.jenkins.plugins.kubernetes.pipeline; + +import hudson.model.BooleanParameterDefinition; +import hudson.model.BooleanParameterValue; +import hudson.model.Label; +import hudson.model.ParametersAction; +import hudson.model.ParametersDefinitionProperty; +import hudson.model.Result; +import hudson.model.Slave; +import hudson.slaves.OfflineCause; +import java.util.logging.Level; +import org.jenkinsci.plugins.workflow.cps.CpsFlowDefinition; +import org.jenkinsci.plugins.workflow.job.WorkflowJob; +import org.jenkinsci.plugins.workflow.job.WorkflowRun; +import org.jenkinsci.plugins.workflow.test.steps.SemaphoreStep; +import org.junit.ClassRule; +import org.junit.Rule; +import org.junit.Test; +import org.jvnet.hudson.test.BuildWatcher; +import org.jvnet.hudson.test.InboundAgentRule; +import org.jvnet.hudson.test.Issue; +import org.jvnet.hudson.test.JenkinsRule; +import org.jvnet.hudson.test.LoggerRule; + +@Issue("JENKINS-49707") +public class KubernetesAgentErrorConditionTest { + + @ClassRule public static BuildWatcher buildWatcher = new BuildWatcher(); + @Rule public JenkinsRule r = new JenkinsRule(); + @Rule public InboundAgentRule inboundAgents = new InboundAgentRule(); + @Rule public LoggerRule logging = new LoggerRule().record(KubernetesAgentErrorCondition.class, Level.FINE); + + @Test public void handleNonKubernetes() throws Exception { + Slave s = r.createSlave(Label.get("remote")); // *not* a KubernetesSlave + WorkflowJob p = r.createProject(WorkflowJob.class, "p"); + p.addProperty(new ParametersDefinitionProperty(new BooleanParameterDefinition("HNK"))); + p.setDefinition(new CpsFlowDefinition( + "retry(count: 2, conditions: [kubernetesAgent(handleNonKubernetes: params.HNK)]) {\n" + + " node('remote') {\n" + + " semaphore 'wait'\n" + + " pwd()\n" + + " }\n" + + "}", true)); + WorkflowRun b = p.scheduleBuild2(0, new ParametersAction(new BooleanParameterValue("HNK", false))).waitForStart(); + SemaphoreStep.waitForStart("wait/1", b); + s.toComputer().disconnect(new OfflineCause.UserCause(null, null)); + while (s.toComputer().isOnline()) { + Thread.sleep(100); + } + SemaphoreStep.success("wait/1", null); + s.toComputer().connect(false); + r.assertBuildStatus(Result.FAILURE, r.waitForCompletion(b)); + b = p.scheduleBuild2(0, new ParametersAction(new BooleanParameterValue("HNK", true))).waitForStart(); + SemaphoreStep.waitForStart("wait/2", b); + s.toComputer().disconnect(new OfflineCause.UserCause(null, null)); + while (s.toComputer().isOnline()) { + Thread.sleep(100); + } + SemaphoreStep.success("wait/2", null); + SemaphoreStep.success("wait/3", null); + s.toComputer().connect(false); + r.assertBuildStatusSuccess(r.waitForCompletion(b)); + } + +}