Skip to content

Commit

Permalink
Merge pull request #916 from dwnusbaum/cps-vm-thread-metrics
Browse files Browse the repository at this point in the history
Improve support bundle data for running Pipelines, update support bundle component names and file paths, and update log warning for a particular type of error during step completion
  • Loading branch information
dwnusbaum authored Aug 15, 2024
2 parents 7141c52 + e8af1ac commit 3519a32
Show file tree
Hide file tree
Showing 4 changed files with 92 additions and 11 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -386,6 +386,10 @@ enum TimingKind {
* Running inside {@link CpsVmExecutorService}, which includes many other things.
*/
run,
/**
* Time spent waiting in the queue for {@link CpsVmExecutorService}.
*/
runQueue,
/**
* Saving the program state.
* @see CpsThreadGroup#saveProgram(File)
Expand All @@ -400,6 +404,8 @@ enum TimingKind {

/** accumulated time in ns of a given {@link TimingKind#name}; {@link String} key for pretty XStream form */
transient @NonNull Map<String, LongAdder> liveTimings = new ConcurrentHashMap<>();
/** instances of {@link Timing} which have not yet completed for reporting counts and durations in support bundles. Never persisted. */
transient @NonNull Set<Timing> liveIncompleteTimings = ConcurrentHashMap.newKeySet();
/** XStream simplified form of {@link #liveTimings} */
private Map<String, Long> timings;

Expand Down Expand Up @@ -433,7 +439,16 @@ private Timing(TimingKind kind) {
start = System.nanoTime();
}

TimingKind getKind() {
return kind;
}

long getStartNanos() {
return start;
}

@Override public void close() {
liveIncompleteTimings.remove(this);
liveTimings.computeIfAbsent(kind.name(), k -> new LongAdder()).add(System.nanoTime() - start);
}
}
Expand All @@ -444,7 +459,9 @@ private Timing(TimingKind kind) {
* @return something to {@link Timing#close} when finished
*/
Timing time(TimingKind kind) {
return new Timing(kind);
var timing = new Timing(kind);
liveIncompleteTimings.add(timing);
return timing;
}

static final Logger TIMING_LOGGER = Logger.getLogger(CpsFlowExecution.class.getName() + ".timing");
Expand Down Expand Up @@ -1879,6 +1896,7 @@ public Object unmarshal(HierarchicalStreamReader reader, final UnmarshallingCont
la.add(kv.getValue());
return la;
}));
result.liveIncompleteTimings = ConcurrentHashMap.newKeySet();
return result;
} catch (Exception ex) {
LOGGER.log(Level.SEVERE, "Failed to even load the FlowExecution", ex);
Expand Down Expand Up @@ -2025,15 +2043,15 @@ public void autopersist(@NonNull FlowNode n) throws IOException {
}

@Override public String getDisplayName() {
return "Timing data about recently completed Pipeline builds";
return "Recently completed Pipeline builds";
}

@Override public ComponentCategory getCategory() {
return ComponentCategory.BUILDS;
}

@Override public void addContents(Container container) {
container.add(new Content("nodes/master/pipeline-timings.txt") {
container.add(new Content("nodes/master/pipeline-recent-builds.txt") {
@Override public void writeTo(OutputStream outputStream) throws IOException {
PrintWriter pw = new PrintWriter(new OutputStreamWriter(outputStream, StandardCharsets.UTF_8));
for (Job<?, ?> job : Jenkins.get().getAllItems(Job.class)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -435,7 +435,9 @@ public void onFailure(Throwable t) {
}
});
} catch (IOException x) {
LOGGER.log(Level.FINE, null, x);
// TODO: If the problem is with the FlowNode and not the CpsFlowExecution, should we try to call

Check warning on line 438 in plugin/src/main/java/org/jenkinsci/plugins/workflow/cps/CpsStepContext.java

View check run for this annotation

ci.jenkins.io / Open Tasks Scanner

TODO

NORMAL: If the problem is with the FlowNode and not the CpsFlowExecution, should we try to call
// CpsVmExecutorService.reportProblem or CpsFlowExecution.croak to kill the build right away?
LOGGER.log(Level.WARNING, "Unable to load FlowNode or CpsFlowExecution when completing " + this + ", which is likely to cause its execution to hang indefinitely", x);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,29 @@
import com.cloudbees.jenkins.support.api.Content;
import com.google.common.util.concurrent.FutureCallback;
import hudson.Extension;
import hudson.Functions;
import hudson.model.Action;
import hudson.model.Queue;
import hudson.model.Run;
import hudson.security.Permission;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.util.Collections;
import java.util.Set;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeUnit;
import jenkins.model.Jenkins;
import java.nio.charset.StandardCharsets;
import java.time.Duration;
import java.time.Instant;
import java.util.HashSet;
import java.util.Map;
import java.util.Optional;
import java.util.TreeMap;
import java.util.concurrent.atomic.LongAdder;
import java.util.stream.Collectors;
import org.jenkinsci.plugins.workflow.flow.FlowExecution;
import org.jenkinsci.plugins.workflow.flow.FlowExecutionList;
import org.kohsuke.stapler.HttpResponses;
Expand Down Expand Up @@ -99,22 +109,56 @@ public CpsThreadDump getThreadDump() {
}

@Override public String getDisplayName() {
return "Thread dumps of running Pipeline builds";
return "Running Pipeline builds";
}

@Override public ComponentCategory getCategory() {
return ComponentCategory.BUILDS;
}

@Override public void addContents(Container container) {
container.add(new Content("nodes/master/pipeline-thread-dump.txt") {
@Override public void writeTo(OutputStream outputStream) throws IOException {
container.add(new Content("nodes/master/pipeline-running-builds.txt") {
@Override public void writeTo(OutputStream outputStream) {
PrintWriter pw = new PrintWriter(new OutputStreamWriter(outputStream, StandardCharsets.UTF_8));
for (FlowExecution flow : FlowExecutionList.get()) {
if (flow instanceof CpsFlowExecution) {
pw.println("Build: " + flow.getOwner().getExecutable());
((CpsFlowExecution) flow).getThreadDump().print(pw);
pw.println("Approximate graph size: " + ((CpsFlowExecution) flow).approximateNodeCount());
Queue.Executable ownerExec;
try {
ownerExec = flow.getOwner().getExecutable();
} catch (IOException e) {
pw.println("No data available for " + flow);
Functions.printStackTrace(e, pw);
pw.println();
continue;
}
pw.println("Build: " + ownerExec);
if (ownerExec instanceof Run<?, ?>) {
var run = (Run<?, ?>) ownerExec;
var started = Instant.ofEpochMilli(run.getStartTimeInMillis());
pw.println("Started: " + started);
var duration = Duration.between(started, Instant.now());
pw.print("Duration: " + duration);
if (duration.toDays() > 3) {
pw.println(" (Running for more than 3 days!)");
} else {
pw.println();
}
}
var cpsFlow = (CpsFlowExecution) flow;
Map<String, LongAdder> sortedTimings = new TreeMap<>(cpsFlow.liveTimings);
pw.println("Timings:");
sortedTimings.forEach((k, v) -> pw.println(" " + k + "\t" + v.longValue() / 1000 / 1000 + "ms"));
pw.println("Active operations:");
long nanos = System.nanoTime();
Map<String, Optional<CountAndDuration>> sortedIncompleteTimings = new HashSet<>(cpsFlow.liveIncompleteTimings).stream()
.collect(Collectors.groupingBy(t -> t.getKind().name(), TreeMap::new,
Collectors.mapping(t -> new CountAndDuration(nanos - t.getStartNanos()),
Collectors.reducing(CountAndDuration::new))));
sortedIncompleteTimings.forEach((k, optional) ->
optional.ifPresent(cd ->
pw.println(" " + k + "\t" + cd.count + "\t" + cd.duration / 1000 / 1000 + "ms")));
pw.println("Approximate graph size: " + cpsFlow.approximateNodeCount());
cpsFlow.getThreadDump().print(pw);
pw.println();
}
}
Expand All @@ -123,6 +167,19 @@ public CpsThreadDump getThreadDump() {
});
}

private static class CountAndDuration {
private final int count;
private final long duration;
CountAndDuration(long duration) {
this.count = 1;
this.duration = duration;
}
CountAndDuration(CountAndDuration a, CountAndDuration b) {
this.count = a.count + b.count;
this.duration = a.duration + b.duration;
}
}

}

}
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,9 @@ public Void call() {

@Override
protected Runnable wrap(final Runnable r) {
var timing = cpsThreadGroup.getExecution().time(CpsFlowExecution.TimingKind.runQueue);
return () -> {
timing.close();
ThreadContext context = setUp();
try {
r.run();
Expand Down Expand Up @@ -130,7 +132,9 @@ private void reportProblem(Throwable t) {

@Override
protected <V> Callable<V> wrap(final Callable<V> r) {
var timing = cpsThreadGroup.getExecution().time(CpsFlowExecution.TimingKind.runQueue);
return () -> {
timing.close();
ThreadContext context = setUp();
try {
return r.call();
Expand Down

0 comments on commit 3519a32

Please sign in to comment.