Skip to content

Commit

Permalink
Make \BugReport#handleCrash()\ re-entrant to avoid deadlocks.
Browse files Browse the repository at this point in the history
Eliminating a synchronized block using a singleton lock allows this method to
be invoked from multiple threads concurrently, eliminating a deadlock when
`BlazeModule#blazeShutdownOnCrash(DetailedExitCode)` also invokes
`BugReport#handleCrash()`. This deadlock has been seen in practice when
`BuildEventServiceModule#blazeShutdownOnCrash(DetailedExitCode)` fails to
cleanup its `BuildEventArtifactUploaderFactory`.

PiperOrigin-RevId: 548663925
Change-Id: Icf35b3be429d5708acb83dc052e51aecf50e466c
  • Loading branch information
michaeledgar authored and copybara-github committed Jul 17, 2023
1 parent d39cf07 commit 7311f76
Show file tree
Hide file tree
Showing 3 changed files with 144 additions and 45 deletions.
114 changes: 73 additions & 41 deletions src/main/java/com/google/devtools/build/lib/bugreport/BugReport.java
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
import java.lang.management.ManagementFactory;
import java.util.Arrays;
import java.util.List;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.logging.Level;
import javax.annotation.Nullable;
import javax.annotation.concurrent.GuardedBy;
Expand All @@ -61,13 +62,31 @@ public final class BugReport {

private static BlazeRuntimeInterface runtime = null;

/**
* The most recent throwable that reported a crash; used only for testing purposes where the JVM
* will not halt.
*/
@SuppressWarnings("StaticAssignmentOfThrowable")
@GuardedBy("LOCK")
@Nullable
private static Throwable lastCrashingThrowable = null;

/**
* A global lock object, used with {@code synchronized} blocks to provide thread-safe access to
* {@link BugReport#lastCrashingThrowable}.
*/
private static final Object LOCK = new Object();

/**
* Flag set by {@link #handleCrash(Crash, CrashContext)} using {@link
* AtomicBoolean#compareAndSet(boolean, boolean)} to determine which thread "wins" racing crashes.
*
* <p>All concurrent calls to {@link #handleCrash(Crash, CrashContext)} will be logged, but only
* the thread which sets this flag {@code true} will propagate the crash to exit data and to each
* {@code BlazeModule}.
*/
private static final AtomicBoolean alreadyHandlingCrash = new AtomicBoolean(false);

private static final boolean SHOULD_NOT_SEND_BUG_REPORT_BECAUSE_IN_TEST =
TestType.isInTest() && System.getenv("ENABLE_BUG_REPORT_LOGGING_IN_TEST") == null;

Expand Down Expand Up @@ -188,7 +207,7 @@ public static void sendBugReport(@FormatString String message, Object... args) {
* report} without additional arguments.
*/
public static void sendBugReport(Throwable exception) {
sendBugReport(exception, /*args=*/ ImmutableList.of());
sendBugReport(exception, /* args= */ ImmutableList.of());
}

/**
Expand All @@ -199,12 +218,12 @@ public static void sendBugReport(Throwable exception) {
* @param values Additional string values to clarify the exception.
*/
public static void sendBugReport(Throwable exception, List<String> args, String... values) {
sendBugReportInternal(exception, /*isFatal=*/ true, filterArgs(args), values);
sendBugReportInternal(exception, /* isFatal= */ true, filterArgs(args), values);
}

/** Logs the bug report, indicating it is not a crash. */
public static void sendNonFatalBugReport(Throwable exception) {
sendBugReportInternal(exception, /*isFatal=*/ false, /*args=*/ ImmutableList.of());
sendBugReportInternal(exception, /* isFatal= */ false, /* args= */ ImmutableList.of());
}

private static void sendBugReportInternal(
Expand Down Expand Up @@ -240,57 +259,60 @@ public static RuntimeException handleCrash(Throwable throwable, String... args)
* Otherwise, for {@link CrashContext#keepAlive}, returns {@code null}, in which case the caller
* is responsible for shutting down the server.
*/
@SuppressWarnings("StaticAssignmentOfThrowable")
@SuppressWarnings({"StaticAssignmentOfThrowable", "LogAndThrow"})
public static void handleCrash(Crash crash, CrashContext ctx) {
int numericExitCode = crash.getDetailedExitCode().getExitCode().getNumericExitCode();
Throwable throwable = crash.getThrowable();
if (runtime != null) {
runtime.fillInCrashContext(ctx);
}
try {
synchronized (LOCK) {
logger.atSevere().withCause(throwable).log("Handling crash with %s", ctx);

if (TestType.isInTest()) {
boolean isFirstCrash = alreadyHandlingCrash.compareAndSet(false, true);
logger.atSevere().withCause(throwable).log(
"Handling crash (isFirstCrash = %s) with CrashContext %s", isFirstCrash, ctx);
try {
if (TestType.isInTest()) {
synchronized (LOCK) {
lastCrashingThrowable = throwable;
}
}

String crashMsg;
String heapDumpPath;
// Might be a wrapped OOM - the detailed exit code reflects the root cause.
boolean isOom = crash.getDetailedExitCode().getExitCode().equals(ExitCode.OOM_ERROR);
if (isOom) {
crashMsg = constructOomExitMessage(ctx.getExtraOomInfo());
heapDumpPath = ctx.getHeapDumpPath();
if (heapDumpPath != null) {
crashMsg += " An attempt will be made to write a heap dump to " + heapDumpPath + ".";
}
} else {
crashMsg = getProductName() + " crashed due to an internal error.";
heapDumpPath = null;
String crashMsg;
String heapDumpPath;
// Might be a wrapped OOM - the detailed exit code reflects the root cause.
boolean isOom = crash.getDetailedExitCode().getExitCode().equals(ExitCode.OOM_ERROR);
if (isOom) {
crashMsg = constructOomExitMessage(ctx.getExtraOomInfo());
heapDumpPath = ctx.getHeapDumpPath();
if (heapDumpPath != null) {
crashMsg += " An attempt will be made to write a heap dump to " + heapDumpPath + ".";
}
crashMsg += " Printing stack trace:\n" + Throwables.getStackTraceAsString(throwable);
ctx.getEventHandler().handle(Event.fatal(crashMsg));

} else {
crashMsg = getProductName() + " crashed due to an internal error.";
heapDumpPath = null;
}
crashMsg += " Printing stack trace:\n" + Throwables.getStackTraceAsString(throwable);
ctx.getEventHandler().handle(Event.fatal(crashMsg));
if (isFirstCrash) {
try {
// Emit exit data before sending a bug report. Bug reports involve an RPC, and given that
// we are crashing, who knows if it will complete. It's more important that we write
// exit code and failure detail information so that the crash can be handled correctly.
emitExitData(crash, ctx, numericExitCode, heapDumpPath);
emitExitData(crash, ctx, heapDumpPath);
// Skip sending a bug report if the crash is an OOM - attempting an RPC while out of
// memory can cause issues. Also, don't try to send a bug report during a crash in a test,
// it will throw itself.
if (ctx.shouldSendBugReport() && !isOom && !TestType.isInTest()) {
sendBugReport(throwable, ctx.getArgs());
}
} finally {
if (ctx.shouldHaltJvm()) {
// Avoid shutdown deadlock issues: If an application shutdown hook crashes, it will
// trigger our Blaze crash handler (this method). Calling System#exit() here, would
// therefore induce a deadlock. This call would block on the shutdown sequence
// completing, but the shutdown sequence would in turn be blocked on this thread
// finishing. Instead, exit fast via halt().
Runtime.getRuntime().halt(numericExitCode);
try {
haltJvmIfRequested(crash, ctx);
} finally {
// Reset the global `alreadyHandlingCrash` flag while testing, so subsequent tests can
// exercise BugReport#handleCrash(). (Note that Runtime#halt() throws in tests.)
if (TestType.isInTest()) {
alreadyHandlingCrash.set(false);
}
}
}
}
Expand All @@ -309,9 +331,7 @@ public static void handleCrash(Crash crash, CrashContext ctx) {
System.err.println("Exception encountered during BugReport#handleCrash:");
t.printStackTrace(System.err);
} finally {
if (ctx.shouldHaltJvm()) {
Runtime.getRuntime().halt(numericExitCode);
}
haltJvmIfRequested(crash, ctx);
}
if (!ctx.shouldHaltJvm()) {
return;
Expand All @@ -320,14 +340,26 @@ public static void handleCrash(Crash crash, CrashContext ctx) {
throw new IllegalStateException("Should have halted", throwable);
}

private static void haltJvmIfRequested(Crash crash, CrashContext ctx) {
if (ctx.shouldHaltJvm()) {
// Avoid shutdown deadlock issues: If an application shutdown hook crashes, it will
// trigger our Blaze crash handler (this method). Calling System#exit() here, would
// therefore induce a deadlock. This call would block on the shutdown sequence
// completing, but the shutdown sequence would in turn be blocked on this thread
// finishing. Instead, exit fast via halt().
Runtime.getRuntime().halt(crash.getDetailedExitCode().getExitCode().getNumericExitCode());
}
}

/**
* Writes exit status files, dumps heap if requested, and calls {@link
* BlazeRuntimeInterface#cleanUpForCrash}.
*/
private static void emitExitData(
Crash crash, CrashContext ctx, int numericExitCode, @Nullable String heapDumpPath) {
private static void emitExitData(Crash crash, CrashContext ctx, @Nullable String heapDumpPath) {
// Writing the exit code status file is only necessary if we are halting. Otherwise, the
// caller is responsible for an orderly shutdown with the proper exit code.
DetailedExitCode detailedExitCode = crash.getDetailedExitCode();
int numericExitCode = detailedExitCode.getExitCode().getNumericExitCode();
if (ctx.shouldHaltJvm()) {
if (CustomExitCodePublisher.maybeWriteExitStatusFile(numericExitCode)) {
logger.atInfo().log("Wrote exit status file.");
Expand All @@ -337,7 +369,7 @@ private static void emitExitData(
}

if (CustomFailureDetailPublisher.maybeWriteFailureDetailFile(
crash.getDetailedExitCode().getFailureDetail())) {
detailedExitCode.getFailureDetail())) {
logger.atInfo().log("Wrote failure detail file.");
} else {
logger.atWarning().log("Did not write failure detail file; check stderr for errors.");
Expand All @@ -354,7 +386,7 @@ private static void emitExitData(
}

if (runtime != null) {
runtime.cleanUpForCrash(crash.getDetailedExitCode());
runtime.cleanUpForCrash(detailedExitCode);
logger.atInfo().log("Cleaned up runtime.");
} else {
logger.atInfo().log("No runtime to clean.");
Expand All @@ -372,7 +404,7 @@ private static void dumpHeap(String path) throws IOException {
ManagementFactory.getPlatformMBeanServer(),
"com.sun.management:type=HotSpotDiagnostic",
HotSpotDiagnosticMXBean.class);
mxBean.dumpHeap(path, /*live=*/ true);
mxBean.dumpHeap(path, /* live= */ true);
}

/**
Expand Down
2 changes: 2 additions & 0 deletions src/test/java/com/google/devtools/build/lib/bugreport/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ java_library(
"//src/main/java/com/google/devtools/build/lib/util:exit_code",
"//src/main/java/com/google/devtools/build/lib/util:logging",
"//src/main/protobuf:failure_details_java_proto",
"//src/test/java/com/google/devtools/build/lib/testutil:TestThread",
"//src/test/java/com/google/devtools/build/lib/testutil:TestUtils",
"//third_party:guava",
"//third_party:guava-testlib",
"//third_party:junit4",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@

import static com.google.common.truth.Truth.assertThat;
import static com.google.common.util.concurrent.Futures.immediateFuture;
import static com.google.common.util.concurrent.Uninterruptibles.awaitUninterruptibly;
import static java.util.concurrent.TimeUnit.SECONDS;
import static org.junit.Assert.assertThrows;
import static org.mockito.ArgumentMatchers.any;
import static org.mockito.Mockito.doAnswer;
Expand All @@ -34,6 +36,8 @@
import com.google.devtools.build.lib.server.FailureDetails;
import com.google.devtools.build.lib.server.FailureDetails.Crash.Code;
import com.google.devtools.build.lib.server.FailureDetails.FailureDetail;
import com.google.devtools.build.lib.testutil.TestThread;
import com.google.devtools.build.lib.testutil.TestUtils;
import com.google.devtools.build.lib.util.CrashFailureDetails;
import com.google.devtools.build.lib.util.CustomExitCodePublisher;
import com.google.devtools.build.lib.util.CustomFailureDetailPublisher;
Expand All @@ -48,6 +52,7 @@
import java.nio.file.Path;
import java.security.Permission;
import java.util.Arrays;
import java.util.concurrent.CountDownLatch;
import java.util.logging.Level;
import java.util.logging.LogRecord;
import java.util.logging.Logger;
Expand Down Expand Up @@ -135,17 +140,17 @@ Throwable createThrowable() {
private enum ExceptionType {
FATAL(
new RuntimeException("fatal exception"),
/*isFatal=*/ true,
/* isFatal= */ true,
Level.SEVERE,
"myProductName crashed with args: arg foo"),
NONFATAL(
new IllegalStateException("bug report"),
/*isFatal=*/ false,
/* isFatal= */ false,
Level.WARNING,
"myProductName had a non fatal error with args: arg foo"),
OOM(
new OutOfMemoryError("Java heap space"),
/*isFatal=*/ true,
/* isFatal= */ true,
Level.SEVERE,
"myProductName OOMError: arg foo");

Expand Down Expand Up @@ -219,6 +224,67 @@ public void logException(@TestParameter ExceptionType exceptionType) {
assertThat(got.getLevel()).isEqualTo(exceptionType.level);
}

/**
* Regression test verifying that {@link BugReport#handleCrash(Crash, CrashContext)} is re-entrant
* and can be called from multiple threads even if the first caller is blocking.
*/
@Test
public void handleCrash_reentrant(@TestParameter CrashType crashType)
throws InterruptedException {
////// Arrange:
// A separate thread will block until `firstCrashWaitsLatch` is ready, then crash using the
// `crashType` test parameter.
CountDownLatch firstCrashStartedReportingLatch = new CountDownLatch(1);
CountDownLatch firstCrashWaitsLatch = new CountDownLatch(1);
doAnswer(
(inv) -> {
firstCrashStartedReportingLatch.countDown();
awaitUninterruptibly(firstCrashWaitsLatch);
return null;
})
.when(mockRuntime)
.cleanUpForCrash(any());

Throwable t = crashType.createThrowable();
TestThread blockingCrashThread =
new TestThread(
() -> {
SecurityException e =
assertThrows(SecurityException.class, () -> BugReport.handleCrash(t));
if (e instanceof ExitException) {
int code = ((ExitException) e).code;
assertThat(code).isEqualTo(crashType.expectedExitCode.getNumericExitCode());
}
});
blockingCrashThread.start();
// Wait for the TestThread to start, call BugReport#handleCrash(), and call
// BlazeRuntimeInterface#cleanUpForCrash().
assertThat(firstCrashStartedReportingLatch.await(TestUtils.WAIT_TIMEOUT_SECONDS, SECONDS))
.isTrue();
assertThat(blockingCrashThread.isAlive()).isTrue();

////// Act:
// Crash on the main test thread while the first crash is still blocking forever.
Throwable secondThreadCrash = new UnsupportedOperationException("Second thread crashing.");
SecurityException secondThreadSecurityException =
assertThrows(SecurityException.class, () -> BugReport.handleCrash(secondThreadCrash));

////// Assert:
if (secondThreadSecurityException instanceof ExitException) {
int code = ((ExitException) secondThreadSecurityException).code;
assertThat(code).isEqualTo(ExitCode.BLAZE_INTERNAL_ERROR.getNumericExitCode());
}
// Verify the latest exception stored is the main-thread crashing throwable.
assertThat(BugReport.getAndResetLastCrashingThrowableIfInTest())
.isSameInstanceAs(secondThreadCrash);

// Unblock the first crashing thread and verify that it completes. Asserting its state will
// ensure the test fails if the wrong exit code is returned to the first call to handleCrash().
firstCrashWaitsLatch.countDown();
blockingCrashThread.joinAndAssertState(TestUtils.WAIT_TIMEOUT_MILLISECONDS);
verify(mockRuntime).cleanUpForCrash(any());
}

@Test
public void convenienceMethod(@TestParameter CrashType crashType) throws Exception {
Throwable t = crashType.createThrowable();
Expand Down Expand Up @@ -401,4 +467,3 @@ public void checkExit(int code) {
public void checkPermission(Permission p) {} // Allow everything else.
}
}

0 comments on commit 7311f76

Please sign in to comment.