Skip to content

Commit

Permalink
remote_worker: Serialize fork() calls. Fixes #3356
Browse files Browse the repository at this point in the history
Spawning processes concurrently from multiple threads doesn't work
reliably on Linux (see bug for details).

This change introduces simply puts a synchronized block around fork().

I have tested this change by repeatedly running a build of bazel for 2
hours on a 64 core machine with up to 200 concurrent actions. There
wasn't a single failure.

PiperOrigin-RevId: 164450559
  • Loading branch information
buchgr committed Aug 7, 2017
1 parent 904563c commit 91fb38e
Showing 1 changed file with 34 additions and 30 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
import com.google.devtools.build.lib.shell.Command;
import com.google.devtools.build.lib.shell.CommandException;
import com.google.devtools.build.lib.shell.CommandResult;
import com.google.devtools.build.lib.shell.FutureCommandResult;
import com.google.devtools.build.lib.shell.TimeoutKillableObserver;
import com.google.devtools.build.lib.vfs.FileSystemUtils;
import com.google.devtools.build.lib.vfs.Path;
Expand All @@ -49,6 +50,7 @@
import io.grpc.StatusException;
import io.grpc.protobuf.StatusProto;
import io.grpc.stub.StreamObserver;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
Expand All @@ -71,6 +73,8 @@
final class ExecutionServer extends ExecutionImplBase {
private static final Logger logger = Logger.getLogger(ExecutionServer.class.getName());

private final Object lock = new Object();

// The name of the container image entry in the Platform proto
// (see third_party/googleapis/devtools/remoteexecution/*/remote_execution.proto and
// experimental_remote_platform_override in
Expand Down Expand Up @@ -200,41 +204,41 @@ private ActionResult execute(Action action, Path execRoot)
execRoot.getPathString());
long startTime = System.currentTimeMillis();
CommandResult cmdResult = null;
// Linux does not provide a safe API for a multi-threaded program to fork a subprocess. Consider
// the case where two threads both write an executable file and then try to execute it. It can
// happen that the first thread writes its executable file, with the file descriptor still
// being open when the second thread forks, with the fork inheriting a copy of the file
// descriptor. Then the first thread closes the original file descriptor, and proceeds to
// execute the file. At that point Linux sees an open file descriptor to the file and returns
// ETXTBSY (Text file busy) as an error. This race is inherent in the fork / exec duality, with
// fork always inheriting a copy of the file descriptor table; if there was a way to fork
// without copying the entire file descriptor table (e.g., only copy specific entries), we could
// avoid this race.
//
// I was able to reproduce this problem reliably by running significantly more threads than
// there are CPU cores on my workstation - the more threads the more likely it happens.
//
// As a workaround, we retry up to two times before we let the exception propagate.
int attempt = 0;
while (true) {

FutureCommandResult futureCmdResult = null;
synchronized (lock) {
// Linux does not provide a safe API for a multi-threaded program to fork a subprocess.
// Consider the case where two threads both write an executable file and then try to execute
// it. It can happen that the first thread writes its executable file, with the file
// descriptor still being open when the second thread forks, with the fork inheriting a copy
// of the file descriptor. Then the first thread closes the original file descriptor, and
// proceeds to execute the file. At that point Linux sees an open file descriptor to the file
// and returns ETXTBSY (Text file busy) as an error. This race is inherent in the fork / exec
// duality, with fork always inheriting a copy of the file descriptor table; if there was a
// way to fork without copying the entire file descriptor table (e.g., only copy specific
// entries), we could avoid this race.
//
// I was able to reproduce this problem reliably by running significantly more threads than
// there are CPU cores on my workstation - the more threads the more likely it happens.
//
// As a workaround, we put a synchronized block around the fork.
try {
futureCmdResult = cmd
.executeAsynchronously(new ByteArrayInputStream(new byte[0]), Command.NO_OBSERVER,
stdoutBuffer, stderrBuffer, true, false);
} catch (CommandException e) {
Throwables.throwIfInstanceOf(e.getCause(), IOException.class);
}
}

if (futureCmdResult != null) {
try {
cmdResult =
cmd.execute(Command.NO_INPUT, Command.NO_OBSERVER, stdoutBuffer, stderrBuffer, true);
cmdResult = futureCmdResult.get();
} catch (AbnormalTerminationException e) {
cmdResult = e.getResult();
} catch (CommandException e) {
// As of this writing, the cause can only be an IOException from the underlying library.
IOException cause = (IOException) e.getCause();
if ((attempt++ < 3) && cause.getMessage().endsWith("Text file busy")) {
// We wait a bit to give the other forks some time to close their open file descriptors.
Thread.sleep(10);
continue;
} else {
throw cause;
}
}
break;
}

long timeoutMillis =
action.hasTimeout()
? Durations.toMillis(action.getTimeout())
Expand Down

0 comments on commit 91fb38e

Please sign in to comment.