Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix a deadlock in syscallbuf unmapping after vfork #3826

Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1493,6 +1493,8 @@ set(TESTS_WITH_PROGRAM
clone_interruption
# Disabled because it fails
# clone_share_vm
clone_syscallbuf_cleanup_blocked
clone_syscallbuf_cleanup_cpu
clone_vfork
concurrent_signals
conditional_breakpoint_calls
Expand Down
8 changes: 8 additions & 0 deletions src/AddressSpace.h
Original file line number Diff line number Diff line change
Expand Up @@ -914,6 +914,14 @@ class AddressSpace : public HasTaskSet {
// Returns true if the range is completely covered by private mappings
bool range_is_private_mapping(const MemoryRange& range) const;

/**
* When two processes share an address space (e.g. with vfork(2) or
* clone(2) CLONE_VM), and one process calls execve(2), we need to unmap
* that process's syscallbuf. This list is checked the next time a task
* in that address space runs to perform the unmapping
*/
std::vector<MemoryRange> regions_pending_unmap;

private:
struct Breakpoint;
typedef std::map<remote_code_ptr, Breakpoint> BreakpointMap;
Expand Down
1 change: 1 addition & 0 deletions src/RecordSession.cc
Original file line number Diff line number Diff line change
Expand Up @@ -936,6 +936,7 @@ void RecordSession::task_continue(const StepState& step_state) {
}
}
}
t->unmap_dead_syscallbufs_if_required();
t->resume_execution(resume, RESUME_NONBLOCKING, ticks_request);
}

Expand Down
47 changes: 25 additions & 22 deletions src/Task.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1057,29 +1057,17 @@ static bool is_long_mode_segment(uint32_t segment) {
#endif

void Task::post_exec(const string& exe_file) {
Task* stopped_task_in_address_space = nullptr;
bool other_task_in_address_space = false;
for (Task* t : as->task_set()) {
if (t != this) {
other_task_in_address_space = true;
if (t->is_stopped_) {
stopped_task_in_address_space = t;
break;
}
}
// If the address space of this process which just exec'd is shared with another process
// (via vfork(2) or CLONE_VM perhaps), we will be leaving behind the syscallbuf mappings
// for this pid in the shared address space. Make a note of this, so that the next time
// we run a task in tihs address space, we unmap these buffers. (n.b. we can't clean up
// those buffers *before* the exec completes, because it might fail in which case we
// souldn't have cleaned them up.)
if (scratch_ptr) {
as->regions_pending_unmap.push_back(MemoryRange(scratch_ptr, scratch_size));
}
if (stopped_task_in_address_space) {
LOG(warn) << "Unmapping buffers using tid " << stopped_task_in_address_space->tid;
AutoRemoteSyscalls remote(stopped_task_in_address_space);
unmap_buffers_for(remote, this, syscallbuf_child);
} else if (other_task_in_address_space) {
// We should clean up our syscallbuf/scratch but that's too hard since we
// have no stopped task to use for that :-(.
// (We can't clean up those buffers *before* the exec completes, because it
// might fail in which case we shouldn't have cleaned them up.)
// Just let the buffers leak. The AddressSpace will clean up our local
// shared buffer when it's destroyed.
LOG(warn) << "Intentionally leaking syscallbuf after exec for task " << tid;
if (!syscallbuf_child.is_null()) {
as->regions_pending_unmap.push_back(MemoryRange(syscallbuf_child, syscallbuf_size));
}

session().post_exec();
Expand Down Expand Up @@ -1130,6 +1118,21 @@ void Task::post_exec_syscall(const std::string& original_exe_file) {

bool Task::execed() const { return tg->execed; }

void Task::unmap_dead_syscallbufs_if_required() {
if (!as->regions_pending_unmap.empty()) {
LOG(warn) << "Using " << tid << " to unmap syscallbuf regions for "
<< "previously exec'd processes";
AutoRemoteSyscalls remote(this);
std::vector<MemoryRange> regions_pending_unmap;
std::swap(regions_pending_unmap, as->regions_pending_unmap);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This could be a move, but it's fine.

for (auto region : regions_pending_unmap) {
if (remote.infallible_munmap_syscall_if_alive(region.start(), region.size())) {
vm()->unmap(this, region.start(), region.size());
}
}
}
}

void Task::flush_inconsistent_state() { ticks = 0; }

string Task::read_c_str(remote_ptr<char> child_addr, bool *ok) {
Expand Down
7 changes: 7 additions & 0 deletions src/Task.h
Original file line number Diff line number Diff line change
Expand Up @@ -1142,6 +1142,13 @@ class Task {
// (used to avoid double recording on unexpected exit)
bool last_syscall_entry_recorded;

/*
* Called before the scheduler resumes a task to check if the task's address
* space has any leftover syscallbufs from dead processes which shared the
* address space
*/
void unmap_dead_syscallbufs_if_required();

protected:
Task(Session& session, pid_t tid, pid_t rec_tid, uint32_t serial,
SupportedArch a);
Expand Down
106 changes: 106 additions & 0 deletions src/test/clone_syscallbuf_cleanup_blocked.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
/* -*- Mode: C; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */

#include "util.h"

static int pipefds[2];
static const char *CHILD_PROCESS_NAME = "clone_syscallbuf_cleanup_child";

void assert_syscallbuf_count(int expected) {
FILE *f = fopen("/proc/self/maps", "r");
test_assert(!!f);

int actual = 0;
for (;;) {
size_t len = 0;
char *line = NULL;
int ret = getline(&line, &len, f);
if (ret == -1) {
break;
}
if (strstr(line, "rr-shared-syscallbuf")) {
actual++;
}
free(line);
}
test_assert(expected == actual);
}

static int exec_proc(__attribute__((unused)) void* arg) {
// Close the reading end of the pipe in this process
close(pipefds[0]);

// Wait for the parent to be blocked in a read(2) syscall waiting on the pipe
char wchan_path[PATH_MAX];
snprintf(wchan_path, PATH_MAX, "/proc/%d/wchan", getppid());
char wchan[1024] = {0};
do {
FILE *f = fopen(wchan_path, "r");
fread(wchan, 1, 1024, f);
fclose(f);
} while (strncmp(wchan, "pipe_read", 1024) != 0);

sleep(1);

assert_syscallbuf_count(2);
// Now exec our child. The child will un-freeze the parent.
char wpipe_name[PATH_MAX];
snprintf(wpipe_name, sizeof(wpipe_name), "/proc/self/fd/%d", pipefds[1]);
execl("/proc/self/exe", CHILD_PROCESS_NAME, wpipe_name, NULL);

test_assert("Not reached" && 0);
return 0;
}

static void execd_child_proc(char *pipe_file) {
// Unblock the parent by writing a . to the shared pipe
pipefds[1] = open(pipe_file, O_WRONLY);
test_assert(pipefds[1] != -1);

char dummy[1] = { '.' };
int ret = write(pipefds[1], dummy, 1);
test_assert(ret == 1);

close(pipefds[1]);
}

int main(int argc, char **argv) {
// This is what get exec'd from exec_proc
if (argc == 2 && strcmp(argv[0], CHILD_PROCESS_NAME) == 0) {
execd_child_proc(argv[1]);
return 0;
}

int ret;

// Before forking, there should only be one syscallbuf
assert_syscallbuf_count(1);

ret = pipe2(pipefds, 0);
test_assert(ret != -1);

// Spawn a process which shares our address space, and will execve(2)
const size_t stack_size = 1 << 20;
void* exec_proc_stack = mmap(NULL, stack_size, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
pid_t exec_proc_pid = clone(exec_proc, exec_proc_stack + stack_size, CLONE_VM | SIGCHLD,
NULL, NULL, NULL, NULL);

// This proces needs the read end of the pipe, close the write end
close(pipefds[1]);

// Block ourselves attempting to read from the pipe.
char dummy[1] = { 0 };
ret = read(pipefds[0], dummy, 1);
// When we are woken up, we should have read a byte
test_assert(ret == 1);
test_assert(dummy[0] == '.');

// This means the child exec'd so we should have cleaned up the syscallbuf
assert_syscallbuf_count(1);

// Reap the exec'd child
test_assert(exec_proc_pid == waitpid(exec_proc_pid, NULL, 0));
atomic_puts("EXIT-SUCCESS");
return 0;
}

4 changes: 4 additions & 0 deletions src/test/clone_syscallbuf_cleanup_blocked.run
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
source `dirname $0`/util.sh
skip_if_no_syscall_buf
record $TESTNAME
replay
95 changes: 95 additions & 0 deletions src/test/clone_syscallbuf_cleanup_cpu.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
/* -*- Mode: C; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */

#include "util.h"

static const size_t SHARED_MEMFD_SIZE = 4096;
static int shared_memfd;
static volatile char *shared_memfd_mapping;
static const char *CHILD_PROCESS_NAME = "clone_syscallbuf_cleanup_child";

void assert_syscallbuf_count(int expected) {
FILE *f = fopen("/proc/self/maps", "r");
test_assert(!!f);

int actual = 0;
for (;;) {
size_t len = 0;
char *line = NULL;
int ret = getline(&line, &len, f);
if (ret == -1) {
break;
}
if (strstr(line, "rr-shared-syscallbuf")) {
actual++;
}
free(line);
}
test_assert(expected == actual);
}

static int exec_proc(__attribute__((unused)) void* arg) {
assert_syscallbuf_count(2);

char memfd_name[PATH_MAX];
snprintf(memfd_name, sizeof(memfd_name), "/proc/self/fd/%d", shared_memfd);
execl("/proc/self/exe", CHILD_PROCESS_NAME, memfd_name, NULL);

test_assert("Not reached" && 0);
return 0;
}

static void execd_child_proc(char *shared_memfd_mapping_path) {
// Need to re-map the shared memory we're going to use to poke the parent process
shared_memfd = open(shared_memfd_mapping_path, O_RDWR);
test_assert(shared_memfd != -1);
shared_memfd_mapping = mmap(NULL, SHARED_MEMFD_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED,
shared_memfd, 0);
test_assert(shared_memfd_mapping != MAP_FAILED);

// And do said poking
shared_memfd_mapping[0] = 1;
}


int main(int argc, char **argv) {
// This is what get exec'd from exec_proc
if (argc == 2 && strcmp(argv[0], CHILD_PROCESS_NAME) == 0) {
execd_child_proc(argv[1]);
return 0;
}

int ret;

// Before forking, there should only be one syscallbuf
assert_syscallbuf_count(1);

// mmap some memory that can be shared across execve(2)
shared_memfd = memfd_create("shared_page", 0);
test_assert(shared_memfd != -1);
ret = ftruncate(shared_memfd, SHARED_MEMFD_SIZE);
test_assert(ret != -1);
shared_memfd_mapping = mmap(NULL, SHARED_MEMFD_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED,
shared_memfd, 0);
test_assert(shared_memfd_mapping != MAP_FAILED);
shared_memfd_mapping[0] = 0;

// Spawn a process which shares our address space, and will execve(2)
const size_t stack_size = 1 << 20;
void* exec_proc_stack = mmap(NULL, stack_size, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
test_assert(exec_proc_stack != MAP_FAILED);
pid_t exec_proc_pid = clone(exec_proc, exec_proc_stack + stack_size, CLONE_VM | SIGCHLD,
NULL, NULL, NULL, NULL);
test_assert(exec_proc_pid != -1);

// Now spin waiting for the exec'd process to set a bit in the shard mapping
while (shared_memfd_mapping[0] != 1) { ; }

// This means the child exec'd so we should have cleaned up the syscallbuf
assert_syscallbuf_count(1);

// Reap the exec'd child
test_assert(exec_proc_pid == waitpid(exec_proc_pid, NULL, 0));
atomic_puts("EXIT-SUCCESS");
return 0;
}
4 changes: 4 additions & 0 deletions src/test/clone_syscallbuf_cleanup_cpu.run
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
source `dirname $0`/util.sh
skip_if_no_syscall_buf
record $TESTNAME
replay