Skip to content

Commit

Permalink
fix(profiling): remove slow getpid call from memalloc path (#11848)
Browse files Browse the repository at this point in the history
memalloc uses getpid to detect whether the process has forked, so that
we can unlock the memalloc lock in the child process (if it isn't
already locked). Unfortunately the getpid call is quite slow. From the
man page: "calls to getpid() always invoke the actual system call,
rather than returning a cached value." Furthermore, we _always_ attempt
to take the lock for allocations, even if we aren't going to sample
them. So this is basically adding a syscall to every allocation.

Move this logic out of the allocation path. Switch to using
pthread_atfork handlers to ensure that the lock is held prior to
forking, and unlock it in the parent and child after forking. This
(maybe) has the added benefit of making sure the data structures are in
a consistent state in the child process after forking. Unclear if that's
an issue prior to this change, though. I may be missing some code that
resets the profiler on fork anyway?
  • Loading branch information
nsrip-dd authored Jan 2, 2025
1 parent 00cd9fd commit 6bfe77e
Show file tree
Hide file tree
Showing 4 changed files with 65 additions and 13 deletions.
25 changes: 25 additions & 0 deletions ddtrace/profiling/collector/_memalloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,28 @@ static alloc_tracker_t* global_alloc_tracker;
static void
memalloc_init(void);

static void
memalloc_prefork(void)
{
// Lock the mutex prior to forking. This ensures that the memory profiler
// data structures will be in a consistent state in the child process.
// The rest of the memalloc calls do trylock so we don't run the risk
// of deadlocking if some other fork handler allocates
memlock_lock(&g_memalloc_lock);
}

static void
memalloc_postfork_parent(void)
{
memlock_unlock(&g_memalloc_lock);
}

static void
memalloc_postfork_child(void)
{
memlock_unlock(&g_memalloc_lock);
}

#ifdef _MSC_VER
#pragma section(".CRT$XCU", read)
__declspec(allocate(".CRT$XCU")) void (*memalloc_init_func)(void) = memalloc_init;
Expand All @@ -81,6 +103,9 @@ memalloc_init()
}
}
memlock_init(&g_memalloc_lock, crash_on_mutex_pass);
#ifndef _WIN32
pthread_atfork(memalloc_prefork, memalloc_postfork_parent, memalloc_postfork_child);
#endif
}

static void
Expand Down
22 changes: 22 additions & 0 deletions ddtrace/profiling/collector/_memalloc_heap.c
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,25 @@ static heap_tracker_t global_heap_tracker;
static void
memheap_init(void);

static void
memheap_prefork(void)
{
// See memalloc_prefork for an explanation of why this is here
memlock_lock(&g_memheap_lock);
}

static void
memheap_postfork_parent(void)
{
memlock_unlock(&g_memheap_lock);
}

static void
memheap_postfork_child(void)
{
memlock_unlock(&g_memheap_lock);
}

#ifdef _MSC_VER
#pragma section(".CRT$XCU", read)
__declspec(allocate(".CRT$XCU")) void (*memheap_init_func)(void) = memheap_init;
Expand All @@ -60,6 +79,9 @@ memheap_init()
}
}
memlock_init(&g_memheap_lock, crash_on_mutex_pass);
#ifndef _WIN32
pthread_atfork(memheap_prefork, memheap_postfork_parent, memheap_postfork_child);
#endif
}

static uint32_t
Expand Down
26 changes: 13 additions & 13 deletions ddtrace/profiling/collector/_memalloc_reentrant.h
Original file line number Diff line number Diff line change
Expand Up @@ -125,19 +125,6 @@ memlock_trylock(memlock_t* lock)
if (!lock)
return false;

#ifdef __linux__
// On Linux, we need to make sure we didn't just fork
// pthreads will guarantee the lock is consistent, but we at least need to clear it
static pid_t my_pid = 0;
if (my_pid == 0) {
my_pid = getpid();
} else if (my_pid != getpid()) {
// We've forked, so we need to free the lock
memlock_unlock(lock);
my_pid = getpid();
}
#endif

#ifdef _WIN32
bool result = WAIT_OBJECT_0 == WaitForSingleObject(lock->mutex, 0); // 0ms timeout -> no wait
#else
Expand All @@ -153,6 +140,19 @@ memlock_trylock(memlock_t* lock)
return result;
}

static inline void
memlock_lock(memlock_t* lock)
{
if (!lock)
return;

#ifdef _WIN32
WaitForSingleObject(lock->mutex, INFINITE);
#else
pthread_mutex_lock(&lock->mutex);
#endif
}

// Cleanup function
static inline bool
memlock_destroy(memlock_t* lock)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
fixes:
- |
profiling: Removed a system call from the memory allocation profiler, used to detect forks,
which ran on every allocation and resulted in a significant slowdown.

0 comments on commit 6bfe77e

Please sign in to comment.