Skip to content

Commit

Permalink
perf: introduce stack cache
Browse files Browse the repository at this point in the history
We hash stacks so that we can cache them when emitting data in the
binary format.
  • Loading branch information
P403n1x87 committed Apr 9, 2024
1 parent 4559915 commit be46795
Show file tree
Hide file tree
Showing 8 changed files with 132 additions and 30 deletions.
3 changes: 2 additions & 1 deletion src/cache.c
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,8 @@ queue_item__destroy(queue_item_t * self, void (*deallocator)(value_t)) {
if (!isvalid(self))
return;

deallocator(self->value);
if (deallocator)
deallocator(self->value);

free(self);
}
Expand Down
4 changes: 2 additions & 2 deletions src/events.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,10 +75,10 @@
} \
}

#define emit_stack(format, pid, iid, tid, ...) \
#define emit_stack(hash, format, pid, iid, tid, ...) \
{ \
if (pargs.binary) { \
mojo_stack(pid, iid, tid); \
mojo_stack(hash, pid, iid, tid); \
} else { \
fprintfp(pargs.output_file, format, pid, iid, tid, __VA_ARGS__); \
} \
Expand Down
20 changes: 14 additions & 6 deletions src/mojo.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
#include "cache.h"
#include "platform.h"

#define MOJO_VERSION 3
#define MOJO_VERSION 4

enum {
MOJO_RESERVED,
Expand All @@ -44,7 +44,7 @@ enum {
MOJO_METRIC_TIME,
MOJO_METRIC_MEMORY,
MOJO_STRING,
MOJO_STRING_REF,
MOJO_STACK_REF,
MOJO_MAX,
};

Expand Down Expand Up @@ -119,10 +119,11 @@ static inline void mojo_integer(mojo_int_t integer, int sign) {
mojo_string(label); \
mojo_fstring(__VA_ARGS__);

#define mojo_stack(pid, iid, tid) \
mojo_event(MOJO_STACK); \
mojo_integer(pid, 0); \
mojo_integer(iid, 0); \
#define mojo_stack(key, pid, iid, tid) \
mojo_event(MOJO_STACK); \
mojo_ref(key); \
mojo_integer(pid, 0); \
mojo_integer(iid, 0); \
mojo_fstring(FORMAT_TID, tid);

#define mojo_frame(frame) \
Expand Down Expand Up @@ -160,4 +161,11 @@ static inline void mojo_integer(mojo_int_t integer, int sign) {
mojo_event(MOJO_STRING_REF); \
mojo_ref(key);

#define mojo_stack_ref(key, pid, iid, tid) \
mojo_event(MOJO_STACK_REF); \
mojo_ref(key); \
mojo_integer(pid, 0); \
mojo_integer(iid, 0); \
mojo_fstring(FORMAT_TID, tid);

#endif
15 changes: 14 additions & 1 deletion src/py_proc.c
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,9 @@
#include "py_thread.h"


#define MAX_STACK_CACHE_SIZE (1 << 16) // 64K


// ---- PRIVATE ---------------------------------------------------------------

#define py_proc__memcpy(self, raddr, size, dest) copy_memory(self->proc_ref, raddr, size, dest)
Expand Down Expand Up @@ -708,6 +711,15 @@ py_proc_new(int child) {

py_proc->frames_heap = py_proc->frames = NULL_MEM_BLOCK;

py_proc->stack_cache = lru_cache_new(MAX_STACK_CACHE_SIZE, NULL);
if (!isvalid(py_proc->stack_cache)) {
log_e("Failed to allocate stack cache");
goto error;
}
#ifdef DEBUG
py_proc->stack_cache->name = "stack cache";
#endif

py_proc->frame_cache = lru_cache_new(MAX_FRAME_CACHE_SIZE, (void (*)(value_t)) frame__destroy);
if (!isvalid(py_proc->frame_cache)) {
log_e("Failed to allocate frame cache");
Expand Down Expand Up @@ -1175,7 +1187,7 @@ _py_proc__sample_interpreter(py_proc_t * self, PyInterpreterState * is, ctime_t
}
}

py_thread__emit_collapsed_stack(
py_thread__emit_sample(
&py_thread,
interp_id,
time_delta,
Expand Down Expand Up @@ -1327,6 +1339,7 @@ py_proc__destroy(py_proc_t * self) {

lru_cache__destroy(self->string_cache);
lru_cache__destroy(self->frame_cache);
lru_cache__destroy(self->stack_cache);

free(self);
}
1 change: 1 addition & 0 deletions src/py_proc.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ typedef struct {

void * is_raddr;

lru_cache_t * stack_cache;
lru_cache_t * frame_cache;
lru_cache_t * string_cache;

Expand Down
56 changes: 38 additions & 18 deletions src/py_thread.c
Original file line number Diff line number Diff line change
Expand Up @@ -444,8 +444,6 @@ _py_thread__unwind_iframe_stack(py_thread_t * self, void * iframe_raddr) {
break;
}
}

invalid = fail(_py_thread__resolve_py_stack(self)) || invalid;

return invalid;
}
Expand Down Expand Up @@ -893,7 +891,7 @@ py_thread__next(py_thread_t * self) {

// ----------------------------------------------------------------------------
void
py_thread__emit_collapsed_stack(py_thread_t * self, int64_t interp_id, ctime_t time_delta, ssize_t mem_delta) {
py_thread__emit_sample(py_thread_t * self, int64_t interp_id, ctime_t time_delta, ssize_t mem_delta) {
if (!pargs.full && pargs.memory && mem_delta == 0)
return;

Expand Down Expand Up @@ -922,18 +920,8 @@ py_thread__emit_collapsed_stack(py_thread_t * self, int64_t interp_id, ctime_t t
}
}

// Group entries by thread.
emit_stack(
pargs.head_format, self->proc->pid, interp_id, self->tid,
// These are relevant only in `where` mode
is_idle ? "💤" : "🚀",
self->proc->child ? "🧒" : ""
);

int error = FALSE;

#ifdef NATIVE

// We sample the kernel frame stack BEFORE interrupting because otherwise
// we would see the ptrace syscall call stack, which is not very interesting.
// The downside is that the kernel stack might not be in sync with the other
Expand All @@ -953,6 +941,7 @@ py_thread__emit_collapsed_stack(py_thread_t * self, int64_t interp_id, ctime_t t

V_DESC(self->proc->py_v);

stack_hash_t stack_hash = 0;
if (isvalid(self->top_frame)) {
if (V_MIN(3, 11)) {
if (fail(_py_thread__unwind_cframe_stack(self))) {
Expand All @@ -966,11 +955,41 @@ py_thread__emit_collapsed_stack(py_thread_t * self, int64_t interp_id, ctime_t t
error = TRUE;
}
}

if (fail(_py_thread__resolve_py_stack(self))) {
emit_invalid_frame();
error = TRUE;

stack_hash = stack_py_hash();
#ifdef NATIVE
stack_hash ^= stack_native_hash();
if (pargs.kernel) {
stack_hash ^= stack_kernel_hash();
}
#endif

if (pargs.binary) {
value_t seen_stack = lru_cache__maybe_hit(self->proc->stack_cache, stack_hash);
if (seen_stack) {
mojo_stack_ref(stack_hash, self->proc->pid, interp_id, self->tid);
goto finish_sample;
} else {
lru_cache__store(self->proc->stack_cache, stack_hash, (value_t)TRUE);
}
}
}

// Group entries by thread.
emit_stack(
stack_hash,
pargs.head_format, self->proc->pid, interp_id, self->tid,
// These are relevant only in `where` mode
is_idle ? "💤" : "🚀",
self->proc->child ? "🧒" : ""
);

if (stack_hash == 0)
// We have no stack to emit.
goto finish_sample;

if (!error && fail(_py_thread__resolve_py_stack(self))) {
emit_invalid_frame();
}

#ifdef NATIVE
Expand Down Expand Up @@ -1036,6 +1055,7 @@ py_thread__emit_collapsed_stack(py_thread_t * self, int64_t interp_id, ctime_t t
}
#endif

finish_sample:
if (pargs.gc && py_proc__is_gc_collecting(self->proc) == TRUE) {
emit_gc();
stats_gc_time(time_delta);
Expand All @@ -1060,7 +1080,7 @@ py_thread__emit_collapsed_stack(py_thread_t * self, int64_t interp_id, ctime_t t
stats_count_sample();
if (error) stats_count_error();
stats_check_duration(stopwatch_duration());
} /* py_thread__emit_collapsed_stack */
} /* py_thread__emit_sample */


// ----------------------------------------------------------------------------
Expand Down
2 changes: 1 addition & 1 deletion src/py_thread.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ py_thread__next(py_thread_t *);
* @param ssize_t the memory delta.
*/
void
py_thread__emit_collapsed_stack(py_thread_t *, int64_t, ctime_t, ssize_t);
py_thread__emit_sample(py_thread_t *, int64_t, ctime_t, ssize_t);


/**
Expand Down
61 changes: 60 additions & 1 deletion src/stack.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,14 @@ typedef struct {
#endif
} stack_dt;

typedef key_dt stack_hash_t;

static stack_dt * _stack;

#define ROTL(x) ((x << 1) | (x >> (sizeof(x) * CHAR_BIT - 1)))


// ----------------------------------------------------------------------------
static inline int
stack_allocate(size_t size) {
if (isvalid(_stack))
Expand All @@ -73,6 +79,8 @@ stack_allocate(size_t size) {
SUCCESS;
}


// ----------------------------------------------------------------------------
static inline void
stack_deallocate(void) {
if (!isvalid(_stack))
Expand All @@ -89,7 +97,7 @@ stack_deallocate(void) {
}



// ----------------------------------------------------------------------------
static inline int
stack_has_cycle(void) {
if (_stack->pointer < 2)
Expand All @@ -110,6 +118,8 @@ stack_has_cycle(void) {
return FALSE;
}


// ----------------------------------------------------------------------------
static inline void
stack_py_push(void * origin, void * code, int lasti) {
_stack->py_base[_stack->pointer++] = (py_frame_t) {
Expand All @@ -119,6 +129,55 @@ stack_py_push(void * origin, void * code, int lasti) {
};
}


// ----------------------------------------------------------------------------
static inline stack_hash_t
stack_py_hash(void) {
stack_hash_t hash = 0;

for (ssize_t i = 0; i < _stack->pointer; i++) {
py_frame_t * frame = _stack->py_base+i;
hash = ROTL(hash) ^ py_frame_key(frame->code, frame->lasti);
}

return hash;
}


#ifdef NATIVE
// ----------------------------------------------------------------------------
static inline stack_hash_t
stack_native_hash(void) {
stack_hash_t hash = 0;

for (ssize_t i = 0; i < _stack->native_pointer; i++) {
frame_t * frame = _stack->native_base[i];
hash = ROTL(hash) ^ frame->key;
}

return hash;
}


// ----------------------------------------------------------------------------
static inline stack_hash_t
stack_kernel_hash(void) {
stack_hash_t hash = 0;

for (ssize_t i = 0; i < _stack->kernel_pointer; i++) {
key_dt frame = (key_dt)_stack->kernel_base[i];
hash = ROTL(hash) ^ frame;
}

return hash;
}

#endif


// ----------------------------------------------------------------------------


#define stack_pointer() (_stack->pointer)
#define stack_push(frame) {_stack->base[_stack->pointer++] = frame;}
#define stack_set(i, frame) {_stack->base[i] = frame;}
Expand Down

0 comments on commit be46795

Please sign in to comment.