perf: introduce stack cache

We hash stacks so that we can cache them when emitting data in the binary format.
P403n1x87 · Apr 9, 2024 · be46795 · be46795
1 parent 4559915
commit be46795
Show file tree

Hide file tree

Showing 8 changed files with 132 additions and 30 deletions.
diff --git a/src/cache.c b/src/cache.c
@@ -48,7 +48,8 @@ queue_item__destroy(queue_item_t * self, void (*deallocator)(value_t)) {
  if (!isvalid(self))
  return;
 
- deallocator(self->value);
+ if (deallocator)
+ deallocator(self->value);
 
  free(self);
 }

diff --git a/src/events.h b/src/events.h
@@ -75,10 +75,10 @@
  } \
  }
 
-#define emit_stack(format, pid, iid, tid, ...)  \
+#define emit_stack(hash, format, pid, iid, tid, ...) \
  { \
  if (pargs.binary) { \
- mojo_stack(pid, iid, tid);  \
+ mojo_stack(hash, pid, iid, tid); \
  } else { \
  fprintfp(pargs.output_file, format, pid, iid, tid, __VA_ARGS__); \
  } \

diff --git a/src/mojo.h b/src/mojo.h
@@ -29,7 +29,7 @@
 #include "cache.h"
 #include "platform.h"
 
-#define MOJO_VERSION 3
+#define MOJO_VERSION 4
 
 enum {
  MOJO_RESERVED,
@@ -44,7 +44,7 @@ enum {
  MOJO_METRIC_TIME,
  MOJO_METRIC_MEMORY,
  MOJO_STRING,
- MOJO_STRING_REF,
+ MOJO_STACK_REF,
  MOJO_MAX,
 };
 
@@ -119,10 +119,11 @@ static inline void mojo_integer(mojo_int_t integer, int sign) {
  mojo_string(label); \
  mojo_fstring(__VA_ARGS__);
 
-#define mojo_stack(pid, iid, tid) \
- mojo_event(MOJO_STACK); \
- mojo_integer(pid, 0); \
- mojo_integer(iid, 0); \
+#define mojo_stack(key, pid, iid, tid) \
+ mojo_event(MOJO_STACK); \
+ mojo_ref(key); \
+ mojo_integer(pid, 0); \
+ mojo_integer(iid, 0); \
  mojo_fstring(FORMAT_TID, tid);
 
 #define mojo_frame(frame) \
@@ -160,4 +161,11 @@ static inline void mojo_integer(mojo_int_t integer, int sign) {
  mojo_event(MOJO_STRING_REF); \
  mojo_ref(key);
 
+#define mojo_stack_ref(key, pid, iid, tid) \
+ mojo_event(MOJO_STACK_REF); \
+ mojo_ref(key); \
+ mojo_integer(pid, 0); \
+ mojo_integer(iid, 0); \
+ mojo_fstring(FORMAT_TID, tid);
+
 #endif
diff --git a/src/py_proc.c b/src/py_proc.c
@@ -54,6 +54,9 @@
 #include "py_thread.h"
 
 
+#define MAX_STACK_CACHE_SIZE (1 << 16) // 64K
+
+
 // ---- PRIVATE ---------------------------------------------------------------
 
 #define py_proc__memcpy(self, raddr, size, dest) copy_memory(self->proc_ref, raddr, size, dest)
@@ -708,6 +711,15 @@ py_proc_new(int child) {
 
  py_proc->frames_heap = py_proc->frames = NULL_MEM_BLOCK;
 
+ py_proc->stack_cache = lru_cache_new(MAX_STACK_CACHE_SIZE, NULL);
+ if (!isvalid(py_proc->stack_cache)) {
+ log_e("Failed to allocate stack cache");
+ goto error;
+ }
+ #ifdef DEBUG
+ py_proc->stack_cache->name = "stack cache";
+ #endif
+
  py_proc->frame_cache = lru_cache_new(MAX_FRAME_CACHE_SIZE, (void (*)(value_t)) frame__destroy);
  if (!isvalid(py_proc->frame_cache)) {
  log_e("Failed to allocate frame cache");
@@ -1175,7 +1187,7 @@ _py_proc__sample_interpreter(py_proc_t * self, PyInterpreterState * is, ctime_t
  }
  }
 
- py_thread__emit_collapsed_stack(
+ py_thread__emit_sample(
  &py_thread,
  interp_id,
  time_delta,
@@ -1327,6 +1339,7 @@ py_proc__destroy(py_proc_t * self) {
 
  lru_cache__destroy(self->string_cache);
  lru_cache__destroy(self->frame_cache);
+ lru_cache__destroy(self->stack_cache);
 
  free(self);
 }
diff --git a/src/py_proc.h b/src/py_proc.h
@@ -75,6 +75,7 @@ typedef struct {
 
  void * is_raddr;
 
+ lru_cache_t * stack_cache;
  lru_cache_t * frame_cache;
  lru_cache_t * string_cache;
 

diff --git a/src/py_thread.c b/src/py_thread.c
@@ -444,8 +444,6 @@ _py_thread__unwind_iframe_stack(py_thread_t * self, void * iframe_raddr) {
  break;
  }
  }
-
- invalid = fail(_py_thread__resolve_py_stack(self)) || invalid;
 
  return invalid;
 }
@@ -893,7 +891,7 @@ py_thread__next(py_thread_t * self) {
 
 // ----------------------------------------------------------------------------
 void
-py_thread__emit_collapsed_stack(py_thread_t * self, int64_t interp_id, ctime_t time_delta, ssize_t mem_delta) {
+py_thread__emit_sample(py_thread_t * self, int64_t interp_id, ctime_t time_delta, ssize_t mem_delta) {
  if (!pargs.full && pargs.memory && mem_delta == 0)
  return;
 
@@ -922,18 +920,8 @@ py_thread__emit_collapsed_stack(py_thread_t * self, int64_t interp_id, ctime_t t
  }
  }
 
- // Group entries by thread.
- emit_stack(
- pargs.head_format, self->proc->pid, interp_id, self->tid,
- // These are relevant only in `where` mode
- is_idle ? "💤" : "🚀",
- self->proc->child ? "🧒" : ""
- );
-
  int error = FALSE;
-
  #ifdef NATIVE
-
  // We sample the kernel frame stack BEFORE interrupting because otherwise
  // we would see the ptrace syscall call stack, which is not very interesting.
  // The downside is that the kernel stack might not be in sync with the other
@@ -953,6 +941,7 @@ py_thread__emit_collapsed_stack(py_thread_t * self, int64_t interp_id, ctime_t t
 
  V_DESC(self->proc->py_v);
 
+ stack_hash_t stack_hash = 0;
  if (isvalid(self->top_frame)) {
  if (V_MIN(3, 11)) {
  if (fail(_py_thread__unwind_cframe_stack(self))) {
@@ -966,11 +955,41 @@ py_thread__emit_collapsed_stack(py_thread_t * self, int64_t interp_id, ctime_t t
  error = TRUE;
  }
  }
-
- if (fail(_py_thread__resolve_py_stack(self))) {
- emit_invalid_frame();
- error = TRUE;
+
+ stack_hash = stack_py_hash();
+ #ifdef NATIVE
+ stack_hash ^= stack_native_hash();
+ if (pargs.kernel) {
+ stack_hash ^= stack_kernel_hash();
  }
+ #endif
+
+ if (pargs.binary) {
+ value_t seen_stack = lru_cache__maybe_hit(self->proc->stack_cache, stack_hash);
+ if (seen_stack) {
+ mojo_stack_ref(stack_hash, self->proc->pid, interp_id, self->tid);
+ goto finish_sample;
+ } else {
+ lru_cache__store(self->proc->stack_cache, stack_hash, (value_t)TRUE);
+ }
+ }
+ }
+
+ // Group entries by thread.
+ emit_stack(
+ stack_hash,
+ pargs.head_format, self->proc->pid, interp_id, self->tid,
+ // These are relevant only in `where` mode
+ is_idle ? "💤" : "🚀",
+ self->proc->child ? "🧒" : ""
+ );
+
+ if (stack_hash == 0)
+ // We have no stack to emit.
+ goto finish_sample;
+
+ if (!error && fail(_py_thread__resolve_py_stack(self))) {
+ emit_invalid_frame();
  }
 
  #ifdef NATIVE
@@ -1036,6 +1055,7 @@ py_thread__emit_collapsed_stack(py_thread_t * self, int64_t interp_id, ctime_t t
  }
  #endif
 
+finish_sample:
  if (pargs.gc && py_proc__is_gc_collecting(self->proc) == TRUE) {
  emit_gc();
  stats_gc_time(time_delta);
@@ -1060,7 +1080,7 @@ py_thread__emit_collapsed_stack(py_thread_t * self, int64_t interp_id, ctime_t t
  stats_count_sample();
  if (error) stats_count_error();
  stats_check_duration(stopwatch_duration());
-} /* py_thread__emit_collapsed_stack */
+} /* py_thread__emit_sample */
 
 
 // ----------------------------------------------------------------------------

diff --git a/src/py_thread.h b/src/py_thread.h
@@ -97,7 +97,7 @@ py_thread__next(py_thread_t *);
  * @param ssize_t the memory delta.
  */
 void
-py_thread__emit_collapsed_stack(py_thread_t *, int64_t, ctime_t, ssize_t);
+py_thread__emit_sample(py_thread_t *, int64_t, ctime_t, ssize_t);
 
 
 /**

diff --git a/src/stack.h b/src/stack.h
@@ -51,8 +51,14 @@ typedef struct {
  #endif
 } stack_dt;
 
+typedef key_dt stack_hash_t;
+
 static stack_dt * _stack;
 
+#define ROTL(x) ((x << 1) | (x >> (sizeof(x) * CHAR_BIT - 1)))
+
+
+// ----------------------------------------------------------------------------
 static inline int
 stack_allocate(size_t size) {
  if (isvalid(_stack))
@@ -73,6 +79,8 @@ stack_allocate(size_t size) {
  SUCCESS;
 }
 
+
+// ----------------------------------------------------------------------------
 static inline void
 stack_deallocate(void) {
  if (!isvalid(_stack))
@@ -89,7 +97,7 @@ stack_deallocate(void) {
 }
 
 
-
+// ----------------------------------------------------------------------------
 static inline int
 stack_has_cycle(void) {
  if (_stack->pointer < 2)
@@ -110,6 +118,8 @@ stack_has_cycle(void) {
  return FALSE;
 }
 
+
+// ----------------------------------------------------------------------------
 static inline void
 stack_py_push(void * origin, void * code, int lasti) {
  _stack->py_base[_stack->pointer++] = (py_frame_t) {
@@ -119,6 +129,55 @@ stack_py_push(void * origin, void * code, int lasti) {
  };
 }
 
+
+// ----------------------------------------------------------------------------
+static inline stack_hash_t
+stack_py_hash(void) {
+ stack_hash_t hash = 0;
+
+ for (ssize_t i = 0; i < _stack->pointer; i++) {
+ py_frame_t * frame = _stack->py_base+i;
+ hash = ROTL(hash) ^ py_frame_key(frame->code, frame->lasti);
+ }
+
+ return hash;
+}
+
+
+#ifdef NATIVE
+// ----------------------------------------------------------------------------
+static inline stack_hash_t
+stack_native_hash(void) {
+ stack_hash_t hash = 0;
+
+ for (ssize_t i = 0; i < _stack->native_pointer; i++) {
+ frame_t * frame = _stack->native_base[i];
+ hash = ROTL(hash) ^ frame->key;
+ }
+
+ return hash;
+}
+
+
+// ----------------------------------------------------------------------------
+static inline stack_hash_t
+stack_kernel_hash(void) {
+ stack_hash_t hash = 0;
+
+ for (ssize_t i = 0; i < _stack->kernel_pointer; i++) {
+ key_dt frame = (key_dt)_stack->kernel_base[i];
+ hash = ROTL(hash) ^ frame;
+ }
+
+ return hash;
+}
+
+#endif
+
+
+// ----------------------------------------------------------------------------
+
+
 #define stack_pointer() (_stack->pointer)
 #define stack_push(frame) {_stack->base[_stack->pointer++] = frame;}
 #define stack_set(i, frame) {_stack->base[i] = frame;}