From 263d51d4e349a15852ba687dc96f2507118d9e80 Mon Sep 17 00:00:00 2001 From: "Gabriele N. Tornetta" Date: Tue, 9 Apr 2024 20:16:02 +0100 Subject: [PATCH] perf: introduce stack cache We hash stacks so that we can cache them when emitting data in the binary format. --- src/cache.c | 3 ++- src/events.h | 4 ++-- src/mojo.h | 20 +++++++++++----- src/py_proc.c | 15 +++++++++++- src/py_proc.h | 1 + src/py_thread.c | 56 ++++++++++++++++++++++++++++++-------------- src/py_thread.h | 2 +- src/stack.h | 62 ++++++++++++++++++++++++++++++++++++++++++++++++- 8 files changed, 133 insertions(+), 30 deletions(-) diff --git a/src/cache.c b/src/cache.c index 09c0469b..6ed1f59a 100644 --- a/src/cache.c +++ b/src/cache.c @@ -48,7 +48,8 @@ queue_item__destroy(queue_item_t * self, void (*deallocator)(value_t)) { if (!isvalid(self)) return; - deallocator(self->value); + if (deallocator) + deallocator(self->value); free(self); } diff --git a/src/events.h b/src/events.h index 4ef9f930..5ae75d1d 100644 --- a/src/events.h +++ b/src/events.h @@ -75,10 +75,10 @@ } \ } -#define emit_stack(format, pid, iid, tid, ...) \ +#define emit_stack(hash, format, pid, iid, tid, ...) \ { \ if (pargs.binary) { \ - mojo_stack(pid, iid, tid); \ + mojo_stack(hash, pid, iid, tid); \ } else { \ fprintfp(pargs.output_file, format, pid, iid, tid, __VA_ARGS__); \ } \ diff --git a/src/mojo.h b/src/mojo.h index d4fc0626..3cf8133a 100644 --- a/src/mojo.h +++ b/src/mojo.h @@ -29,7 +29,7 @@ #include "cache.h" #include "platform.h" -#define MOJO_VERSION 3 +#define MOJO_VERSION 4 enum { MOJO_RESERVED, @@ -44,7 +44,7 @@ enum { MOJO_METRIC_TIME, MOJO_METRIC_MEMORY, MOJO_STRING, - MOJO_STRING_REF, + MOJO_STACK_REF, MOJO_MAX, }; @@ -119,10 +119,11 @@ static inline void mojo_integer(mojo_int_t integer, int sign) { mojo_string(label); \ mojo_fstring(__VA_ARGS__); -#define mojo_stack(pid, iid, tid) \ - mojo_event(MOJO_STACK); \ - mojo_integer(pid, 0); \ - mojo_integer(iid, 0); \ +#define mojo_stack(key, pid, iid, tid) \ + mojo_event(MOJO_STACK); \ + mojo_ref(key); \ + mojo_integer(pid, 0); \ + mojo_integer(iid, 0); \ mojo_fstring(FORMAT_TID, tid); #define mojo_frame(frame) \ @@ -160,4 +161,11 @@ static inline void mojo_integer(mojo_int_t integer, int sign) { mojo_event(MOJO_STRING_REF); \ mojo_ref(key); +#define mojo_stack_ref(key, pid, iid, tid) \ + mojo_event(MOJO_STACK_REF); \ + mojo_ref(key); \ + mojo_integer(pid, 0); \ + mojo_integer(iid, 0); \ + mojo_fstring(FORMAT_TID, tid); + #endif diff --git a/src/py_proc.c b/src/py_proc.c index 9951dd6d..b1a4dee2 100644 --- a/src/py_proc.c +++ b/src/py_proc.c @@ -54,6 +54,9 @@ #include "py_thread.h" +#define MAX_STACK_CACHE_SIZE (1 << 16) // 64K + + // ---- PRIVATE --------------------------------------------------------------- #define py_proc__memcpy(self, raddr, size, dest) copy_memory(self->proc_ref, raddr, size, dest) @@ -708,6 +711,15 @@ py_proc_new(int child) { py_proc->frames_heap = py_proc->frames = NULL_MEM_BLOCK; + py_proc->stack_cache = lru_cache_new(MAX_STACK_CACHE_SIZE, NULL); + if (!isvalid(py_proc->stack_cache)) { + log_e("Failed to allocate stack cache"); + goto error; + } + #ifdef DEBUG + py_proc->stack_cache->name = "stack cache"; + #endif + py_proc->frame_cache = lru_cache_new(MAX_FRAME_CACHE_SIZE, (void (*)(value_t)) frame__destroy); if (!isvalid(py_proc->frame_cache)) { log_e("Failed to allocate frame cache"); @@ -1175,7 +1187,7 @@ _py_proc__sample_interpreter(py_proc_t * self, PyInterpreterState * is, ctime_t } } - py_thread__emit_collapsed_stack( + py_thread__emit_sample( &py_thread, interp_id, time_delta, @@ -1327,6 +1339,7 @@ py_proc__destroy(py_proc_t * self) { lru_cache__destroy(self->string_cache); lru_cache__destroy(self->frame_cache); + lru_cache__destroy(self->stack_cache); free(self); } diff --git a/src/py_proc.h b/src/py_proc.h index 56afdd75..01bc1917 100644 --- a/src/py_proc.h +++ b/src/py_proc.h @@ -75,6 +75,7 @@ typedef struct { void * is_raddr; + lru_cache_t * stack_cache; lru_cache_t * frame_cache; lru_cache_t * string_cache; diff --git a/src/py_thread.c b/src/py_thread.c index 196e997b..ff9d064c 100644 --- a/src/py_thread.c +++ b/src/py_thread.c @@ -444,8 +444,6 @@ _py_thread__unwind_iframe_stack(py_thread_t * self, void * iframe_raddr) { break; } } - - invalid = fail(_py_thread__resolve_py_stack(self)) || invalid; return invalid; } @@ -893,7 +891,7 @@ py_thread__next(py_thread_t * self) { // ---------------------------------------------------------------------------- void -py_thread__emit_collapsed_stack(py_thread_t * self, int64_t interp_id, ctime_t time_delta, ssize_t mem_delta) { +py_thread__emit_sample(py_thread_t * self, int64_t interp_id, ctime_t time_delta, ssize_t mem_delta) { if (!pargs.full && pargs.memory && mem_delta == 0) return; @@ -922,18 +920,8 @@ py_thread__emit_collapsed_stack(py_thread_t * self, int64_t interp_id, ctime_t t } } - // Group entries by thread. - emit_stack( - pargs.head_format, self->proc->pid, interp_id, self->tid, - // These are relevant only in `where` mode - is_idle ? "💤" : "🚀", - self->proc->child ? "🧒" : "" - ); - int error = FALSE; - #ifdef NATIVE - // We sample the kernel frame stack BEFORE interrupting because otherwise // we would see the ptrace syscall call stack, which is not very interesting. // The downside is that the kernel stack might not be in sync with the other @@ -953,6 +941,7 @@ py_thread__emit_collapsed_stack(py_thread_t * self, int64_t interp_id, ctime_t t V_DESC(self->proc->py_v); + stack_hash_t stack_hash = 0; if (isvalid(self->top_frame)) { if (V_MIN(3, 11)) { if (fail(_py_thread__unwind_cframe_stack(self))) { @@ -966,11 +955,41 @@ py_thread__emit_collapsed_stack(py_thread_t * self, int64_t interp_id, ctime_t t error = TRUE; } } - - if (fail(_py_thread__resolve_py_stack(self))) { - emit_invalid_frame(); - error = TRUE; + + stack_hash = stack_py_hash(); + #ifdef NATIVE + stack_hash ^= stack_native_hash(); + if (pargs.kernel) { + stack_hash ^= stack_kernel_hash(); } + #endif + + if (pargs.binary) { + value_t seen_stack = lru_cache__maybe_hit(self->proc->stack_cache, stack_hash); + if (seen_stack) { + mojo_stack_ref(stack_hash, self->proc->pid, interp_id, self->tid); + goto finish_sample; + } else { + lru_cache__store(self->proc->stack_cache, stack_hash, (value_t)TRUE); + } + } + } + + // Group entries by thread. + emit_stack( + stack_hash, + pargs.head_format, self->proc->pid, interp_id, self->tid, + // These are relevant only in `where` mode + is_idle ? "💤" : "🚀", + self->proc->child ? "🧒" : "" + ); + + if (stack_hash == 0) + // We have no stack to emit. + goto finish_sample; + + if (!error && fail(_py_thread__resolve_py_stack(self))) { + emit_invalid_frame(); } #ifdef NATIVE @@ -1036,6 +1055,7 @@ py_thread__emit_collapsed_stack(py_thread_t * self, int64_t interp_id, ctime_t t } #endif +finish_sample: if (pargs.gc && py_proc__is_gc_collecting(self->proc) == TRUE) { emit_gc(); stats_gc_time(time_delta); @@ -1060,7 +1080,7 @@ py_thread__emit_collapsed_stack(py_thread_t * self, int64_t interp_id, ctime_t t stats_count_sample(); if (error) stats_count_error(); stats_check_duration(stopwatch_duration()); -} /* py_thread__emit_collapsed_stack */ +} /* py_thread__emit_sample */ // ---------------------------------------------------------------------------- diff --git a/src/py_thread.h b/src/py_thread.h index 11c23690..4832d6af 100644 --- a/src/py_thread.h +++ b/src/py_thread.h @@ -97,7 +97,7 @@ py_thread__next(py_thread_t *); * @param ssize_t the memory delta. */ void -py_thread__emit_collapsed_stack(py_thread_t *, int64_t, ctime_t, ssize_t); +py_thread__emit_sample(py_thread_t *, int64_t, ctime_t, ssize_t); /** diff --git a/src/stack.h b/src/stack.h index 036dbf9d..cea2443e 100644 --- a/src/stack.h +++ b/src/stack.h @@ -23,6 +23,7 @@ #ifndef STACK_H #define STACK_H +#include #include #include @@ -51,8 +52,14 @@ typedef struct { #endif } stack_dt; +typedef key_dt stack_hash_t; + static stack_dt * _stack; +#define ROTL(x) ((x << 1) | (x >> (sizeof(x) * CHAR_BIT - 1))) + + +// ---------------------------------------------------------------------------- static inline int stack_allocate(size_t size) { if (isvalid(_stack)) @@ -73,6 +80,8 @@ stack_allocate(size_t size) { SUCCESS; } + +// ---------------------------------------------------------------------------- static inline void stack_deallocate(void) { if (!isvalid(_stack)) @@ -89,7 +98,7 @@ stack_deallocate(void) { } - +// ---------------------------------------------------------------------------- static inline int stack_has_cycle(void) { if (_stack->pointer < 2) @@ -110,6 +119,8 @@ stack_has_cycle(void) { return FALSE; } + +// ---------------------------------------------------------------------------- static inline void stack_py_push(void * origin, void * code, int lasti) { _stack->py_base[_stack->pointer++] = (py_frame_t) { @@ -119,6 +130,55 @@ stack_py_push(void * origin, void * code, int lasti) { }; } + +// ---------------------------------------------------------------------------- +static inline stack_hash_t +stack_py_hash(void) { + stack_hash_t hash = 0; + + for (ssize_t i = 0; i < _stack->pointer; i++) { + py_frame_t * frame = _stack->py_base+i; + hash = ROTL(hash) ^ py_frame_key(frame->code, frame->lasti); + } + + return hash; +} + + +#ifdef NATIVE +// ---------------------------------------------------------------------------- +static inline stack_hash_t +stack_native_hash(void) { + stack_hash_t hash = 0; + + for (ssize_t i = 0; i < _stack->native_pointer; i++) { + frame_t * frame = _stack->native_base[i]; + hash = ROTL(hash) ^ frame->key; + } + + return hash; +} + + +// ---------------------------------------------------------------------------- +static inline stack_hash_t +stack_kernel_hash(void) { + stack_hash_t hash = 0; + + for (ssize_t i = 0; i < _stack->kernel_pointer; i++) { + key_dt frame = (key_dt)_stack->kernel_base[i]; + hash = ROTL(hash) ^ frame; + } + + return hash; +} + +#endif + + +// ---------------------------------------------------------------------------- + + #define stack_pointer() (_stack->pointer) #define stack_push(frame) {_stack->base[_stack->pointer++] = frame;} #define stack_set(i, frame) {_stack->base[i] = frame;}