From 263d51d4e349a15852ba687dc96f2507118d9e80 Mon Sep 17 00:00:00 2001
From: "Gabriele N. Tornetta" <phoenix1987@gmail.com>
Date: Tue, 9 Apr 2024 20:16:02 +0100
Subject: [PATCH] perf: introduce stack cache

We hash stacks so that we can cache them when emitting data in the
binary format.
---
 src/cache.c     |  3 ++-
 src/events.h    |  4 ++--
 src/mojo.h      | 20 +++++++++++-----
 src/py_proc.c   | 15 +++++++++++-
 src/py_proc.h   |  1 +
 src/py_thread.c | 56 ++++++++++++++++++++++++++++++--------------
 src/py_thread.h |  2 +-
 src/stack.h     | 62 ++++++++++++++++++++++++++++++++++++++++++++++++-
 8 files changed, 133 insertions(+), 30 deletions(-)

diff --git a/src/cache.c b/src/cache.c
index 09c0469b..6ed1f59a 100644
--- a/src/cache.c
+++ b/src/cache.c
@@ -48,7 +48,8 @@ queue_item__destroy(queue_item_t * self, void (*deallocator)(value_t)) {
   if (!isvalid(self))
     return;
 
-  deallocator(self->value);
+  if (deallocator)
+    deallocator(self->value);
 
   free(self);
 }
diff --git a/src/events.h b/src/events.h
index 4ef9f930..5ae75d1d 100644
--- a/src/events.h
+++ b/src/events.h
@@ -75,10 +75,10 @@
     }                                      \
   }
 
-#define emit_stack(format, pid, iid, tid, ...)                         \
+#define emit_stack(hash, format, pid, iid, tid, ...)                   \
   {                                                                    \
     if (pargs.binary) {                                                \
-      mojo_stack(pid, iid, tid);                                       \
+      mojo_stack(hash, pid, iid, tid);                                 \
     } else {                                                           \
       fprintfp(pargs.output_file, format, pid, iid, tid, __VA_ARGS__); \
     }                                                                  \
diff --git a/src/mojo.h b/src/mojo.h
index d4fc0626..3cf8133a 100644
--- a/src/mojo.h
+++ b/src/mojo.h
@@ -29,7 +29,7 @@
 #include "cache.h"
 #include "platform.h"
 
-#define MOJO_VERSION 3
+#define MOJO_VERSION 4
 
 enum {
   MOJO_RESERVED,
@@ -44,7 +44,7 @@ enum {
   MOJO_METRIC_TIME,
   MOJO_METRIC_MEMORY,
   MOJO_STRING,
-  MOJO_STRING_REF,
+  MOJO_STACK_REF,
   MOJO_MAX,
 };
 
@@ -119,10 +119,11 @@ static inline void mojo_integer(mojo_int_t integer, int sign) {
   mojo_string(label);             \
   mojo_fstring(__VA_ARGS__);
 
-#define mojo_stack(pid, iid, tid) \
-  mojo_event(MOJO_STACK);         \
-  mojo_integer(pid, 0);           \
-  mojo_integer(iid, 0);           \
+#define mojo_stack(key, pid, iid, tid) \
+  mojo_event(MOJO_STACK);              \
+  mojo_ref(key);                       \
+  mojo_integer(pid, 0);                \
+  mojo_integer(iid, 0);                \
   mojo_fstring(FORMAT_TID, tid);
 
 #define mojo_frame(frame)           \
@@ -160,4 +161,11 @@ static inline void mojo_integer(mojo_int_t integer, int sign) {
   mojo_event(MOJO_STRING_REF); \
   mojo_ref(key);
 
+#define mojo_stack_ref(key, pid, iid, tid) \
+  mojo_event(MOJO_STACK_REF);              \
+  mojo_ref(key);                           \
+  mojo_integer(pid, 0);                    \
+  mojo_integer(iid, 0);                    \
+  mojo_fstring(FORMAT_TID, tid);
+
 #endif
diff --git a/src/py_proc.c b/src/py_proc.c
index 9951dd6d..b1a4dee2 100644
--- a/src/py_proc.c
+++ b/src/py_proc.c
@@ -54,6 +54,9 @@
 #include "py_thread.h"
 
 
+#define MAX_STACK_CACHE_SIZE (1 << 16)  // 64K
+
+
 // ---- PRIVATE ---------------------------------------------------------------
 
 #define py_proc__memcpy(self, raddr, size, dest)  copy_memory(self->proc_ref, raddr, size, dest)
@@ -708,6 +711,15 @@ py_proc_new(int child) {
 
   py_proc->frames_heap = py_proc->frames = NULL_MEM_BLOCK;
 
+  py_proc->stack_cache = lru_cache_new(MAX_STACK_CACHE_SIZE, NULL);
+  if (!isvalid(py_proc->stack_cache)) {
+    log_e("Failed to allocate stack cache");
+    goto error;
+  }
+  #ifdef DEBUG
+  py_proc->stack_cache->name = "stack cache";
+  #endif
+
   py_proc->frame_cache = lru_cache_new(MAX_FRAME_CACHE_SIZE, (void (*)(value_t)) frame__destroy);
   if (!isvalid(py_proc->frame_cache)) {
     log_e("Failed to allocate frame cache");
@@ -1175,7 +1187,7 @@ _py_proc__sample_interpreter(py_proc_t * self, PyInterpreterState * is, ctime_t
       }
     }
 
-    py_thread__emit_collapsed_stack(
+    py_thread__emit_sample(
       &py_thread,
       interp_id,
       time_delta,
@@ -1327,6 +1339,7 @@ py_proc__destroy(py_proc_t * self) {
 
   lru_cache__destroy(self->string_cache);
   lru_cache__destroy(self->frame_cache);
+  lru_cache__destroy(self->stack_cache);
 
   free(self);
 }
diff --git a/src/py_proc.h b/src/py_proc.h
index 56afdd75..01bc1917 100644
--- a/src/py_proc.h
+++ b/src/py_proc.h
@@ -75,6 +75,7 @@ typedef struct {
 
   void          * is_raddr;
 
+  lru_cache_t   * stack_cache;
   lru_cache_t   * frame_cache;
   lru_cache_t   * string_cache;
 
diff --git a/src/py_thread.c b/src/py_thread.c
index 196e997b..ff9d064c 100644
--- a/src/py_thread.c
+++ b/src/py_thread.c
@@ -444,8 +444,6 @@ _py_thread__unwind_iframe_stack(py_thread_t * self, void * iframe_raddr) {
       break;
     }
   }
-  
-  invalid = fail(_py_thread__resolve_py_stack(self)) || invalid;
 
   return invalid;
 }
@@ -893,7 +891,7 @@ py_thread__next(py_thread_t * self) {
 
 // ----------------------------------------------------------------------------
 void
-py_thread__emit_collapsed_stack(py_thread_t * self, int64_t interp_id, ctime_t time_delta, ssize_t mem_delta) {
+py_thread__emit_sample(py_thread_t * self, int64_t interp_id, ctime_t time_delta, ssize_t mem_delta) {
   if (!pargs.full && pargs.memory && mem_delta == 0)
     return;
 
@@ -922,18 +920,8 @@ py_thread__emit_collapsed_stack(py_thread_t * self, int64_t interp_id, ctime_t t
     }
   }
 
-  // Group entries by thread.
-  emit_stack(
-    pargs.head_format, self->proc->pid, interp_id, self->tid,
-    // These are relevant only in `where` mode
-    is_idle           ? "💤" : "🚀",
-    self->proc->child ? "🧒" : ""
-  );
-
   int error = FALSE;
-
   #ifdef NATIVE
-
   // We sample the kernel frame stack BEFORE interrupting because otherwise
   // we would see the ptrace syscall call stack, which is not very interesting.
   // The downside is that the kernel stack might not be in sync with the other
@@ -953,6 +941,7 @@ py_thread__emit_collapsed_stack(py_thread_t * self, int64_t interp_id, ctime_t t
 
   V_DESC(self->proc->py_v);
 
+  stack_hash_t stack_hash = 0;
   if (isvalid(self->top_frame)) {
     if (V_MIN(3, 11)) {
       if (fail(_py_thread__unwind_cframe_stack(self))) {
@@ -966,11 +955,41 @@ py_thread__emit_collapsed_stack(py_thread_t * self, int64_t interp_id, ctime_t t
         error = TRUE;
       }
     }
-    
-    if (fail(_py_thread__resolve_py_stack(self))) {
-      emit_invalid_frame();
-      error = TRUE;
+
+    stack_hash = stack_py_hash();
+    #ifdef NATIVE
+    stack_hash ^= stack_native_hash();
+    if (pargs.kernel) {
+      stack_hash ^= stack_kernel_hash();
     }
+    #endif
+
+    if (pargs.binary) {
+      value_t seen_stack = lru_cache__maybe_hit(self->proc->stack_cache, stack_hash);
+      if (seen_stack) {
+        mojo_stack_ref(stack_hash, self->proc->pid, interp_id, self->tid);
+        goto finish_sample;
+      } else {
+        lru_cache__store(self->proc->stack_cache, stack_hash, (value_t)TRUE);
+      }
+    }
+  }
+
+  // Group entries by thread.
+  emit_stack(
+    stack_hash,
+    pargs.head_format, self->proc->pid, interp_id, self->tid,
+    // These are relevant only in `where` mode
+    is_idle           ? "💤" : "🚀",
+    self->proc->child ? "🧒" : ""
+  );
+
+  if (stack_hash == 0)
+    // We have no stack to emit.
+    goto finish_sample;
+
+  if (!error && fail(_py_thread__resolve_py_stack(self))) {
+    emit_invalid_frame();
   }
 
   #ifdef NATIVE
@@ -1036,6 +1055,7 @@ py_thread__emit_collapsed_stack(py_thread_t * self, int64_t interp_id, ctime_t t
   }
   #endif
 
+finish_sample:
   if (pargs.gc && py_proc__is_gc_collecting(self->proc) == TRUE) {
     emit_gc();
     stats_gc_time(time_delta);
@@ -1060,7 +1080,7 @@ py_thread__emit_collapsed_stack(py_thread_t * self, int64_t interp_id, ctime_t t
   stats_count_sample();
   if (error) stats_count_error();
   stats_check_duration(stopwatch_duration());
-} /* py_thread__emit_collapsed_stack */
+} /* py_thread__emit_sample */
 
 
 // ----------------------------------------------------------------------------
diff --git a/src/py_thread.h b/src/py_thread.h
index 11c23690..4832d6af 100644
--- a/src/py_thread.h
+++ b/src/py_thread.h
@@ -97,7 +97,7 @@ py_thread__next(py_thread_t *);
  * @param  ssize_t      the memory delta.
  */
 void
-py_thread__emit_collapsed_stack(py_thread_t *, int64_t, ctime_t, ssize_t);
+py_thread__emit_sample(py_thread_t *, int64_t, ctime_t, ssize_t);
 
 
 /**
diff --git a/src/stack.h b/src/stack.h
index 036dbf9d..cea2443e 100644
--- a/src/stack.h
+++ b/src/stack.h
@@ -23,6 +23,7 @@
 #ifndef STACK_H
 #define STACK_H
 
+#include <limits.h>
 #include <stdint.h>
 #include <stdlib.h>
 
@@ -51,8 +52,14 @@ typedef struct {
   #endif
 } stack_dt;
 
+typedef key_dt stack_hash_t;
+
 static stack_dt * _stack;
 
+#define ROTL(x) ((x << 1) | (x >> (sizeof(x) * CHAR_BIT - 1)))
+
+
+// ----------------------------------------------------------------------------
 static inline int
 stack_allocate(size_t size) {
   if (isvalid(_stack))
@@ -73,6 +80,8 @@ stack_allocate(size_t size) {
   SUCCESS;
 }
 
+
+// ----------------------------------------------------------------------------
 static inline void
 stack_deallocate(void) {
   if (!isvalid(_stack))
@@ -89,7 +98,7 @@ stack_deallocate(void) {
 }
 
 
-
+// ----------------------------------------------------------------------------
 static inline int
 stack_has_cycle(void) {
   if (_stack->pointer < 2)
@@ -110,6 +119,8 @@ stack_has_cycle(void) {
   return FALSE;
 }
 
+
+// ----------------------------------------------------------------------------
 static inline void
 stack_py_push(void * origin, void * code, int lasti) {
   _stack->py_base[_stack->pointer++] = (py_frame_t) {
@@ -119,6 +130,55 @@ stack_py_push(void * origin, void * code, int lasti) {
   };
 }
 
+
+// ----------------------------------------------------------------------------
+static inline stack_hash_t
+stack_py_hash(void) {
+  stack_hash_t hash = 0;
+  
+  for (ssize_t i = 0; i < _stack->pointer; i++) {
+    py_frame_t * frame = _stack->py_base+i;
+    hash = ROTL(hash) ^ py_frame_key(frame->code, frame->lasti);
+  }
+  
+  return hash;
+}
+
+
+#ifdef NATIVE
+// ----------------------------------------------------------------------------
+static inline stack_hash_t
+stack_native_hash(void) {
+  stack_hash_t hash = 0;
+  
+  for (ssize_t i = 0; i < _stack->native_pointer; i++) {
+    frame_t * frame = _stack->native_base[i];
+    hash = ROTL(hash) ^ frame->key;
+  }
+  
+  return hash;
+}
+
+
+// ----------------------------------------------------------------------------
+static inline stack_hash_t
+stack_kernel_hash(void) {
+  stack_hash_t hash = 0;
+  
+  for (ssize_t i = 0; i < _stack->kernel_pointer; i++) {
+    key_dt frame = (key_dt)_stack->kernel_base[i];
+    hash = ROTL(hash) ^ frame;
+  }
+  
+  return hash;
+}
+
+#endif
+
+
+// ----------------------------------------------------------------------------
+
+
 #define stack_pointer()         (_stack->pointer)
 #define stack_push(frame)       {_stack->base[_stack->pointer++] = frame;}
 #define stack_set(i, frame)     {_stack->base[i] = frame;}