Skip to content

Commit be46795

Browse files
committed
perf: introduce stack cache
We hash stacks so that we can cache them when emitting data in the binary format.
1 parent 4559915 commit be46795

File tree

8 files changed

+132
-30
lines changed

8 files changed

+132
-30
lines changed

src/cache.c

+2-1
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,8 @@ queue_item__destroy(queue_item_t * self, void (*deallocator)(value_t)) {
4848
if (!isvalid(self))
4949
return;
5050

51-
deallocator(self->value);
51+
if (deallocator)
52+
deallocator(self->value);
5253

5354
free(self);
5455
}

src/events.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -75,10 +75,10 @@
7575
} \
7676
}
7777

78-
#define emit_stack(format, pid, iid, tid, ...) \
78+
#define emit_stack(hash, format, pid, iid, tid, ...) \
7979
{ \
8080
if (pargs.binary) { \
81-
mojo_stack(pid, iid, tid); \
81+
mojo_stack(hash, pid, iid, tid); \
8282
} else { \
8383
fprintfp(pargs.output_file, format, pid, iid, tid, __VA_ARGS__); \
8484
} \

src/mojo.h

+14-6
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
#include "cache.h"
3030
#include "platform.h"
3131

32-
#define MOJO_VERSION 3
32+
#define MOJO_VERSION 4
3333

3434
enum {
3535
MOJO_RESERVED,
@@ -44,7 +44,7 @@ enum {
4444
MOJO_METRIC_TIME,
4545
MOJO_METRIC_MEMORY,
4646
MOJO_STRING,
47-
MOJO_STRING_REF,
47+
MOJO_STACK_REF,
4848
MOJO_MAX,
4949
};
5050

@@ -119,10 +119,11 @@ static inline void mojo_integer(mojo_int_t integer, int sign) {
119119
mojo_string(label); \
120120
mojo_fstring(__VA_ARGS__);
121121

122-
#define mojo_stack(pid, iid, tid) \
123-
mojo_event(MOJO_STACK); \
124-
mojo_integer(pid, 0); \
125-
mojo_integer(iid, 0); \
122+
#define mojo_stack(key, pid, iid, tid) \
123+
mojo_event(MOJO_STACK); \
124+
mojo_ref(key); \
125+
mojo_integer(pid, 0); \
126+
mojo_integer(iid, 0); \
126127
mojo_fstring(FORMAT_TID, tid);
127128

128129
#define mojo_frame(frame) \
@@ -160,4 +161,11 @@ static inline void mojo_integer(mojo_int_t integer, int sign) {
160161
mojo_event(MOJO_STRING_REF); \
161162
mojo_ref(key);
162163

164+
#define mojo_stack_ref(key, pid, iid, tid) \
165+
mojo_event(MOJO_STACK_REF); \
166+
mojo_ref(key); \
167+
mojo_integer(pid, 0); \
168+
mojo_integer(iid, 0); \
169+
mojo_fstring(FORMAT_TID, tid);
170+
163171
#endif

src/py_proc.c

+14-1
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,9 @@
5454
#include "py_thread.h"
5555

5656

57+
#define MAX_STACK_CACHE_SIZE (1 << 16) // 64K
58+
59+
5760
// ---- PRIVATE ---------------------------------------------------------------
5861

5962
#define py_proc__memcpy(self, raddr, size, dest) copy_memory(self->proc_ref, raddr, size, dest)
@@ -708,6 +711,15 @@ py_proc_new(int child) {
708711

709712
py_proc->frames_heap = py_proc->frames = NULL_MEM_BLOCK;
710713

714+
py_proc->stack_cache = lru_cache_new(MAX_STACK_CACHE_SIZE, NULL);
715+
if (!isvalid(py_proc->stack_cache)) {
716+
log_e("Failed to allocate stack cache");
717+
goto error;
718+
}
719+
#ifdef DEBUG
720+
py_proc->stack_cache->name = "stack cache";
721+
#endif
722+
711723
py_proc->frame_cache = lru_cache_new(MAX_FRAME_CACHE_SIZE, (void (*)(value_t)) frame__destroy);
712724
if (!isvalid(py_proc->frame_cache)) {
713725
log_e("Failed to allocate frame cache");
@@ -1175,7 +1187,7 @@ _py_proc__sample_interpreter(py_proc_t * self, PyInterpreterState * is, ctime_t
11751187
}
11761188
}
11771189

1178-
py_thread__emit_collapsed_stack(
1190+
py_thread__emit_sample(
11791191
&py_thread,
11801192
interp_id,
11811193
time_delta,
@@ -1327,6 +1339,7 @@ py_proc__destroy(py_proc_t * self) {
13271339

13281340
lru_cache__destroy(self->string_cache);
13291341
lru_cache__destroy(self->frame_cache);
1342+
lru_cache__destroy(self->stack_cache);
13301343

13311344
free(self);
13321345
}

src/py_proc.h

+1
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ typedef struct {
7575

7676
void * is_raddr;
7777

78+
lru_cache_t * stack_cache;
7879
lru_cache_t * frame_cache;
7980
lru_cache_t * string_cache;
8081

src/py_thread.c

+38-18
Original file line numberDiff line numberDiff line change
@@ -444,8 +444,6 @@ _py_thread__unwind_iframe_stack(py_thread_t * self, void * iframe_raddr) {
444444
break;
445445
}
446446
}
447-
448-
invalid = fail(_py_thread__resolve_py_stack(self)) || invalid;
449447

450448
return invalid;
451449
}
@@ -893,7 +891,7 @@ py_thread__next(py_thread_t * self) {
893891

894892
// ----------------------------------------------------------------------------
895893
void
896-
py_thread__emit_collapsed_stack(py_thread_t * self, int64_t interp_id, ctime_t time_delta, ssize_t mem_delta) {
894+
py_thread__emit_sample(py_thread_t * self, int64_t interp_id, ctime_t time_delta, ssize_t mem_delta) {
897895
if (!pargs.full && pargs.memory && mem_delta == 0)
898896
return;
899897

@@ -922,18 +920,8 @@ py_thread__emit_collapsed_stack(py_thread_t * self, int64_t interp_id, ctime_t t
922920
}
923921
}
924922

925-
// Group entries by thread.
926-
emit_stack(
927-
pargs.head_format, self->proc->pid, interp_id, self->tid,
928-
// These are relevant only in `where` mode
929-
is_idle ? "💤" : "🚀",
930-
self->proc->child ? "🧒" : ""
931-
);
932-
933923
int error = FALSE;
934-
935924
#ifdef NATIVE
936-
937925
// We sample the kernel frame stack BEFORE interrupting because otherwise
938926
// we would see the ptrace syscall call stack, which is not very interesting.
939927
// The downside is that the kernel stack might not be in sync with the other
@@ -953,6 +941,7 @@ py_thread__emit_collapsed_stack(py_thread_t * self, int64_t interp_id, ctime_t t
953941

954942
V_DESC(self->proc->py_v);
955943

944+
stack_hash_t stack_hash = 0;
956945
if (isvalid(self->top_frame)) {
957946
if (V_MIN(3, 11)) {
958947
if (fail(_py_thread__unwind_cframe_stack(self))) {
@@ -966,11 +955,41 @@ py_thread__emit_collapsed_stack(py_thread_t * self, int64_t interp_id, ctime_t t
966955
error = TRUE;
967956
}
968957
}
969-
970-
if (fail(_py_thread__resolve_py_stack(self))) {
971-
emit_invalid_frame();
972-
error = TRUE;
958+
959+
stack_hash = stack_py_hash();
960+
#ifdef NATIVE
961+
stack_hash ^= stack_native_hash();
962+
if (pargs.kernel) {
963+
stack_hash ^= stack_kernel_hash();
973964
}
965+
#endif
966+
967+
if (pargs.binary) {
968+
value_t seen_stack = lru_cache__maybe_hit(self->proc->stack_cache, stack_hash);
969+
if (seen_stack) {
970+
mojo_stack_ref(stack_hash, self->proc->pid, interp_id, self->tid);
971+
goto finish_sample;
972+
} else {
973+
lru_cache__store(self->proc->stack_cache, stack_hash, (value_t)TRUE);
974+
}
975+
}
976+
}
977+
978+
// Group entries by thread.
979+
emit_stack(
980+
stack_hash,
981+
pargs.head_format, self->proc->pid, interp_id, self->tid,
982+
// These are relevant only in `where` mode
983+
is_idle ? "💤" : "🚀",
984+
self->proc->child ? "🧒" : ""
985+
);
986+
987+
if (stack_hash == 0)
988+
// We have no stack to emit.
989+
goto finish_sample;
990+
991+
if (!error && fail(_py_thread__resolve_py_stack(self))) {
992+
emit_invalid_frame();
974993
}
975994

976995
#ifdef NATIVE
@@ -1036,6 +1055,7 @@ py_thread__emit_collapsed_stack(py_thread_t * self, int64_t interp_id, ctime_t t
10361055
}
10371056
#endif
10381057

1058+
finish_sample:
10391059
if (pargs.gc && py_proc__is_gc_collecting(self->proc) == TRUE) {
10401060
emit_gc();
10411061
stats_gc_time(time_delta);
@@ -1060,7 +1080,7 @@ py_thread__emit_collapsed_stack(py_thread_t * self, int64_t interp_id, ctime_t t
10601080
stats_count_sample();
10611081
if (error) stats_count_error();
10621082
stats_check_duration(stopwatch_duration());
1063-
} /* py_thread__emit_collapsed_stack */
1083+
} /* py_thread__emit_sample */
10641084

10651085

10661086
// ----------------------------------------------------------------------------

src/py_thread.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ py_thread__next(py_thread_t *);
9797
* @param ssize_t the memory delta.
9898
*/
9999
void
100-
py_thread__emit_collapsed_stack(py_thread_t *, int64_t, ctime_t, ssize_t);
100+
py_thread__emit_sample(py_thread_t *, int64_t, ctime_t, ssize_t);
101101

102102

103103
/**

src/stack.h

+60-1
Original file line numberDiff line numberDiff line change
@@ -51,8 +51,14 @@ typedef struct {
5151
#endif
5252
} stack_dt;
5353

54+
typedef key_dt stack_hash_t;
55+
5456
static stack_dt * _stack;
5557

58+
#define ROTL(x) ((x << 1) | (x >> (sizeof(x) * CHAR_BIT - 1)))
59+
60+
61+
// ----------------------------------------------------------------------------
5662
static inline int
5763
stack_allocate(size_t size) {
5864
if (isvalid(_stack))
@@ -73,6 +79,8 @@ stack_allocate(size_t size) {
7379
SUCCESS;
7480
}
7581

82+
83+
// ----------------------------------------------------------------------------
7684
static inline void
7785
stack_deallocate(void) {
7886
if (!isvalid(_stack))
@@ -89,7 +97,7 @@ stack_deallocate(void) {
8997
}
9098

9199

92-
100+
// ----------------------------------------------------------------------------
93101
static inline int
94102
stack_has_cycle(void) {
95103
if (_stack->pointer < 2)
@@ -110,6 +118,8 @@ stack_has_cycle(void) {
110118
return FALSE;
111119
}
112120

121+
122+
// ----------------------------------------------------------------------------
113123
static inline void
114124
stack_py_push(void * origin, void * code, int lasti) {
115125
_stack->py_base[_stack->pointer++] = (py_frame_t) {
@@ -119,6 +129,55 @@ stack_py_push(void * origin, void * code, int lasti) {
119129
};
120130
}
121131

132+
133+
// ----------------------------------------------------------------------------
134+
static inline stack_hash_t
135+
stack_py_hash(void) {
136+
stack_hash_t hash = 0;
137+
138+
for (ssize_t i = 0; i < _stack->pointer; i++) {
139+
py_frame_t * frame = _stack->py_base+i;
140+
hash = ROTL(hash) ^ py_frame_key(frame->code, frame->lasti);
141+
}
142+
143+
return hash;
144+
}
145+
146+
147+
#ifdef NATIVE
148+
// ----------------------------------------------------------------------------
149+
static inline stack_hash_t
150+
stack_native_hash(void) {
151+
stack_hash_t hash = 0;
152+
153+
for (ssize_t i = 0; i < _stack->native_pointer; i++) {
154+
frame_t * frame = _stack->native_base[i];
155+
hash = ROTL(hash) ^ frame->key;
156+
}
157+
158+
return hash;
159+
}
160+
161+
162+
// ----------------------------------------------------------------------------
163+
static inline stack_hash_t
164+
stack_kernel_hash(void) {
165+
stack_hash_t hash = 0;
166+
167+
for (ssize_t i = 0; i < _stack->kernel_pointer; i++) {
168+
key_dt frame = (key_dt)_stack->kernel_base[i];
169+
hash = ROTL(hash) ^ frame;
170+
}
171+
172+
return hash;
173+
}
174+
175+
#endif
176+
177+
178+
// ----------------------------------------------------------------------------
179+
180+
122181
#define stack_pointer() (_stack->pointer)
123182
#define stack_push(frame) {_stack->base[_stack->pointer++] = frame;}
124183
#define stack_set(i, frame) {_stack->base[i] = frame;}

0 commit comments

Comments
 (0)