diff --git a/ddtrace/profiling/collector/_memalloc.c b/ddtrace/profiling/collector/_memalloc.c index 3b7f7db293f..3876517baaf 100644 --- a/ddtrace/profiling/collector/_memalloc.c +++ b/ddtrace/profiling/collector/_memalloc.c @@ -42,47 +42,95 @@ static PyObject* object_string = NULL; #define ALLOC_TRACKER_MAX_COUNT UINT64_MAX +// The data coordination primitives in this and related files are related to a crash we started seeing. +// We don't have a precise understanding of the causal factors within the runtime that lead to this condition, +// since the GIL alone was sufficient in the past for preventing this issue. +// We add an option here to _add_ a crash, in order to observe this condition in a future diagnostic iteration. +// **This option is _intended_ to crash the Python process** do not use without a good reason! +static char g_crash_on_mutex_pass_str[] = "_DD_PROFILING_MEMALLOC_CRASH_ON_MUTEX_PASS"; +static const char* g_truthy_values[] = { "1", "true", "yes", "on", "enable", "enabled", NULL }; // NB the sentinel NULL +static memlock_t g_memalloc_lock; + static alloc_tracker_t* global_alloc_tracker; +// This is a multiplatform way to define an operation to happen at static initialization time +static void +memalloc_init(void); + +#ifdef _MSC_VER +#pragma section(".CRT$XCU", read) +__declspec(allocate(".CRT$XCU")) void (*memalloc_init_func)(void) = memalloc_init; + +#elif defined(__GNUC__) || defined(__clang__) +__attribute__((constructor)) +#else +#error Unsupported compiler +#endif +static void +memalloc_init() +{ + // Check if we should crash the process on mutex pass + char* crash_on_mutex_pass_str = getenv(g_crash_on_mutex_pass_str); + bool crash_on_mutex_pass = false; + if (crash_on_mutex_pass_str) { + for (int i = 0; g_truthy_values[i]; i++) { + if (strcmp(crash_on_mutex_pass_str, g_truthy_values[i]) == 0) { + crash_on_mutex_pass = true; + break; + } + } + } + memlock_init(&g_memalloc_lock, crash_on_mutex_pass); +} + static void memalloc_add_event(memalloc_context_t* ctx, void* ptr, size_t size) { - /* Do not overflow; just ignore the new events if we ever reach that point */ - if (global_alloc_tracker->alloc_count >= ALLOC_TRACKER_MAX_COUNT) + uint64_t alloc_count = atomic_add_clamped(&global_alloc_tracker->alloc_count, 1, ALLOC_TRACKER_MAX_COUNT); + + /* Return if we've reached the maximum number of allocations */ + if (alloc_count == 0) return; - global_alloc_tracker->alloc_count++; + // Return if we can't take the guard + if (!memalloc_take_guard()) { + return; + } - /* Avoid loops */ - if (memalloc_get_reentrant()) + // In this implementation, the `global_alloc_tracker` isn't intrinsically protected. Before we read or modify, + // take the lock. The count of allocations is already forward-attributed elsewhere, so if we can't take the lock + // there's nothing to do. + if (!memlock_trylock(&g_memalloc_lock)) { return; + } /* Determine if we can capture or if we need to sample */ if (global_alloc_tracker->allocs.count < ctx->max_events) { - /* set a barrier so we don't loop as getting a traceback allocates memory */ - memalloc_set_reentrant(true); /* Buffer is not full, fill it */ traceback_t* tb = memalloc_get_traceback(ctx->max_nframe, ptr, size, ctx->domain); - memalloc_set_reentrant(false); - if (tb) + if (tb) { traceback_array_append(&global_alloc_tracker->allocs, tb); + } } else { /* Sampling mode using a reservoir sampling algorithm: replace a random * traceback with this one */ - uint64_t r = random_range(global_alloc_tracker->alloc_count); + uint64_t r = random_range(alloc_count); - if (r < ctx->max_events) { - /* set a barrier so we don't loop as getting a traceback allocates memory */ - memalloc_set_reentrant(true); + // In addition to event size, need to check that the tab is in a good state + if (r < ctx->max_events && global_alloc_tracker->allocs.tab != NULL) { /* Replace a random traceback with this one */ traceback_t* tb = memalloc_get_traceback(ctx->max_nframe, ptr, size, ctx->domain); - memalloc_set_reentrant(false); + + // Need to check not only that the tb returned if (tb) { traceback_free(global_alloc_tracker->allocs.tab[r]); global_alloc_tracker->allocs.tab[r] = tb; } } } + + memlock_unlock(&g_memalloc_lock); + memalloc_yield_guard(); } static void @@ -98,12 +146,6 @@ memalloc_free(void* ctx, void* ptr) alloc->free(alloc->ctx, ptr); } -#ifdef _PY37_AND_LATER -Py_tss_t memalloc_reentrant_key = Py_tss_NEEDS_INIT; -#else -int memalloc_reentrant_key = -1; -#endif - static void* memalloc_alloc(int use_calloc, void* ctx, size_t nelem, size_t elsize) { @@ -233,7 +275,10 @@ memalloc_start(PyObject* Py_UNUSED(module), PyObject* args) global_memalloc_ctx.domain = PYMEM_DOMAIN_OBJ; - global_alloc_tracker = alloc_tracker_new(); + if (memlock_trylock(&g_memalloc_lock)) { + global_alloc_tracker = alloc_tracker_new(); + memlock_unlock(&g_memalloc_lock); + } PyMem_GetAllocator(PYMEM_DOMAIN_OBJ, &global_memalloc_ctx.pymem_allocator_obj); PyMem_SetAllocator(PYMEM_DOMAIN_OBJ, &alloc); @@ -258,8 +303,11 @@ memalloc_stop(PyObject* Py_UNUSED(module), PyObject* Py_UNUSED(args)) PyMem_SetAllocator(PYMEM_DOMAIN_OBJ, &global_memalloc_ctx.pymem_allocator_obj); memalloc_tb_deinit(); - alloc_tracker_free(global_alloc_tracker); - global_alloc_tracker = NULL; + if (memlock_trylock(&g_memalloc_lock)) { + alloc_tracker_free(global_alloc_tracker); + global_alloc_tracker = NULL; + memlock_unlock(&g_memalloc_lock); + } memalloc_heap_tracker_deinit(); @@ -310,9 +358,15 @@ iterevents_new(PyTypeObject* type, PyObject* Py_UNUSED(args), PyObject* Py_UNUSE if (!iestate) return NULL; - iestate->alloc_tracker = global_alloc_tracker; /* reset the current traceback list */ - global_alloc_tracker = alloc_tracker_new(); + if (memlock_trylock(&g_memalloc_lock)) { + iestate->alloc_tracker = global_alloc_tracker; + global_alloc_tracker = alloc_tracker_new(); + memlock_unlock(&g_memalloc_lock); + } else { + Py_TYPE(iestate)->tp_free(iestate); + return NULL; + } iestate->seq_index = 0; PyObject* iter_and_count = PyTuple_New(3); @@ -326,8 +380,11 @@ iterevents_new(PyTypeObject* type, PyObject* Py_UNUSED(args), PyObject* Py_UNUSE static void iterevents_dealloc(IterEventsState* iestate) { - alloc_tracker_free(iestate->alloc_tracker); - Py_TYPE(iestate)->tp_free(iestate); + if (memlock_trylock(&g_memalloc_lock)) { + alloc_tracker_free(iestate->alloc_tracker); + Py_TYPE(iestate)->tp_free(iestate); + memlock_unlock(&g_memalloc_lock); + } } static PyObject* @@ -442,20 +499,6 @@ PyInit__memalloc(void) return NULL; } -#ifdef _PY37_AND_LATER - if (PyThread_tss_create(&memalloc_reentrant_key) != 0) { -#else - memalloc_reentrant_key = PyThread_create_key(); - if (memalloc_reentrant_key == -1) { -#endif -#ifdef MS_WINDOWS - PyErr_SetFromWindowsErr(0); -#else - PyErr_SetFromErrno(PyExc_OSError); -#endif - return NULL; - } - if (PyType_Ready(&MemallocIterEvents_Type) < 0) return NULL; Py_INCREF((PyObject*)&MemallocIterEvents_Type); diff --git a/ddtrace/profiling/collector/_memalloc_heap.c b/ddtrace/profiling/collector/_memalloc_heap.c index d6531d7b095..d2a5cc29eee 100644 --- a/ddtrace/profiling/collector/_memalloc_heap.c +++ b/ddtrace/profiling/collector/_memalloc_heap.c @@ -9,13 +9,13 @@ typedef struct { /* Granularity of the heap profiler in bytes */ - uint32_t sample_size; + uint64_t sample_size; /* Current sample size of the heap profiler in bytes */ - uint32_t current_sample_size; + uint64_t current_sample_size; /* Tracked allocations */ traceback_array_t allocs; /* Allocated memory counter in bytes */ - uint32_t allocated_memory; + uint64_t allocated_memory; /* True if the heap tracker is frozen */ bool frozen; /* Contains the ongoing heap allocation/deallocation while frozen */ @@ -26,8 +26,42 @@ typedef struct } freezer; } heap_tracker_t; +static char g_crash_on_mutex_pass_str[] = "_DD_PROFILING_MEMHEAP_CRASH_ON_MUTEX_PASS"; +static const char* g_truthy_values[] = { "1", "true", "yes", "on", "enable", "enabled", NULL }; // NB the sentinel NULL +static memlock_t g_memheap_lock; + static heap_tracker_t global_heap_tracker; +// This is a multiplatform way to define an operation to happen at static initialization time +static void +memheap_init(void); + +#ifdef _MSC_VER +#pragma section(".CRT$XCU", read) +__declspec(allocate(".CRT$XCU")) void (*memheap_init_func)(void) = memheap_init; + +#elif defined(__GNUC__) || defined(__clang__) +__attribute__((constructor)) +#else +#error Unsupported compiler +#endif +static void +memheap_init() +{ + // Check if we should crash the process on mutex pass + char* crash_on_mutex_pass_str = getenv(g_crash_on_mutex_pass_str); + bool crash_on_mutex_pass = false; + if (crash_on_mutex_pass_str) { + for (int i = 0; g_truthy_values[i]; i++) { + if (strcmp(crash_on_mutex_pass_str, g_truthy_values[i]) == 0) { + crash_on_mutex_pass = true; + break; + } + } + } + memlock_init(&g_memheap_lock, crash_on_mutex_pass); +} + static uint32_t heap_tracker_next_sample_size(uint32_t sample_size) { @@ -119,20 +153,30 @@ heap_tracker_thaw(heap_tracker_t* heap_tracker) void memalloc_heap_tracker_init(uint32_t sample_size) { - heap_tracker_init(&global_heap_tracker); - global_heap_tracker.sample_size = sample_size; - global_heap_tracker.current_sample_size = heap_tracker_next_sample_size(sample_size); + + if (memlock_trylock(&g_memheap_lock)) { + heap_tracker_init(&global_heap_tracker); + global_heap_tracker.sample_size = sample_size; + global_heap_tracker.current_sample_size = heap_tracker_next_sample_size(sample_size); + memlock_unlock(&g_memheap_lock); + } } void memalloc_heap_tracker_deinit(void) { - heap_tracker_wipe(&global_heap_tracker); + if (memlock_trylock(&g_memheap_lock)) { + heap_tracker_wipe(&global_heap_tracker); + memlock_unlock(&g_memheap_lock); + } } void memalloc_heap_untrack(void* ptr) { + if (!memlock_trylock(&g_memheap_lock)) { + return; + } if (global_heap_tracker.frozen) { /* Check that we still have space to store the free. If we don't have enough space, we ignore the untrack. That's sad as there is a change @@ -144,6 +188,8 @@ memalloc_heap_untrack(void* ptr) ptr_array_append(&global_heap_tracker.freezer.frees, ptr); } else heap_tracker_untrack_thawed(&global_heap_tracker, ptr); + + memlock_unlock(&g_memheap_lock); } /* Track a memory allocation in the heap profiler. @@ -157,26 +203,36 @@ memalloc_heap_track(uint16_t max_nframe, void* ptr, size_t size, PyMemAllocatorD return false; /* Check for overflow */ - global_heap_tracker.allocated_memory = Py_MIN(global_heap_tracker.allocated_memory + size, MAX_HEAP_SAMPLE_SIZE); + uint64_t res = atomic_add_clamped(&global_heap_tracker.allocated_memory, size, MAX_HEAP_SAMPLE_SIZE); + if (0 == res) + return false; + + // Take the lock + if (!memlock_trylock(&g_memheap_lock)) { + return false; + } /* Check if we have enough sample or not */ - if (global_heap_tracker.allocated_memory < global_heap_tracker.current_sample_size) + if (global_heap_tracker.allocated_memory < global_heap_tracker.current_sample_size) { + memlock_unlock(&g_memheap_lock); return false; + } /* Check if we can add more samples: the sum of the freezer + alloc tracker cannot be greater than what the alloc tracker can handle: when the alloc tracker is thawed, all the allocs in the freezer will be moved there!*/ - if ((global_heap_tracker.freezer.allocs.count + global_heap_tracker.allocs.count) >= TRACEBACK_ARRAY_MAX_COUNT) + if (global_heap_tracker.freezer.allocs.count + global_heap_tracker.allocs.count >= TRACEBACK_ARRAY_MAX_COUNT) { + memlock_unlock(&g_memheap_lock); return false; + } /* Avoid loops */ - if (memalloc_get_reentrant()) + if (!memalloc_take_guard()) { + memlock_unlock(&g_memheap_lock); return false; + } - memalloc_set_reentrant(true); traceback_t* tb = memalloc_get_traceback(max_nframe, ptr, global_heap_tracker.allocated_memory, domain); - memalloc_set_reentrant(false); - if (tb) { if (global_heap_tracker.frozen) traceback_array_append(&global_heap_tracker.freezer.allocs, tb); @@ -189,15 +245,23 @@ memalloc_heap_track(uint16_t max_nframe, void* ptr, size_t size, PyMemAllocatorD /* Compute the new target sample size */ global_heap_tracker.current_sample_size = heap_tracker_next_sample_size(global_heap_tracker.sample_size); + memalloc_yield_guard(); + memlock_unlock(&g_memheap_lock); return true; } + memalloc_yield_guard(); + memlock_unlock(&g_memheap_lock); return false; } PyObject* memalloc_heap() { + if (!memlock_trylock(&g_memheap_lock)) { + return NULL; + } + heap_tracker_freeze(&global_heap_tracker); PyObject* heap_list = PyList_New(global_heap_tracker.allocs.count); @@ -213,5 +277,6 @@ memalloc_heap() heap_tracker_thaw(&global_heap_tracker); + memlock_unlock(&g_memheap_lock); return heap_list; } diff --git a/ddtrace/profiling/collector/_memalloc_reentrant.c b/ddtrace/profiling/collector/_memalloc_reentrant.c new file mode 100644 index 00000000000..d360d19fb30 --- /dev/null +++ b/ddtrace/profiling/collector/_memalloc_reentrant.c @@ -0,0 +1,3 @@ +#include "_memalloc_reentrant.h" + +bool _MEMALLOC_ON_THREAD = false; diff --git a/ddtrace/profiling/collector/_memalloc_reentrant.h b/ddtrace/profiling/collector/_memalloc_reentrant.h index 5c8a552294e..cb4aa246961 100644 --- a/ddtrace/profiling/collector/_memalloc_reentrant.h +++ b/ddtrace/profiling/collector/_memalloc_reentrant.h @@ -1,50 +1,188 @@ #ifndef _DDTRACE_MEMALLOC_REENTRANT_H #define _DDTRACE_MEMALLOC_REENTRANT_H -#include "_pymacro.h" +#ifdef _WIN32 +#include +#else +#define _POSIX_C_SOURCE 200809L +#include +#include +#include +#include +#include +#endif #include +#include +#include -#ifndef _PY37_AND_LATER -#include +// Cross-platform macro for defining thread-local storage +// NB - we use dynamic-global on Linux because the others are problematic +#if defined(_MSC_VER) // Check for MSVC compiler +#define MEMALLOC_TLS __declspec(thread) +#elif defined(__GNUC__) || defined(__clang__) // GCC or Clang +#define MEMALLOC_TLS __attribute__((tls_model("global-dynamic"))) __thread +#else +#error "Unsupported compiler for thread-local storage" #endif +extern bool _MEMALLOC_ON_THREAD; + +// This is a saturating atomic add for 32- and 64-bit platforms. +// In order to implement the saturation logic, use a CAS loop. +// From the GCC docs: +// "‘__atomic’ builtins can be used with any integral scalar or pointer type that is 1, 2, 4, or 8 bytes in length" +// From the MSVC docs: +// "_InterlockedCompareExchange64 is available on x86 systems running on any Pentium architecture; it is not +// available on 386 or 486 architectures." +static inline uint64_t +atomic_add_clamped(uint64_t* target, uint64_t amount, uint64_t max) +{ + // In reality, there's virtually no scenario in which this deadlocks. Just the same, give it some arbitrarily high + // limit in order to prevent unpredicted deadlocks. 96 is chosen since it's the number of cores on the largest + // consumer CPU generally used by our customers. + int attempts = 96; + while (attempts--) { + uint64_t old_val = (volatile uint64_t) * target; -#ifdef _PY37_AND_LATER -extern Py_tss_t memalloc_reentrant_key; + // CAS loop + saturation check + uint64_t new_val = old_val + amount; + if (new_val > max || new_val < old_val) { + return 0; + } +#if defined(_MSC_VER) + uint64_t prev_val = + (uint64_t)InterlockedCompareExchange64((volatile LONG64*)target, (LONG64)new_val, (LONG64)old_val); + if (prev_val == old_val) { + return new_val; + } +#elif defined(__clang__) || defined(__GNUC__) + if (atomic_compare_exchange_strong_explicit( + (_Atomic uint64_t*)target, &old_val, new_val, memory_order_seq_cst, memory_order_seq_cst)) { + return new_val; + } #else -extern int memalloc_reentrant_key; +#error "Unsupported compiler for atomic operations" #endif + // If we reach here, CAS failed; another thread changed `target` + // Retry until success or until we detect max. + } -/* Any non-NULL pointer can be used */ -#define _MEMALLOC_REENTRANT_VALUE Py_True + return 0; +} -static inline void -memalloc_set_reentrant(bool reentrant) +// Opaque lock type +typedef struct +{ +#ifdef _WIN32 + HANDLE mutex; +#else + pthread_mutex_t mutex; +#endif +} memlock_t; + +// Global setting; if a lock fails to be acquired, crash +static bool g_crash_on_mutex_pass = false; + +// Generic initializer +static inline bool +memlock_init(memlock_t* lock, bool crash_on_pass) +{ + if (!lock) + return false; + + g_crash_on_mutex_pass = crash_on_pass; + +#ifdef _WIN32 + lock->mutex = CreateMutex(NULL, FALSE, NULL); + return lock->mutex != NULL; +#else + // For POSIX systems, we make sure to use an ERRORCHECK type mutex, since it pushes some of the state checking + // down to the implementation. + pthread_mutexattr_t attr; + pthread_mutexattr_init(&attr); + pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ERRORCHECK); + return pthread_mutex_init(&lock->mutex, NULL) == 0; +#endif +} + +// Unlock function +static inline bool +memlock_unlock(memlock_t* lock) { - if (reentrant) -#ifdef _PY37_AND_LATER - PyThread_tss_set(&memalloc_reentrant_key, _MEMALLOC_REENTRANT_VALUE); + if (!lock) + return false; + +#ifdef _WIN32 + return ReleaseMutex(lock->mutex); #else - PyThread_set_key_value(memalloc_reentrant_key, _MEMALLOC_REENTRANT_VALUE); + return pthread_mutex_unlock(&lock->mutex) == 0; +#endif +} + +// trylock function +static inline bool +memlock_trylock(memlock_t* lock) +{ + if (!lock) + return false; + +#ifdef __linux__ + // On Linux, we need to make sure we didn't just fork + // pthreads will guarantee the lock is consistent, but we at least need to clear it + static pid_t my_pid = 0; + if (my_pid == 0) { + my_pid = getpid(); + } else if (my_pid != getpid()) { + // We've forked, so we need to free the lock + memlock_unlock(lock); + my_pid = getpid(); + } #endif - else -#ifdef _PY37_AND_LATER - PyThread_tss_set(&memalloc_reentrant_key, NULL); + +#ifdef _WIN32 + bool result = WAIT_OBJECT_0 == WaitForSingleObject(lock->mutex, 0); // 0ms timeout -> no wait #else - PyThread_set_key_value(memalloc_reentrant_key, NULL); + bool result = 0 == pthread_mutex_trylock(&lock->mutex); #endif + if (!result && g_crash_on_mutex_pass) { + // segfault + int* p = NULL; + *p = 0; + abort(); // should never reach here + } + + return result; } +// Cleanup function static inline bool -memalloc_get_reentrant(void) +memlock_destroy(memlock_t* lock) { -#ifdef _PY37_AND_LATER - if (PyThread_tss_get(&memalloc_reentrant_key)) + if (!lock) + return false; + +#ifdef _WIN32 + return CloseHandle(lock->mutex); #else - if (PyThread_get_key_value(memalloc_reentrant_key)) + return 0 == pthread_mutex_destroy(&lock->mutex); #endif - return true; +} - return false; +static inline bool +memalloc_take_guard() +{ + // Ordinarilly, a process-wide semaphore would require a CAS, but since this is thread-local we can just set it. + if (_MEMALLOC_ON_THREAD) + return false; + _MEMALLOC_ON_THREAD = true; + return true; +} + +static inline void +memalloc_yield_guard(void) +{ + // Ideally, we'd actually capture the old state within an object and restore it, but since this is + // a coarse-grained lock, we just set it to false. + _MEMALLOC_ON_THREAD = false; } #endif diff --git a/ddtrace/profiling/collector/_memalloc_tb.c b/ddtrace/profiling/collector/_memalloc_tb.c index ba79021f719..bb265fe08d5 100644 --- a/ddtrace/profiling/collector/_memalloc_tb.c +++ b/ddtrace/profiling/collector/_memalloc_tb.c @@ -87,6 +87,9 @@ memalloc_tb_deinit(void) void traceback_free(traceback_t* tb) { + if (!tb) + return; + for (uint16_t nframe = 0; nframe < tb->nframe; nframe++) { Py_DECREF(tb->frames[nframe].filename); Py_DECREF(tb->frames[nframe].name); @@ -197,11 +200,7 @@ memalloc_get_traceback(uint16_t max_nframe, void* ptr, size_t size, PyMemAllocat traceback->size = size; traceback->ptr = ptr; -#ifdef _PY37_AND_LATER traceback->thread_id = PyThread_get_thread_ident(); -#else - traceback->thread_id = tstate->thread_id; -#endif traceback->domain = domain; diff --git a/ddtrace/profiling/collector/_pymacro.h b/ddtrace/profiling/collector/_pymacro.h index e71ed6888b9..aa31c3d4cc1 100644 --- a/ddtrace/profiling/collector/_pymacro.h +++ b/ddtrace/profiling/collector/_pymacro.h @@ -13,8 +13,4 @@ #define _PY38 #endif -#if PY_VERSION_HEX >= 0x03070000 -#define _PY37_AND_LATER -#endif - #endif diff --git a/releasenotes/notes/fix-profiling-memalloc-segfault-5593ad951405a75d.yaml b/releasenotes/notes/fix-profiling-memalloc-segfault-5593ad951405a75d.yaml new file mode 100644 index 00000000000..8632b62af50 --- /dev/null +++ b/releasenotes/notes/fix-profiling-memalloc-segfault-5593ad951405a75d.yaml @@ -0,0 +1,5 @@ +--- +fixes: + - | + Fixes an issue where the memory allocation profiler can cause a segmentation fault due to + data races when accessing its own global data structures from multiple threads. diff --git a/setup.py b/setup.py index 13b0cb4a4f0..6ed826c5920 100644 --- a/setup.py +++ b/setup.py @@ -510,8 +510,11 @@ def get_exts_for(name): "ddtrace/profiling/collector/_memalloc.c", "ddtrace/profiling/collector/_memalloc_tb.c", "ddtrace/profiling/collector/_memalloc_heap.c", + "ddtrace/profiling/collector/_memalloc_reentrant.c", ], - extra_compile_args=debug_compile_args, + extra_compile_args=debug_compile_args + ["-D_POSIX_C_SOURCE=200809L", "-std=c11"] + if CURRENT_OS != "Windows" + else ["/std:c11"], ), Extension( "ddtrace.internal._threads",