diff --git a/ddtrace/profiling/collector/_memalloc.c b/ddtrace/profiling/collector/_memalloc.c
index 3b7f7db293f..3876517baaf 100644
--- a/ddtrace/profiling/collector/_memalloc.c
+++ b/ddtrace/profiling/collector/_memalloc.c
@@ -42,47 +42,95 @@ static PyObject* object_string = NULL;
 
 #define ALLOC_TRACKER_MAX_COUNT UINT64_MAX
 
+// The data coordination primitives in this and related files are related to a crash we started seeing.
+// We don't have a precise understanding of the causal factors within the runtime that lead to this condition,
+// since the GIL alone was sufficient in the past for preventing this issue.
+// We add an option here to _add_ a crash, in order to observe this condition in a future diagnostic iteration.
+// **This option is _intended_ to crash the Python process** do not use without a good reason!
+static char g_crash_on_mutex_pass_str[] = "_DD_PROFILING_MEMALLOC_CRASH_ON_MUTEX_PASS";
+static const char* g_truthy_values[] = { "1", "true", "yes", "on", "enable", "enabled", NULL }; // NB the sentinel NULL
+static memlock_t g_memalloc_lock;
+
 static alloc_tracker_t* global_alloc_tracker;
 
+// This is a multiplatform way to define an operation to happen at static initialization time
+static void
+memalloc_init(void);
+
+#ifdef _MSC_VER
+#pragma section(".CRT$XCU", read)
+__declspec(allocate(".CRT$XCU")) void (*memalloc_init_func)(void) = memalloc_init;
+
+#elif defined(__GNUC__) || defined(__clang__)
+__attribute__((constructor))
+#else
+#error Unsupported compiler
+#endif
+static void
+memalloc_init()
+{
+    // Check if we should crash the process on mutex pass
+    char* crash_on_mutex_pass_str = getenv(g_crash_on_mutex_pass_str);
+    bool crash_on_mutex_pass = false;
+    if (crash_on_mutex_pass_str) {
+        for (int i = 0; g_truthy_values[i]; i++) {
+            if (strcmp(crash_on_mutex_pass_str, g_truthy_values[i]) == 0) {
+                crash_on_mutex_pass = true;
+                break;
+            }
+        }
+    }
+    memlock_init(&g_memalloc_lock, crash_on_mutex_pass);
+}
+
 static void
 memalloc_add_event(memalloc_context_t* ctx, void* ptr, size_t size)
 {
-    /* Do not overflow; just ignore the new events if we ever reach that point */
-    if (global_alloc_tracker->alloc_count >= ALLOC_TRACKER_MAX_COUNT)
+    uint64_t alloc_count = atomic_add_clamped(&global_alloc_tracker->alloc_count, 1, ALLOC_TRACKER_MAX_COUNT);
+
+    /* Return if we've reached the maximum number of allocations */
+    if (alloc_count == 0)
         return;
 
-    global_alloc_tracker->alloc_count++;
+    // Return if we can't take the guard
+    if (!memalloc_take_guard()) {
+        return;
+    }
 
-    /* Avoid loops */
-    if (memalloc_get_reentrant())
+    // In this implementation, the `global_alloc_tracker` isn't intrinsically protected.  Before we read or modify,
+    // take the lock.  The count of allocations is already forward-attributed elsewhere, so if we can't take the lock
+    // there's nothing to do.
+    if (!memlock_trylock(&g_memalloc_lock)) {
         return;
+    }
 
     /* Determine if we can capture or if we need to sample */
     if (global_alloc_tracker->allocs.count < ctx->max_events) {
-        /* set a barrier so we don't loop as getting a traceback allocates memory */
-        memalloc_set_reentrant(true);
         /* Buffer is not full, fill it */
         traceback_t* tb = memalloc_get_traceback(ctx->max_nframe, ptr, size, ctx->domain);
-        memalloc_set_reentrant(false);
-        if (tb)
+        if (tb) {
             traceback_array_append(&global_alloc_tracker->allocs, tb);
+        }
     } else {
         /* Sampling mode using a reservoir sampling algorithm: replace a random
          * traceback with this one */
-        uint64_t r = random_range(global_alloc_tracker->alloc_count);
+        uint64_t r = random_range(alloc_count);
 
-        if (r < ctx->max_events) {
-            /* set a barrier so we don't loop as getting a traceback allocates memory */
-            memalloc_set_reentrant(true);
+        // In addition to event size, need to check that the tab is in a good state
+        if (r < ctx->max_events && global_alloc_tracker->allocs.tab != NULL) {
             /* Replace a random traceback with this one */
             traceback_t* tb = memalloc_get_traceback(ctx->max_nframe, ptr, size, ctx->domain);
-            memalloc_set_reentrant(false);
+
+            // Need to check not only that the tb returned
             if (tb) {
                 traceback_free(global_alloc_tracker->allocs.tab[r]);
                 global_alloc_tracker->allocs.tab[r] = tb;
             }
         }
     }
+
+    memlock_unlock(&g_memalloc_lock);
+    memalloc_yield_guard();
 }
 
 static void
@@ -98,12 +146,6 @@ memalloc_free(void* ctx, void* ptr)
     alloc->free(alloc->ctx, ptr);
 }
 
-#ifdef _PY37_AND_LATER
-Py_tss_t memalloc_reentrant_key = Py_tss_NEEDS_INIT;
-#else
-int memalloc_reentrant_key = -1;
-#endif
-
 static void*
 memalloc_alloc(int use_calloc, void* ctx, size_t nelem, size_t elsize)
 {
@@ -233,7 +275,10 @@ memalloc_start(PyObject* Py_UNUSED(module), PyObject* args)
 
     global_memalloc_ctx.domain = PYMEM_DOMAIN_OBJ;
 
-    global_alloc_tracker = alloc_tracker_new();
+    if (memlock_trylock(&g_memalloc_lock)) {
+        global_alloc_tracker = alloc_tracker_new();
+        memlock_unlock(&g_memalloc_lock);
+    }
 
     PyMem_GetAllocator(PYMEM_DOMAIN_OBJ, &global_memalloc_ctx.pymem_allocator_obj);
     PyMem_SetAllocator(PYMEM_DOMAIN_OBJ, &alloc);
@@ -258,8 +303,11 @@ memalloc_stop(PyObject* Py_UNUSED(module), PyObject* Py_UNUSED(args))
 
     PyMem_SetAllocator(PYMEM_DOMAIN_OBJ, &global_memalloc_ctx.pymem_allocator_obj);
     memalloc_tb_deinit();
-    alloc_tracker_free(global_alloc_tracker);
-    global_alloc_tracker = NULL;
+    if (memlock_trylock(&g_memalloc_lock)) {
+        alloc_tracker_free(global_alloc_tracker);
+        global_alloc_tracker = NULL;
+        memlock_unlock(&g_memalloc_lock);
+    }
 
     memalloc_heap_tracker_deinit();
 
@@ -310,9 +358,15 @@ iterevents_new(PyTypeObject* type, PyObject* Py_UNUSED(args), PyObject* Py_UNUSE
     if (!iestate)
         return NULL;
 
-    iestate->alloc_tracker = global_alloc_tracker;
     /* reset the current traceback list */
-    global_alloc_tracker = alloc_tracker_new();
+    if (memlock_trylock(&g_memalloc_lock)) {
+        iestate->alloc_tracker = global_alloc_tracker;
+        global_alloc_tracker = alloc_tracker_new();
+        memlock_unlock(&g_memalloc_lock);
+    } else {
+        Py_TYPE(iestate)->tp_free(iestate);
+        return NULL;
+    }
     iestate->seq_index = 0;
 
     PyObject* iter_and_count = PyTuple_New(3);
@@ -326,8 +380,11 @@ iterevents_new(PyTypeObject* type, PyObject* Py_UNUSED(args), PyObject* Py_UNUSE
 static void
 iterevents_dealloc(IterEventsState* iestate)
 {
-    alloc_tracker_free(iestate->alloc_tracker);
-    Py_TYPE(iestate)->tp_free(iestate);
+    if (memlock_trylock(&g_memalloc_lock)) {
+        alloc_tracker_free(iestate->alloc_tracker);
+        Py_TYPE(iestate)->tp_free(iestate);
+        memlock_unlock(&g_memalloc_lock);
+    }
 }
 
 static PyObject*
@@ -442,20 +499,6 @@ PyInit__memalloc(void)
         return NULL;
     }
 
-#ifdef _PY37_AND_LATER
-    if (PyThread_tss_create(&memalloc_reentrant_key) != 0) {
-#else
-    memalloc_reentrant_key = PyThread_create_key();
-    if (memalloc_reentrant_key == -1) {
-#endif
-#ifdef MS_WINDOWS
-        PyErr_SetFromWindowsErr(0);
-#else
-        PyErr_SetFromErrno(PyExc_OSError);
-#endif
-        return NULL;
-    }
-
     if (PyType_Ready(&MemallocIterEvents_Type) < 0)
         return NULL;
     Py_INCREF((PyObject*)&MemallocIterEvents_Type);
diff --git a/ddtrace/profiling/collector/_memalloc_heap.c b/ddtrace/profiling/collector/_memalloc_heap.c
index d6531d7b095..d2a5cc29eee 100644
--- a/ddtrace/profiling/collector/_memalloc_heap.c
+++ b/ddtrace/profiling/collector/_memalloc_heap.c
@@ -9,13 +9,13 @@
 typedef struct
 {
     /* Granularity of the heap profiler in bytes */
-    uint32_t sample_size;
+    uint64_t sample_size;
     /* Current sample size of the heap profiler in bytes */
-    uint32_t current_sample_size;
+    uint64_t current_sample_size;
     /* Tracked allocations */
     traceback_array_t allocs;
     /* Allocated memory counter in bytes */
-    uint32_t allocated_memory;
+    uint64_t allocated_memory;
     /* True if the heap tracker is frozen */
     bool frozen;
     /* Contains the ongoing heap allocation/deallocation while frozen */
@@ -26,8 +26,42 @@ typedef struct
     } freezer;
 } heap_tracker_t;
 
+static char g_crash_on_mutex_pass_str[] = "_DD_PROFILING_MEMHEAP_CRASH_ON_MUTEX_PASS";
+static const char* g_truthy_values[] = { "1", "true", "yes", "on", "enable", "enabled", NULL }; // NB the sentinel NULL
+static memlock_t g_memheap_lock;
+
 static heap_tracker_t global_heap_tracker;
 
+// This is a multiplatform way to define an operation to happen at static initialization time
+static void
+memheap_init(void);
+
+#ifdef _MSC_VER
+#pragma section(".CRT$XCU", read)
+__declspec(allocate(".CRT$XCU")) void (*memheap_init_func)(void) = memheap_init;
+
+#elif defined(__GNUC__) || defined(__clang__)
+__attribute__((constructor))
+#else
+#error Unsupported compiler
+#endif
+static void
+memheap_init()
+{
+    // Check if we should crash the process on mutex pass
+    char* crash_on_mutex_pass_str = getenv(g_crash_on_mutex_pass_str);
+    bool crash_on_mutex_pass = false;
+    if (crash_on_mutex_pass_str) {
+        for (int i = 0; g_truthy_values[i]; i++) {
+            if (strcmp(crash_on_mutex_pass_str, g_truthy_values[i]) == 0) {
+                crash_on_mutex_pass = true;
+                break;
+            }
+        }
+    }
+    memlock_init(&g_memheap_lock, crash_on_mutex_pass);
+}
+
 static uint32_t
 heap_tracker_next_sample_size(uint32_t sample_size)
 {
@@ -119,20 +153,30 @@ heap_tracker_thaw(heap_tracker_t* heap_tracker)
 void
 memalloc_heap_tracker_init(uint32_t sample_size)
 {
-    heap_tracker_init(&global_heap_tracker);
-    global_heap_tracker.sample_size = sample_size;
-    global_heap_tracker.current_sample_size = heap_tracker_next_sample_size(sample_size);
+
+    if (memlock_trylock(&g_memheap_lock)) {
+        heap_tracker_init(&global_heap_tracker);
+        global_heap_tracker.sample_size = sample_size;
+        global_heap_tracker.current_sample_size = heap_tracker_next_sample_size(sample_size);
+        memlock_unlock(&g_memheap_lock);
+    }
 }
 
 void
 memalloc_heap_tracker_deinit(void)
 {
-    heap_tracker_wipe(&global_heap_tracker);
+    if (memlock_trylock(&g_memheap_lock)) {
+        heap_tracker_wipe(&global_heap_tracker);
+        memlock_unlock(&g_memheap_lock);
+    }
 }
 
 void
 memalloc_heap_untrack(void* ptr)
 {
+    if (!memlock_trylock(&g_memheap_lock)) {
+        return;
+    }
     if (global_heap_tracker.frozen) {
         /* Check that we still have space to store the free. If we don't have
            enough space, we ignore the untrack. That's sad as there is a change
@@ -144,6 +188,8 @@ memalloc_heap_untrack(void* ptr)
             ptr_array_append(&global_heap_tracker.freezer.frees, ptr);
     } else
         heap_tracker_untrack_thawed(&global_heap_tracker, ptr);
+
+    memlock_unlock(&g_memheap_lock);
 }
 
 /* Track a memory allocation in the heap profiler.
@@ -157,26 +203,36 @@ memalloc_heap_track(uint16_t max_nframe, void* ptr, size_t size, PyMemAllocatorD
         return false;
 
     /* Check for overflow */
-    global_heap_tracker.allocated_memory = Py_MIN(global_heap_tracker.allocated_memory + size, MAX_HEAP_SAMPLE_SIZE);
+    uint64_t res = atomic_add_clamped(&global_heap_tracker.allocated_memory, size, MAX_HEAP_SAMPLE_SIZE);
+    if (0 == res)
+        return false;
+
+    // Take the lock
+    if (!memlock_trylock(&g_memheap_lock)) {
+        return false;
+    }
 
     /* Check if we have enough sample or not */
-    if (global_heap_tracker.allocated_memory < global_heap_tracker.current_sample_size)
+    if (global_heap_tracker.allocated_memory < global_heap_tracker.current_sample_size) {
+        memlock_unlock(&g_memheap_lock);
         return false;
+    }
 
     /* Check if we can add more samples: the sum of the freezer + alloc tracker
      cannot be greater than what the alloc tracker can handle: when the alloc
      tracker is thawed, all the allocs in the freezer will be moved there!*/
-    if ((global_heap_tracker.freezer.allocs.count + global_heap_tracker.allocs.count) >= TRACEBACK_ARRAY_MAX_COUNT)
+    if (global_heap_tracker.freezer.allocs.count + global_heap_tracker.allocs.count >= TRACEBACK_ARRAY_MAX_COUNT) {
+        memlock_unlock(&g_memheap_lock);
         return false;
+    }
 
     /* Avoid loops */
-    if (memalloc_get_reentrant())
+    if (!memalloc_take_guard()) {
+        memlock_unlock(&g_memheap_lock);
         return false;
+    }
 
-    memalloc_set_reentrant(true);
     traceback_t* tb = memalloc_get_traceback(max_nframe, ptr, global_heap_tracker.allocated_memory, domain);
-    memalloc_set_reentrant(false);
-
     if (tb) {
         if (global_heap_tracker.frozen)
             traceback_array_append(&global_heap_tracker.freezer.allocs, tb);
@@ -189,15 +245,23 @@ memalloc_heap_track(uint16_t max_nframe, void* ptr, size_t size, PyMemAllocatorD
         /* Compute the new target sample size */
         global_heap_tracker.current_sample_size = heap_tracker_next_sample_size(global_heap_tracker.sample_size);
 
+        memalloc_yield_guard();
+        memlock_unlock(&g_memheap_lock);
         return true;
     }
 
+    memalloc_yield_guard();
+    memlock_unlock(&g_memheap_lock);
     return false;
 }
 
 PyObject*
 memalloc_heap()
 {
+    if (!memlock_trylock(&g_memheap_lock)) {
+        return NULL;
+    }
+
     heap_tracker_freeze(&global_heap_tracker);
 
     PyObject* heap_list = PyList_New(global_heap_tracker.allocs.count);
@@ -213,5 +277,6 @@ memalloc_heap()
 
     heap_tracker_thaw(&global_heap_tracker);
 
+    memlock_unlock(&g_memheap_lock);
     return heap_list;
 }
diff --git a/ddtrace/profiling/collector/_memalloc_reentrant.c b/ddtrace/profiling/collector/_memalloc_reentrant.c
new file mode 100644
index 00000000000..d360d19fb30
--- /dev/null
+++ b/ddtrace/profiling/collector/_memalloc_reentrant.c
@@ -0,0 +1,3 @@
+#include "_memalloc_reentrant.h"
+
+bool _MEMALLOC_ON_THREAD = false;
diff --git a/ddtrace/profiling/collector/_memalloc_reentrant.h b/ddtrace/profiling/collector/_memalloc_reentrant.h
index 5c8a552294e..cb4aa246961 100644
--- a/ddtrace/profiling/collector/_memalloc_reentrant.h
+++ b/ddtrace/profiling/collector/_memalloc_reentrant.h
@@ -1,50 +1,188 @@
 #ifndef _DDTRACE_MEMALLOC_REENTRANT_H
 #define _DDTRACE_MEMALLOC_REENTRANT_H
 
-#include "_pymacro.h"
+#ifdef _WIN32
+#include <windows.h>
+#else
+#define _POSIX_C_SOURCE 200809L
+#include <errno.h>
+#include <pthread.h>
+#include <stdatomic.h>
+#include <time.h>
+#include <unistd.h>
+#endif
 #include <stdbool.h>
+#include <stdint.h>
+#include <stdlib.h>
 
-#ifndef _PY37_AND_LATER
-#include <pythread.h>
+// Cross-platform macro for defining thread-local storage
+// NB - we use dynamic-global on Linux because the others are problematic
+#if defined(_MSC_VER) // Check for MSVC compiler
+#define MEMALLOC_TLS __declspec(thread)
+#elif defined(__GNUC__) || defined(__clang__) // GCC or Clang
+#define MEMALLOC_TLS __attribute__((tls_model("global-dynamic"))) __thread
+#else
+#error "Unsupported compiler for thread-local storage"
 #endif
+extern bool _MEMALLOC_ON_THREAD;
+
+// This is a saturating atomic add for 32- and 64-bit platforms.
+// In order to implement the saturation logic, use a CAS loop.
+// From the GCC docs:
+// "‘__atomic’ builtins can be used with any integral scalar or pointer type that is 1, 2, 4, or 8 bytes in length"
+// From the MSVC docs:
+// "_InterlockedCompareExchange64 is available on x86 systems running on any Pentium architecture; it is not
+// available on 386 or 486 architectures."
+static inline uint64_t
+atomic_add_clamped(uint64_t* target, uint64_t amount, uint64_t max)
+{
+    // In reality, there's virtually no scenario in which this deadlocks.  Just the same, give it some arbitrarily high
+    // limit in order to prevent unpredicted deadlocks.  96 is chosen since it's the number of cores on the largest
+    // consumer CPU generally used by our customers.
+    int attempts = 96;
+    while (attempts--) {
+        uint64_t old_val = (volatile uint64_t) * target;
 
-#ifdef _PY37_AND_LATER
-extern Py_tss_t memalloc_reentrant_key;
+        // CAS loop + saturation check
+        uint64_t new_val = old_val + amount;
+        if (new_val > max || new_val < old_val) {
+            return 0;
+        }
+#if defined(_MSC_VER)
+        uint64_t prev_val =
+          (uint64_t)InterlockedCompareExchange64((volatile LONG64*)target, (LONG64)new_val, (LONG64)old_val);
+        if (prev_val == old_val) {
+            return new_val;
+        }
+#elif defined(__clang__) || defined(__GNUC__)
+        if (atomic_compare_exchange_strong_explicit(
+              (_Atomic uint64_t*)target, &old_val, new_val, memory_order_seq_cst, memory_order_seq_cst)) {
+            return new_val;
+        }
 #else
-extern int memalloc_reentrant_key;
+#error "Unsupported compiler for atomic operations"
 #endif
+        // If we reach here, CAS failed; another thread changed `target`
+        // Retry until success or until we detect max.
+    }
 
-/* Any non-NULL pointer can be used */
-#define _MEMALLOC_REENTRANT_VALUE Py_True
+    return 0;
+}
 
-static inline void
-memalloc_set_reentrant(bool reentrant)
+// Opaque lock type
+typedef struct
+{
+#ifdef _WIN32
+    HANDLE mutex;
+#else
+    pthread_mutex_t mutex;
+#endif
+} memlock_t;
+
+// Global setting; if a lock fails to be acquired, crash
+static bool g_crash_on_mutex_pass = false;
+
+// Generic initializer
+static inline bool
+memlock_init(memlock_t* lock, bool crash_on_pass)
+{
+    if (!lock)
+        return false;
+
+    g_crash_on_mutex_pass = crash_on_pass;
+
+#ifdef _WIN32
+    lock->mutex = CreateMutex(NULL, FALSE, NULL);
+    return lock->mutex != NULL;
+#else
+    // For POSIX systems, we make sure to use an ERRORCHECK type mutex, since it pushes some of the state checking
+    // down to the implementation.
+    pthread_mutexattr_t attr;
+    pthread_mutexattr_init(&attr);
+    pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ERRORCHECK);
+    return pthread_mutex_init(&lock->mutex, NULL) == 0;
+#endif
+}
+
+// Unlock function
+static inline bool
+memlock_unlock(memlock_t* lock)
 {
-    if (reentrant)
-#ifdef _PY37_AND_LATER
-        PyThread_tss_set(&memalloc_reentrant_key, _MEMALLOC_REENTRANT_VALUE);
+    if (!lock)
+        return false;
+
+#ifdef _WIN32
+    return ReleaseMutex(lock->mutex);
 #else
-        PyThread_set_key_value(memalloc_reentrant_key, _MEMALLOC_REENTRANT_VALUE);
+    return pthread_mutex_unlock(&lock->mutex) == 0;
+#endif
+}
+
+// trylock function
+static inline bool
+memlock_trylock(memlock_t* lock)
+{
+    if (!lock)
+        return false;
+
+#ifdef __linux__
+    // On Linux, we need to make sure we didn't just fork
+    // pthreads will guarantee the lock is consistent, but we at least need to clear it
+    static pid_t my_pid = 0;
+    if (my_pid == 0) {
+        my_pid = getpid();
+    } else if (my_pid != getpid()) {
+        // We've forked, so we need to free the lock
+        memlock_unlock(lock);
+        my_pid = getpid();
+    }
 #endif
-    else
-#ifdef _PY37_AND_LATER
-        PyThread_tss_set(&memalloc_reentrant_key, NULL);
+
+#ifdef _WIN32
+    bool result = WAIT_OBJECT_0 == WaitForSingleObject(lock->mutex, 0); // 0ms timeout -> no wait
 #else
-        PyThread_set_key_value(memalloc_reentrant_key, NULL);
+    bool result = 0 == pthread_mutex_trylock(&lock->mutex);
 #endif
+    if (!result && g_crash_on_mutex_pass) {
+        // segfault
+        int* p = NULL;
+        *p = 0;
+        abort(); // should never reach here
+    }
+
+    return result;
 }
 
+// Cleanup function
 static inline bool
-memalloc_get_reentrant(void)
+memlock_destroy(memlock_t* lock)
 {
-#ifdef _PY37_AND_LATER
-    if (PyThread_tss_get(&memalloc_reentrant_key))
+    if (!lock)
+        return false;
+
+#ifdef _WIN32
+    return CloseHandle(lock->mutex);
 #else
-    if (PyThread_get_key_value(memalloc_reentrant_key))
+    return 0 == pthread_mutex_destroy(&lock->mutex);
 #endif
-        return true;
+}
 
-    return false;
+static inline bool
+memalloc_take_guard()
+{
+    // Ordinarilly, a process-wide semaphore would require a CAS, but since this is thread-local we can just set it.
+    if (_MEMALLOC_ON_THREAD)
+        return false;
+    _MEMALLOC_ON_THREAD = true;
+    return true;
+}
+
+static inline void
+memalloc_yield_guard(void)
+{
+    // Ideally, we'd actually capture the old state within an object and restore it, but since this is
+    // a coarse-grained lock, we just set it to false.
+    _MEMALLOC_ON_THREAD = false;
 }
 
 #endif
diff --git a/ddtrace/profiling/collector/_memalloc_tb.c b/ddtrace/profiling/collector/_memalloc_tb.c
index ba79021f719..bb265fe08d5 100644
--- a/ddtrace/profiling/collector/_memalloc_tb.c
+++ b/ddtrace/profiling/collector/_memalloc_tb.c
@@ -87,6 +87,9 @@ memalloc_tb_deinit(void)
 void
 traceback_free(traceback_t* tb)
 {
+    if (!tb)
+        return;
+
     for (uint16_t nframe = 0; nframe < tb->nframe; nframe++) {
         Py_DECREF(tb->frames[nframe].filename);
         Py_DECREF(tb->frames[nframe].name);
@@ -197,11 +200,7 @@ memalloc_get_traceback(uint16_t max_nframe, void* ptr, size_t size, PyMemAllocat
     traceback->size = size;
     traceback->ptr = ptr;
 
-#ifdef _PY37_AND_LATER
     traceback->thread_id = PyThread_get_thread_ident();
-#else
-    traceback->thread_id = tstate->thread_id;
-#endif
 
     traceback->domain = domain;
 
diff --git a/ddtrace/profiling/collector/_pymacro.h b/ddtrace/profiling/collector/_pymacro.h
index e71ed6888b9..aa31c3d4cc1 100644
--- a/ddtrace/profiling/collector/_pymacro.h
+++ b/ddtrace/profiling/collector/_pymacro.h
@@ -13,8 +13,4 @@
 #define _PY38
 #endif
 
-#if PY_VERSION_HEX >= 0x03070000
-#define _PY37_AND_LATER
-#endif
-
 #endif
diff --git a/releasenotes/notes/fix-profiling-memalloc-segfault-5593ad951405a75d.yaml b/releasenotes/notes/fix-profiling-memalloc-segfault-5593ad951405a75d.yaml
new file mode 100644
index 00000000000..8632b62af50
--- /dev/null
+++ b/releasenotes/notes/fix-profiling-memalloc-segfault-5593ad951405a75d.yaml
@@ -0,0 +1,5 @@
+---
+fixes:
+  - |
+    Fixes an issue where the memory allocation profiler can cause a segmentation fault due to
+    data races when accessing its own global data structures from multiple threads.
diff --git a/setup.py b/setup.py
index 13b0cb4a4f0..6ed826c5920 100644
--- a/setup.py
+++ b/setup.py
@@ -510,8 +510,11 @@ def get_exts_for(name):
                 "ddtrace/profiling/collector/_memalloc.c",
                 "ddtrace/profiling/collector/_memalloc_tb.c",
                 "ddtrace/profiling/collector/_memalloc_heap.c",
+                "ddtrace/profiling/collector/_memalloc_reentrant.c",
             ],
-            extra_compile_args=debug_compile_args,
+            extra_compile_args=debug_compile_args + ["-D_POSIX_C_SOURCE=200809L", "-std=c11"]
+            if CURRENT_OS != "Windows"
+            else ["/std:c11"],
         ),
         Extension(
             "ddtrace.internal._threads",