diff --git a/include/qemu/coroutine-tls.h b/include/qemu/coroutine-tls.h
new file mode 100644
index 00000000000..1558a826aa0
--- /dev/null
+++ b/include/qemu/coroutine-tls.h
@@ -0,0 +1,165 @@
+/*
+ * QEMU Thread Local Storage for coroutines
+ *
+ * Copyright Red Hat
+ *
+ * SPDX-License-Identifier: LGPL-2.1-or-later
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2.1 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ * It is forbidden to access Thread Local Storage in coroutines because
+ * compiler optimizations may cause values to be cached across coroutine
+ * re-entry. Coroutines can run in more than one thread through the course of
+ * their life, leading bugs when stale TLS values from the wrong thread are
+ * used as a result of compiler optimization.
+ *
+ * An example is:
+ *
+ * ..code-block:: c
+ *   :caption: A coroutine that may see the wrong TLS value
+ *
+ *   static __thread AioContext *current_aio_context;
+ *   ...
+ *   static void coroutine_fn foo(void)
+ *   {
+ *       aio_notify(current_aio_context);
+ *       qemu_coroutine_yield();
+ *       aio_notify(current_aio_context); // <-- may be stale after yielding!
+ *   }
+ *
+ * This header provides macros for safely defining variables in Thread Local
+ * Storage:
+ *
+ * ..code-block:: c
+ *   :caption: A coroutine that safely uses TLS
+ *
+ *   QEMU_DEFINE_STATIC_CO_TLS(AioContext *, current_aio_context)
+ *   ...
+ *   static void coroutine_fn foo(void)
+ *   {
+ *       aio_notify(get_current_aio_context());
+ *       qemu_coroutine_yield();
+ *       aio_notify(get_current_aio_context()); // <-- safe
+ *   }
+ */
+
+#ifndef QEMU_COROUTINE_TLS_H
+#define QEMU_COROUTINE_TLS_H
+
+/*
+ * To stop the compiler from caching TLS values we define accessor functions
+ * with __attribute__((noinline)) plus asm volatile("") to prevent
+ * optimizations that override noinline.
+ *
+ * The compiler can still analyze noinline code and make optimizations based on
+ * that knowledge, so an inline asm output operand is used to prevent
+ * optimizations that make assumptions about the address of the TLS variable.
+ *
+ * This is fragile and ultimately needs to be solved by a mechanism that is
+ * guaranteed to work by the compiler (e.g. stackless coroutines), but for now
+ * we use this approach to prevent issues.
+ */
+
+/**
+ * QEMU_DECLARE_CO_TLS:
+ * @type: the variable's C type
+ * @var: the variable name
+ *
+ * Declare an extern variable in Thread Local Storage from a header file:
+ *
+ * .. code-block:: c
+ *   :caption: Declaring an extern variable in Thread Local Storage
+ *
+ *   QEMU_DECLARE_CO_TLS(int, my_count)
+ *   ...
+ *   int c = get_my_count();
+ *   set_my_count(c + 1);
+ *   *get_ptr_my_count() = 0;
+ *
+ * This is a coroutine-safe replacement for the __thread keyword and is
+ * equivalent to the following code:
+ *
+ * .. code-block:: c
+ *   :caption: Declaring a TLS variable using __thread
+ *
+ *   extern __thread int my_count;
+ *   ...
+ *   int c = my_count;
+ *   my_count = c + 1;
+ *   *(&my_count) = 0;
+ */
+#define QEMU_DECLARE_CO_TLS(type, var)                                       \
+    __attribute__((noinline)) type get_##var(void);                          \
+    __attribute__((noinline)) void set_##var(type v);                        \
+    __attribute__((noinline)) type *get_ptr_##var(void);
+
+/**
+ * QEMU_DEFINE_CO_TLS:
+ * @type: the variable's C type
+ * @var: the variable name
+ *
+ * Define a variable in Thread Local Storage that was previously declared from
+ * a header file with QEMU_DECLARE_CO_TLS():
+ *
+ * .. code-block:: c
+ *   :caption: Defining a variable in Thread Local Storage
+ *
+ *   QEMU_DEFINE_CO_TLS(int, my_count)
+ *
+ * This is a coroutine-safe replacement for the __thread keyword and is
+ * equivalent to the following code:
+ *
+ * .. code-block:: c
+ *   :caption: Defining a TLS variable using __thread
+ *
+ *   __thread int my_count;
+ */
+#define QEMU_DEFINE_CO_TLS(type, var)                                        \
+    static __thread type co_tls_##var;                                       \
+    type get_##var(void) { asm volatile(""); return co_tls_##var; }          \
+    void set_##var(type v) { asm volatile(""); co_tls_##var = v; }           \
+    type *get_ptr_##var(void)                                                \
+    { type *ptr = &co_tls_##var; asm volatile("" : "+rm" (ptr)); return ptr; }
+
+/**
+ * QEMU_DEFINE_STATIC_CO_TLS:
+ * @type: the variable's C type
+ * @var: the variable name
+ *
+ * Define a static variable in Thread Local Storage:
+ *
+ * .. code-block:: c
+ *   :caption: Defining a static variable in Thread Local Storage
+ *
+ *   QEMU_DEFINE_STATIC_CO_TLS(int, my_count)
+ *   ...
+ *   int c = get_my_count();
+ *   set_my_count(c + 1);
+ *   *get_ptr_my_count() = 0;
+ *
+ * This is a coroutine-safe replacement for the __thread keyword and is
+ * equivalent to the following code:
+ *
+ * .. code-block:: c
+ *   :caption: Defining a static TLS variable using __thread
+ *
+ *   static __thread int my_count;
+ *   ...
+ *   int c = my_count;
+ *   my_count = c + 1;
+ *   *(&my_count) = 0;
+ */
+#define QEMU_DEFINE_STATIC_CO_TLS(type, var)                                 \
+    static __thread type co_tls_##var;                                       \
+    static __attribute__((noinline, unused))                                 \
+    type get_##var(void)                                                     \
+    { asm volatile(""); return co_tls_##var; }                               \
+    static __attribute__((noinline, unused))                                 \
+    void set_##var(type v)                                                   \
+    { asm volatile(""); co_tls_##var = v; }                                  \
+    static __attribute__((noinline, unused))                                 \
+    type *get_ptr_##var(void)                                                \
+    { type *ptr = &co_tls_##var; asm volatile("" : "+rm" (ptr)); return ptr; }
+
+#endif /* QEMU_COROUTINE_TLS_H */
diff --git a/include/qemu/rcu.h b/include/qemu/rcu.h
index 515d327cf11..15b7bc2d67c 100644
--- a/include/qemu/rcu.h
+++ b/include/qemu/rcu.h
@@ -28,6 +28,7 @@
 #include "qemu/queue.h"
 #include "qemu/atomic.h"
 #include "qemu/sys_membarrier.h"
+#include "qemu/coroutine-tls.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -68,11 +69,11 @@ struct rcu_reader_data {
     QLIST_ENTRY(rcu_reader_data) node;
 };
 
-extern __thread struct rcu_reader_data rcu_reader;
+QEMU_DECLARE_CO_TLS(struct rcu_reader_data, rcu_reader)
 
 static inline void rcu_read_lock(void)
 {
-    struct rcu_reader_data *p_rcu_reader = &rcu_reader;
+    struct rcu_reader_data *p_rcu_reader = get_ptr_rcu_reader();
     unsigned ctr;
 
     if (p_rcu_reader->depth++ > 0) {
@@ -88,7 +89,7 @@ static inline void rcu_read_lock(void)
 
 static inline void rcu_read_unlock(void)
 {
-    struct rcu_reader_data *p_rcu_reader = &rcu_reader;
+    struct rcu_reader_data *p_rcu_reader = get_ptr_rcu_reader();
 
     assert(p_rcu_reader->depth != 0);
     if (--p_rcu_reader->depth > 0) {
diff --git a/softmmu/cpus.c b/softmmu/cpus.c
index a7ee431187a..581b7d73da1 100644
--- a/softmmu/cpus.c
+++ b/softmmu/cpus.c
@@ -25,6 +25,7 @@
 #include "qemu/osdep.h"
 #include "qemu-common.h"
 #include "monitor/monitor.h"
+#include "qemu/coroutine-tls.h"
 #include "qapi/error.h"
 #include "qapi/qapi-commands-machine.h"
 #include "qapi/qapi-commands-misc.h"
@@ -472,11 +473,11 @@ bool qemu_in_vcpu_thread(void)
     return current_cpu && qemu_cpu_is_self(current_cpu);
 }
 
-static __thread bool iothread_locked = false;
+QEMU_DEFINE_STATIC_CO_TLS(bool, iothread_locked)
 
 bool qemu_mutex_iothread_locked(void)
 {
-    return iothread_locked;
+    return get_iothread_locked();
 }
 
 /*
@@ -489,13 +490,13 @@ void qemu_mutex_lock_iothread_impl(const char *file, int line)
 
     g_assert(!qemu_mutex_iothread_locked());
     bql_lock(&qemu_global_mutex, file, line);
-    iothread_locked = true;
+    set_iothread_locked(true);
 }
 
 void qemu_mutex_unlock_iothread(void)
 {
     g_assert(qemu_mutex_iothread_locked());
-    iothread_locked = false;
+    set_iothread_locked(false);
     qemu_mutex_unlock(&qemu_global_mutex);
 }
 
diff --git a/tests/unit/rcutorture.c b/tests/unit/rcutorture.c
index de6f649058e..495a4e6f420 100644
--- a/tests/unit/rcutorture.c
+++ b/tests/unit/rcutorture.c
@@ -122,7 +122,7 @@ static void *rcu_read_perf_test(void *arg)
 
     rcu_register_thread();
 
-    *(struct rcu_reader_data **)arg = &rcu_reader;
+    *(struct rcu_reader_data **)arg = get_ptr_rcu_reader();
     qatomic_inc(&nthreadsrunning);
     while (goflag == GOFLAG_INIT) {
         g_usleep(1000);
@@ -148,7 +148,7 @@ static void *rcu_update_perf_test(void *arg)
 
     rcu_register_thread();
 
-    *(struct rcu_reader_data **)arg = &rcu_reader;
+    *(struct rcu_reader_data **)arg = get_ptr_rcu_reader();
     qatomic_inc(&nthreadsrunning);
     while (goflag == GOFLAG_INIT) {
         g_usleep(1000);
@@ -253,7 +253,7 @@ static void *rcu_read_stress_test(void *arg)
 
     rcu_register_thread();
 
-    *(struct rcu_reader_data **)arg = &rcu_reader;
+    *(struct rcu_reader_data **)arg = get_ptr_rcu_reader();
     while (goflag == GOFLAG_INIT) {
         g_usleep(1000);
     }
@@ -304,7 +304,7 @@ static void *rcu_update_stress_test(void *arg)
     struct rcu_stress *cp = qatomic_read(&rcu_stress_current);
 
     rcu_register_thread();
-    *(struct rcu_reader_data **)arg = &rcu_reader;
+    *(struct rcu_reader_data **)arg = get_ptr_rcu_reader();
 
     while (goflag == GOFLAG_INIT) {
         g_usleep(1000);
@@ -347,7 +347,7 @@ static void *rcu_fake_update_stress_test(void *arg)
 {
     rcu_register_thread();
 
-    *(struct rcu_reader_data **)arg = &rcu_reader;
+    *(struct rcu_reader_data **)arg = get_ptr_rcu_reader();
     while (goflag == GOFLAG_INIT) {
         g_usleep(1000);
     }
diff --git a/tests/unit/test-rcu-list.c b/tests/unit/test-rcu-list.c
index 49641e19366..64b81ae0581 100644
--- a/tests/unit/test-rcu-list.c
+++ b/tests/unit/test-rcu-list.c
@@ -171,7 +171,7 @@ static void *rcu_q_reader(void *arg)
 
     rcu_register_thread();
 
-    *(struct rcu_reader_data **)arg = &rcu_reader;
+    *(struct rcu_reader_data **)arg = get_ptr_rcu_reader();
     qatomic_inc(&nthreadsrunning);
     while (qatomic_read(&goflag) == GOFLAG_INIT) {
         g_usleep(1000);
@@ -206,7 +206,7 @@ static void *rcu_q_updater(void *arg)
     long long n_removed_local = 0;
     struct list_element *el, *prev_el;
 
-    *(struct rcu_reader_data **)arg = &rcu_reader;
+    *(struct rcu_reader_data **)arg = get_ptr_rcu_reader();
     qatomic_inc(&nthreadsrunning);
     while (qatomic_read(&goflag) == GOFLAG_INIT) {
         g_usleep(1000);
diff --git a/util/coroutine-ucontext.c b/util/coroutine-ucontext.c
index 904b375192c..127d5a13c8e 100644
--- a/util/coroutine-ucontext.c
+++ b/util/coroutine-ucontext.c
@@ -25,6 +25,7 @@
 #include "qemu/osdep.h"
 #include <ucontext.h>
 #include "qemu/coroutine_int.h"
+#include "qemu/coroutine-tls.h"
 
 #ifdef CONFIG_VALGRIND_H
 #include <valgrind/valgrind.h>
@@ -66,8 +67,8 @@ typedef struct {
 /**
  * Per-thread coroutine bookkeeping
  */
-static __thread CoroutineUContext leader;
-static __thread Coroutine *current;
+QEMU_DEFINE_STATIC_CO_TLS(Coroutine *, current);
+QEMU_DEFINE_STATIC_CO_TLS(CoroutineUContext, leader);
 
 /*
  * va_args to makecontext() must be type 'int', so passing
@@ -97,14 +98,15 @@ static inline __attribute__((always_inline))
 void finish_switch_fiber(void *fake_stack_save)
 {
 #ifdef CONFIG_ASAN
+    CoroutineUContext *leaderp = get_ptr_leader();
     const void *bottom_old;
     size_t size_old;
 
     __sanitizer_finish_switch_fiber(fake_stack_save, &bottom_old, &size_old);
 
-    if (!leader.stack) {
-        leader.stack = (void *)bottom_old;
-        leader.stack_size = size_old;
+    if (!leaderp->stack) {
+        leaderp->stack = (void *)bottom_old;
+        leaderp->stack_size = size_old;
     }
 #endif
 #ifdef CONFIG_TSAN
@@ -161,8 +163,10 @@ static void coroutine_trampoline(int i0, int i1)
 
     /* Initialize longjmp environment and switch back the caller */
     if (!sigsetjmp(self->env, 0)) {
-        start_switch_fiber_asan(COROUTINE_YIELD, &fake_stack_save, leader.stack,
-                                leader.stack_size);
+        CoroutineUContext *leaderp = get_ptr_leader();
+
+        start_switch_fiber_asan(COROUTINE_YIELD, &fake_stack_save,
+                                leaderp->stack, leaderp->stack_size);
         start_switch_fiber_tsan(&fake_stack_save, self, true); /* true=caller */
         siglongjmp(*(sigjmp_buf *)co->entry_arg, 1);
     }
@@ -297,7 +301,7 @@ qemu_coroutine_switch(Coroutine *from_, Coroutine *to_,
     int ret;
     void *fake_stack_save = NULL;
 
-    current = to_;
+    set_current(to_);
 
     ret = sigsetjmp(from->env, 0);
     if (ret == 0) {
@@ -315,18 +319,24 @@ qemu_coroutine_switch(Coroutine *from_, Coroutine *to_,
 
 Coroutine *qemu_coroutine_self(void)
 {
-    if (!current) {
-        current = &leader.base;
+    Coroutine *self = get_current();
+    CoroutineUContext *leaderp = get_ptr_leader();
+
+    if (!self) {
+        self = &leaderp->base;
+        set_current(self);
     }
 #ifdef CONFIG_TSAN
-    if (!leader.tsan_co_fiber) {
-        leader.tsan_co_fiber = __tsan_get_current_fiber();
+    if (!leaderp->tsan_co_fiber) {
+        leaderp->tsan_co_fiber = __tsan_get_current_fiber();
     }
 #endif
-    return current;
+    return self;
 }
 
 bool qemu_in_coroutine(void)
 {
-    return current && current->caller;
+    Coroutine *self = get_current();
+
+    return self && self->caller;
 }
diff --git a/util/coroutine-win32.c b/util/coroutine-win32.c
index de6bd4fd3e4..c02a62c8969 100644
--- a/util/coroutine-win32.c
+++ b/util/coroutine-win32.c
@@ -25,6 +25,7 @@
 #include "qemu/osdep.h"
 #include "qemu-common.h"
 #include "qemu/coroutine_int.h"
+#include "qemu/coroutine-tls.h"
 
 typedef struct
 {
@@ -34,8 +35,8 @@ typedef struct
     CoroutineAction action;
 } CoroutineWin32;
 
-static __thread CoroutineWin32 leader;
-static __thread Coroutine *current;
+QEMU_DEFINE_STATIC_CO_TLS(CoroutineWin32, leader);
+QEMU_DEFINE_STATIC_CO_TLS(Coroutine *, current);
 
 /* This function is marked noinline to prevent GCC from inlining it
  * into coroutine_trampoline(). If we allow it to do that then it
@@ -52,7 +53,7 @@ qemu_coroutine_switch(Coroutine *from_, Coroutine *to_,
     CoroutineWin32 *from = DO_UPCAST(CoroutineWin32, base, from_);
     CoroutineWin32 *to = DO_UPCAST(CoroutineWin32, base, to_);
 
-    current = to_;
+    set_current(to_);
 
     to->action = action;
     SwitchToFiber(to->fiber);
@@ -89,14 +90,21 @@ void qemu_coroutine_delete(Coroutine *co_)
 
 Coroutine *qemu_coroutine_self(void)
 {
+    Coroutine *current = get_current();
+
     if (!current) {
-        current = &leader.base;
-        leader.fiber = ConvertThreadToFiber(NULL);
+        CoroutineWin32 *leader = get_ptr_leader();
+
+        current = &leader->base;
+        set_current(current);
+        leader->fiber = ConvertThreadToFiber(NULL);
     }
     return current;
 }
 
 bool qemu_in_coroutine(void)
 {
+    Coroutine *current = get_current();
+
     return current && current->caller;
 }
diff --git a/util/qemu-coroutine.c b/util/qemu-coroutine.c
index 38fb6d3084d..22071dac4b8 100644
--- a/util/qemu-coroutine.c
+++ b/util/qemu-coroutine.c
@@ -18,6 +18,7 @@
 #include "qemu/atomic.h"
 #include "qemu/coroutine.h"
 #include "qemu/coroutine_int.h"
+#include "qemu/coroutine-tls.h"
 #include "block/aio.h"
 
 enum {
@@ -27,17 +28,20 @@ enum {
 /** Free list to speed up creation */
 static QSLIST_HEAD(, Coroutine) release_pool = QSLIST_HEAD_INITIALIZER(pool);
 static unsigned int release_pool_size;
-static __thread QSLIST_HEAD(, Coroutine) alloc_pool = QSLIST_HEAD_INITIALIZER(pool);
-static __thread unsigned int alloc_pool_size;
-static __thread Notifier coroutine_pool_cleanup_notifier;
+
+typedef QSLIST_HEAD(, Coroutine) CoroutineQSList;
+QEMU_DEFINE_STATIC_CO_TLS(CoroutineQSList, alloc_pool);
+QEMU_DEFINE_STATIC_CO_TLS(unsigned int, alloc_pool_size);
+QEMU_DEFINE_STATIC_CO_TLS(Notifier, coroutine_pool_cleanup_notifier);
 
 static void coroutine_pool_cleanup(Notifier *n, void *value)
 {
     Coroutine *co;
     Coroutine *tmp;
+    CoroutineQSList *alloc_pool = get_ptr_alloc_pool();
 
-    QSLIST_FOREACH_SAFE(co, &alloc_pool, pool_next, tmp) {
-        QSLIST_REMOVE_HEAD(&alloc_pool, pool_next);
+    QSLIST_FOREACH_SAFE(co, alloc_pool, pool_next, tmp) {
+        QSLIST_REMOVE_HEAD(alloc_pool, pool_next);
         qemu_coroutine_delete(co);
     }
 }
@@ -47,27 +51,30 @@ Coroutine *qemu_coroutine_create(CoroutineEntry *entry, void *opaque)
     Coroutine *co = NULL;
 
     if (CONFIG_COROUTINE_POOL) {
-        co = QSLIST_FIRST(&alloc_pool);
+        CoroutineQSList *alloc_pool = get_ptr_alloc_pool();
+
+        co = QSLIST_FIRST(alloc_pool);
         if (!co) {
             if (release_pool_size > POOL_BATCH_SIZE) {
                 /* Slow path; a good place to register the destructor, too.  */
-                if (!coroutine_pool_cleanup_notifier.notify) {
-                    coroutine_pool_cleanup_notifier.notify = coroutine_pool_cleanup;
-                    qemu_thread_atexit_add(&coroutine_pool_cleanup_notifier);
+                Notifier *notifier = get_ptr_coroutine_pool_cleanup_notifier();
+                if (!notifier->notify) {
+                    notifier->notify = coroutine_pool_cleanup;
+                    qemu_thread_atexit_add(notifier);
                 }
 
                 /* This is not exact; there could be a little skew between
                  * release_pool_size and the actual size of release_pool.  But
                  * it is just a heuristic, it does not need to be perfect.
                  */
-                alloc_pool_size = qatomic_xchg(&release_pool_size, 0);
-                QSLIST_MOVE_ATOMIC(&alloc_pool, &release_pool);
-                co = QSLIST_FIRST(&alloc_pool);
+                set_alloc_pool_size(qatomic_xchg(&release_pool_size, 0));
+                QSLIST_MOVE_ATOMIC(alloc_pool, &release_pool);
+                co = QSLIST_FIRST(alloc_pool);
             }
         }
         if (co) {
-            QSLIST_REMOVE_HEAD(&alloc_pool, pool_next);
-            alloc_pool_size--;
+            QSLIST_REMOVE_HEAD(alloc_pool, pool_next);
+            set_alloc_pool_size(get_alloc_pool_size() - 1);
         }
     }
 
@@ -91,9 +98,9 @@ static void coroutine_delete(Coroutine *co)
             qatomic_inc(&release_pool_size);
             return;
         }
-        if (alloc_pool_size < POOL_BATCH_SIZE) {
-            QSLIST_INSERT_HEAD(&alloc_pool, co, pool_next);
-            alloc_pool_size++;
+        if (get_alloc_pool_size() < POOL_BATCH_SIZE) {
+            QSLIST_INSERT_HEAD(get_ptr_alloc_pool(), co, pool_next);
+            set_alloc_pool_size(get_alloc_pool_size() + 1);
             return;
         }
     }
diff --git a/util/rcu.c b/util/rcu.c
index 13ac0f75cb2..40062b8ed20 100644
--- a/util/rcu.c
+++ b/util/rcu.c
@@ -64,7 +64,7 @@ static inline int rcu_gp_ongoing(unsigned long *ctr)
 /* Written to only by each individual reader. Read by both the reader and the
  * writers.
  */
-__thread struct rcu_reader_data rcu_reader;
+QEMU_DEFINE_CO_TLS(struct rcu_reader_data, rcu_reader)
 
 /* Protected by rcu_registry_lock.  */
 typedef QLIST_HEAD(, rcu_reader_data) ThreadList;
@@ -350,16 +350,16 @@ void drain_call_rcu(void)
 
 void rcu_register_thread(void)
 {
-    assert(rcu_reader.ctr == 0);
+    assert(get_ptr_rcu_reader()->ctr == 0);
     qemu_mutex_lock(&rcu_registry_lock);
-    QLIST_INSERT_HEAD(&registry, &rcu_reader, node);
+    QLIST_INSERT_HEAD(&registry, get_ptr_rcu_reader(), node);
     qemu_mutex_unlock(&rcu_registry_lock);
 }
 
 void rcu_unregister_thread(void)
 {
     qemu_mutex_lock(&rcu_registry_lock);
-    QLIST_REMOVE(&rcu_reader, node);
+    QLIST_REMOVE(get_ptr_rcu_reader(), node);
     qemu_mutex_unlock(&rcu_registry_lock);
 }