Multiple Pals could return spuriously from wake on address (#739)

* Add stress test benchmark Co-authored-by: Alexander Nadeau <[email protected]> * Add defensive code against spurious wakeup This commit checks that wait_on_address has not returned spuriously. * pal: spurious wake up. The code in the Pal for wake on address was incorrectly assuming the operation returning success meant it had actually changed. The specification allows for spurious wake ups. This change makes the Pals recheck for a change. --------- Co-authored-by: Alexander Nadeau <[email protected]>
microsoft · Feb 4, 2025 · 32495fd · 32495fd
1 parent e3e5584
commit 32495fd
Show file tree

Hide file tree

Showing 7 changed files with 153 additions and 30 deletions.
diff --git a/src/snmalloc/ds/combininglock.h b/src/snmalloc/ds/combininglock.h
@@ -141,6 +141,10 @@ namespace snmalloc
               expected, LockStatus::SLEEPING, stl::memory_order_acq_rel))
         {
           Pal::wait_on_address(status, LockStatus::SLEEPING);
+          if (status.load(stl::memory_order_acquire) == LockStatus::SLEEPING)
+          {
+            error("Corruption in core locking primitive. Aborting execution.");
+          }
         }
       }
     }

diff --git a/src/snmalloc/pal/pal_apple.h b/src/snmalloc/pal/pal_apple.h
@@ -327,24 +327,16 @@ namespace snmalloc
       while (addr.load(stl::memory_order_relaxed) == expected)
       {
 #  ifdef SNMALLOC_APPLE_HAS_OS_SYNC_WAIT_ON_ADDRESS
-        if (
-          os_sync_wait_on_address(
-            &addr, static_cast<uint64_t>(expected), sizeof(T), 0) != -1)
-        {
-          errno = errno_backup;
-          return;
-        }
+        os_sync_wait_on_address(
+          &addr, static_cast<uint64_t>(expected), sizeof(T), 0);
 #  else
-        if (
-          __ulock_wait(
-            UL_COMPARE_AND_WAIT | ULF_NO_ERRNO,
-            &addr,
-            static_cast<uint64_t>(expected),
-            0) != -1)
-        {
-          return;
-        }
+        __ulock_wait(
+          UL_COMPARE_AND_WAIT | ULF_NO_ERRNO,
+          &addr,
+          static_cast<uint64_t>(expected),
+          0);
 #  endif
+        errno = errno_backup;
       }
     }
 

diff --git a/src/snmalloc/pal/pal_freebsd.h b/src/snmalloc/pal/pal_freebsd.h
@@ -145,15 +145,12 @@ namespace snmalloc
       int backup = errno;
       while (addr.load(stl::memory_order_relaxed) == expected)
       {
-        int ret = _umtx_op(
+        _umtx_op(
           &addr,
           UMTX_OP_WAIT_UINT_PRIVATE,
           static_cast<unsigned long>(expected),
           nullptr,
           nullptr);
-
-        if (ret == 0)
-          break;
       }
       errno = backup;
     }

diff --git a/src/snmalloc/pal/pal_linux.h b/src/snmalloc/pal/pal_linux.h
@@ -255,11 +255,23 @@ namespace snmalloc
         "T must be the same size and alignment as WaitingWord");
       while (addr.load(stl::memory_order_relaxed) == expected)
       {
-        long ret = syscall(
+        // Man page
+        // (https://www.man7.org/linux/man-pages/man2/futex.2.html#RETURN_VALUE)
+        // says:
+        //  FUTEX_WAIT
+        //    Returns 0 if the caller was woken up.  Note that a wake-up
+        //    can also be caused by common futex usage patterns in
+        //    unrelated code that happened to have previously used the
+        //    futex word's memory location (e.g., typical futex-based
+        //    implementations of Pthreads mutexes can cause this under
+        //    some conditions).  Therefore, callers should always
+        //    conservatively assume that a return value of 0 can mean a
+        //    spurious wake-up, and use the futex word's value (i.e., the
+        //    user-space synchronization scheme) to decide whether to
+        //    continue to block or not.
+        // We ignore the return and recheck.
+        syscall(
           SYS_futex, &addr, FUTEX_WAIT_PRIVATE, expected, nullptr, nullptr, 0);
-
-        if (ret == 0)
-          break;
       }
       errno = backup;
     }

diff --git a/src/snmalloc/pal/pal_openbsd.h b/src/snmalloc/pal/pal_openbsd.h
@@ -37,15 +37,12 @@ namespace snmalloc
         "T must be the same size and alignment as WaitingWord");
       while (addr.load(std::memory_order_relaxed) == expected)
       {
-        long ret = futex(
+        futex(
           (uint32_t*)&addr,
           FUTEX_WAIT_PRIVATE,
           static_cast<int>(expected),
           nullptr,
           nullptr);
-
-        if (ret == 0)
-          break;
       }
       errno = backup;
     }

diff --git a/src/snmalloc/pal/pal_windows.h b/src/snmalloc/pal/pal_windows.h
@@ -336,8 +336,7 @@ namespace snmalloc
     {
       while (addr.load(stl::memory_order_relaxed) == expected)
       {
-        if (::WaitOnAddress(&addr, &expected, sizeof(T), INFINITE))
-          break;
+        ::WaitOnAddress(&addr, &expected, sizeof(T), INFINITE);
       }
     }
 

diff --git a/src/test/perf/lotsofthreads/lotsofthread.cc b/src/test/perf/lotsofthreads/lotsofthread.cc
@@ -0,0 +1,122 @@
+/**
+ * This benchmark is based on
+ * https://github.com/microsoft/mimalloc/issues/1002#issuecomment-2630410617
+ *
+ * It causes large batchs of memory to be freed on a remote thread, and causes
+ * many aspects of the backend to be under-contention.
+ *
+ * The benchmark has a single freeing thread, and many allocating threads. The
+ * allocating threads communicate using a shared list of memory to free, which
+ * is protected by a mutex. This causes interesting batch behaviour which
+ * triggered a bug in the linux backend.
+ */
+#include <assert.h>
+#include <atomic>
+#include <mutex>
+#include <stdio.h>
+#include <stdlib.h>
+#include <thread>
+#include <vector>
+using namespace std;
+
+#include <snmalloc/snmalloc.h>
+#define malloc snmalloc::libc::malloc
+#define free snmalloc::libc::free
+#define malloc_usable_size snmalloc::libc::malloc_usable_size
+
+std::mutex global_tofree_list_mtx;
+std::vector<void*> global_tofree_list;
+
+std::atomic_int mustexit;
+
+void freeloop()
+{
+  size_t max_list_bytes = 0;
+  while (1)
+  {
+    std::lock_guard<std::mutex> guard{global_tofree_list_mtx};
+    size_t list_bytes = 0;
+    for (auto& p : global_tofree_list)
+    {
+      list_bytes += malloc_usable_size(p);
+      free(p);
+    }
+    global_tofree_list.clear();
+
+    if (list_bytes > max_list_bytes)
+    {
+      printf("%zd bytes\n", list_bytes);
+      max_list_bytes = list_bytes;
+    }
+
+    if (mustexit)
+      return;
+  }
+}
+
+void looper(size_t iterations)
+{
+  std::vector<void*> tofree_list;
+  auto flush = [&]() {
+    {
+      std::lock_guard<std::mutex> guard{global_tofree_list_mtx};
+      for (auto& p : tofree_list)
+        global_tofree_list.push_back(p);
+    }
+    tofree_list.clear();
+  };
+
+  auto do_free = [&](void* p) {
+    tofree_list.push_back(p);
+    if (tofree_list.size() > 100)
+    {
+      flush();
+    }
+  };
+
+  for (size_t i = 0; i < iterations; ++i)
+  {
+    size_t s = snmalloc::bits::one_at_bit(i % 20);
+    for (size_t j = 0; j < 8; j++)
+    {
+      auto ptr = (int*)malloc(s * sizeof(int));
+      if (ptr == nullptr)
+        continue;
+      *ptr = 1523;
+      do_free(ptr);
+    }
+  }
+
+  flush();
+}
+
+int main()
+{
+#ifdef SNMALLOC_THREAD_SANITIZER_ENABLED
+  size_t iterations = 50000;
+#elif defined(__APPLE__) && !defined(SNMALLOC_APPLE_HAS_OS_SYNC_WAIT_ON_ADDRESS)
+  size_t iterations = 50000;
+#else
+  size_t iterations = 200000;
+#endif
+
+  int threadcount = 8;
+  vector<thread> threads;
+
+  for (int i = 0; i < threadcount; ++i)
+    threads.emplace_back(looper, iterations);
+
+  std::thread freeloop_thread(freeloop);
+
+  for (auto& thread : threads)
+  {
+    thread.join();
+  }
+
+  mustexit.store(1);
+  freeloop_thread.join();
+
+  puts("Done!");
+
+  return 0;
+}
-Original file line number
+Diff line change
@@ Expand Up / @@ -336,8 +336,7 @@ namespace snmalloc @@
         {
           while (addr.load(stl::memory_order_relaxed) == expected)
           {
-            if (::WaitOnAddress(&addr, &expected, sizeof(T), INFINITE))
-              break;
+            ::WaitOnAddress(&addr, &expected, sizeof(T), INFINITE);
           }
         }
@@ Expand Down @@