Refactor: Remove unused features and functions, and move most allocat…

…or operations to a global namespace. (#750) * Factor out explicit Config type Instead of using snmalloc::Alloc::Config, expose snmalloc::Config, which is then used to derive the allocator type. * Move globalalloc to front end. * Remove unneed template parameter from global snmalloc functions. * Remove SNMALLOC_PASS_THROUGH VeronaRT now has an abstraction layer which can easily replace the allocator. Having such a complex integration still in snmalloc does not make sense. * Take some global functions off of local alloc. * Drop comparison overloads on atomic Capptr. Performing a comparison on two atomic ptr is a complex operation, and should not be implicit. The memory model order and such things needs to be considered by the caller. * Remove function_ref and use templates The implementation prefers to use templates over the function_ref. This now only exists in the Pal for a currently unused feature. * Removing function_ref reduces stl needs. * Remove use of __is_convertible to support older g++ * Inline function that is only used once. * Remove unused function * Restrict ThreadAlloc usage to globalalloc This commit introduces various inline functions on snmalloc:: that perform allocation/deallocation using the thread local allocator. They remove all usage from a particular test. * Move cheri checks to own file. * Refactor is_owned checks. * Move alloc_size and check_size to globalalloc. * Minor simplification of dealloc path * Fix up is_owned to take a config * Improve usage of scoped allocator. * Handle Config_ in globalalloc.
microsoft · Feb 22, 2025 · 5f7baef · 5f7baef
1 parent 6d50141
commit 5f7baef
Show file tree

Hide file tree

Showing 50 changed files with 819 additions and 1,404 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -574,22 +574,9 @@ endif()
   endif()
 
   if (SNMALLOC_BUILD_TESTING)
-    if (WIN32
-    OR (CMAKE_SYSTEM_NAME STREQUAL NetBSD)
-    OR (CMAKE_SYSTEM_NAME STREQUAL OpenBSD)
-    OR (CMAKE_SYSTEM_NAME STREQUAL SunOS))
-    # Windows does not support aligned allocation well enough
-    # for pass through.
-    # NetBSD, OpenBSD and DragonFlyBSD do not support malloc*size calls.
-      set(FLAVOURS fast;check)
-    else()
-      set(FLAVOURS fast;check;malloc)
-    endif()
+    set(FLAVOURS fast;check)
 
     foreach(FLAVOUR ${FLAVOURS})
-      if (${FLAVOUR} STREQUAL "malloc")
-        set(DEFINES SNMALLOC_PASS_THROUGH)
-      endif()
       if (${FLAVOUR} STREQUAL "check")
         set(DEFINES SNMALLOC_CHECK_CLIENT)
       endif()

diff --git a/docs/release/0.7/README.md b/docs/release/0.7/README.md
@@ -109,10 +109,10 @@ The additional meta-data size in snmalloc 0.6 was fixed and under a cache line i
 In snmalloc 0.7, we have made this meta-data size configurable.
 This allows developers to build new security features on top of snmalloc.
 
-For instance, building snmalloc with the following definition of `Alloc` will allow you to store a 64-bit counter for each allocation:
+For instance, building snmalloc with the following definition of `Config` will allow you to store a 64-bit counter for each allocation:
 ```cpp
-  using Alloc = snmalloc::LocalAllocator<snmalloc::StandardConfigClientMeta<
-    ArrayClientMetaDataProvider<std::atomic<size_t>>>>;
+  using Config = snmalloc::StandardConfigClientMeta<
+    ArrayClientMetaDataProvider<std::atomic<size_t>>>;
 ```
 
 This does not affect the underlying alignment of the allocations.

diff --git a/fuzzing/snmalloc-fuzzer.cpp b/fuzzing/snmalloc-fuzzer.cpp
@@ -148,7 +148,7 @@ struct Result
   {
     auto alloc = snmalloc::get_scoped_allocator();
     if (ptr)
-      alloc->dealloc(ptr, size);
+      alloc->dealloc(ptr);
     ptr = nullptr;
   }
 };

diff --git a/src/snmalloc/README.md b/src/snmalloc/README.md
@@ -20,7 +20,7 @@ These are arranged in a hierarchy such that each of the directories may include
    This includes data structures such as pagemap implementations (efficient maps from a chunk address to associated metadata) and buddy allocators for managing address-space ranges.
  - `backend/` provides some example implementations for snmalloc embeddings that provide a global memory allocator for an address space.
    Users may ignore this entirely and use the types in `mem/` with a custom back end to expose an snmalloc instance with specific behaviour.
-   Layers above this can be used with a custom configuration by defining `SNMALLOC_PROVIDE_OWN_CONFIG` and exporting a type as `snmalloc::Alloc` that defines the type of an `snmalloc::LocalAllocator` template specialisation.
+   Layers above this can be used with a custom configuration by defining `SNMALLOC_PROVIDE_OWN_CONFIG` and exporting a type as `snmalloc::Config` that defines the configuration.
  - `global/` provides some front-end components that assume that snmalloc is available in a global configuration.
  - `override/` builds on top of `global/` to provide specific implementations with compatibility with external specifications (for example C `malloc`, C++ `operator new`, jemalloc's `*allocx`, or Rust's `std::alloc`).
 

diff --git a/src/snmalloc/ds_core/cheri.h b/src/snmalloc/ds_core/cheri.h
@@ -0,0 +1,139 @@
+#include "mitigations.h"
+
+namespace snmalloc
+{
+  /*
+   * Many of these tests come with an "or is null" branch that they'd need to
+   * add if we did them up front.  Instead, defer them until we're past the
+   * point where we know, from the pagemap, or by explicitly testing, that the
+   * pointer under test is not nullptr.
+   */
+  SNMALLOC_FAST_PATH_INLINE void dealloc_cheri_checks(void* p)
+  {
+#if defined(__CHERI_PURE_CAPABILITY__)
+    /*
+     * Enforce the use of an unsealed capability.
+     *
+     * TODO In CHERI+MTE, this, is part of the CAmoCDecVersion instruction;
+     * elide this test in that world.
+     */
+    snmalloc_check_client(
+      mitigations(cheri_checks),
+      !__builtin_cheri_sealed_get(p),
+      "Sealed capability in deallocation");
+
+    /*
+     * Enforce permissions on the returned pointer.  These pointers end up in
+     * free queues and will be cycled out to clients again, so try to catch
+     * erroneous behavior now, rather than later.
+     *
+     * TODO In the CHERI+MTE case, we must reconstruct the pointer for the
+     * free queues as part of the discovery of the start of the object (so
+     * that it has the correct version), and the CAmoCDecVersion call imposes
+     * its own requirements on the permissions (to ensure that it's at least
+     * not zero).  They are somewhat more lax than we might wish, so this test
+     * may remain, guarded by SNMALLOC_CHECK_CLIENT, but no explicit
+     * permissions checks are required in the non-SNMALLOC_CHECK_CLIENT case
+     * to defend ourselves or other clients against a misbehaving client.
+     */
+    static const size_t reqperm = CHERI_PERM_LOAD | CHERI_PERM_STORE |
+      CHERI_PERM_LOAD_CAP | CHERI_PERM_STORE_CAP;
+    snmalloc_check_client(
+      mitigations(cheri_checks),
+      (__builtin_cheri_perms_get(p) & reqperm) == reqperm,
+      "Insufficient permissions on capability in deallocation");
+
+    /*
+     * We check for a valid tag here, rather than in domestication, because
+     * domestication might be answering a slightly different question, about
+     * the plausibility of addresses rather than of exact pointers.
+     *
+     * TODO Further, in the CHERI+MTE case, the tag check will be implicit in
+     * a future CAmoCDecVersion instruction, and there should be no harm in
+     * the lookups we perform along the way to get there.  In that world,
+     * elide this test.
+     */
+    snmalloc_check_client(
+      mitigations(cheri_checks),
+      __builtin_cheri_tag_get(p),
+      "Untagged capability in deallocation");
+
+    /*
+     * Verify that the capability is not zero-length, ruling out the other
+     * edge case around monotonicity.
+     */
+    snmalloc_check_client(
+      mitigations(cheri_checks),
+      __builtin_cheri_length_get(p) > 0,
+      "Zero-length capability in deallocation");
+
+    /*
+     * At present we check for the pointer also being the start of an
+     * allocation closer to dealloc; for small objects, that happens in
+     * dealloc_local_object_fast, either below or *on the far end of message
+     * receipt*.  For large objects, it happens below by directly rounding to
+     * power of two rather than using the is_start_of_object helper.
+     * (XXX This does mean that we might end up threading our remote queue
+     * state somewhere slightly unexpected rather than at the head of an
+     * object.  That is perhaps fine for now?)
+     */
+
+    /*
+     * TODO
+     *
+     * We could enforce other policies here, including that the length exactly
+     * match the sizeclass.  At present, we bound caps we give for allocations
+     * to the underlying sizeclass, so even malloc(0) will have a non-zero
+     * length.  Monotonicity would then imply that the pointer must be the
+     * head of an object (modulo, perhaps, temporal aliasing if we somehow
+     * introduced phase shifts in heap layout like some allocators do).
+     *
+     * If we switched to bounding with upwards-rounded representable bounds
+     * (c.f., CRRL) rather than underlying object size, then we should,
+     * instead, in general require plausibility of p_raw by checking that its
+     * length is nonzero and the snmalloc size class associated with its
+     * length is the one for the slab in question... except for the added
+     * challenge of malloc(0).  Since 0 rounds up to 0, we might end up
+     * constructing zero-length caps to hand out, which we would then reject
+     * upon receipt.  Instead, as part of introducing CRRL bounds, we should
+     * introduce a sizeclass for slabs holding zero-size objects.  All told,
+     * we would want to check that
+     *
+     *   size_to_sizeclass(length) == entry.get_sizeclass()
+     *
+     * I believe a relaxed CRRL test of
+     *
+     *   length > 0 || (length == sizeclass_to_size(entry.get_sizeclass()))
+     *
+     * would also suffice and may be slightly less expensive than the test
+     * above, at the cost of not catching as many misbehaving clients.
+     *
+     * In either case, having bounded by CRRL bounds, we would need to be
+     * *reconstructing* the capabilities headed to our free lists to be given
+     * out to clients again; there are many more CRRL classes than snmalloc
+     * sizeclasses (this is the same reason that we can always get away with
+     * CSetBoundsExact in capptr_bound).  Switching to CRRL bounds, if that's
+     * ever a thing we want to do, will be easier after we've done the
+     * plumbing for CHERI+MTE.
+     */
+
+    /*
+     * TODO: Unsurprisingly, the CHERI+MTE case once again has something to
+     * say here.  In that world, again, we are certain to be reconstructing
+     * the capability for the free queue anyway, and so exactly what we wish
+     * to enforce, length-wise, of the provided capability, is somewhat more
+     * flexible.  Using the provided capability bounds when recoloring memory
+     * could be a natural way to enforce that it covers the entire object, at
+     * the cost of a more elaborate recovery story (as we risk aborting with a
+     * partially recolored object).  On non-SNMALLOC_CHECK_CLIENT builds, it
+     * likely makes sense to just enforce that length > 0 (*not* enforced by
+     * the CAmoCDecVersion instruction) and say that any authority-bearing
+     * interior pointer suffices to free the object.  I believe that to be an
+     * acceptable security posture for the allocator and between clients;
+     * misbehavior is confined to the misbehaving client.
+     */
+#else
+    UNUSED(p);
+#endif
+  }
+} // namespace snmalloc
diff --git a/src/snmalloc/ds_core/ds_core.h b/src/snmalloc/ds_core/ds_core.h
@@ -8,6 +8,7 @@
  */
 
 #include "bits.h"
+#include "cheri.h"
 #include "concept.h"
 #include "defines.h"
 #include "helpers.h"

diff --git a/src/snmalloc/ds_core/helpers.h b/src/snmalloc/ds_core/helpers.h
@@ -85,55 +85,6 @@ namespace snmalloc
     }
   };
 
-  /**
-   * Non-owning version of std::function. Wraps a reference to a callable object
-   * (eg. a lambda) and allows calling it through dynamic dispatch, with no
-   * allocation. This is useful in the allocator code paths, where we can't
-   * safely use std::function.
-   *
-   * Inspired by the C++ proposal:
-   * http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2018/p0792r2.html
-   */
-  template<typename Fn>
-  struct function_ref;
-
-  template<typename R, typename... Args>
-  struct function_ref<R(Args...)>
-  {
-    // The enable_if is used to stop this constructor from shadowing the default
-    // copy / move constructors.
-    template<
-      typename Fn,
-      typename =
-        stl::enable_if_t<!stl::is_same_v<stl::decay_t<Fn>, function_ref>>>
-    function_ref(Fn&& fn)
-    {
-      data_ = static_cast<void*>(&fn);
-      fn_ = execute<Fn>;
-    }
-
-    R operator()(Args... args) const
-    {
-      return fn_(data_, args...);
-    }
-
-  private:
-    void* data_;
-    R (*fn_)(void*, Args...);
-
-    template<typename Fn>
-    static R execute(void* p, Args... args)
-    {
-      return (*static_cast<stl::add_pointer_t<Fn>>(p))(args...);
-    };
-  };
-
-  template<class T, template<typename> typename Ptr>
-  void ignore(Ptr<T> t)
-  {
-    UNUSED(t);
-  }
-
   /**
    * Helper class for building fatal errors.  Used by `report_fatal_error` to
    * build an on-stack buffer containing the formatted string.

diff --git a/src/snmalloc/ds_core/ptrwrap.h b/src/snmalloc/ds_core/ptrwrap.h
@@ -517,21 +517,6 @@ namespace snmalloc
       return CapPtr<T, bounds>::unsafe_from(
         this->unsafe_capptr.exchange(desired.unsafe_ptr(), order));
     }
-
-    SNMALLOC_FAST_PATH bool operator==(const AtomicCapPtr& rhs) const
-    {
-      return this->unsafe_capptr == rhs.unsafe_capptr;
-    }
-
-    SNMALLOC_FAST_PATH bool operator!=(const AtomicCapPtr& rhs) const
-    {
-      return this->unsafe_capptr != rhs.unsafe_capptr;
-    }
-
-    SNMALLOC_FAST_PATH bool operator<(const AtomicCapPtr& rhs) const
-    {
-      return this->unsafe_capptr < rhs.unsafe_capptr;
-    }
   };
 
   namespace capptr

diff --git a/src/snmalloc/global/bounds_checks.h b/src/snmalloc/global/bounds_checks.h
@@ -1,4 +1,5 @@
 #pragma once
+#include "globalalloc.h"
 #include "threadalloc.h"
 
 namespace snmalloc
@@ -61,18 +62,17 @@ namespace snmalloc
     }
     else
     {
-      auto& alloc = ThreadAlloc::get();
       void* p = const_cast<void*>(ptr);
 
       auto range_end = pointer_offset(p, len);
-      auto object_end = alloc.template external_pointer<OnePastEnd>(p);
+      auto object_end = external_pointer<OnePastEnd>(p);
       report_fatal_error(
         "Fatal Error!\n{}: \n\trange [{}, {})\n\tallocation [{}, "
         "{})\nrange goes beyond allocation by {} bytes \n",
         msg,
         p,
         range_end,
-        alloc.template external_pointer<Start>(p),
+        external_pointer<Start>(p),
         object_end,
         pointer_diff(object_end, range_end));
     }
@@ -86,13 +86,16 @@ namespace snmalloc
    * The template parameter indicates whether the check should be performed.  It
    * defaults to true. If it is false, the check will always succeed.
    */
-  template<bool PerformCheck = true>
+  template<bool PerformCheck = true, SNMALLOC_CONCEPT(IsConfig) Config = Config>
   SNMALLOC_FAST_PATH_INLINE bool check_bounds(const void* ptr, size_t len)
   {
     if constexpr (PerformCheck)
     {
-      auto& alloc = ThreadAlloc::get();
-      return alloc.check_bounds(ptr, len);
+      if (SNMALLOC_LIKELY(Config::is_initialised()))
+      {
+        return remaining_bytes(address_cast(ptr)) >= len;
+      }
+      return true;
     }
     else
     {

diff --git a/src/snmalloc/global/global.h b/src/snmalloc/global/global.h
@@ -1,4 +1,5 @@
 #include "bounds_checks.h"
+#include "globalalloc.h"
 #include "libc.h"
 #include "memcpy.h"
 #include "scopedalloc.h"