Atomics backend refactor (#1631)

* Delete <cuda/std/atomic> header * Move atomic from libcxx to top-level * Move PTX backends from libcxx to <cuda/std/__atomic/...> * Delete remaining atomics backends. Move MSVC backend * First pass at making atomic use new backends * Change atomic_storage operator()() to get() * Fixup: Change desired of compexch to accept by value. * This matches other implementations. * Fix merge conflicts (LIBCUDACXX->CCCL) * Fix another merge conflict (LIBCUDACXX->CCCL) * Simplify tag dispatch in the atomic backend * Make tests work when full path is specified to lit. * Update barrier, latch, and semaphore, to use new atomic_impl. * Make changes to atomic work. * Rearrange headers and update latch/barrier. * Update codegen to reflect new header layout. * Make platform.h define `LIBCUDACXX_ATOMIC_BLAH_LOCK_FREE`. * We previously defined or *clobbered* the existing STL definitions. * See: `ATOMIC_BOOL_LOCK_FREE`->`LIBCUDACXX_ATOMIC_BOOL_LOCK_FREE` * Fix missing <cstdint> in generated ptx file. * `__cuda_std__` mode does not require use of host atomics checks. * Fix missing `_If` in types.h. * Fix missing <cstdint> in derived PTX file. * Remove uneeded headers from base.h. * Fix type mixup in `__atomic_wait`. * Change heterogeneous tests to permutate over H/D launcher combinations. * However this is restricted to *one* device launch per suite to prevent deadlocks. * Tests are much slower, but extremely thorough. * Concurrent H/D coverage is particularly exemplified in latch.pass.cpp. * Change tests to use `validate_pinned` API. * Fix a couple issues with `__atomic_underlying_t` in __atomics/types * [pre-commit.ci] auto code formatting * Fix `<atomic>` header include guard. * Move thread_count trait around and remove unused sink. * Fix mistakes from merging clang-format changes. * [pre-commit.ci] auto code formatting * Add system header guards to every new `__atomic` header * Delete `<atomic>` header synopsis. * Fix push/pop macros in `<atomic>`. * `ATOMIC_VAR_INIT->LIBCUDACXX_ATOMIC_VAR_INIT` * This avoids conflicting with the host's definition. * Make `<cuda/atomic>` tests include the correct header. * Fix typing with volatile atomic types. * Include correct header for `cuda::atomic`. * Fix underlying_t in `notify_wait.h` * Revert using `volatile` in latch. * Make helpers more useful in tests. * Prevent non-CUDA compilers from seeing PTX. * Make the MSVC atomic header a little more friendly. * Make the derived PTX header only visible to CUDA compilers. * Remove the defaulted scope specifier on the atomic type layer. * Remove the defaulted scope specifier from the atomics API layer. * Fix missing cassert in several tests. * Revert mistaken `LIBCUDACXX_ATOMIC_FLAG_INIT` change. * Fix bad atomic alignment errors. * [pre-commit.ci] auto code formatting * Make internal owned memory atomic APIs have a default ctor. * Fully qualify the atomics APIs in the cuda/atomic header. * Add missing type_traits to host.h Co-authored-by: Michael Schellenberger Costa <[email protected]> * Use `_LIBCUDACXX_INLINE_VISIBILITY` for API functions. * Reorder derived PTX functions attribute declarations. * traits fixups in `__atomic/types` * Default ctors and sprinkle noexcept around on some `__atomic/types` APIs. * Apply suggestions to common.h. * Remove full namespace qualifier in atomic storage trait. * modernization fixes to order.h. * Move includes cuda/atomic.h and cuda/barrier.h. * Adjust header error block and add missing includes in `<cuda/std/atomic>` * Add missing system header block in cuda/atomic * Fix invalid use of typename. * `_LIBCUDACXX_TRAITS->_CCCL_TRAIT` * [pre-commit.ci] auto code formatting * Fix return type of host atomics. * Fix cassert missing from generated atomic header. * Fix visibility of host atomics in NVRTC build. * Add more tests to bad_atomic_alignment.pass.cpp * Fix alignment warnings in host compare_exchange layer. * [pre-commit.ci] auto code formatting --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Michael Schellenberger Costa <[email protected]>
NVIDIA · May 10, 2024 · 12c2892 · 12c2892
1 parent 576f204
commit 12c2892
Show file tree

Hide file tree

Showing 69 changed files with 3,603 additions and 5,301 deletions.
diff --git a/libcudacxx/codegen/CMakeLists.txt b/libcudacxx/codegen/CMakeLists.txt
@@ -19,8 +19,8 @@ target_compile_features(
 
 add_dependencies(libcudacxx.atomics.codegen codegen)
 
-set(atomic_generated_output "${libcudacxx_BINARY_DIR}/codegen/atomic_cuda_generated.h")
-set(atomic_install_location "${libcudacxx_SOURCE_DIR}/include/cuda/std/detail/libcxx/include/support/atomic")
+set(atomic_generated_output "${libcudacxx_BINARY_DIR}/codegen/cuda_ptx_generated.h")
+set(atomic_install_location "${libcudacxx_SOURCE_DIR}/include/cuda/std/__atomic/functions")
 
 add_custom_target(
     libcudacxx.atomics.codegen.execute
@@ -32,13 +32,13 @@ add_dependencies(libcudacxx.atomics.codegen libcudacxx.atomics.codegen.execute)
 
 add_custom_target(
     libcudacxx.atomics.codegen.install
-    COMMAND ${CMAKE_COMMAND} -E copy "${atomic_generated_output}" "${atomic_install_location}/atomic_cuda_generated.h"
-    BYPRODUCTS "${atomic_install_location}/atomic_cuda_generated.h"
+    COMMAND ${CMAKE_COMMAND} -E copy "${atomic_generated_output}" "${atomic_install_location}/cuda_ptx_generated.h"
+    BYPRODUCTS "${atomic_install_location}/cuda_ptx_generated.h"
 )
 
 add_dependencies(libcudacxx.atomics.codegen.install libcudacxx.atomics.codegen.execute)
 
 add_test(
     NAME libcudacxx.atomics.codegen.diff
-    COMMAND ${CMAKE_COMMAND} -E compare_files "${atomic_install_location}/atomic_cuda_generated.h" "${atomic_generated_output}"
+    COMMAND ${CMAKE_COMMAND} -E compare_files "${atomic_install_location}/cuda_ptx_generated.h" "${atomic_generated_output}"
 )
diff --git a/libcudacxx/codegen/codegen.cpp b/libcudacxx/codegen/codegen.cpp
@@ -66,7 +66,7 @@ int main()
 
   std::vector<std::string> cv_qualifier{"volatile ", ""};
 
-  std::ofstream out("atomic_cuda_generated.h");
+  std::ofstream out("cuda_ptx_generated.h");
 
   out << R"XXX(//===----------------------------------------------------------------------===//
 //
@@ -78,8 +78,36 @@ int main()
 //
 //===----------------------------------------------------------------------===//
 
-// This is a autogenerated file, we want to ensure that it contains exactly the contentes we want to generate
+// This is an autogenerated file, we want to ensure that it contains exactly the contents we want to generate
 // clang-format off
+
+#ifndef _LIBCUDACXX___ATOMIC_FUNCTIONS_CUDA_PTX_GENERATED_H
+#define _LIBCUDACXX___ATOMIC_FUNCTIONS_CUDA_PTX_GENERATED_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/cassert>
+#include <cuda/std/cstdint>
+
+#include <cuda/std/__type_traits/enable_if.h>
+#include <cuda/std/__type_traits/is_signed.h>
+#include <cuda/std/__type_traits/is_unsigned.h>
+
+#include <cuda/std/__atomic/scopes.h>
+#include <cuda/std/__atomic/order.h>
+
+_LIBCUDACXX_BEGIN_NAMESPACE_STD
+
+#if defined(_CCCL_CUDA_COMPILER)
+
 )XXX";
 
   auto scopenametag = [&](auto scope) {
@@ -302,11 +330,11 @@ int main()
               {
                 out << "template<class _Type, _CUDA_VSTD::__enable_if_t<sizeof(_Type)==" << sz / 8 << ", int> = 0>\n";
                 out << "_CCCL_DEVICE bool __atomic_compare_exchange_cuda(" << cv
-                    << "_Type *__ptr, _Type *__expected, const _Type *__desired, bool, int __success_memorder, int "
+                    << "_Type *__ptr, _Type *__expected, const _Type __desired, bool, int __success_memorder, int "
                        "__failure_memorder, "
                     << scopenametag(s.first) << ") {\n";
                 out << "    uint" << sz << "_t __tmp = 0, __old = 0, __old_tmp;\n";
-                out << "    memcpy(&__tmp, __desired, " << sz / 8 << ");\n";
+                out << "    memcpy(&__tmp, &__desired, " << sz / 8 << ");\n";
                 out << "    memcpy(&__old, __expected, " << sz / 8 << ");\n";
                 out << "    __old_tmp = __old;\n";
                 out << "    NV_DISPATCH_TARGET(\n";
@@ -503,6 +531,9 @@ int main()
     }
   }
 
+  out << "\n#endif // defined(_CCCL_CUDA_COMPILER)\n";
+  out << "\n_LIBCUDACXX_END_NAMESPACE_STD\n";
+  out << "\n#endif // _LIBCUDACXX___ATOMIC_FUNCTIONS_CUDA_PTX_GENERATED_H\n";
   out << "\n// clang-format on\n";
 
   return 0;

diff --git a/libcudacxx/examples/rtc_example.cpp b/libcudacxx/examples/rtc_example.cpp
@@ -50,11 +50,11 @@ template<class T> static constexpr T min(T a, T b) { return a < b ? a : b; }
 
 struct trie {
     struct ref {
-        cuda::std::atomic<trie*> ptr = ATOMIC_VAR_INIT(nullptr);
+        cuda::std::atomic<trie*> ptr = LIBCUDACXX_ATOMIC_VAR_INIT(nullptr);
         // the flag will protect against multiple pointer updates
-        cuda::std::atomic_flag flag = ATOMIC_FLAG_INIT;
+        cuda::std::atomic_flag flag = LIBCUDACXX_ATOMIC_FLAG_INIT;
     } next[26];
-    cuda::std::atomic<int> count = ATOMIC_VAR_INIT(0);
+    cuda::std::atomic<int> count = LIBCUDACXX_ATOMIC_VAR_INIT(0);
 };
 __host__ __device__
 int index_of(char c) {

diff --git a/libcudacxx/examples/trie.cu b/libcudacxx/examples/trie.cu
@@ -36,11 +36,11 @@ struct trie
 {
   struct ref
   {
-    cuda::atomic<trie*, cuda::thread_scope_device> ptr = ATOMIC_VAR_INIT(nullptr);
+    cuda::atomic<trie*, cuda::thread_scope_device> ptr = LIBCUDACXX_ATOMIC_VAR_INIT(nullptr);
     // the flag will protect against multiple pointer updates
-    cuda::std::atomic_flag flag = ATOMIC_FLAG_INIT;
+    cuda::std::atomic_flag flag = LIBCUDACXX_ATOMIC_FLAG_INIT;
   } next[26];
-  cuda::std::atomic<short> count = ATOMIC_VAR_INIT(0);
+  cuda::std::atomic<short> count = LIBCUDACXX_ATOMIC_VAR_INIT(0);
 };
 __host__ __device__ int index_of(char c)
 {

diff --git a/libcudacxx/examples/trie_mt.cpp b/libcudacxx/examples/trie_mt.cpp
@@ -36,11 +36,11 @@ struct trie
 {
   struct ref
   {
-    std::atomic<trie*> ptr = ATOMIC_VAR_INIT(nullptr);
+    std::atomic<trie*> ptr = LIBCUDACXX_ATOMIC_VAR_INIT(nullptr);
     // the flag will protect against multiple pointer updates
-    std::atomic_flag flag = ATOMIC_VAR_INIT(0);
+    std::atomic_flag flag = LIBCUDACXX_ATOMIC_VAR_INIT(0);
   } next[26];
-  std::atomic<int> count = ATOMIC_VAR_INIT(0);
+  std::atomic<int> count = LIBCUDACXX_ATOMIC_VAR_INIT(0);
 };
 int index_of(char c)
 {

diff --git a/libcudacxx/include/cuda/atomic b/libcudacxx/include/cuda/atomic
@@ -11,6 +11,14 @@
 #ifndef _CUDA_ATOMIC
 #define _CUDA_ATOMIC
 
-#include <cuda/std/atomic>
+#include <cuda/std/__cuda/atomic.h>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
 
 #endif // _CUDA_ATOMIC