NVIDIA · miscco · Jul 26, 2024 · Jul 2, 2024 · Jul 20, 2024 · Jul 20, 2024
@@ -11,49 +11,13 @@
 #ifndef _CUDAX_EVENT_DETAIL_H
 #define _CUDAX_EVENT_DETAIL_H
 
-/*
-    event synopsis
-namespace cuda::experimental {
-class event : public event_ref {
-public:
-    enum class flags : unsigned int { none, blocking_sync, interprocess };
-
-    event(stream_ref, flags = flags::none);
-    event(uninit_t) noexcept;
-    event(event&&) noexcept;
-    ~event();
-    event& operator=(event&&) noexcept;
-
-    [[nodiscard]] static event from_native_handle(cudaEvent_t) noexcept;
-    static event from_native_handle(int) = delete;
-    static event from_native_handle(nullptr_t) = delete;
-
-    [[nodiscard]] cudaEvent_t release() noexcept;
-
-    [[nodiscard]] friend flags operator|(flags, flags) noexcept;
-
-    // From event_ref:
-    using value_type = cudaEvent_t;
-
-    void record(stream_ref) const;
-
-    void wait(stream_ref) const;
-
-    [[nodiscard]] cudaEvent_t get() const noexcept;
-
-    [[nodiscard]] explicit operator bool() const noexcept;
-
-    [[nodiscard]] friend bool operator==(event_ref, event_ref);
-    [[nodiscard]] friend bool operator!=(event_ref, event_ref);
-};
-}  // cuda::experimenal
-*/
-
 #include <cuda_runtime_api.h>
 // cuda_runtime_api needs to come first
 
 #include <cuda/std/detail/__config>
 
+#include "cuda/std/detail/libcxx/include/__config"
+
 #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
 #  pragma GCC system_header
 #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
@@ -63,7 +27,6 @@ public:
 #endif // no system header
 
 #include <cuda/std/__cuda/api_wrapper.h>
-#include <cuda/std/__exception/cuda_error.h>
 #include <cuda/std/cstddef>
 #include <cuda/std/utility>
 
@@ -115,6 +78,9 @@ public:
       : event_ref(_CUDA_VSTD::exchange(__other.__event_, {}))
   {}
 
+  // Disallow copy construction.
+  event(const event&) = delete;
+
   //! @brief Destroy the `event` object
   //!
   //! @note If the event fails to be destroyed, the error is silently ignored.
@@ -131,12 +97,16 @@ public:
   //! @param __other
   //!
   //! @post `__other` is in a moved-from state.
-  constexpr event& operator=(event&& __other) noexcept
+  event& operator=(event&& __other) noexcept
   {
-    __event_ = _CUDA_VSTD::exchange(__other.__event_, {});
+    event __tmp(_CUDA_VSTD::move(__other));
+    _CUDA_VSTD::swap(__event_, __tmp.__event_);
     return *this;
   }
 
+  // Disallow copy assignment.
+  event& operator=(const event&) = delete;
+
   //! @brief Construct an `event` object from a native `cudaEvent_t` handle.
   //!
   //! @param __evnt The native handle

@@ -11,35 +11,6 @@
 #ifndef _CUDAX_EVENT_REF_DETAIL_H
 #define _CUDAX_EVENT_REF_DETAIL_H
 
-/*
-    event_ref synopsis
-namespace cuda::experimental {
-class event_ref {
-public:
-    using value_type = cudaEvent_t;
-
-    event_ref(cudaEvent_t event_) noexcept : event(event_) {}
-
-    event_ref(int) = delete;
-    event_ref(nullptr_t) = delete;
-
-    void record(stream_ref) const;
-
-    void wait(stream_ref) const;
-
-    [[nodiscard]] cudaEvent_t get() const noexcept;
-
-    [[nodiscard]] explicit operator bool() const noexcept;
-
-    [[nodiscard]] friend bool operator==(event_ref, event_ref);
-    [[nodiscard]] friend bool operator!=(event_ref, event_ref);
-
-private:
-    cudaEvent_t event{}; // exposition only
-};
-}  // cuda::experimenal
-*/
-
 #include <cuda_runtime_api.h>
 // cuda_runtime_api needs to come first
 
@@ -54,7 +25,6 @@ private:
 #endif // no system header
 
 #include <cuda/std/__cuda/api_wrapper.h>
-#include <cuda/std/__exception/cuda_error.h>
 #include <cuda/std/cassert>
 #include <cuda/std/cstddef>
 #include <cuda/std/utility>
@@ -72,7 +42,7 @@ private:
   friend class event;
   friend class timed_event;
 
-  ::cudaEvent_t __event_{0};
+  ::cudaEvent_t __event_{};
 
 public:
   using value_type = ::cudaEvent_t;

@@ -11,48 +11,6 @@
 #ifndef _CUDAX_TIMED_EVENT_DETAIL_H
 #define _CUDAX_TIMED_EVENT_DETAIL_H
 
-/*
-    timed_event synopsis
-namespace cuda::experimental {
-class timed_event : public event {
-public:
-    timed_event(stream_ref, flags = flags::none);
-    timed_event(uninit_t) noexcept;
-    timed_event(timed_event&&) noexcept;
-    ~timed_event();
-    timed_event& operator=(timed_event&&) noexcept;
-
-    [[nodiscard]] static timed_event from_native_handle(cudaEvent_t) noexcept;
-    static timed_event from_native_handle(int) = delete;
-    static timed_event from_native_handle(nullptr_t) = delete;
-
-    [[nodiscard]] friend auto operator-(const timed_event& end, const timed_event& start) noexcept
-        -> cuda::std::chrono::nanoseconds;
-
-    // from event:
-    enum class flags : unsigned int { none, blocking_sync, interprocess };
-
-    [[nodiscard]] cudaEvent_t release() noexcept;
-
-    [[nodiscard]] friend flags operator|(flags, flags) noexcept;
-
-    // From event_ref:
-    using value_type = cudaEvent_t;
-
-    void record(stream_ref) const;
-
-    void wait(stream_ref) const;
-
-    [[nodiscard]] cudaEvent_t get() const noexcept;
-
-    [[nodiscard]] explicit operator bool() const noexcept;
-
-    [[nodiscard]] friend bool operator==(event_ref, event_ref);
-    [[nodiscard]] friend bool operator!=(event_ref, event_ref);
-};
-}  // cuda::experimenal
-*/
-
 #include <cuda_runtime_api.h>
 // cuda_runtime_api needs to come first
 
@@ -67,7 +25,6 @@ public:
 #endif // no system header
 
 #include <cuda/std/__cuda/api_wrapper.h>
-#include <cuda/std/__exception/cuda_error.h>
 #include <cuda/std/chrono>
 #include <cuda/std/cstddef>
 
@@ -98,6 +55,11 @@ public:
       : event(uninit)
   {}
 
+  timed_event(timed_event&&) noexcept            = default;
+  timed_event(const timed_event&)                = delete;
+  timed_event& operator=(timed_event&&) noexcept = default;
+  timed_event& operator=(const timed_event&)     = delete;
+
   //! @brief Construct a `timed_event` object from a native `cudaEvent_t` handle.
   //!
   //! @param __evnt The native handle

@@ -61,4 +61,5 @@ foreach(cn_target IN LISTS cudax_TARGETS)
   Cudax_add_catch2_test(test_target event_tests ${cn_target}
     event/event_smoke.cu
   )
+  target_compile_options(${test_target} PRIVATE $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--extended-lambda>)
 endforeach()
@@ -0,0 +1,118 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda_runtime_api.h>
+// cuda_runtime_api needs to come first
+
+#include <cuda/std/__cuda/api_wrapper.h>
+#include <cuda/std/utility>
+#include <cuda/stream_ref>
+
+#include <new> // IWYU pragma: keep (needed for placement new)
+
+namespace
+{
+namespace test
+{
+struct stream : cuda::stream_ref
+{
+  stream()
+      : cuda::stream_ref(::cudaStream_t{})
+  {
+    ::cudaStream_t stream{};
+    _CCCL_TRY_CUDA_API(::cudaStreamCreate, "failed to create a CUDA stream", &stream);
+    static_cast<cuda::stream_ref&>(*this) = cuda::stream_ref(stream);
+  }
+
+  cuda::stream_ref ref() const noexcept
+  {
+    return *this;
+  }
+
+  void wait() const
+  {
+    _CCCL_TRY_CUDA_API(::cudaStreamSynchronize, "failed to synchronize a CUDA stream", get());
+  }
+
+  ~stream()
+  {
+    [[maybe_unused]] auto status = ::cudaStreamDestroy(get());
+  }
+};
+
+struct _malloc_managed
+{
+private:
+  void* pv = nullptr;
+
+public:
+  explicit _malloc_managed(std::size_t size)
+  {
+    _CCCL_TRY_CUDA_API(::cudaMallocManaged, "failed to allocate managed memory", &pv, size);
+  }
+
+  ~_malloc_managed()
+  {
+    [[maybe_unused]] auto status = ::cudaFree(pv);
+  }
+
+  template <class T>
+  T* get_as() const noexcept
+  {
+    return static_cast<T*>(pv);
+  }
+};
+
+template <class T>
+struct managed
+{
+private:
+  _malloc_managed _mem;
+
+public:
+  explicit managed(T t)
+      : _mem(sizeof(T))
+  {
+    ::new (_mem.get_as<void>()) T(_CUDA_VSTD::move(t));
+  }
+
+  ~managed()
+  {
+    get()->~T();
+  }
+
+  T* get() noexcept
+  {
+    return _mem.get_as<T>();
+  }
+  const T* get() const noexcept
+  {
+    return _mem.get_as<T>();
+  }
+
+  T& operator*() noexcept
+  {
+    return *get();
+  }
+  const T& operator*() const noexcept
+  {
+    return *get();
+  }
+};
+
+/// A kernel that takes a callable object and invokes it with a set of arguments
+template <class Fn, class... Args>
+__global__ void invokernel(Fn fn, Args... args)
+{
+  fn(args...);
+}
+
+} // namespace test
+} // namespace