[CUDAX] Add experimental owning abstraction for cudaStream_t (NVIDIA#…

…2093) * construct with a stream_ref and record the event on construction --------- Co-authored-by: Eric Niebler <[email protected]>
pciolkosz · Aug 4, 2024 · 4e4e22d · 4e4e22d
1 parent 9062e5b
commit 4e4e22d
Show file tree

Hide file tree

Showing 11 changed files with 375 additions and 16 deletions.
diff --git a/cudax/include/cuda/experimental/__hierarchy/level_dimensions.cuh b/cudax/include/cuda/experimental/__hierarchy/level_dimensions.cuh
@@ -156,7 +156,8 @@ struct level_dimensions
   _CCCL_HOST_DEVICE constexpr level_dimensions(Dimensions&& d)
       : dims(d)
   {}
-  _CCCL_HOST_DEVICE constexpr level_dimensions(){};
+  _CCCL_HOST_DEVICE constexpr level_dimensions()
+      : dims(){};
 };
 
 /**

diff --git a/cudax/include/cuda/experimental/__stream/stream.cuh b/cudax/include/cuda/experimental/__stream/stream.cuh
@@ -0,0 +1,197 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDAX__STREAM_STREAM
+#define _CUDAX__STREAM_STREAM
+
+#include <cuda/std/detail/__config>
+#include <cuda_runtime_api.h>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__cuda/api_wrapper.h>
+#include <cuda/stream_ref>
+
+#include <cuda/experimental/__device/device.cuh>
+#include <cuda/experimental/__event/timed_event.cuh>
+
+namespace cuda::experimental
+{
+
+namespace detail
+{
+// 0 is a valid stream in CUDA, so we need some other invalid stream representation
+// Can't make it constexpr, because cudaStream_t is a pointer type
+static const ::cudaStream_t invalid_stream = reinterpret_cast<cudaStream_t>(~0ULL);
+} // namespace detail
+
+//! @brief An owning wrapper for cudaStream_t.
+struct stream : stream_ref
+{
+  // 0 is documented as default priority
+  static constexpr int default_priority = 0;
+
+  //! @brief Constructs a stream on a specified device and with specified priority
+  //!
+  //! Priority is defaulted to stream::default_priority
+  //!
+  //! @throws cuda_error if stream creation fails
+  explicit stream(device __dev, int __priority = default_priority)
+  {
+    __scoped_device dev_setter(__dev);
+    _CCCL_TRY_CUDA_API(
+      ::cudaStreamCreateWithPriority, "Failed to create a stream", &__stream, cudaStreamDefault, __priority);
+  }
+
+  //! @brief Constructs a stream on the default device
+  //!
+  //! @throws cuda_error if stream creation fails.
+  stream()
+      : stream(device{0})
+  {}
+
+  //! @brief Construct a new `stream` object into the moved-from state.
+  //!
+  //! @post `stream()` returns an invalid stream handle
+  // Can't be constexpr because invalid_stream isn't
+  explicit stream(uninit_t) noexcept
+      : stream_ref(detail::invalid_stream)
+  {}
+
+  //! @brief Move-construct a new `stream` object
+  //!
+  //! @param __other
+  //!
+  //! @post `__other` is in moved-from state.
+  stream(stream&& __other) noexcept
+      : stream(_CUDA_VSTD::exchange(__other.__stream, detail::invalid_stream))
+  {}
+
+  stream(const stream&) = delete;
+
+  //! Destroy the `stream` object
+  //!
+  //! @note If the stream fails to be destroyed, the error is silently ignored.
+  ~stream()
+  {
+    if (__stream != detail::invalid_stream)
+    {
+      [[maybe_unused]] auto status = ::cudaStreamDestroy(__stream);
+    }
+  }
+
+  //! @brief Move-assign a `stream` object
+  //!
+  //! @param __other
+  //!
+  //! @post `__other` is in a moved-from state.
+  stream& operator=(stream&& __other) noexcept
+  {
+    stream __tmp(_CUDA_VSTD::move(__other));
+    _CUDA_VSTD::swap(__stream, __tmp.__stream);
+    return *this;
+  }
+
+  stream& operator=(const stream&) = delete;
+
+  // Ideally records and waits below would be in stream_ref, but we can't have it depend on cudax yet
+
+  //! @brief Create a new event and record it into this stream
+  //!
+  //! @return A new event that was recorded into this stream
+  //!
+  //! @throws cuda_error if event creation or record failed
+  _CCCL_NODISCARD event record_event(event::flags __flags = event::flags::none) const
+  {
+    return event(*this, __flags);
+  }
+
+  //! @brief Create a new timed event and record it into this stream
+  //!
+  //! @return A new timed event that was recorded into this stream
+  //!
+  //! @throws cuda_error if event creation or record failed
+  _CCCL_NODISCARD timed_event record_timed_event(event::flags __flags = event::flags::none) const
+  {
+    return timed_event(*this, __flags);
+  }
+
+  using stream_ref::wait;
+
+  //! @brief Make all future work submitted into this stream depend on completion of the specified event
+  //!
+  //! @param __ev Event that this stream should wait for
+  //!
+  //! @throws cuda_error if inserting the dependency fails
+  void wait(event_ref __ev) const
+  {
+    assert(__ev.get() != nullptr);
+    _CCCL_TRY_CUDA_API(::cudaStreamWaitEvent, "Failed to make a stream wait for an event", get(), __ev.get());
+  }
+
+  //! @brief Make all future work submitted into this stream depend on completion of all work from the specified stream
+  //!
+  //! @param __other Stream that this stream should wait for
+  //!
+  //! @throws cuda_error if inserting the dependency fails
+  void wait(stream_ref __other) const
+  {
+    // TODO consider an optimization to not create an event every time and instead have one persistent event or one per
+    // stream
+    assert(__stream.get() != detail::invalid_stream);
+    event __tmp(__other);
+    wait(__tmp);
+  }
+
+  //! @brief Construct an `stream` object from a native `cudaStream_t` handle.
+  //!
+  //! @param __handle The native handle
+  //!
+  //! @return stream The constructed `stream` object
+  //!
+  //! @note The constructed `stream` object takes ownership of the native handle.
+  _CCCL_NODISCARD static stream from_native_handle(::cudaStream_t __handle)
+  {
+    return stream(__handle);
+  }
+
+  // Disallow construction from an `int`, e.g., `0`.
+  static stream from_native_handle(int) = delete;
+
+  // Disallow construction from `nullptr`.
+  static stream from_native_handle(_CUDA_VSTD::nullptr_t) = delete;
+
+  //! @brief Retrieve the native `cudaStream_t` handle and give up ownership.
+  //!
+  //! @return cudaStream_t The native handle being held by the `stream` object.
+  //!
+  //! @post The stream object is in a moved-from state.
+  _CCCL_NODISCARD ::cudaStream_t release()
+  {
+    return _CUDA_VSTD::exchange(__stream, detail::invalid_stream);
+  }
+
+private:
+  // Use `stream::from_native_handle(s)` to construct an owning `stream`
+  // object from a `cudaStream_t` handle.
+  explicit stream(::cudaStream_t __handle)
+      : stream_ref(__handle)
+  {}
+};
+
+} // namespace cuda::experimental
+
+#endif // _CUDAX__STREAM_STREAM
diff --git a/cudax/include/cuda/experimental/hierarchy.cuh b/cudax/include/cuda/experimental/hierarchy.cuh
@@ -13,4 +13,4 @@
 
 #include <cuda/experimental/__hierarchy/hierarchy_dimensions.cuh>
 
-#endif
+#endif // __CUDAX_HIERARCHY___
diff --git a/cudax/include/cuda/experimental/launch.cuh b/cudax/include/cuda/experimental/launch.cuh
@@ -13,4 +13,4 @@
 
 #include <cuda/experimental/__launch/launch.cuh>
 
-#endif
+#endif // __CUDAX_LAUNCH___
diff --git a/cudax/include/cuda/experimental/stream.cuh b/cudax/include/cuda/experimental/stream.cuh
@@ -0,0 +1,16 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDAX_STREAM__
+#define __CUDAX_STREAM__
+
+#include <cuda/experimental/__stream/stream.cuh>
+
+#endif // __CUDAX_STREAM__
diff --git a/cudax/include/cuda/experimental/version.cuh b/cudax/include/cuda/experimental/version.cuh
@@ -18,4 +18,4 @@
 #define CUDAX_VERSION_MINOR CCCL_MINOR_VERSION
 #define CUDAX_VERSION_PATCH CCCL_PATCH_VERSION
 
-#endif
+#endif // __CUDAX_VERSION__
diff --git a/cudax/test/CMakeLists.txt b/cudax/test/CMakeLists.txt
@@ -67,4 +67,8 @@ foreach(cn_target IN LISTS cudax_TARGETS)
     event/event_smoke.cu
   )
   target_compile_options(${test_target} PRIVATE $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--extended-lambda>)
+
+  cudax_add_catch2_test(test_target stream_tests ${cn_target}
+    stream/stream_smoke.cu
+  )
 endforeach()
diff --git a/cudax/test/common/utility.cuh b/cudax/test/common/utility.cuh
@@ -11,12 +11,16 @@
 #include <cuda_runtime_api.h>
 // cuda_runtime_api needs to come first
 
+#include <cuda/atomic>
 #include <cuda/std/__cuda/api_wrapper.h>
 #include <cuda/std/utility>
 #include <cuda/stream_ref>
 
 #include <new> // IWYU pragma: keep (needed for placement new)
 
+// TODO unify the common testing header
+#include "../hierarchy/testing_common.cuh"
+
 namespace
 {
 namespace test
@@ -107,6 +111,32 @@ public:
   }
 };
 
+struct assign_42
+{
+  __device__ constexpr void operator()(int* pi) const noexcept
+  {
+    *pi = 42;
+  }
+};
+
+struct verify_42
+{
+  __device__ void operator()(int* pi) const noexcept
+  {
+    CUDAX_REQUIRE(*pi == 42);
+  }
+};
+
+struct spin_until_80
+{
+  __device__ void operator()(int* pi) const noexcept
+  {
+    cuda::atomic_ref atomic_pi(*pi);
+    while (atomic_pi.load() != 80)
+      ;
+  }
+};
+
 /// A kernel that takes a callable object and invokes it with a set of arguments
 template <class Fn, class... Args>
 __global__ void invokernel(Fn fn, Args... args)

diff --git a/cudax/test/event/event_smoke.cu b/cudax/test/event/event_smoke.cu
@@ -66,14 +66,6 @@ TEST_CASE("can copy construct an event_ref and compare for equality", "[event]")
   CUDAX_REQUIRE(!ref4);
 }
 
-struct assign_42
-{
-  __device__ constexpr void operator()(int* pi) const noexcept
-  {
-    *pi = 42;
-  }
-};
-
 TEST_CASE("can use event_ref to record and wait on an event", "[event]")
 {
   ::cudaEvent_t ev;
@@ -82,7 +74,7 @@ TEST_CASE("can use event_ref to record and wait on an event", "[event]")
 
   test::managed<int> i(0);
   test::stream stream;
-  ::test::invokernel<<<1, 1, 0, stream.get()>>>(assign_42{}, i.get());
+  ::test::invokernel<<<1, 1, 0, stream.get()>>>(::test::assign_42{}, i.get());
   ref.record(stream);
   ref.wait();
   CUDAX_REQUIRE(*i == 42);
@@ -102,7 +94,7 @@ TEST_CASE("can wait on an event", "[event]")
 {
   test::stream stream;
   ::test::managed<int> i(0);
-  ::test::invokernel<<<1, 1, 0, stream.get()>>>(assign_42{}, i.get());
+  ::test::invokernel<<<1, 1, 0, stream.get()>>>(::test::assign_42{}, i.get());
   cudax::event ev(stream);
   ev.wait();
   CUDAX_REQUIRE(*i == 42);
@@ -114,7 +106,7 @@ TEST_CASE("can take the difference of two timed_event objects", "[event]")
   test::stream stream;
   ::test::managed<int> i(0);
   cudax::timed_event start(stream);
-  ::test::invokernel<<<1, 1, 0, stream.get()>>>(assign_42{}, i.get());
+  ::test::invokernel<<<1, 1, 0, stream.get()>>>(::test::assign_42{}, i.get());
   cudax::timed_event end(stream);
   end.wait();
   CUDAX_REQUIRE(*i == 42);