Skip to content

Commit

Permalink
[CUDAX] Add experimental owning abstraction for cudaStream_t (NVIDIA#…
Browse files Browse the repository at this point in the history
…2093)

* construct with a stream_ref and record the event on construction

---------

Co-authored-by: Eric Niebler <[email protected]>
  • Loading branch information
pciolkosz and ericniebler committed Aug 4, 2024
1 parent 9062e5b commit 4e4e22d
Show file tree
Hide file tree
Showing 11 changed files with 375 additions and 16 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,8 @@ struct level_dimensions
_CCCL_HOST_DEVICE constexpr level_dimensions(Dimensions&& d)
: dims(d)
{}
_CCCL_HOST_DEVICE constexpr level_dimensions(){};
_CCCL_HOST_DEVICE constexpr level_dimensions()
: dims(){};
};

/**
Expand Down
197 changes: 197 additions & 0 deletions cudax/include/cuda/experimental/__stream/stream.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,197 @@
//===----------------------------------------------------------------------===//
//
// Part of CUDA Experimental in CUDA C++ Core Libraries,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#ifndef _CUDAX__STREAM_STREAM
#define _CUDAX__STREAM_STREAM

#include <cuda/std/detail/__config>
#include <cuda_runtime_api.h>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

#include <cuda/std/__cuda/api_wrapper.h>
#include <cuda/stream_ref>

#include <cuda/experimental/__device/device.cuh>
#include <cuda/experimental/__event/timed_event.cuh>

namespace cuda::experimental
{

namespace detail
{
// 0 is a valid stream in CUDA, so we need some other invalid stream representation
// Can't make it constexpr, because cudaStream_t is a pointer type
static const ::cudaStream_t invalid_stream = reinterpret_cast<cudaStream_t>(~0ULL);
} // namespace detail

//! @brief An owning wrapper for cudaStream_t.
struct stream : stream_ref
{
// 0 is documented as default priority
static constexpr int default_priority = 0;

//! @brief Constructs a stream on a specified device and with specified priority
//!
//! Priority is defaulted to stream::default_priority
//!
//! @throws cuda_error if stream creation fails
explicit stream(device __dev, int __priority = default_priority)
{
__scoped_device dev_setter(__dev);
_CCCL_TRY_CUDA_API(
::cudaStreamCreateWithPriority, "Failed to create a stream", &__stream, cudaStreamDefault, __priority);
}

//! @brief Constructs a stream on the default device
//!
//! @throws cuda_error if stream creation fails.
stream()
: stream(device{0})
{}

//! @brief Construct a new `stream` object into the moved-from state.
//!
//! @post `stream()` returns an invalid stream handle
// Can't be constexpr because invalid_stream isn't
explicit stream(uninit_t) noexcept
: stream_ref(detail::invalid_stream)
{}

//! @brief Move-construct a new `stream` object
//!
//! @param __other
//!
//! @post `__other` is in moved-from state.
stream(stream&& __other) noexcept
: stream(_CUDA_VSTD::exchange(__other.__stream, detail::invalid_stream))
{}

stream(const stream&) = delete;

//! Destroy the `stream` object
//!
//! @note If the stream fails to be destroyed, the error is silently ignored.
~stream()
{
if (__stream != detail::invalid_stream)
{
[[maybe_unused]] auto status = ::cudaStreamDestroy(__stream);
}
}

//! @brief Move-assign a `stream` object
//!
//! @param __other
//!
//! @post `__other` is in a moved-from state.
stream& operator=(stream&& __other) noexcept
{
stream __tmp(_CUDA_VSTD::move(__other));
_CUDA_VSTD::swap(__stream, __tmp.__stream);
return *this;
}

stream& operator=(const stream&) = delete;

// Ideally records and waits below would be in stream_ref, but we can't have it depend on cudax yet

//! @brief Create a new event and record it into this stream
//!
//! @return A new event that was recorded into this stream
//!
//! @throws cuda_error if event creation or record failed
_CCCL_NODISCARD event record_event(event::flags __flags = event::flags::none) const
{
return event(*this, __flags);
}

//! @brief Create a new timed event and record it into this stream
//!
//! @return A new timed event that was recorded into this stream
//!
//! @throws cuda_error if event creation or record failed
_CCCL_NODISCARD timed_event record_timed_event(event::flags __flags = event::flags::none) const
{
return timed_event(*this, __flags);
}

using stream_ref::wait;

//! @brief Make all future work submitted into this stream depend on completion of the specified event
//!
//! @param __ev Event that this stream should wait for
//!
//! @throws cuda_error if inserting the dependency fails
void wait(event_ref __ev) const
{
assert(__ev.get() != nullptr);
_CCCL_TRY_CUDA_API(::cudaStreamWaitEvent, "Failed to make a stream wait for an event", get(), __ev.get());
}

//! @brief Make all future work submitted into this stream depend on completion of all work from the specified stream
//!
//! @param __other Stream that this stream should wait for
//!
//! @throws cuda_error if inserting the dependency fails
void wait(stream_ref __other) const
{
// TODO consider an optimization to not create an event every time and instead have one persistent event or one per
// stream
assert(__stream.get() != detail::invalid_stream);
event __tmp(__other);
wait(__tmp);
}

//! @brief Construct an `stream` object from a native `cudaStream_t` handle.
//!
//! @param __handle The native handle
//!
//! @return stream The constructed `stream` object
//!
//! @note The constructed `stream` object takes ownership of the native handle.
_CCCL_NODISCARD static stream from_native_handle(::cudaStream_t __handle)
{
return stream(__handle);
}

// Disallow construction from an `int`, e.g., `0`.
static stream from_native_handle(int) = delete;

// Disallow construction from `nullptr`.
static stream from_native_handle(_CUDA_VSTD::nullptr_t) = delete;

//! @brief Retrieve the native `cudaStream_t` handle and give up ownership.
//!
//! @return cudaStream_t The native handle being held by the `stream` object.
//!
//! @post The stream object is in a moved-from state.
_CCCL_NODISCARD ::cudaStream_t release()
{
return _CUDA_VSTD::exchange(__stream, detail::invalid_stream);
}

private:
// Use `stream::from_native_handle(s)` to construct an owning `stream`
// object from a `cudaStream_t` handle.
explicit stream(::cudaStream_t __handle)
: stream_ref(__handle)
{}
};

} // namespace cuda::experimental

#endif // _CUDAX__STREAM_STREAM
2 changes: 1 addition & 1 deletion cudax/include/cuda/experimental/hierarchy.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,4 @@

#include <cuda/experimental/__hierarchy/hierarchy_dimensions.cuh>

#endif
#endif // __CUDAX_HIERARCHY___
2 changes: 1 addition & 1 deletion cudax/include/cuda/experimental/launch.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,4 @@

#include <cuda/experimental/__launch/launch.cuh>

#endif
#endif // __CUDAX_LAUNCH___
16 changes: 16 additions & 0 deletions cudax/include/cuda/experimental/stream.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
//===----------------------------------------------------------------------===//
//
// Part of CUDA Experimental in CUDA C++ Core Libraries,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#ifndef __CUDAX_STREAM__
#define __CUDAX_STREAM__

#include <cuda/experimental/__stream/stream.cuh>

#endif // __CUDAX_STREAM__
2 changes: 1 addition & 1 deletion cudax/include/cuda/experimental/version.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,4 @@
#define CUDAX_VERSION_MINOR CCCL_MINOR_VERSION
#define CUDAX_VERSION_PATCH CCCL_PATCH_VERSION

#endif
#endif // __CUDAX_VERSION__
4 changes: 4 additions & 0 deletions cudax/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -67,4 +67,8 @@ foreach(cn_target IN LISTS cudax_TARGETS)
event/event_smoke.cu
)
target_compile_options(${test_target} PRIVATE $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--extended-lambda>)

cudax_add_catch2_test(test_target stream_tests ${cn_target}
stream/stream_smoke.cu
)
endforeach()
30 changes: 30 additions & 0 deletions cudax/test/common/utility.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,16 @@
#include <cuda_runtime_api.h>
// cuda_runtime_api needs to come first

#include <cuda/atomic>
#include <cuda/std/__cuda/api_wrapper.h>
#include <cuda/std/utility>
#include <cuda/stream_ref>

#include <new> // IWYU pragma: keep (needed for placement new)

// TODO unify the common testing header
#include "../hierarchy/testing_common.cuh"

namespace
{
namespace test
Expand Down Expand Up @@ -107,6 +111,32 @@ public:
}
};

struct assign_42
{
__device__ constexpr void operator()(int* pi) const noexcept
{
*pi = 42;
}
};

struct verify_42
{
__device__ void operator()(int* pi) const noexcept
{
CUDAX_REQUIRE(*pi == 42);
}
};

struct spin_until_80
{
__device__ void operator()(int* pi) const noexcept
{
cuda::atomic_ref atomic_pi(*pi);
while (atomic_pi.load() != 80)
;
}
};

/// A kernel that takes a callable object and invokes it with a set of arguments
template <class Fn, class... Args>
__global__ void invokernel(Fn fn, Args... args)
Expand Down
14 changes: 3 additions & 11 deletions cudax/test/event/event_smoke.cu
Original file line number Diff line number Diff line change
Expand Up @@ -66,14 +66,6 @@ TEST_CASE("can copy construct an event_ref and compare for equality", "[event]")
CUDAX_REQUIRE(!ref4);
}

struct assign_42
{
__device__ constexpr void operator()(int* pi) const noexcept
{
*pi = 42;
}
};

TEST_CASE("can use event_ref to record and wait on an event", "[event]")
{
::cudaEvent_t ev;
Expand All @@ -82,7 +74,7 @@ TEST_CASE("can use event_ref to record and wait on an event", "[event]")

test::managed<int> i(0);
test::stream stream;
::test::invokernel<<<1, 1, 0, stream.get()>>>(assign_42{}, i.get());
::test::invokernel<<<1, 1, 0, stream.get()>>>(::test::assign_42{}, i.get());
ref.record(stream);
ref.wait();
CUDAX_REQUIRE(*i == 42);
Expand All @@ -102,7 +94,7 @@ TEST_CASE("can wait on an event", "[event]")
{
test::stream stream;
::test::managed<int> i(0);
::test::invokernel<<<1, 1, 0, stream.get()>>>(assign_42{}, i.get());
::test::invokernel<<<1, 1, 0, stream.get()>>>(::test::assign_42{}, i.get());
cudax::event ev(stream);
ev.wait();
CUDAX_REQUIRE(*i == 42);
Expand All @@ -114,7 +106,7 @@ TEST_CASE("can take the difference of two timed_event objects", "[event]")
test::stream stream;
::test::managed<int> i(0);
cudax::timed_event start(stream);
::test::invokernel<<<1, 1, 0, stream.get()>>>(assign_42{}, i.get());
::test::invokernel<<<1, 1, 0, stream.get()>>>(::test::assign_42{}, i.get());
cudax::timed_event end(stream);
end.wait();
CUDAX_REQUIRE(*i == 42);
Expand Down
Loading

0 comments on commit 4e4e22d

Please sign in to comment.