Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[CUDAX] Add experimental owning abstraction for cudaStream_t #2093

Merged
merged 11 commits into from
Jul 30, 2024
5 changes: 2 additions & 3 deletions cudax/include/cuda/experimental/__event/event.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,12 @@
# pragma system_header
#endif // no system header

#include <cuda/experimental/__detail/utility.cuh>
#include <cuda/experimental/__event/event_ref.cuh>
#include <cuda/std/__cuda/api_wrapper.h>
#include <cuda/std/cstddef>
#include <cuda/std/utility>

#include <cuda/experimental/__detail/utility.cuh>
#include <cuda/experimental/__event/event_ref.cuh>

namespace cuda::experimental
pciolkosz marked this conversation as resolved.
Show resolved Hide resolved
{
class timed_event;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,8 @@
#ifndef _CUDAX__HIERARCHY_LEVEL_DIMENSIONS
#define _CUDAX__HIERARCHY_LEVEL_DIMENSIONS

#include <cuda/std/type_traits>

#include <cuda/experimental/__hierarchy/hierarchy_levels.cuh>
#include <cuda/std/type_traits>

#if _CCCL_STD_VER >= 2017
namespace cuda::experimental
Expand Down Expand Up @@ -124,7 +123,8 @@ struct level_dimensions
_CCCL_HOST_DEVICE constexpr level_dimensions(Dimensions&& d)
: dims(d)
{}
_CCCL_HOST_DEVICE constexpr level_dimensions(){};
_CCCL_HOST_DEVICE constexpr level_dimensions()
: dims(){};
};

/**
Expand Down
206 changes: 206 additions & 0 deletions cudax/include/cuda/experimental/__stream/stream.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
//===----------------------------------------------------------------------===//
//
// Part of CUDA Experimental in CUDA C++ Core Libraries,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#ifndef _CUDAX__STREAM_STREAM
#define _CUDAX__STREAM_STREAM

#include <cuda_runtime_api.h>
// cuda_runtime_api needs to come first
pciolkosz marked this conversation as resolved.
Show resolved Hide resolved

#include <cuda/std/detail/__config>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

#include <cuda/experimental/__device/device.cuh>
#include <cuda/experimental/__event/timed_event.cuh>
#include <cuda/std/__cuda/api_wrapper.h>
#include <cuda/std/utility>
pciolkosz marked this conversation as resolved.
Show resolved Hide resolved
#include <cuda/stream_ref>

namespace cuda::experimental
{

namespace detail
{
// 0 is a valid stream in CUDA, so we need some other invalid stream representation
// Can't make it constexpr, because cudaStream_t is a pointer type
static const ::cudaStream_t invalid_stream = reinterpret_cast<cudaStream_t>(~0ULL);
miscco marked this conversation as resolved.
Show resolved Hide resolved
} // namespace detail

//! @brief An owning wrapper for cudaStream_t.
struct stream : stream_ref
{
// 0 is documented as default priority
static constexpr int default_priority = 0;

//! @brief Constructs a stream on a specified device and with specified priority
//!
//! @throws cuda_error if stream creation fails
explicit stream(device __dev, int __priority)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would love if in new APIs we could try to be const correct:

Suggested change
explicit stream(device __dev, int __priority)
explicit stream(const device __dev, const int __priority)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If these are passed by value, is there any value in having them const?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From the perspective of the user these are identical. From our perspective it would guard against accidentally changing them. I'm generally not a fan of top level qualifiers on function arguments, and they are very much an implementation detail, so we can add or remove them in any API at any time.

{
__scoped_device dev_setter(__dev);
_CCCL_TRY_CUDA_API(
::cudaStreamCreateWithPriority, "Failed to create a stream", &__stream, cudaStreamDefault, __priority);
}

//! @brief Constructs a stream on a specified device and with default priority
//!
//! @throws cuda_error if stream creation fails
explicit stream(device __dev)
: stream(__dev, default_priority)
{}
pciolkosz marked this conversation as resolved.
Show resolved Hide resolved

//! @brief Constructs a stream on the default device
//!
//! @throws cuda_error if stream creation fails.
stream()
: stream(device{0})
{}

//! @brief Construct a new `stream` object into the moved-from state.
//!
//! @post `stream()` returns an invalid stream handle
// Can't be constexpr because invalid_stream isn't
explicit stream(uninit_t) noexcept
: stream_ref(detail::invalid_stream)
{}
Comment on lines +66 to +72
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this be a public constructor?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This aligns with the event constructor from uninit and works as an opt-in to create a stream that will be assigned into later


//! @brief Move-construct a new `stream` object
//!
//! @param __other
//!
//! @post `__other` is in moved-from state.
stream(stream&& __other) noexcept
: stream(_CUDA_VSTD::exchange(__other.__stream, detail::invalid_stream))
{}

// Disallow copy construction.
stream(const stream&) = delete;
pciolkosz marked this conversation as resolved.
Show resolved Hide resolved

//! Destroy the `stream` object
//!
//! @note If the stream fails to be destroyed, the error is silently ignored.
~stream()
{
if (__stream != detail::invalid_stream)
{
[[maybe_unused]] auto status = ::cudaStreamDestroy(__stream);
miscco marked this conversation as resolved.
Show resolved Hide resolved
}
}

//! @brief Move-assign a `stream` object
//!
//! @param __other
//!
//! @post `__other` is in a moved-from state.
stream& operator=(stream&& __other)
pciolkosz marked this conversation as resolved.
Show resolved Hide resolved
{
stream __tmp(_CUDA_VSTD::move(__other));
_CUDA_VSTD::swap(__stream, __tmp.__stream);
return *this;
miscco marked this conversation as resolved.
Show resolved Hide resolved
}

// Disallow copy assignment.
stream& operator=(const stream&) = delete;

// Ideally records and waits below would be in stream_ref, but we can't have it depend on cudax yet

//! @brief Create a new event and record it into this stream
//!
//! @return A new event that was recorded into this stream
//!
//! @throws cuda_error if event creation or record failed
_CCCL_NODISCARD event record_event(event::flags __flags = event::flags::none) const
{
return event(*this, __flags);
}

//! @brief Create a new timed event and record it into this stream
//!
//! @return A new timed event that was recorded into this stream
//!
//! @throws cuda_error if event creation or record failed
_CCCL_NODISCARD timed_event record_timed_event(event::flags __flags = event::flags::none) const
{
return timed_event(*this, __flags);
}

using stream_ref::wait;

//! @brief Make all future work submitted into this stream depend on completion of the specified event
//!
//! @param __ev Event that this stream should wait for
//!
//! @throws cuda_error if inserting the dependency fails
void wait(event_ref __ev) const
{
assert(__ev.get() != nullptr);
_CCCL_TRY_CUDA_API(::cudaStreamWaitEvent, "Failed to make a stream wait for an event", get(), __ev.get());
}

//! @brief Make all future work submitted into this stream depend on completion of all work from the specified stream
//!
//! @param __other Stream that this stream should wait for
//!
//! @throws cuda_error if inserting the dependency fails
void wait(const stream_ref __other) const
pciolkosz marked this conversation as resolved.
Show resolved Hide resolved
{
// TODO consider an optimization to not create an event every time and instead have one persistent event or one per
// stream
assert(__stream.get() != nullptr);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

isn't __stream a cudaStream_t here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should be check for invalid_stream, corrected

event __tmp(__other);
wait(__tmp);
}

//! @brief Construct an `stream` object from a native `cudaStream_t` handle.
//!
//! @param __handle The native handle
//!
//! @return stream The constructed `stream` object
//!
//! @note The constructed `stream` object takes ownership of the native handle.
_CCCL_NODISCARD static stream from_native_handle(::cudaStream_t __handle)
{
return stream(__handle);
}

// Disallow construction from an `int`, e.g., `0`.
static stream from_native_handle(int) = delete;

// Disallow construction from `nullptr`.
static stream from_native_handle(_CUDA_VSTD::nullptr_t) = delete;
Comment on lines +166 to +175
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Question: why shouldn't those be ctors? What problem are factory functions solving here that ctors cannot?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This one aligns with event factory function, we can discuss it as a broader design question for cudax. These are taking the ownership of the stream, so I like the explicitness of the function.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ericniebler points out that these functions take ownership over the passed in stream, so he wants them to stand out in the code


//! @brief Retrieve the native `cudaStream_t` handle and give up ownership.
//!
//! @return cudaStream_t The native handle being held by the `stream` object.
//!
//! @post The stream object is in a moved-from state.
_CCCL_NODISCARD ::cudaStream_t release()
{
return _CUDA_VSTD::exchange(__stream, detail::invalid_stream);
}

private:
// Use `stream::from_native_handle(s)` to construct an owning `stream`
// object from a `cudaStream_t` handle.
explicit stream(::cudaStream_t __handle)
: stream_ref(__handle)
{}
};

} // namespace cuda::experimental

#endif
16 changes: 16 additions & 0 deletions cudax/include/cuda/experimental/stream.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
//===----------------------------------------------------------------------===//
//
// Part of CUDA Experimental in CUDA C++ Core Libraries,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#ifndef __CUDAX_STREAM__
#define __CUDAX_STREAM__

#include <cuda/experimental/__stream/stream.cuh>

#endif
pciolkosz marked this conversation as resolved.
Show resolved Hide resolved
4 changes: 4 additions & 0 deletions cudax/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -62,4 +62,8 @@ foreach(cn_target IN LISTS cudax_TARGETS)
event/event_smoke.cu
)
target_compile_options(${test_target} PRIVATE $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:--extended-lambda>)

cudax_add_catch2_test(test_target stream_tests ${cn_target}
stream/stream_smoke.cu
)
endforeach()
30 changes: 30 additions & 0 deletions cudax/test/common/utility.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,16 @@
#include <cuda_runtime_api.h>
// cuda_runtime_api needs to come first

#include <cuda/atomic>
#include <cuda/std/__cuda/api_wrapper.h>
#include <cuda/std/utility>
#include <cuda/stream_ref>

#include <new> // IWYU pragma: keep (needed for placement new)

// TODO unify the common testing header
#include "../hierarchy/testing_common.cuh"

namespace
{
namespace test
Expand Down Expand Up @@ -107,6 +111,32 @@ public:
}
};

struct assign_42
{
__device__ constexpr void operator()(int* pi) const noexcept
{
*pi = 42;
}
};

struct verify_42
{
__device__ void operator()(int* pi) const noexcept
{
CUDAX_REQUIRE(*pi == 42);
}
};

struct spin_until_80
{
__device__ void operator()(int* pi) const noexcept
{
cuda::atomic_ref atomic_pi(*pi);
while (atomic_pi.load() != 80)
;
}
};

/// A kernel that takes a callable object and invokes it with a set of arguments
template <class Fn, class... Args>
__global__ void invokernel(Fn fn, Args... args)
Expand Down
14 changes: 3 additions & 11 deletions cudax/test/event/event_smoke.cu
Original file line number Diff line number Diff line change
Expand Up @@ -66,14 +66,6 @@ TEST_CASE("can copy construct an event_ref and compare for equality", "[event]")
CUDAX_REQUIRE(!ref4);
}

struct assign_42
{
__device__ constexpr void operator()(int* pi) const noexcept
{
*pi = 42;
}
};

TEST_CASE("can use event_ref to record and wait on an event", "[event]")
{
::cudaEvent_t ev;
Expand All @@ -82,7 +74,7 @@ TEST_CASE("can use event_ref to record and wait on an event", "[event]")

test::managed<int> i(0);
test::stream stream;
::test::invokernel<<<1, 1, 0, stream.get()>>>(assign_42{}, i.get());
::test::invokernel<<<1, 1, 0, stream.get()>>>(::test::assign_42{}, i.get());
ref.record(stream);
ref.wait();
CUDAX_REQUIRE(*i == 42);
Expand All @@ -102,7 +94,7 @@ TEST_CASE("can wait on an event", "[event]")
{
test::stream stream;
::test::managed<int> i(0);
::test::invokernel<<<1, 1, 0, stream.get()>>>(assign_42{}, i.get());
::test::invokernel<<<1, 1, 0, stream.get()>>>(::test::assign_42{}, i.get());
cudax::event ev(stream);
ev.wait();
CUDAX_REQUIRE(*i == 42);
Expand All @@ -114,7 +106,7 @@ TEST_CASE("can take the difference of two timed_event objects", "[event]")
test::stream stream;
::test::managed<int> i(0);
cudax::timed_event start(stream);
::test::invokernel<<<1, 1, 0, stream.get()>>>(assign_42{}, i.get());
::test::invokernel<<<1, 1, 0, stream.get()>>>(::test::assign_42{}, i.get());
cudax::timed_event end(stream);
end.wait();
CUDAX_REQUIRE(*i == 42);
Expand Down
Loading
Loading