Skip to content

Commit

Permalink
Fix stream_ordered_memory_resource attempt to record event in stream …
Browse files Browse the repository at this point in the history
…from another device (#1333)

As described in #1306 , stream_ordered_memory_resource maintained thread-local storage of an event to synchronize in the per-thread default stream. However, this caused the event to be recorded on the wrong stream if allocations were made on memory resources associated with different devices, because allocation on the first device on the PTDS would initialize the TLS for that stream, and subsequent device pools would try to use the already initialized TLS. 

This PR adds a new test that only runs on multidevice systems (more correctly, does nothing on single device systems). The test creates two per-device pools, and creates and destroys a device buffer on each. 

It also fixes `stream_ordered_memory_resource` to store the ID of the device that is current when it is constructed, and then to store a vector of one event per device in TLS rather than a single event. When the PTDS is passed to `get_event`, the event for the MR's stored device ID is used. This should correctly support PTDS with multiple threads and multiple devices (still one MR per device).

The PR also includes some changes to the device ID utilities in `cuda_device.hpp`. There is a new RAII device helper class, and a `get_num_cuda_devices()` function.

Finally, there is a small addition to the .clang-tidy to disable warnings about `do...while` loops inside of RMM error checking macros.

- Fixes #1306

Authors:
  - Mark Harris (https://github.com/harrism)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - Bradley Dice (https://github.com/bdice)
  - Jake Hemstad (https://github.com/jrhemstad)

URL: #1333
  • Loading branch information
harrism authored Sep 13, 2023
1 parent ebd423c commit 2ff1bd0
Show file tree
Hide file tree
Showing 11 changed files with 148 additions and 30 deletions.
2 changes: 2 additions & 0 deletions .clang-tidy
Original file line number Diff line number Diff line change
Expand Up @@ -64,4 +64,6 @@ CheckOptions:
value: '1'
- key: readability-magic-numbers.IgnorePowersOf2IntegerValues
value: '1'
- key: cppcoreguidelines-avoid-do-while.IgnoreMacros
value: 'true'
...
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ DartConfiguration.tcl
.DS_Store
*.manifest
*.spec
compile_commands.json

## Python build directories & artifacts
dist/
Expand Down
31 changes: 26 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -354,11 +354,32 @@ objects for each device and sets them as the per-device resource for that device
```c++
std::vector<unique_ptr<pool_memory_resource>> per_device_pools;
for(int i = 0; i < N; ++i) {
cudaSetDevice(i); // set device i before creating MR
// Use a vector of unique_ptr to maintain the lifetime of the MRs
per_device_pools.push_back(std::make_unique<pool_memory_resource>());
// Set the per-device resource for device i
set_per_device_resource(cuda_device_id{i}, &per_device_pools.back());
cudaSetDevice(i); // set device i before creating MR
// Use a vector of unique_ptr to maintain the lifetime of the MRs
per_device_pools.push_back(std::make_unique<pool_memory_resource>());
// Set the per-device resource for device i
set_per_device_resource(cuda_device_id{i}, &per_device_pools.back());
}
```

Note that the CUDA device that is current when creating a `device_memory_resource` must also be
current any time that `device_memory_resource` is used to deallocate memory, including in a
destructor. This affects RAII classes like `rmm::device_buffer` and `rmm::device_uvector`. Here's an
(incorrect) example that assumes the above example loop has been run to create a
`pool_memory_resource` for each device. A correct example adds a call to `cudaSetDevice(0)` on the
line of the error comment.

```c++
{
RMM_CUDA_TRY(cudaSetDevice(0));
rmm::device_buffer buf_a(16);

{
RMM_CUDA_TRY(cudaSetDevice(1));
rmm::device_buffer buf_b(16);
}

// Error: when buf_a is destroyed, the current device must be 0, but it is 1
}
```

Expand Down
54 changes: 49 additions & 5 deletions include/rmm/cuda_device.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,19 +42,63 @@ struct cuda_device_id {
value_type id_;
};

namespace detail {
/**
* @brief Returns a `cuda_device_id` for the current device
*
* The current device is the device on which the calling thread executes device code.
*
* @return `cuda_device_id` for the current device
*/
inline cuda_device_id current_device()
inline cuda_device_id get_current_cuda_device()
{
int dev_id{};
RMM_CUDA_TRY(cudaGetDevice(&dev_id));
cuda_device_id::value_type dev_id{-1};
RMM_ASSERT_CUDA_SUCCESS(cudaGetDevice(&dev_id));
return cuda_device_id{dev_id};
}
} // namespace detail

/**
* @brief Returns the number of CUDA devices in the system
*
* @return Number of CUDA devices in the system
*/
inline int get_num_cuda_devices()
{
cuda_device_id::value_type num_dev{-1};
RMM_ASSERT_CUDA_SUCCESS(cudaGetDeviceCount(&num_dev));
return num_dev;
}

/**
* @brief RAII class that sets the current CUDA device to the specified device on construction
* and restores the previous device on destruction.
*/
struct cuda_set_device_raii {
/**
* @brief Construct a new cuda_set_device_raii object and sets the current CUDA device to `dev_id`
*
* @param dev_id The device to set as the current CUDA device
*/
explicit cuda_set_device_raii(cuda_device_id dev_id)
: old_device_{get_current_cuda_device()}, needs_reset_{old_device_.value() != dev_id.value()}
{
if (needs_reset_) RMM_ASSERT_CUDA_SUCCESS(cudaSetDevice(dev_id.value()));
}
/**
* @brief Reactivates the previous CUDA device
*/
~cuda_set_device_raii() noexcept
{
if (needs_reset_) RMM_ASSERT_CUDA_SUCCESS(cudaSetDevice(old_device_.value()));
}

cuda_set_device_raii(cuda_set_device_raii const&) = delete;
cuda_set_device_raii& operator=(cuda_set_device_raii const&) = delete;
cuda_set_device_raii(cuda_set_device_raii&&) = delete;
cuda_set_device_raii& operator=(cuda_set_device_raii&&) = delete;

private:
cuda_device_id old_device_;
bool needs_reset_;
};

} // namespace rmm
4 changes: 2 additions & 2 deletions include/rmm/detail/dynamic_load_runtime.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ struct async_alloc {
int cuda_pool_supported{};
auto result = cudaDeviceGetAttribute(&cuda_pool_supported,
cudaDevAttrMemoryPoolsSupported,
rmm::detail::current_device().value());
rmm::get_current_cuda_device().value());
return result == cudaSuccess and cuda_pool_supported == 1;
}()};
return runtime_supports_pool and driver_supports_pool;
Expand All @@ -139,7 +139,7 @@ struct async_alloc {
if (cudaMemHandleTypeNone != handle_type) {
auto const result = cudaDeviceGetAttribute(&supported_handle_types_bitmask,
cudaDevAttrMemoryPoolSupportedHandleTypes,
rmm::detail::current_device().value());
rmm::get_current_cuda_device().value());

// Don't throw on cudaErrorInvalidValue
auto const unsupported_runtime = (result == cudaErrorInvalidValue);
Expand Down
2 changes: 1 addition & 1 deletion include/rmm/mr/device/cuda_async_memory_resource.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ class cuda_async_memory_resource final : public device_memory_resource {
RMM_EXPECTS(rmm::detail::async_alloc::is_export_handle_type_supported(pool_props.handleTypes),
"Requested IPC memory handle type not supported");
pool_props.location.type = cudaMemLocationTypeDevice;
pool_props.location.id = rmm::detail::current_device().value();
pool_props.location.id = rmm::get_current_cuda_device().value();
cudaMemPool_t cuda_pool_handle{};
RMM_CUDA_TRY(rmm::detail::async_alloc::cudaMemPoolCreate(&cuda_pool_handle, &pool_props));
pool_ = cuda_async_view_memory_resource{cuda_pool_handle};
Expand Down
2 changes: 1 addition & 1 deletion include/rmm/mr/device/cuda_async_view_memory_resource.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ class cuda_async_view_memory_resource final : public device_memory_resource {
}()}
{
// Check if cudaMallocAsync Memory pool supported
auto const device = rmm::detail::current_device();
auto const device = rmm::get_current_cuda_device();
int cuda_pool_supported{};
auto result =
cudaDeviceGetAttribute(&cuda_pool_supported, cudaDevAttrMemoryPoolsSupported, device.value());
Expand Down
35 changes: 23 additions & 12 deletions include/rmm/mr/device/detail/stream_ordered_memory_resource.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,16 @@
*/
#pragma once

#include <rmm/cuda_device.hpp>
#include <rmm/detail/aligned.hpp>
#include <rmm/detail/error.hpp>
#include <rmm/logger.hpp>
#include <rmm/mr/device/device_memory_resource.hpp>

#include <fmt/core.h>

#include <cuda_runtime_api.h>

#include <fmt/core.h>

#include <cstddef>
#include <functional>
#include <limits>
Expand Down Expand Up @@ -288,17 +289,25 @@ class stream_ordered_memory_resource : public crtp<PoolResource>, public device_
stream_event_pair get_event(cuda_stream_view stream)
{
if (stream.is_per_thread_default()) {
// Create a thread-local shared event wrapper. Shared pointers in the thread and in each MR
// instance ensures it is destroyed cleaned up only after all are finished with it.
thread_local auto event_tls = std::make_shared<event_wrapper>();
default_stream_events.insert(event_tls);
return stream_event_pair{stream.value(), event_tls->event};
// Create a thread-local shared event wrapper for each device. Shared pointers in the thread
// and in each MR instance ensure the wrappers are destroyed only after all are finished
// with them.
thread_local std::vector<std::shared_ptr<event_wrapper>> events_tls(
rmm::get_num_cuda_devices());
auto event = [&, device_id = this->device_id_]() {
if (events_tls[device_id.value()]) { return events_tls[device_id.value()]->event; }

auto event = std::make_shared<event_wrapper>();
this->default_stream_events.insert(event);
return (events_tls[device_id.value()] = std::move(event))->event;
}();
return stream_event_pair{stream.value(), event};
}
// We use cudaStreamLegacy as the event map key for the default stream for consistency between
// PTDS and non-PTDS mode. In PTDS mode, the cudaStreamLegacy map key will only exist if the
// user explicitly passes it, so it is used as the default location for the free list
// at construction. For consistency, the same key is used for null stream free lists in non-PTDS
// mode.
// at construction. For consistency, the same key is used for null stream free lists in
// non-PTDS mode.
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-cstyle-cast)
auto* const stream_to_store = stream.is_default() ? cudaStreamLegacy : stream.value();

Expand Down Expand Up @@ -496,11 +505,13 @@ class stream_ordered_memory_resource : public crtp<PoolResource>, public device_
// bidirectional mapping between non-default streams and events
std::unordered_map<cudaStream_t, stream_event_pair> stream_events_;

// shared pointers to events keeps the events alive as long as either the thread that created them
// or the MR that is using them exists.
// shared pointers to events keeps the events alive as long as either the thread that created
// them or the MR that is using them exists.
std::set<std::shared_ptr<event_wrapper>> default_stream_events;

std::mutex mtx_; // mutex for thread-safe access
}; // namespace detail

rmm::cuda_device_id device_id_{rmm::get_current_cuda_device()};
}; // namespace detail

} // namespace rmm::mr::detail
4 changes: 2 additions & 2 deletions include/rmm/mr/device/per_device_resource.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ inline device_memory_resource* set_per_device_resource(cuda_device_id device_id,
*/
inline device_memory_resource* get_current_device_resource()
{
return get_per_device_resource(rmm::detail::current_device());
return get_per_device_resource(rmm::get_current_cuda_device());
}

/**
Expand Down Expand Up @@ -231,6 +231,6 @@ inline device_memory_resource* get_current_device_resource()
*/
inline device_memory_resource* set_current_device_resource(device_memory_resource* new_mr)
{
return set_per_device_resource(rmm::detail::current_device(), new_mr);
return set_per_device_resource(rmm::get_current_cuda_device(), new_mr);
}
} // namespace rmm::mr
4 changes: 2 additions & 2 deletions tests/mr/device/cuda_async_view_mr_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ TEST(PoolTest, UsePool)
{
cudaMemPool_t memPool{};
RMM_CUDA_TRY(rmm::detail::async_alloc::cudaDeviceGetDefaultMemPool(
&memPool, rmm::detail::current_device().value()));
&memPool, rmm::get_current_cuda_device().value()));

const auto pool_init_size{100};
cuda_async_view_mr mr{memPool};
Expand All @@ -44,7 +44,7 @@ TEST(PoolTest, NotTakingOwnershipOfPool)
{
cudaMemPoolProps poolProps = {};
poolProps.allocType = cudaMemAllocationTypePinned;
poolProps.location.id = rmm::detail::current_device().value();
poolProps.location.id = rmm::get_current_cuda_device().value();
poolProps.location.type = cudaMemLocationTypeDevice;

cudaMemPool_t memPool{};
Expand Down
39 changes: 39 additions & 0 deletions tests/mr/device/pool_mr_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,12 @@
* limitations under the License.
*/

#include <rmm/cuda_device.hpp>
#include <rmm/detail/aligned.hpp>
#include <rmm/detail/cuda_util.hpp>
#include <rmm/detail/error.hpp>
#include <rmm/device_buffer.hpp>
#include <rmm/device_uvector.hpp>
#include <rmm/mr/device/cuda_memory_resource.hpp>
#include <rmm/mr/device/device_memory_resource.hpp>
#include <rmm/mr/device/limiting_resource_adaptor.hpp>
Expand Down Expand Up @@ -150,5 +152,42 @@ TEST(PoolTest, UpstreamDoesntSupportMemInfo)
mr2.deallocate(ptr, 1024);
}

TEST(PoolTest, MultidevicePool)
{
using MemoryResource = rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource>;

// Get the number of cuda devices
int num_devices = rmm::get_num_cuda_devices();

// only run on multidevice systems
if (num_devices >= 2) {
rmm::mr::cuda_memory_resource general_mr;

// initializing pool_memory_resource of multiple devices
int devices = 2;
size_t pool_size = 1024;
std::vector<std::shared_ptr<MemoryResource>> mrs;

for (int i = 0; i < devices; ++i) {
RMM_CUDA_TRY(cudaSetDevice(i));
auto mr = std::make_shared<MemoryResource>(&general_mr, pool_size, pool_size);
rmm::mr::set_per_device_resource(rmm::cuda_device_id{i}, mr.get());
mrs.emplace_back(mr);
}

{
RMM_CUDA_TRY(cudaSetDevice(0));
rmm::device_buffer buf_a(16, rmm::cuda_stream_per_thread, mrs[0].get());

{
RMM_CUDA_TRY(cudaSetDevice(1));
rmm::device_buffer buf_b(16, rmm::cuda_stream_per_thread, mrs[1].get());
}

RMM_CUDA_TRY(cudaSetDevice(0));
}
}
}

} // namespace
} // namespace rmm::test

0 comments on commit 2ff1bd0

Please sign in to comment.