diff --git a/.clang-tidy b/.clang-tidy index 04689c330..9b3f844c9 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -64,4 +64,6 @@ CheckOptions: value: '1' - key: readability-magic-numbers.IgnorePowersOf2IntegerValues value: '1' + - key: cppcoreguidelines-avoid-do-while.IgnoreMacros + value: 'true' ... diff --git a/.gitignore b/.gitignore index 1ab57e4d4..ad6c8ebf7 100644 --- a/.gitignore +++ b/.gitignore @@ -14,6 +14,7 @@ DartConfiguration.tcl .DS_Store *.manifest *.spec +compile_commands.json ## Python build directories & artifacts dist/ diff --git a/README.md b/README.md index 8d65b7d33..2fa4761df 100644 --- a/README.md +++ b/README.md @@ -354,11 +354,32 @@ objects for each device and sets them as the per-device resource for that device ```c++ std::vector> per_device_pools; for(int i = 0; i < N; ++i) { - cudaSetDevice(i); // set device i before creating MR - // Use a vector of unique_ptr to maintain the lifetime of the MRs - per_device_pools.push_back(std::make_unique()); - // Set the per-device resource for device i - set_per_device_resource(cuda_device_id{i}, &per_device_pools.back()); + cudaSetDevice(i); // set device i before creating MR + // Use a vector of unique_ptr to maintain the lifetime of the MRs + per_device_pools.push_back(std::make_unique()); + // Set the per-device resource for device i + set_per_device_resource(cuda_device_id{i}, &per_device_pools.back()); +} +``` + +Note that the CUDA device that is current when creating a `device_memory_resource` must also be +current any time that `device_memory_resource` is used to deallocate memory, including in a +destructor. This affects RAII classes like `rmm::device_buffer` and `rmm::device_uvector`. Here's an +(incorrect) example that assumes the above example loop has been run to create a +`pool_memory_resource` for each device. A correct example adds a call to `cudaSetDevice(0)` on the +line of the error comment. + +```c++ +{ + RMM_CUDA_TRY(cudaSetDevice(0)); + rmm::device_buffer buf_a(16); + + { + RMM_CUDA_TRY(cudaSetDevice(1)); + rmm::device_buffer buf_b(16); + } + + // Error: when buf_a is destroyed, the current device must be 0, but it is 1 } ``` diff --git a/include/rmm/cuda_device.hpp b/include/rmm/cuda_device.hpp index 81d35dc3c..8d355ee23 100644 --- a/include/rmm/cuda_device.hpp +++ b/include/rmm/cuda_device.hpp @@ -42,7 +42,6 @@ struct cuda_device_id { value_type id_; }; -namespace detail { /** * @brief Returns a `cuda_device_id` for the current device * @@ -50,11 +49,56 @@ namespace detail { * * @return `cuda_device_id` for the current device */ -inline cuda_device_id current_device() +inline cuda_device_id get_current_cuda_device() { - int dev_id{}; - RMM_CUDA_TRY(cudaGetDevice(&dev_id)); + cuda_device_id::value_type dev_id{-1}; + RMM_ASSERT_CUDA_SUCCESS(cudaGetDevice(&dev_id)); return cuda_device_id{dev_id}; } -} // namespace detail + +/** + * @brief Returns the number of CUDA devices in the system + * + * @return Number of CUDA devices in the system + */ +inline int get_num_cuda_devices() +{ + cuda_device_id::value_type num_dev{-1}; + RMM_ASSERT_CUDA_SUCCESS(cudaGetDeviceCount(&num_dev)); + return num_dev; +} + +/** + * @brief RAII class that sets the current CUDA device to the specified device on construction + * and restores the previous device on destruction. + */ +struct cuda_set_device_raii { + /** + * @brief Construct a new cuda_set_device_raii object and sets the current CUDA device to `dev_id` + * + * @param dev_id The device to set as the current CUDA device + */ + explicit cuda_set_device_raii(cuda_device_id dev_id) + : old_device_{get_current_cuda_device()}, needs_reset_{old_device_.value() != dev_id.value()} + { + if (needs_reset_) RMM_ASSERT_CUDA_SUCCESS(cudaSetDevice(dev_id.value())); + } + /** + * @brief Reactivates the previous CUDA device + */ + ~cuda_set_device_raii() noexcept + { + if (needs_reset_) RMM_ASSERT_CUDA_SUCCESS(cudaSetDevice(old_device_.value())); + } + + cuda_set_device_raii(cuda_set_device_raii const&) = delete; + cuda_set_device_raii& operator=(cuda_set_device_raii const&) = delete; + cuda_set_device_raii(cuda_set_device_raii&&) = delete; + cuda_set_device_raii& operator=(cuda_set_device_raii&&) = delete; + + private: + cuda_device_id old_device_; + bool needs_reset_; +}; + } // namespace rmm diff --git a/include/rmm/detail/dynamic_load_runtime.hpp b/include/rmm/detail/dynamic_load_runtime.hpp index 28121e6a8..b45dbae25 100644 --- a/include/rmm/detail/dynamic_load_runtime.hpp +++ b/include/rmm/detail/dynamic_load_runtime.hpp @@ -115,7 +115,7 @@ struct async_alloc { int cuda_pool_supported{}; auto result = cudaDeviceGetAttribute(&cuda_pool_supported, cudaDevAttrMemoryPoolsSupported, - rmm::detail::current_device().value()); + rmm::get_current_cuda_device().value()); return result == cudaSuccess and cuda_pool_supported == 1; }()}; return runtime_supports_pool and driver_supports_pool; @@ -139,7 +139,7 @@ struct async_alloc { if (cudaMemHandleTypeNone != handle_type) { auto const result = cudaDeviceGetAttribute(&supported_handle_types_bitmask, cudaDevAttrMemoryPoolSupportedHandleTypes, - rmm::detail::current_device().value()); + rmm::get_current_cuda_device().value()); // Don't throw on cudaErrorInvalidValue auto const unsupported_runtime = (result == cudaErrorInvalidValue); diff --git a/include/rmm/mr/device/cuda_async_memory_resource.hpp b/include/rmm/mr/device/cuda_async_memory_resource.hpp index d41eae63e..329d8f29a 100644 --- a/include/rmm/mr/device/cuda_async_memory_resource.hpp +++ b/include/rmm/mr/device/cuda_async_memory_resource.hpp @@ -98,7 +98,7 @@ class cuda_async_memory_resource final : public device_memory_resource { RMM_EXPECTS(rmm::detail::async_alloc::is_export_handle_type_supported(pool_props.handleTypes), "Requested IPC memory handle type not supported"); pool_props.location.type = cudaMemLocationTypeDevice; - pool_props.location.id = rmm::detail::current_device().value(); + pool_props.location.id = rmm::get_current_cuda_device().value(); cudaMemPool_t cuda_pool_handle{}; RMM_CUDA_TRY(rmm::detail::async_alloc::cudaMemPoolCreate(&cuda_pool_handle, &pool_props)); pool_ = cuda_async_view_memory_resource{cuda_pool_handle}; diff --git a/include/rmm/mr/device/cuda_async_view_memory_resource.hpp b/include/rmm/mr/device/cuda_async_view_memory_resource.hpp index 806ace807..c685cd75f 100644 --- a/include/rmm/mr/device/cuda_async_view_memory_resource.hpp +++ b/include/rmm/mr/device/cuda_async_view_memory_resource.hpp @@ -60,7 +60,7 @@ class cuda_async_view_memory_resource final : public device_memory_resource { }()} { // Check if cudaMallocAsync Memory pool supported - auto const device = rmm::detail::current_device(); + auto const device = rmm::get_current_cuda_device(); int cuda_pool_supported{}; auto result = cudaDeviceGetAttribute(&cuda_pool_supported, cudaDevAttrMemoryPoolsSupported, device.value()); diff --git a/include/rmm/mr/device/detail/stream_ordered_memory_resource.hpp b/include/rmm/mr/device/detail/stream_ordered_memory_resource.hpp index 53575e5ce..f071717c0 100644 --- a/include/rmm/mr/device/detail/stream_ordered_memory_resource.hpp +++ b/include/rmm/mr/device/detail/stream_ordered_memory_resource.hpp @@ -15,15 +15,16 @@ */ #pragma once +#include #include #include #include #include -#include - #include +#include + #include #include #include @@ -288,17 +289,25 @@ class stream_ordered_memory_resource : public crtp, public device_ stream_event_pair get_event(cuda_stream_view stream) { if (stream.is_per_thread_default()) { - // Create a thread-local shared event wrapper. Shared pointers in the thread and in each MR - // instance ensures it is destroyed cleaned up only after all are finished with it. - thread_local auto event_tls = std::make_shared(); - default_stream_events.insert(event_tls); - return stream_event_pair{stream.value(), event_tls->event}; + // Create a thread-local shared event wrapper for each device. Shared pointers in the thread + // and in each MR instance ensure the wrappers are destroyed only after all are finished + // with them. + thread_local std::vector> events_tls( + rmm::get_num_cuda_devices()); + auto event = [&, device_id = this->device_id_]() { + if (events_tls[device_id.value()]) { return events_tls[device_id.value()]->event; } + + auto event = std::make_shared(); + this->default_stream_events.insert(event); + return (events_tls[device_id.value()] = std::move(event))->event; + }(); + return stream_event_pair{stream.value(), event}; } // We use cudaStreamLegacy as the event map key for the default stream for consistency between // PTDS and non-PTDS mode. In PTDS mode, the cudaStreamLegacy map key will only exist if the // user explicitly passes it, so it is used as the default location for the free list - // at construction. For consistency, the same key is used for null stream free lists in non-PTDS - // mode. + // at construction. For consistency, the same key is used for null stream free lists in + // non-PTDS mode. // NOLINTNEXTLINE(cppcoreguidelines-pro-type-cstyle-cast) auto* const stream_to_store = stream.is_default() ? cudaStreamLegacy : stream.value(); @@ -496,11 +505,13 @@ class stream_ordered_memory_resource : public crtp, public device_ // bidirectional mapping between non-default streams and events std::unordered_map stream_events_; - // shared pointers to events keeps the events alive as long as either the thread that created them - // or the MR that is using them exists. + // shared pointers to events keeps the events alive as long as either the thread that created + // them or the MR that is using them exists. std::set> default_stream_events; std::mutex mtx_; // mutex for thread-safe access -}; // namespace detail + + rmm::cuda_device_id device_id_{rmm::get_current_cuda_device()}; +}; // namespace detail } // namespace rmm::mr::detail diff --git a/include/rmm/mr/device/per_device_resource.hpp b/include/rmm/mr/device/per_device_resource.hpp index 371c97fdf..aa7217758 100644 --- a/include/rmm/mr/device/per_device_resource.hpp +++ b/include/rmm/mr/device/per_device_resource.hpp @@ -202,7 +202,7 @@ inline device_memory_resource* set_per_device_resource(cuda_device_id device_id, */ inline device_memory_resource* get_current_device_resource() { - return get_per_device_resource(rmm::detail::current_device()); + return get_per_device_resource(rmm::get_current_cuda_device()); } /** @@ -231,6 +231,6 @@ inline device_memory_resource* get_current_device_resource() */ inline device_memory_resource* set_current_device_resource(device_memory_resource* new_mr) { - return set_per_device_resource(rmm::detail::current_device(), new_mr); + return set_per_device_resource(rmm::get_current_cuda_device(), new_mr); } } // namespace rmm::mr diff --git a/tests/mr/device/cuda_async_view_mr_tests.cpp b/tests/mr/device/cuda_async_view_mr_tests.cpp index 86cb6f106..209429b4b 100644 --- a/tests/mr/device/cuda_async_view_mr_tests.cpp +++ b/tests/mr/device/cuda_async_view_mr_tests.cpp @@ -31,7 +31,7 @@ TEST(PoolTest, UsePool) { cudaMemPool_t memPool{}; RMM_CUDA_TRY(rmm::detail::async_alloc::cudaDeviceGetDefaultMemPool( - &memPool, rmm::detail::current_device().value())); + &memPool, rmm::get_current_cuda_device().value())); const auto pool_init_size{100}; cuda_async_view_mr mr{memPool}; @@ -44,7 +44,7 @@ TEST(PoolTest, NotTakingOwnershipOfPool) { cudaMemPoolProps poolProps = {}; poolProps.allocType = cudaMemAllocationTypePinned; - poolProps.location.id = rmm::detail::current_device().value(); + poolProps.location.id = rmm::get_current_cuda_device().value(); poolProps.location.type = cudaMemLocationTypeDevice; cudaMemPool_t memPool{}; diff --git a/tests/mr/device/pool_mr_tests.cpp b/tests/mr/device/pool_mr_tests.cpp index c5df1951c..6bcd90527 100644 --- a/tests/mr/device/pool_mr_tests.cpp +++ b/tests/mr/device/pool_mr_tests.cpp @@ -14,10 +14,12 @@ * limitations under the License. */ +#include #include #include #include #include +#include #include #include #include @@ -150,5 +152,42 @@ TEST(PoolTest, UpstreamDoesntSupportMemInfo) mr2.deallocate(ptr, 1024); } +TEST(PoolTest, MultidevicePool) +{ + using MemoryResource = rmm::mr::pool_memory_resource; + + // Get the number of cuda devices + int num_devices = rmm::get_num_cuda_devices(); + + // only run on multidevice systems + if (num_devices >= 2) { + rmm::mr::cuda_memory_resource general_mr; + + // initializing pool_memory_resource of multiple devices + int devices = 2; + size_t pool_size = 1024; + std::vector> mrs; + + for (int i = 0; i < devices; ++i) { + RMM_CUDA_TRY(cudaSetDevice(i)); + auto mr = std::make_shared(&general_mr, pool_size, pool_size); + rmm::mr::set_per_device_resource(rmm::cuda_device_id{i}, mr.get()); + mrs.emplace_back(mr); + } + + { + RMM_CUDA_TRY(cudaSetDevice(0)); + rmm::device_buffer buf_a(16, rmm::cuda_stream_per_thread, mrs[0].get()); + + { + RMM_CUDA_TRY(cudaSetDevice(1)); + rmm::device_buffer buf_b(16, rmm::cuda_stream_per_thread, mrs[1].get()); + } + + RMM_CUDA_TRY(cudaSetDevice(0)); + } + } +} + } // namespace } // namespace rmm::test