From 9614d9c6156883a680b8a21ca97c8efa6a683691 Mon Sep 17 00:00:00 2001 From: Michael Varvarin <55709728+MichaelVarvarin@users.noreply.github.com> Date: Fri, 26 Jul 2024 15:03:46 +0300 Subject: [PATCH] Add cooperative kernel launch and grid sync support for HIP --- include/alpaka/acc/AccGpuUniformCudaHipRt.hpp | 4 +-- include/alpaka/alpaka.hpp | 2 +- include/alpaka/core/ApiCudaRt.hpp | 11 ++++++++ include/alpaka/core/ApiHipRt.hpp | 11 ++++++++ ...SyncGpuCuda.hpp => GridSyncGpuCudaHip.hpp} | 27 ++++++++++++++----- .../kernel/TaskKernelGpuUniformCudaHipRt.hpp | 8 +++--- 6 files changed, 49 insertions(+), 14 deletions(-) rename include/alpaka/grid/{GridSyncGpuCuda.hpp => GridSyncGpuCudaHip.hpp} (51%) diff --git a/include/alpaka/acc/AccGpuUniformCudaHipRt.hpp b/include/alpaka/acc/AccGpuUniformCudaHipRt.hpp index fe2cd949c7bc..5eeb71a2eae2 100644 --- a/include/alpaka/acc/AccGpuUniformCudaHipRt.hpp +++ b/include/alpaka/acc/AccGpuUniformCudaHipRt.hpp @@ -11,7 +11,7 @@ #include "alpaka/block/shared/st/BlockSharedMemStUniformCudaHipBuiltIn.hpp" #include "alpaka/block/sync/BlockSyncUniformCudaHipBuiltIn.hpp" #include "alpaka/core/DemangleTypeNames.hpp" -#include "alpaka/grid/GridSyncGpuCuda.hpp" +#include "alpaka/grid/GridSyncGpuCudaHip.hpp" #include "alpaka/idx/bt/IdxBtUniformCudaHipBuiltIn.hpp" #include "alpaka/idx/gb/IdxGbUniformCudaHipBuiltIn.hpp" #include "alpaka/intrinsic/IntrinsicUniformCudaHipBuiltIn.hpp" @@ -60,7 +60,7 @@ namespace alpaka , public BlockSharedMemDynUniformCudaHipBuiltIn , public BlockSharedMemStUniformCudaHipBuiltIn , public BlockSyncUniformCudaHipBuiltIn - , public GridSyncCudaBuiltIn + , public GridSyncCudaHipBuiltIn , public IntrinsicUniformCudaHipBuiltIn , public MemFenceUniformCudaHipBuiltIn # ifdef ALPAKA_DISABLE_VENDOR_RNG diff --git a/include/alpaka/alpaka.hpp b/include/alpaka/alpaka.hpp index 55070882d9b7..7118ba0857a9 100644 --- a/include/alpaka/alpaka.hpp +++ b/include/alpaka/alpaka.hpp @@ -34,7 +34,7 @@ #include "alpaka/atomic/Op.hpp" #include "alpaka/atomic/Traits.hpp" // grid -#include "alpaka/grid/GridSyncGpuCuda.hpp" +#include "alpaka/grid/GridSyncGpuCudaHip.hpp" #include "alpaka/grid/Traits.hpp" // block // shared diff --git a/include/alpaka/core/ApiCudaRt.hpp b/include/alpaka/core/ApiCudaRt.hpp index 4a847298357e..1710e7594fa5 100644 --- a/include/alpaka/core/ApiCudaRt.hpp +++ b/include/alpaka/core/ApiCudaRt.hpp @@ -246,6 +246,11 @@ namespace alpaka return ::cudaHostUnregister(ptr); } + static inline Error_t launchCooperativeKernel(const void* func, dim3 gridDim, dim3 blockDim, void** args, size_t sharedMem, Stream_t stream) + { + return ::cudaLaunchCooperativeKernel(func, gridDim, blockDim, args, sharedMem, stream); + } + static inline Error_t launchHostFunc(Stream_t stream, HostFn_t fn, void* userData) { # if CUDART_VERSION >= 10000 @@ -388,6 +393,12 @@ namespace alpaka { return ::make_cudaExtent(w, h, d); } + + template + static inline Error_t occupancyMaxActiveBlocksPerMultiprocessor (int* numBlocks, T func, int blockSize, size_t dynamicSMemSize) + { + return ::cudaOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, func, blockSize, dynamicSMemSize); + } }; } // namespace alpaka diff --git a/include/alpaka/core/ApiHipRt.hpp b/include/alpaka/core/ApiHipRt.hpp index d000685e7b1b..0ea8790d63c0 100644 --- a/include/alpaka/core/ApiHipRt.hpp +++ b/include/alpaka/core/ApiHipRt.hpp @@ -271,6 +271,11 @@ namespace alpaka return ::hipHostUnregister(ptr); } + static inline Error_t launchCooperativeKernel(const void* func, dim3 gridDim, dim3 blockDim, void** args, size_t sharedMem, Stream_t stream) + { + return ::hipLaunchCooperativeKernel(func, gridDim, blockDim, args, sharedMem, stream); + } + static inline Error_t launchHostFunc(Stream_t stream, HostFn_t fn, void* userData) { // hipLaunchHostFunc is implemented only in ROCm 5.4.0 and later. @@ -427,6 +432,12 @@ namespace alpaka { return ::make_hipExtent(w, h, d); } + + template + static inline Error_t occupancyMaxActiveBlocksPerMultiprocessor (int* numBlocks, T func, int blockSize, size_t dynamicSMemSize) + { + return ::hipOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, func, blockSize, dynamicSMemSize); + } }; } // namespace alpaka diff --git a/include/alpaka/grid/GridSyncGpuCuda.hpp b/include/alpaka/grid/GridSyncGpuCudaHip.hpp similarity index 51% rename from include/alpaka/grid/GridSyncGpuCuda.hpp rename to include/alpaka/grid/GridSyncGpuCudaHip.hpp index 54ca9093766b..cdf4fd8b7c11 100644 --- a/include/alpaka/grid/GridSyncGpuCuda.hpp +++ b/include/alpaka/grid/GridSyncGpuCudaHip.hpp @@ -8,14 +8,23 @@ #include "alpaka/core/BoostPredef.hpp" #include "alpaka/core/Concepts.hpp" -#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) -#include +#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED) + +# if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) +# include +# endif + +# if defined(ALPAKA_ACC_GPU_HIP_ENABLED) +# include +# endif + + namespace alpaka { - //! The GPU CUDA grid synchronization. - class GridSyncCudaBuiltIn - : public concepts::Implements + //! The GPU CUDA/HIP grid synchronization. + class GridSyncCudaHipBuiltIn + : public concepts::Implements { }; @@ -23,14 +32,18 @@ namespace alpaka # if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA # error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA! +# endif + +# if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP +# error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP! # endif namespace trait { template<> - struct SyncGridThreads + struct SyncGridThreads { - __device__ static auto syncGridThreads(GridSyncCudaBuiltIn const& /*gridSync*/) -> void + __device__ static auto syncGridThreads(GridSyncCudaHipBuiltIn const& /*gridSync*/) -> void { cooperative_groups::this_grid().sync(); } diff --git a/include/alpaka/kernel/TaskKernelGpuUniformCudaHipRt.hpp b/include/alpaka/kernel/TaskKernelGpuUniformCudaHipRt.hpp index 429376c77ef3..b5b5e20a2137 100644 --- a/include/alpaka/kernel/TaskKernelGpuUniformCudaHipRt.hpp +++ b/include/alpaka/kernel/TaskKernelGpuUniformCudaHipRt.hpp @@ -250,7 +250,7 @@ namespace alpaka if constexpr (TCooperative) { // This checks if requested number of blocks is compliant with the maxima of the accelerator. int numBlocksPerSm = 0; - cudaOccupancyMaxActiveBlocksPerMultiprocessor( + TApi::occupancyMaxActiveBlocksPerMultiprocessor( &numBlocksPerSm, kernelName, blockThreadExtent.prod(), @@ -308,7 +308,7 @@ namespace alpaka { void const* kernelArgs[] = {&threadElemExtent, &task.m_kernelFnObj, &args...}; - cudaLaunchCooperativeKernel( + TApi::launchCooperativeKernel( reinterpret_cast(kernelName), gridDim, blockDim, @@ -409,7 +409,7 @@ namespace alpaka if constexpr (TCooperative) { // This checks if requested number of blocks is compliant with the maxima of the accelerator. int numBlocksPerSm = 0; - cudaOccupancyMaxActiveBlocksPerMultiprocessor( + TApi::occupancyMaxActiveBlocksPerMultiprocessor( &numBlocksPerSm, kernelName, blockThreadExtent.prod(), @@ -463,7 +463,7 @@ namespace alpaka { void const* kernelArgs[] = {&threadElemExtent, &task.m_kernelFnObj, &args...}; - cudaLaunchCooperativeKernel( + TApi::launchCooperativeKernel( reinterpret_cast(kernelName), gridDim, blockDim,