From 337e5d3833dbc5f78ddb4a9c8d357470fb9df800 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Widera?= Date: Tue, 17 Dec 2024 17:57:37 +0100 Subject: [PATCH] first draft inspired by PMacc CONST_VECTOR and https://github.com/alpaka-group/alpaka/issues/2441#issuecomment-2544817029 --- include/alpaka/alpaka.hpp | 1 + include/alpaka/core/PP.hpp | 1 + include/alpaka/mem/MdSpan.hpp | 10 +- include/alpaka/onAcc/GlobalMem.hpp | 109 +++++++++++++++++++ tests/deviceGlobalMem.cpp | 165 +++++++++++++++++++++++++++++ 5 files changed, 285 insertions(+), 1 deletion(-) create mode 100644 include/alpaka/onAcc/GlobalMem.hpp create mode 100644 tests/deviceGlobalMem.cpp diff --git a/include/alpaka/alpaka.hpp b/include/alpaka/alpaka.hpp index da46a68..0588720 100644 --- a/include/alpaka/alpaka.hpp +++ b/include/alpaka/alpaka.hpp @@ -21,6 +21,7 @@ #include "alpaka/mem/Iter.hpp" #include "alpaka/onAcc.hpp" #include "alpaka/onAcc/Acc.hpp" +#include "alpaka/onAcc/GlobalMem.hpp" #include "alpaka/onAcc/atomic.hpp" #include "alpaka/onHost.hpp" #include "alpaka/onHost/Device.hpp" diff --git a/include/alpaka/core/PP.hpp b/include/alpaka/core/PP.hpp index d94343f..a79af9f 100644 --- a/include/alpaka/core/PP.hpp +++ b/include/alpaka/core/PP.hpp @@ -7,3 +7,4 @@ #define ALPAKA_PP_CAT(left, right) left##right #define ALPAKA_PP_REMOVE_FIRST_COMMA_DO(ignore, ...) __VA_ARGS__ #define ALPAKA_PP_REMOVE_FIRST_COMMA(...) ALPAKA_PP_REMOVE_FIRST_COMMA_DO(0 __VA_ARGS__) +#define ALPAKA_PP_REMOVE_BRACKETS(...) __VA_ARGS__ diff --git a/include/alpaka/mem/MdSpan.hpp b/include/alpaka/mem/MdSpan.hpp index 3238672..dd47fdb 100644 --- a/include/alpaka/mem/MdSpan.hpp +++ b/include/alpaka/mem/MdSpan.hpp @@ -256,8 +256,16 @@ namespace alpaka }; template - requires(std::is_array_v) struct MdSpanArray + { + static_assert( + sizeof(T_ArrayType) && false, + "MdSpanArray can only be used if std::is_array_v is true for teh given type."); + }; + + template + requires(std::is_array_v) + struct MdSpanArray { using extentType = std::extent>; using element_type = std::remove_all_extents_t; diff --git a/include/alpaka/onAcc/GlobalMem.hpp b/include/alpaka/onAcc/GlobalMem.hpp new file mode 100644 index 0000000..7377958 --- /dev/null +++ b/include/alpaka/onAcc/GlobalMem.hpp @@ -0,0 +1,109 @@ +/* Copyright 2024 René Widera + * SPDX-License-Identifier: MPL-2.0 + */ + +#pragma once + +#include "alpaka/Vec.hpp" +#include "alpaka/core/PP.hpp" +#include "alpaka/core/config.hpp" +#include "alpaka/mem/MdSpan.hpp" + +#include +#include +#include + +namespace alpaka +{ + + template + consteval size_t count_arguments(Args&&...) + { + return sizeof...(Args); + } + +} // namespace alpaka + +#if defined(__CUDA_ARCH__) || (defined(__HIP_DEVICE_COMPILE__) && __HIP_DEVICE_COMPILE__ == 1 && defined(__HIP__)) +# define ALPAKA_DEVICE_COMPILE 1 +#else +# define ALPAKA_DEVICE_COMPILE 0 +#endif + +/* select namespace depending on __CUDA_ARCH__ compiler flag*/ +#if(ALPAKA_DEVICE_COMPILE == 1) +# define ALPAKA_DEVICE_GLOBAL_NAMESPACE(id) using namespace ALPAKA_PP_CAT(alpaka_onAcc, id) +#else +# define ALPAKA_DEVICE_GLOBAL_NAMESPACE(id) using namespace ALPAKA_PP_CAT(alpaka_onHost, id) +#endif + +#if ALPAKA_LANG_CUDA || ALPAKA_LANG_HIP +# define ALPAKA_DEVICE_GLOBAL_DATA(id, dataType, name, ...) \ + namespace ALPAKA_PP_CAT(alpaka_onAcc, id) \ + { \ + __device__ std::type_identity_t ALPAKA_PP_CAT(name, id) \ + = __VA_OPT__({__VA_ARGS__}); \ + } +#else +# define ALPAKA_DEVICE_GLOBAL_DATA(id, dataType, name, ...) + +#endif + +#if ALPAKA_DEVICE_COMPILE +# define ALPAKA_DEVICE_GLOBAL_ACCESS(name, id) \ + [[maybe_unused]] __device__ constexpr auto name = alpaka::onAcc::GlobalDeviceMemoryWrapper< \ + ALPAKA_PP_CAT(globalVariables, id)::ALPAKA_PP_CAT(GlobalStorage, id)> \ + { \ + } +#else +# define ALPAKA_DEVICE_GLOBAL_ACCESS(name, id) \ + [[maybe_unused]] constexpr auto name = alpaka::onAcc::GlobalDeviceMemoryWrapper< \ + ALPAKA_PP_CAT(globalVariables, id)::ALPAKA_PP_CAT(GlobalStorage, id)> \ + { \ + } +#endif + +namespace alpaka::onAcc +{ + template + struct GlobalDeviceMemoryWrapper : private T_Storage + { + constexpr decltype(auto) get() const + { + return alpaka::unWrapp(T_Storage::get()); + } + + constexpr operator std::reference_wrapper() + { + return T_Storage::get(); + } + }; + +#define ALPAKA_DEVICE_GLOBAL_CREATE(location, dataType, id, name, ...) \ + ALPAKA_DEVICE_GLOBAL_DATA(id, dataType, name, __VA_ARGS__) \ + namespace ALPAKA_PP_CAT(alpaka_onHost, id) \ + { \ + [[maybe_unused]] std::type_identity_t ALPAKA_PP_CAT(name, id) \ + = __VA_OPT__({__VA_ARGS__}); \ + } \ + namespace ALPAKA_PP_CAT(globalVariables, id) \ + { \ + ALPAKA_DEVICE_GLOBAL_NAMESPACE(id); \ + struct ALPAKA_PP_CAT(GlobalStorage, id) \ + { \ + using type = ALPAKA_PP_REMOVE_BRACKETS dataType; \ + ALPAKA_FN_ACC auto get() const \ + { \ + return std::conditional_t< \ + std::is_array_v, \ + alpaka::MdSpanArray, \ + std::reference_wrapper>{ALPAKA_PP_CAT(name, id)}; \ + } \ + }; \ + } \ + ALPAKA_DEVICE_GLOBAL_ACCESS(name, id) + +#define ALPAKA_DEVICE_GLOBAL(type, name, ...) \ + ALPAKA_DEVICE_GLOBAL_CREATE(__device__, type, __COUNTER__, name, __VA_ARGS__) + +} // namespace alpaka::onAcc diff --git a/tests/deviceGlobalMem.cpp b/tests/deviceGlobalMem.cpp new file mode 100644 index 0000000..8add9d6 --- /dev/null +++ b/tests/deviceGlobalMem.cpp @@ -0,0 +1,165 @@ +/* Copyright 2024 René Widera + * SPDX-License-Identifier: MPL-2.0 + */ +#if 1 +# include +# include +# include + +# include +# include + +# include + +using namespace alpaka; +using namespace alpaka::onHost; + +using TestApis = std::decay_t; + + +ALPAKA_DEVICE_GLOBAL((alpaka::Vec), initialised_vector, 42, 43); +ALPAKA_DEVICE_GLOBAL((uint32_t), initialised_scalar, 43); +ALPAKA_DEVICE_GLOBAL((uint32_t[2]), fixed_sized_array, 44, 45); +ALPAKA_DEVICE_GLOBAL((uint32_t[2][3]), fixed_sized_array2D, {9, 5}, {6, 11, 45}); + +struct DeviceGlobalMemKernelVec +{ + template + ALPAKA_FN_ACC void operator()(T const& acc, auto out, auto numThreads) const + { + for(auto globalTheradIdx : onAcc::makeIdxMap(acc, onAcc::worker::threadsInGrid, IdxRange{numThreads})) + { + out[globalTheradIdx] = initialised_vector.get().y(); + } + } +}; + +struct DeviceGlobalMemKernelScalar +{ + template + ALPAKA_FN_ACC void operator()(T const& acc, auto out, auto numThreads) const + { + for(auto globalTheradIdx : onAcc::makeIdxMap(acc, onAcc::worker::threadsInGrid, IdxRange{numThreads})) + { + out[globalTheradIdx] = initialised_scalar.get(); + } + } +}; + +struct DeviceGlobalMemKernelCArray +{ + template + ALPAKA_FN_ACC void operator()(T const& acc, auto out, auto numThreads) const + { + for(auto globalTheradIdx : onAcc::makeIdxMap(acc, onAcc::worker::threadsInGrid, IdxRange{numThreads})) + { + out[globalTheradIdx] = fixed_sized_array.get()[0]; + } + } +}; + +struct DeviceGlobalMemKernelCArray2D +{ + template + ALPAKA_FN_ACC void operator()(T const& acc, auto out, auto numThreads) const + { + for(auto globalTheradIdx : onAcc::makeIdxMap(acc, onAcc::worker::threadsInGrid, IdxRange{numThreads})) + { + out[globalTheradIdx] = fixed_sized_array2D.get()[Vec{1, 2}]; + } + } +}; + +TEMPLATE_LIST_TEST_CASE("device global mem", "", TestApis) +{ + auto cfg = TestType::makeDict(); + auto api = cfg[object::api]; + auto exec = cfg[object::exec]; + + std::cout << api.getName() << std::endl; + + Platform platform = makePlatform(api); + Device device = platform.makeDevice(0); + + std::cout << getName(platform) << " " << device.getName() << std::endl; + + Queue queue = device.makeQueue(); + constexpr Vec numBlocks = Vec{1u}; + constexpr Vec blockExtent = Vec{4u}; + constexpr Vec dataExtent = numBlocks * blockExtent; + std::cout << "block shared iota exec=" << core::demangledName(exec) << std::endl; + auto dBuff = onHost::alloc(device, dataExtent); + + Platform cpuPlatform = makePlatform(api::cpu); + Device cpuDevice = cpuPlatform.makeDevice(0); + auto hBuff = onHost::allocMirror(cpuDevice, dBuff); + wait(queue); + { + onHost::enqueue( + queue, + exec, + FrameSpec{numBlocks, blockExtent}, + KernelBundle{DeviceGlobalMemKernelVec{}, dBuff.getMdSpan(), dataExtent}); + onHost::memcpy(queue, hBuff, dBuff); + wait(queue); + + auto* ptr = onHost::data(hBuff); + for(uint32_t i = 0u; i < dataExtent; ++i) + { + CHECK(42 == ptr[i]); + } + } + + // scalar + { + onHost::enqueue( + queue, + exec, + FrameSpec{numBlocks, blockExtent}, + KernelBundle{DeviceGlobalMemKernelScalar{}, dBuff.getMdSpan(), dataExtent}); + onHost::memcpy(queue, hBuff, dBuff); + wait(queue); + + auto* ptr = onHost::data(hBuff); + for(uint32_t i = 0u; i < dataExtent; ++i) + { + CHECK(43 == ptr[i]); + } + } + + // C array + { + onHost::enqueue( + queue, + exec, + FrameSpec{numBlocks, blockExtent}, + KernelBundle{DeviceGlobalMemKernelCArray{}, dBuff.getMdSpan(), dataExtent}); + onHost::memcpy(queue, hBuff, dBuff); + wait(queue); + + auto* ptr = onHost::data(hBuff); + for(uint32_t i = 0u; i < dataExtent; ++i) + { + CHECK(44 == ptr[i]); + } + } + + // C array 2D + { + onHost::enqueue( + queue, + exec, + FrameSpec{numBlocks, blockExtent}, + KernelBundle{DeviceGlobalMemKernelCArray2D{}, dBuff.getMdSpan(), dataExtent}); + onHost::memcpy(queue, hBuff, dBuff); + wait(queue); + + auto* ptr = onHost::data(hBuff); + for(uint32_t i = 0u; i < dataExtent; ++i) + { + CHECK(45 == ptr[i]); + } + } +} + +#endif