first draft

inspired by PMacc CONST_VECTOR and alpaka-group/alpaka#2441 (comment)
psychocoderHPC · Dec 17, 2024 · 337e5d3 · 337e5d3
1 parent a4e4b9e
commit 337e5d3
Show file tree

Hide file tree

Showing 5 changed files with 285 additions and 1 deletion.
diff --git a/include/alpaka/alpaka.hpp b/include/alpaka/alpaka.hpp
@@ -21,6 +21,7 @@
 #include "alpaka/mem/Iter.hpp"
 #include "alpaka/onAcc.hpp"
 #include "alpaka/onAcc/Acc.hpp"
+#include "alpaka/onAcc/GlobalMem.hpp"
 #include "alpaka/onAcc/atomic.hpp"
 #include "alpaka/onHost.hpp"
 #include "alpaka/onHost/Device.hpp"

diff --git a/include/alpaka/core/PP.hpp b/include/alpaka/core/PP.hpp
@@ -7,3 +7,4 @@
 #define ALPAKA_PP_CAT(left, right) left##right
 #define ALPAKA_PP_REMOVE_FIRST_COMMA_DO(ignore, ...) __VA_ARGS__
 #define ALPAKA_PP_REMOVE_FIRST_COMMA(...) ALPAKA_PP_REMOVE_FIRST_COMMA_DO(0 __VA_ARGS__)
+#define ALPAKA_PP_REMOVE_BRACKETS(...) __VA_ARGS__
diff --git a/include/alpaka/mem/MdSpan.hpp b/include/alpaka/mem/MdSpan.hpp
@@ -256,8 +256,16 @@ namespace alpaka
     };
 
     template<typename T_ArrayType>
-    requires(std::is_array_v<T_ArrayType>)
     struct MdSpanArray
+    {
+        static_assert(
+            sizeof(T_ArrayType) && false,
+            "MdSpanArray can only be used if std::is_array_v<T> is true for teh given type.");
+    };
+
+    template<typename T_ArrayType>
+    requires(std::is_array_v<T_ArrayType>)
+    struct MdSpanArray<T_ArrayType>
     {
         using extentType = std::extent<T_ArrayType, std::rank_v<T_ArrayType>>;
         using element_type = std::remove_all_extents_t<T_ArrayType>;

diff --git a/include/alpaka/onAcc/GlobalMem.hpp b/include/alpaka/onAcc/GlobalMem.hpp
@@ -0,0 +1,109 @@
+/* Copyright 2024 René Widera
+ * SPDX-License-Identifier: MPL-2.0
+ */
+
+#pragma once
+
+#include "alpaka/Vec.hpp"
+#include "alpaka/core/PP.hpp"
+#include "alpaka/core/config.hpp"
+#include "alpaka/mem/MdSpan.hpp"
+
+#include <cassert>
+#include <tuple>
+#include <type_traits>
+
+namespace alpaka
+{
+
+    template<typename... Args>
+    consteval size_t count_arguments(Args&&...)
+    {
+        return sizeof...(Args);
+    }
+
+} // namespace alpaka
+
+#if defined(__CUDA_ARCH__) || (defined(__HIP_DEVICE_COMPILE__) && __HIP_DEVICE_COMPILE__ == 1 && defined(__HIP__))
+#    define ALPAKA_DEVICE_COMPILE 1
+#else
+#    define ALPAKA_DEVICE_COMPILE 0
+#endif
+
+/* select namespace depending on __CUDA_ARCH__ compiler flag*/
+#if(ALPAKA_DEVICE_COMPILE == 1)
+#    define ALPAKA_DEVICE_GLOBAL_NAMESPACE(id) using namespace ALPAKA_PP_CAT(alpaka_onAcc, id)
+#else
+#    define ALPAKA_DEVICE_GLOBAL_NAMESPACE(id) using namespace ALPAKA_PP_CAT(alpaka_onHost, id)
+#endif
+
+#if ALPAKA_LANG_CUDA || ALPAKA_LANG_HIP
+#    define ALPAKA_DEVICE_GLOBAL_DATA(id, dataType, name, ...)                                                        \
+        namespace ALPAKA_PP_CAT(alpaka_onAcc, id)                                                                     \
+        {                                                                                                             \
+            __device__ std::type_identity_t<ALPAKA_PP_REMOVE_BRACKETS dataType> ALPAKA_PP_CAT(name, id)               \
+                = __VA_OPT__({__VA_ARGS__});                                                                          \
+        }
+#else
+#    define ALPAKA_DEVICE_GLOBAL_DATA(id, dataType, name, ...)
+
+#endif
+
+#if ALPAKA_DEVICE_COMPILE
+#    define ALPAKA_DEVICE_GLOBAL_ACCESS(name, id)                                                                     \
+        [[maybe_unused]] __device__ constexpr auto name = alpaka::onAcc::GlobalDeviceMemoryWrapper<                   \
+            ALPAKA_PP_CAT(globalVariables, id)::ALPAKA_PP_CAT(GlobalStorage, id)>                                     \
+        {                                                                                                             \
+        }
+#else
+#    define ALPAKA_DEVICE_GLOBAL_ACCESS(name, id)                                                                     \
+        [[maybe_unused]] constexpr auto name = alpaka::onAcc::GlobalDeviceMemoryWrapper<                              \
+            ALPAKA_PP_CAT(globalVariables, id)::ALPAKA_PP_CAT(GlobalStorage, id)>                                     \
+        {                                                                                                             \
+        }
+#endif
+
+namespace alpaka::onAcc
+{
+    template<typename T_Storage>
+    struct GlobalDeviceMemoryWrapper : private T_Storage
+    {
+        constexpr decltype(auto) get() const
+        {
+            return alpaka::unWrapp(T_Storage::get());
+        }
+
+        constexpr operator std::reference_wrapper<typename T_Storage::type>()
+        {
+            return T_Storage::get();
+        }
+    };
+
+#define ALPAKA_DEVICE_GLOBAL_CREATE(location, dataType, id, name, ...)                                                \
+    ALPAKA_DEVICE_GLOBAL_DATA(id, dataType, name, __VA_ARGS__)                                                        \
+    namespace ALPAKA_PP_CAT(alpaka_onHost, id)                                                                        \
+    {                                                                                                                 \
+        [[maybe_unused]] std::type_identity_t<ALPAKA_PP_REMOVE_BRACKETS dataType> ALPAKA_PP_CAT(name, id)             \
+            = __VA_OPT__({__VA_ARGS__});                                                                              \
+    }                                                                                                                 \
+    namespace ALPAKA_PP_CAT(globalVariables, id)                                                                      \
+    {                                                                                                                 \
+        ALPAKA_DEVICE_GLOBAL_NAMESPACE(id);                                                                           \
+        struct ALPAKA_PP_CAT(GlobalStorage, id)                                                                       \
+        {                                                                                                             \
+            using type = ALPAKA_PP_REMOVE_BRACKETS dataType;                                                          \
+            ALPAKA_FN_ACC auto get() const                                                                            \
+            {                                                                                                         \
+                return std::conditional_t<                                                                            \
+                    std::is_array_v<ALPAKA_PP_REMOVE_BRACKETS dataType>,                                              \
+                    alpaka::MdSpanArray<ALPAKA_PP_REMOVE_BRACKETS dataType>,                                          \
+                    std::reference_wrapper<ALPAKA_PP_REMOVE_BRACKETS dataType>>{ALPAKA_PP_CAT(name, id)};             \
+            }                                                                                                         \
+        };                                                                                                            \
+    }                                                                                                                 \
+    ALPAKA_DEVICE_GLOBAL_ACCESS(name, id)
+
+#define ALPAKA_DEVICE_GLOBAL(type, name, ...)                                                                         \
+    ALPAKA_DEVICE_GLOBAL_CREATE(__device__, type, __COUNTER__, name, __VA_ARGS__)
+
+} // namespace alpaka::onAcc
diff --git a/tests/deviceGlobalMem.cpp b/tests/deviceGlobalMem.cpp
@@ -0,0 +1,165 @@
+/* Copyright 2024 René Widera
+ * SPDX-License-Identifier: MPL-2.0
+ */
+#if 1
+#    include <alpaka/alpaka.hpp>
+#    include <alpaka/example/executeForEach.hpp>
+#    include <alpaka/example/executors.hpp>
+
+#    include <catch2/catch_template_test_macros.hpp>
+#    include <catch2/catch_test_macros.hpp>
+
+#    include <iostream>
+
+using namespace alpaka;
+using namespace alpaka::onHost;
+
+using TestApis = std::decay_t<decltype(allExecutorsAndApis(enabledApis))>;
+
+
+ALPAKA_DEVICE_GLOBAL((alpaka::Vec<uint32_t, 2u>), initialised_vector, 42, 43);
+ALPAKA_DEVICE_GLOBAL((uint32_t), initialised_scalar, 43);
+ALPAKA_DEVICE_GLOBAL((uint32_t[2]), fixed_sized_array, 44, 45);
+ALPAKA_DEVICE_GLOBAL((uint32_t[2][3]), fixed_sized_array2D, {9, 5}, {6, 11, 45});
+
+struct DeviceGlobalMemKernelVec
+{
+    template<typename T>
+    ALPAKA_FN_ACC void operator()(T const& acc, auto out, auto numThreads) const
+    {
+        for(auto globalTheradIdx : onAcc::makeIdxMap(acc, onAcc::worker::threadsInGrid, IdxRange{numThreads}))
+        {
+            out[globalTheradIdx] = initialised_vector.get().y();
+        }
+    }
+};
+
+struct DeviceGlobalMemKernelScalar
+{
+    template<typename T>
+    ALPAKA_FN_ACC void operator()(T const& acc, auto out, auto numThreads) const
+    {
+        for(auto globalTheradIdx : onAcc::makeIdxMap(acc, onAcc::worker::threadsInGrid, IdxRange{numThreads}))
+        {
+            out[globalTheradIdx] = initialised_scalar.get();
+        }
+    }
+};
+
+struct DeviceGlobalMemKernelCArray
+{
+    template<typename T>
+    ALPAKA_FN_ACC void operator()(T const& acc, auto out, auto numThreads) const
+    {
+        for(auto globalTheradIdx : onAcc::makeIdxMap(acc, onAcc::worker::threadsInGrid, IdxRange{numThreads}))
+        {
+            out[globalTheradIdx] = fixed_sized_array.get()[0];
+        }
+    }
+};
+
+struct DeviceGlobalMemKernelCArray2D
+{
+    template<typename T>
+    ALPAKA_FN_ACC void operator()(T const& acc, auto out, auto numThreads) const
+    {
+        for(auto globalTheradIdx : onAcc::makeIdxMap(acc, onAcc::worker::threadsInGrid, IdxRange{numThreads}))
+        {
+            out[globalTheradIdx] = fixed_sized_array2D.get()[Vec{1, 2}];
+        }
+    }
+};
+
+TEMPLATE_LIST_TEST_CASE("device global mem", "", TestApis)
+{
+    auto cfg = TestType::makeDict();
+    auto api = cfg[object::api];
+    auto exec = cfg[object::exec];
+
+    std::cout << api.getName() << std::endl;
+
+    Platform platform = makePlatform(api);
+    Device device = platform.makeDevice(0);
+
+    std::cout << getName(platform) << " " << device.getName() << std::endl;
+
+    Queue queue = device.makeQueue();
+    constexpr Vec numBlocks = Vec{1u};
+    constexpr Vec blockExtent = Vec{4u};
+    constexpr Vec dataExtent = numBlocks * blockExtent;
+    std::cout << "block shared iota exec=" << core::demangledName(exec) << std::endl;
+    auto dBuff = onHost::alloc<uint32_t>(device, dataExtent);
+
+    Platform cpuPlatform = makePlatform(api::cpu);
+    Device cpuDevice = cpuPlatform.makeDevice(0);
+    auto hBuff = onHost::allocMirror(cpuDevice, dBuff);
+    wait(queue);
+    {
+        onHost::enqueue(
+            queue,
+            exec,
+            FrameSpec{numBlocks, blockExtent},
+            KernelBundle{DeviceGlobalMemKernelVec{}, dBuff.getMdSpan(), dataExtent});
+        onHost::memcpy(queue, hBuff, dBuff);
+        wait(queue);
+
+        auto* ptr = onHost::data(hBuff);
+        for(uint32_t i = 0u; i < dataExtent; ++i)
+        {
+            CHECK(42 == ptr[i]);
+        }
+    }
+
+    // scalar
+    {
+        onHost::enqueue(
+            queue,
+            exec,
+            FrameSpec{numBlocks, blockExtent},
+            KernelBundle{DeviceGlobalMemKernelScalar{}, dBuff.getMdSpan(), dataExtent});
+        onHost::memcpy(queue, hBuff, dBuff);
+        wait(queue);
+
+        auto* ptr = onHost::data(hBuff);
+        for(uint32_t i = 0u; i < dataExtent; ++i)
+        {
+            CHECK(43 == ptr[i]);
+        }
+    }
+
+    // C array
+    {
+        onHost::enqueue(
+            queue,
+            exec,
+            FrameSpec{numBlocks, blockExtent},
+            KernelBundle{DeviceGlobalMemKernelCArray{}, dBuff.getMdSpan(), dataExtent});
+        onHost::memcpy(queue, hBuff, dBuff);
+        wait(queue);
+
+        auto* ptr = onHost::data(hBuff);
+        for(uint32_t i = 0u; i < dataExtent; ++i)
+        {
+            CHECK(44 == ptr[i]);
+        }
+    }
+
+    // C array 2D
+    {
+        onHost::enqueue(
+            queue,
+            exec,
+            FrameSpec{numBlocks, blockExtent},
+            KernelBundle{DeviceGlobalMemKernelCArray2D{}, dBuff.getMdSpan(), dataExtent});
+        onHost::memcpy(queue, hBuff, dBuff);
+        wait(queue);
+
+        auto* ptr = onHost::data(hBuff);
+        for(uint32_t i = 0u; i < dataExtent; ++i)
+        {
+            CHECK(45 == ptr[i]);
+        }
+    }
+}
+
+#endif