Skip to content

Commit

Permalink
first draft
Browse files Browse the repository at this point in the history
inspired by PMacc CONST_VECTOR and alpaka-group/alpaka#2441 (comment)
  • Loading branch information
psychocoderHPC committed Dec 17, 2024
1 parent a4e4b9e commit 337e5d3
Show file tree
Hide file tree
Showing 5 changed files with 285 additions and 1 deletion.
1 change: 1 addition & 0 deletions include/alpaka/alpaka.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include "alpaka/mem/Iter.hpp"
#include "alpaka/onAcc.hpp"
#include "alpaka/onAcc/Acc.hpp"
#include "alpaka/onAcc/GlobalMem.hpp"
#include "alpaka/onAcc/atomic.hpp"
#include "alpaka/onHost.hpp"
#include "alpaka/onHost/Device.hpp"
Expand Down
1 change: 1 addition & 0 deletions include/alpaka/core/PP.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@
#define ALPAKA_PP_CAT(left, right) left##right
#define ALPAKA_PP_REMOVE_FIRST_COMMA_DO(ignore, ...) __VA_ARGS__
#define ALPAKA_PP_REMOVE_FIRST_COMMA(...) ALPAKA_PP_REMOVE_FIRST_COMMA_DO(0 __VA_ARGS__)
#define ALPAKA_PP_REMOVE_BRACKETS(...) __VA_ARGS__
10 changes: 9 additions & 1 deletion include/alpaka/mem/MdSpan.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -256,8 +256,16 @@ namespace alpaka
};

template<typename T_ArrayType>
requires(std::is_array_v<T_ArrayType>)
struct MdSpanArray
{
static_assert(
sizeof(T_ArrayType) && false,
"MdSpanArray can only be used if std::is_array_v<T> is true for teh given type.");
};

template<typename T_ArrayType>
requires(std::is_array_v<T_ArrayType>)
struct MdSpanArray<T_ArrayType>
{
using extentType = std::extent<T_ArrayType, std::rank_v<T_ArrayType>>;
using element_type = std::remove_all_extents_t<T_ArrayType>;
Expand Down
109 changes: 109 additions & 0 deletions include/alpaka/onAcc/GlobalMem.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
/* Copyright 2024 René Widera
* SPDX-License-Identifier: MPL-2.0
*/

#pragma once

#include "alpaka/Vec.hpp"
#include "alpaka/core/PP.hpp"
#include "alpaka/core/config.hpp"
#include "alpaka/mem/MdSpan.hpp"

#include <cassert>
#include <tuple>
#include <type_traits>

namespace alpaka
{

template<typename... Args>
consteval size_t count_arguments(Args&&...)
{
return sizeof...(Args);
}

} // namespace alpaka

#if defined(__CUDA_ARCH__) || (defined(__HIP_DEVICE_COMPILE__) && __HIP_DEVICE_COMPILE__ == 1 && defined(__HIP__))
# define ALPAKA_DEVICE_COMPILE 1
#else
# define ALPAKA_DEVICE_COMPILE 0
#endif

/* select namespace depending on __CUDA_ARCH__ compiler flag*/
#if(ALPAKA_DEVICE_COMPILE == 1)
# define ALPAKA_DEVICE_GLOBAL_NAMESPACE(id) using namespace ALPAKA_PP_CAT(alpaka_onAcc, id)
#else
# define ALPAKA_DEVICE_GLOBAL_NAMESPACE(id) using namespace ALPAKA_PP_CAT(alpaka_onHost, id)
#endif

#if ALPAKA_LANG_CUDA || ALPAKA_LANG_HIP
# define ALPAKA_DEVICE_GLOBAL_DATA(id, dataType, name, ...) \
namespace ALPAKA_PP_CAT(alpaka_onAcc, id) \
{ \
__device__ std::type_identity_t<ALPAKA_PP_REMOVE_BRACKETS dataType> ALPAKA_PP_CAT(name, id) \
= __VA_OPT__({__VA_ARGS__}); \
}
#else
# define ALPAKA_DEVICE_GLOBAL_DATA(id, dataType, name, ...)

#endif

#if ALPAKA_DEVICE_COMPILE
# define ALPAKA_DEVICE_GLOBAL_ACCESS(name, id) \
[[maybe_unused]] __device__ constexpr auto name = alpaka::onAcc::GlobalDeviceMemoryWrapper< \
ALPAKA_PP_CAT(globalVariables, id)::ALPAKA_PP_CAT(GlobalStorage, id)> \
{ \
}
#else
# define ALPAKA_DEVICE_GLOBAL_ACCESS(name, id) \
[[maybe_unused]] constexpr auto name = alpaka::onAcc::GlobalDeviceMemoryWrapper< \
ALPAKA_PP_CAT(globalVariables, id)::ALPAKA_PP_CAT(GlobalStorage, id)> \
{ \
}
#endif

namespace alpaka::onAcc
{
template<typename T_Storage>
struct GlobalDeviceMemoryWrapper : private T_Storage
{
constexpr decltype(auto) get() const
{
return alpaka::unWrapp(T_Storage::get());
}

constexpr operator std::reference_wrapper<typename T_Storage::type>()
{
return T_Storage::get();
}
};

#define ALPAKA_DEVICE_GLOBAL_CREATE(location, dataType, id, name, ...) \
ALPAKA_DEVICE_GLOBAL_DATA(id, dataType, name, __VA_ARGS__) \
namespace ALPAKA_PP_CAT(alpaka_onHost, id) \
{ \
[[maybe_unused]] std::type_identity_t<ALPAKA_PP_REMOVE_BRACKETS dataType> ALPAKA_PP_CAT(name, id) \
= __VA_OPT__({__VA_ARGS__}); \
} \
namespace ALPAKA_PP_CAT(globalVariables, id) \
{ \
ALPAKA_DEVICE_GLOBAL_NAMESPACE(id); \
struct ALPAKA_PP_CAT(GlobalStorage, id) \
{ \
using type = ALPAKA_PP_REMOVE_BRACKETS dataType; \
ALPAKA_FN_ACC auto get() const \
{ \
return std::conditional_t< \
std::is_array_v<ALPAKA_PP_REMOVE_BRACKETS dataType>, \
alpaka::MdSpanArray<ALPAKA_PP_REMOVE_BRACKETS dataType>, \
std::reference_wrapper<ALPAKA_PP_REMOVE_BRACKETS dataType>>{ALPAKA_PP_CAT(name, id)}; \
} \
}; \
} \
ALPAKA_DEVICE_GLOBAL_ACCESS(name, id)

#define ALPAKA_DEVICE_GLOBAL(type, name, ...) \
ALPAKA_DEVICE_GLOBAL_CREATE(__device__, type, __COUNTER__, name, __VA_ARGS__)

} // namespace alpaka::onAcc
165 changes: 165 additions & 0 deletions tests/deviceGlobalMem.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
/* Copyright 2024 René Widera
* SPDX-License-Identifier: MPL-2.0
*/
#if 1
# include <alpaka/alpaka.hpp>
# include <alpaka/example/executeForEach.hpp>
# include <alpaka/example/executors.hpp>

# include <catch2/catch_template_test_macros.hpp>
# include <catch2/catch_test_macros.hpp>

# include <iostream>

using namespace alpaka;
using namespace alpaka::onHost;

using TestApis = std::decay_t<decltype(allExecutorsAndApis(enabledApis))>;


ALPAKA_DEVICE_GLOBAL((alpaka::Vec<uint32_t, 2u>), initialised_vector, 42, 43);
ALPAKA_DEVICE_GLOBAL((uint32_t), initialised_scalar, 43);
ALPAKA_DEVICE_GLOBAL((uint32_t[2]), fixed_sized_array, 44, 45);
ALPAKA_DEVICE_GLOBAL((uint32_t[2][3]), fixed_sized_array2D, {9, 5}, {6, 11, 45});

struct DeviceGlobalMemKernelVec
{
template<typename T>
ALPAKA_FN_ACC void operator()(T const& acc, auto out, auto numThreads) const
{
for(auto globalTheradIdx : onAcc::makeIdxMap(acc, onAcc::worker::threadsInGrid, IdxRange{numThreads}))
{
out[globalTheradIdx] = initialised_vector.get().y();
}
}
};

struct DeviceGlobalMemKernelScalar
{
template<typename T>
ALPAKA_FN_ACC void operator()(T const& acc, auto out, auto numThreads) const
{
for(auto globalTheradIdx : onAcc::makeIdxMap(acc, onAcc::worker::threadsInGrid, IdxRange{numThreads}))
{
out[globalTheradIdx] = initialised_scalar.get();
}
}
};

struct DeviceGlobalMemKernelCArray
{
template<typename T>
ALPAKA_FN_ACC void operator()(T const& acc, auto out, auto numThreads) const
{
for(auto globalTheradIdx : onAcc::makeIdxMap(acc, onAcc::worker::threadsInGrid, IdxRange{numThreads}))
{
out[globalTheradIdx] = fixed_sized_array.get()[0];
}
}
};

struct DeviceGlobalMemKernelCArray2D
{
template<typename T>
ALPAKA_FN_ACC void operator()(T const& acc, auto out, auto numThreads) const
{
for(auto globalTheradIdx : onAcc::makeIdxMap(acc, onAcc::worker::threadsInGrid, IdxRange{numThreads}))
{
out[globalTheradIdx] = fixed_sized_array2D.get()[Vec{1, 2}];
}
}
};

TEMPLATE_LIST_TEST_CASE("device global mem", "", TestApis)
{
auto cfg = TestType::makeDict();
auto api = cfg[object::api];
auto exec = cfg[object::exec];

std::cout << api.getName() << std::endl;

Platform platform = makePlatform(api);
Device device = platform.makeDevice(0);

std::cout << getName(platform) << " " << device.getName() << std::endl;

Queue queue = device.makeQueue();
constexpr Vec numBlocks = Vec{1u};
constexpr Vec blockExtent = Vec{4u};
constexpr Vec dataExtent = numBlocks * blockExtent;
std::cout << "block shared iota exec=" << core::demangledName(exec) << std::endl;
auto dBuff = onHost::alloc<uint32_t>(device, dataExtent);

Platform cpuPlatform = makePlatform(api::cpu);
Device cpuDevice = cpuPlatform.makeDevice(0);
auto hBuff = onHost::allocMirror(cpuDevice, dBuff);
wait(queue);
{
onHost::enqueue(
queue,
exec,
FrameSpec{numBlocks, blockExtent},
KernelBundle{DeviceGlobalMemKernelVec{}, dBuff.getMdSpan(), dataExtent});
onHost::memcpy(queue, hBuff, dBuff);
wait(queue);

auto* ptr = onHost::data(hBuff);
for(uint32_t i = 0u; i < dataExtent; ++i)
{
CHECK(42 == ptr[i]);
}
}

// scalar
{
onHost::enqueue(
queue,
exec,
FrameSpec{numBlocks, blockExtent},
KernelBundle{DeviceGlobalMemKernelScalar{}, dBuff.getMdSpan(), dataExtent});
onHost::memcpy(queue, hBuff, dBuff);
wait(queue);

auto* ptr = onHost::data(hBuff);
for(uint32_t i = 0u; i < dataExtent; ++i)
{
CHECK(43 == ptr[i]);
}
}

// C array
{
onHost::enqueue(
queue,
exec,
FrameSpec{numBlocks, blockExtent},
KernelBundle{DeviceGlobalMemKernelCArray{}, dBuff.getMdSpan(), dataExtent});
onHost::memcpy(queue, hBuff, dBuff);
wait(queue);

auto* ptr = onHost::data(hBuff);
for(uint32_t i = 0u; i < dataExtent; ++i)
{
CHECK(44 == ptr[i]);
}
}

// C array 2D
{
onHost::enqueue(
queue,
exec,
FrameSpec{numBlocks, blockExtent},
KernelBundle{DeviceGlobalMemKernelCArray2D{}, dBuff.getMdSpan(), dataExtent});
onHost::memcpy(queue, hBuff, dBuff);
wait(queue);

auto* ptr = onHost::data(hBuff);
for(uint32_t i = 0u; i < dataExtent; ++i)
{
CHECK(45 == ptr[i]);
}
}
}

#endif

0 comments on commit 337e5d3

Please sign in to comment.