Skip to content

Commit

Permalink
Rewrite example to use executeForEachAccTag
Browse files Browse the repository at this point in the history
  • Loading branch information
MichaelVarvarin committed Nov 12, 2024
1 parent 23d6dfc commit b3f7021
Showing 1 changed file with 47 additions and 24 deletions.
71 changes: 47 additions & 24 deletions example/helloWorldGridSync/src/helloWorldGridSync.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
*/

#include <alpaka/alpaka.hpp>
#include <alpaka/example/ExecuteForEachAccTag.hpp>

#include <cstdint>
#include <iostream>
Expand Down Expand Up @@ -51,14 +52,22 @@ struct HelloWorldKernel
}
};

auto main() -> int
// In standard projects, you typically do not execute the code with any available accelerator.
// Instead, a single accelerator is selected once from the active accelerators and the kernels are executed with the
// selected accelerator only. If you use the example as the starting point for your project, you can rename the
// example() function to main() and move the accelerator tag to the function body.
template<typename TAccTag>
auto example(TAccTag const&) -> int
{
// Define the accelerator
// For simplicity this examples always uses 1 dimensional indexing, and index type size_t
using Acc = alpaka::TagToAcc<TAccTag, alpaka::DimInt<1>, std::size_t>;
std::cout << "Using alpaka accelerator: " << alpaka::getAccName<Acc>() << std::endl;

// Define dimensionality and type of indices to be used in kernels
using Dim = alpaka::DimInt<1>;
using Idx = uint32_t;
using Idx = size_t;

// Define alpaka accelerator type, which corresponds to the underlying programming model
using Acc = alpaka::AccGpuSyclIntel<Dim, Idx>;

// Select the first device available on a system, for the chosen accelerator
auto const platformAcc = alpaka::Platform<Acc>{};
Expand All @@ -71,45 +80,59 @@ auto main() -> int

// Define kernel execution configuration of blocks,
// threads per block, and elements per thread.
Idx blocksPerGrid = 10;
Idx blocksPerGrid = 1000;
Idx threadsPerBlock = 1;
Idx threadsPerBlock2 = 1024;
Idx elementsPerThread = 1;

using WorkDiv = alpaka::WorkDivMembers<Dim, Idx>;
auto workDiv = WorkDiv{blocksPerGrid, threadsPerBlock, elementsPerThread};
auto workDiv2 = WorkDiv{blocksPerGrid, threadsPerBlock2, elementsPerThread};

// Allocate memory on the device.
alpaka::Vec<Dim, Idx> bufferExtent{blocksPerGrid * threadsPerBlock};
auto deviceMemory = alpaka::allocBuf<uint32_t, Idx>(devAcc, bufferExtent);

alpaka::Vec<Dim, Idx> bufferExtent2{blocksPerGrid * threadsPerBlock2};
auto deviceMemory2 = alpaka::allocBuf<uint32_t, Idx>(devAcc, bufferExtent2);

// Instantiate the kernel object.
HelloWorldKernel helloWorldKernel;

// int maxBlocks = alpaka::getMaxActiveBlocks<Acc>(
// devAcc,
// helloWorldKernel,
// threadsPerBlock,
// elementsPerThread,
// getPtrNative(deviceMemory));
// std::cout << "Maximum blocks for the kernel: " << maxBlocks << std::endl;
// Query the maximum number of blocks allowed for the device
int maxBlocks = alpaka::getMaxActiveBlocks<Acc>(
devAcc,
helloWorldKernel,
threadsPerBlock,
elementsPerThread,
getPtrNative(deviceMemory));
std::cout << "Maximum blocks for the kernel: " << maxBlocks << std::endl;

// Create a workdiv according to the limitations
blocksPerGrid = std::min(static_cast<Idx>(maxBlocks), blocksPerGrid);
auto workDiv2 = WorkDiv{blocksPerGrid, threadsPerBlock, elementsPerThread};
alpaka::Vec<Dim, Idx> bufferExtent2{blocksPerGrid * threadsPerBlock};
auto deviceMemory2 = alpaka::allocBuf<uint32_t, Idx>(devAcc, bufferExtent2);

// Create a task to run the kernel.
// Note the cooperative kernel specification.
// Only cooperative kernels can perform grid synchronization.
auto taskRunKernel = alpaka::createTaskKernel<Acc>(workDiv, helloWorldKernel, getPtrNative(deviceMemory));

auto taskRunKernel2
auto taskRunKernel
= alpaka::createTaskCooperativeKernel<Acc>(workDiv2, helloWorldKernel, getPtrNative(deviceMemory2));

// Enqueue the kernel execution task..
alpaka::enqueue(queue, taskRunKernel);
alpaka::wait(queue);
printf("launching kernel 2\n");
alpaka::enqueue(queue, taskRunKernel2);

return 0;
return EXIT_SUCCESS;
}

auto main() -> int
{
// Execute the example once for each enabled accelerator.
// If you would like to execute it for a single accelerator only you can use the following code.
// \code{.cpp}
// auto tag = TagCpuSerial;
// return example(tag);
// \endcode
//
// valid tags:
// TagCpuSerial, TagGpuHipRt, TagGpuCudaRt, TagCpuOmp2Blocks, TagCpuTbbBlocks,
// TagCpuOmp2Threads, TagCpuSycl, TagCpuTbbBlocks, TagCpuThreads,
// TagFpgaSyclIntel, TagGenericSycl, TagGpuSyclIntel
return alpaka::executeForEachAccTag([=](auto const& tag) { return example(tag); });
}

0 comments on commit b3f7021

Please sign in to comment.