From b3f70217da9e2b91ff9515d284fd42f3aae121e4 Mon Sep 17 00:00:00 2001 From: Michael Varvarin <55709728+MichaelVarvarin@users.noreply.github.com> Date: Tue, 12 Nov 2024 10:17:01 +0200 Subject: [PATCH] Rewrite example to use executeForEachAccTag --- .../src/helloWorldGridSync.cpp | 71 ++++++++++++------- 1 file changed, 47 insertions(+), 24 deletions(-) diff --git a/example/helloWorldGridSync/src/helloWorldGridSync.cpp b/example/helloWorldGridSync/src/helloWorldGridSync.cpp index ef2afc48a52..3944da8a876 100644 --- a/example/helloWorldGridSync/src/helloWorldGridSync.cpp +++ b/example/helloWorldGridSync/src/helloWorldGridSync.cpp @@ -3,6 +3,7 @@ */ #include +#include #include #include @@ -51,14 +52,22 @@ struct HelloWorldKernel } }; -auto main() -> int +// In standard projects, you typically do not execute the code with any available accelerator. +// Instead, a single accelerator is selected once from the active accelerators and the kernels are executed with the +// selected accelerator only. If you use the example as the starting point for your project, you can rename the +// example() function to main() and move the accelerator tag to the function body. +template +auto example(TAccTag const&) -> int { + // Define the accelerator + // For simplicity this examples always uses 1 dimensional indexing, and index type size_t + using Acc = alpaka::TagToAcc, std::size_t>; + std::cout << "Using alpaka accelerator: " << alpaka::getAccName() << std::endl; + // Define dimensionality and type of indices to be used in kernels using Dim = alpaka::DimInt<1>; - using Idx = uint32_t; + using Idx = size_t; - // Define alpaka accelerator type, which corresponds to the underlying programming model - using Acc = alpaka::AccGpuSyclIntel; // Select the first device available on a system, for the chosen accelerator auto const platformAcc = alpaka::Platform{}; @@ -71,45 +80,59 @@ auto main() -> int // Define kernel execution configuration of blocks, // threads per block, and elements per thread. - Idx blocksPerGrid = 10; + Idx blocksPerGrid = 1000; Idx threadsPerBlock = 1; - Idx threadsPerBlock2 = 1024; Idx elementsPerThread = 1; using WorkDiv = alpaka::WorkDivMembers; - auto workDiv = WorkDiv{blocksPerGrid, threadsPerBlock, elementsPerThread}; - auto workDiv2 = WorkDiv{blocksPerGrid, threadsPerBlock2, elementsPerThread}; // Allocate memory on the device. alpaka::Vec bufferExtent{blocksPerGrid * threadsPerBlock}; auto deviceMemory = alpaka::allocBuf(devAcc, bufferExtent); - alpaka::Vec bufferExtent2{blocksPerGrid * threadsPerBlock2}; - auto deviceMemory2 = alpaka::allocBuf(devAcc, bufferExtent2); + // Instantiate the kernel object. HelloWorldKernel helloWorldKernel; - // int maxBlocks = alpaka::getMaxActiveBlocks( - // devAcc, - // helloWorldKernel, - // threadsPerBlock, - // elementsPerThread, - // getPtrNative(deviceMemory)); - // std::cout << "Maximum blocks for the kernel: " << maxBlocks << std::endl; + // Query the maximum number of blocks allowed for the device + int maxBlocks = alpaka::getMaxActiveBlocks( + devAcc, + helloWorldKernel, + threadsPerBlock, + elementsPerThread, + getPtrNative(deviceMemory)); + std::cout << "Maximum blocks for the kernel: " << maxBlocks << std::endl; + + // Create a workdiv according to the limitations + blocksPerGrid = std::min(static_cast(maxBlocks), blocksPerGrid); + auto workDiv2 = WorkDiv{blocksPerGrid, threadsPerBlock, elementsPerThread}; + alpaka::Vec bufferExtent2{blocksPerGrid * threadsPerBlock}; + auto deviceMemory2 = alpaka::allocBuf(devAcc, bufferExtent2); // Create a task to run the kernel. // Note the cooperative kernel specification. // Only cooperative kernels can perform grid synchronization. - auto taskRunKernel = alpaka::createTaskKernel(workDiv, helloWorldKernel, getPtrNative(deviceMemory)); - - auto taskRunKernel2 + auto taskRunKernel = alpaka::createTaskCooperativeKernel(workDiv2, helloWorldKernel, getPtrNative(deviceMemory2)); // Enqueue the kernel execution task.. alpaka::enqueue(queue, taskRunKernel); - alpaka::wait(queue); - printf("launching kernel 2\n"); - alpaka::enqueue(queue, taskRunKernel2); - return 0; + return EXIT_SUCCESS; +} + +auto main() -> int +{ + // Execute the example once for each enabled accelerator. + // If you would like to execute it for a single accelerator only you can use the following code. + // \code{.cpp} + // auto tag = TagCpuSerial; + // return example(tag); + // \endcode + // + // valid tags: + // TagCpuSerial, TagGpuHipRt, TagGpuCudaRt, TagCpuOmp2Blocks, TagCpuTbbBlocks, + // TagCpuOmp2Threads, TagCpuSycl, TagCpuTbbBlocks, TagCpuThreads, + // TagFpgaSyclIntel, TagGenericSycl, TagGpuSyclIntel + return alpaka::executeForEachAccTag([=](auto const& tag) { return example(tag); }); }