From a3392e530e7bce70c05b640b18d8831af666223d Mon Sep 17 00:00:00 2001 From: mehmet yusufoglu Date: Mon, 22 Jul 2024 18:55:10 +0200 Subject: [PATCH] refactor a little --- benchmarks/cloverleaf/CMakeLists.txt | 4 + .../cloverleaf/src/cloverLeafKernels.hpp | 80 +++++- .../cloverleaf/src/cloverLeafMainTest.cpp | 260 ++++++++++++------ 3 files changed, 250 insertions(+), 94 deletions(-) diff --git a/benchmarks/cloverleaf/CMakeLists.txt b/benchmarks/cloverleaf/CMakeLists.txt index 24bbbdd7351..23531714e3d 100644 --- a/benchmarks/cloverleaf/CMakeLists.txt +++ b/benchmarks/cloverleaf/CMakeLists.txt @@ -19,6 +19,10 @@ if(NOT TARGET alpaka::alpaka) endif() endif() +if (alpaka_USE_MDSPAN STREQUAL "OFF") + message(STATUS "The conv2DWithMdspan example requires mdspan. Please set alpaka_USE_MDSPAN accordingly. Example disabled.") + return() +endif () set(_TARGET_NAME "cloverleaf") append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE) diff --git a/benchmarks/cloverleaf/src/cloverLeafKernels.hpp b/benchmarks/cloverleaf/src/cloverLeafKernels.hpp index 8e8819f82ca..bd7faba3d53 100644 --- a/benchmarks/cloverleaf/src/cloverLeafKernels.hpp +++ b/benchmarks/cloverleaf/src/cloverLeafKernels.hpp @@ -12,6 +12,62 @@ const Idx nx = 512; // Number of cells in x direction const Idx ny = 512; // Number of cells in y direction const Idx nz = 512; // Number of cells in z direction +// Kernel to update the halo regions +struct UpdateHaloKernel +{ + template + ALPAKA_FN_ACC auto operator()(TAcc const& acc, MdSpan density) const -> void + { + auto const i = alpaka::getIdx(acc)[0]; + auto const j = alpaka::getIdx(acc)[1]; + auto const k = alpaka::getIdx(acc)[2]; + + if(i < nx && j < ny && k < nz) + { + // Update halo cells for density (simplified example) + // Assuming a single layer halo, and periodic boundary conditions + if(i == 0) + density(i, j, k) = density(nx - 2, j, k); + if(i == nx - 1) + density(i, j, k) = density(1, j, k); + + if(j == 0) + density(i, j, k) = density(i, ny - 2, k); + if(j == ny - 1) + density(i, j, k) = density(i, 1, k); + + if(k == 0) + density(i, j, k) = density(i, j, nz - 2); + if(k == nz - 1) + density(i, j, k) = density(i, j, 1); + } + } +}; + +// Kernel to compute the ideal gas equation of state +struct IdealGasKernel +{ + template + ALPAKA_FN_ACC auto operator()( + TAcc const& acc, + MdSpan density, + MdSpan energy, + MdSpan pressure, + MdSpan soundspeed, + float gamma) const -> void + { + auto const i = alpaka::getIdx(acc)[0]; + auto const j = alpaka::getIdx(acc)[1]; + auto const k = alpaka::getIdx(acc)[2]; + + if(i < nx && j < ny && k < nz) + { + pressure(i, j, k) = (gamma - 1.0f) * density(i, j, k) * energy(i, j, k); + soundspeed(i, j, k) = sqrt(gamma * pressure(i, j, k) / density(i, j, k)); + } + } +}; + // Kernel to initialize the simulation variables struct InitializerKernel { @@ -25,7 +81,6 @@ struct InitializerKernel MdSpan velocityY, MdSpan velocityZ) const -> void { - // Get thread index, the center of filter-matrix is positioned to the item on this index. auto const i = alpaka::getIdx(acc)[0]; auto const j = alpaka::getIdx(acc)[1]; auto const k = alpaka::getIdx(acc)[2]; @@ -33,11 +88,15 @@ struct InitializerKernel if(i < nx && j < ny && k < nz) { density(i, j, k) = 1.0f; // Initial density - energy(i, j, k) = 1.0f; // Initial energy + energy(i, j, k) = 2.5f; // Initial energy pressure(i, j, k) = 1.0f; // Initial pressure - velocityX(i, j, k) = 0.0f; // Initial velocity in x direction - velocityY(i, j, k) = 0.0f; // Initial velocity in y direction - velocityZ(i, j, k) = 0.0f; // Initial velocity in z direction + velocityX(i, j, k) = 0.0f; // Initial velocity + + if(i < nx && j < ny && k < nz) + { + // Simple advection calculation (this is a simplified example) + density(i, j, k) += (velocityX(i, j, k) + velocityY(i, j, k) + velocityZ(i, j, k)) * 0.01f; + } } } }; @@ -130,6 +189,7 @@ struct AdvectionKernel } }; +// Kernel for the Lagrangian step struct LagrangianKernel { template @@ -162,6 +222,7 @@ struct LagrangianKernel } }; +// Kernel for viscosity calculations struct ViscosityKernel { template @@ -178,9 +239,8 @@ struct ViscosityKernel auto const j = alpaka::getIdx(acc)[1]; auto const k = alpaka::getIdx(acc)[2]; - if(i < nx && j < ny && k < nz) + if(i > 0 && i < nx - 1 && j > 0 && j < ny - 1 && k > 0 && k < nz - 1) { - // Calculate artificial viscosity (this is a simplified example) float gradVx = (velocityX(i + 1, j, k) - velocityX(i - 1, j, k)) * 0.5f; float gradVy = (velocityY(i, j + 1, k) - velocityY(i, j - 1, k)) * 0.5f; float gradVz = (velocityZ(i, j, k + 1) - velocityZ(i, j, k - 1)) * 0.5f; @@ -193,6 +253,7 @@ struct ViscosityKernel } }; +// Kernel to find the maximum velocity struct MaxVelocityKernel { template @@ -201,7 +262,7 @@ struct MaxVelocityKernel MdSpan velocityX, MdSpan velocityY, MdSpan velocityZ, - float* maxVelocity) const -> void + Data* maxVelocity) const -> void { auto const i = alpaka::getIdx(acc)[0]; auto const j = alpaka::getIdx(acc)[1]; @@ -215,7 +276,8 @@ struct MaxVelocityKernel float v = alpaka::math::sqrt(acc, (vx * vx + vy * vy + vz * vz)); // Atomic operation to find the maximum velocity - alpaka::atomicMax(acc, maxVelocity, v); + float val = alpaka::atomicMax(acc, maxVelocity, v); + maxVelocity[0] = val; } } }; diff --git a/benchmarks/cloverleaf/src/cloverLeafMainTest.cpp b/benchmarks/cloverleaf/src/cloverLeafMainTest.cpp index ee169e1015b..7f551c7e349 100644 --- a/benchmarks/cloverleaf/src/cloverLeafMainTest.cpp +++ b/benchmarks/cloverleaf/src/cloverLeafMainTest.cpp @@ -7,17 +7,21 @@ #include #include +#include +#include #include - - /** - * Cloverleaf benchmark - */ - - //! \brief The Function for testing cloverleaf kernels for given Acc type and data type. - //! \tparam TAcc the accelerator type - //! \tparam DataType The data type to differentiate single or double data type based tests. - template - void testKernels() +#include +#include + +/** + * Cloverleaf benchmark + */ + +//! \brief The Function for testing cloverleaf kernels for given Acc type and data type. +//! \tparam Acc the accelerator type +//! \tparam DataType The data type to differentiate single or double data type based tests. +template +void testKernels() { // Define device and queue using Queue = alpaka::Queue; @@ -27,47 +31,51 @@ auto const platformAcc = alpaka::Platform{}; auto const devAcc = alpaka::getDevByIdx(platformAcc, 0); + std::cout << "Using alpaka accelerator: " << alpaka::getAccName() << std::endl; + Queue queue(devAcc); + // Define the 3D extent (dimensions) alpaka::Vec const extent3D(static_cast(nx), static_cast(ny), static_cast(nz)); - - // Allocate host memory - auto hDensity = alpaka::allocBuf(devHost, extent3D); - auto hEnergy = alpaka::allocBuf(devHost, extent3D); - auto hPressure = alpaka::allocBuf(devHost, extent3D); - auto hVelocityX = alpaka::allocBuf(devHost, extent3D); - auto hVelocityY = alpaka::allocBuf(devHost, extent3D); - auto hVelocityZ = alpaka::allocBuf(devHost, extent3D); - - // Allocate device memory - auto dDensity = alpaka::allocBuf(devAcc, extent3D); - auto dEnergy = alpaka::allocBuf(devAcc, extent3D); - auto dPressure = alpaka::allocBuf(devAcc, extent3D); - auto dVelocityX = alpaka::allocBuf(devAcc, extent3D); - auto dVelocityY = alpaka::allocBuf(devAcc, extent3D); - auto dVelocityZ = alpaka::allocBuf(devAcc, extent3D); - - // Convert directly to mdspan + // Allocate host memory + auto hDensity = alpaka::allocBuf(devHost, extent3D); + auto hEnergy = alpaka::allocBuf(devHost, extent3D); + auto hPressure = alpaka::allocBuf(devHost, extent3D); + auto hVelocityX = alpaka::allocBuf(devHost, extent3D); + auto hVelocityY = alpaka::allocBuf(devHost, extent3D); + auto hVelocityZ = alpaka::allocBuf(devHost, extent3D); + auto hSoundspeed = alpaka::allocBuf(devHost, extent3D); // Additional buffer for soundspeed + + // Allocate device memory + auto dDensity = alpaka::allocBuf(devAcc, extent3D); + auto dEnergy = alpaka::allocBuf(devAcc, extent3D); + auto dPressure = alpaka::allocBuf(devAcc, extent3D); + auto dVelocityX = alpaka::allocBuf(devAcc, extent3D); + auto dVelocityY = alpaka::allocBuf(devAcc, extent3D); + auto dVelocityZ = alpaka::allocBuf(devAcc, extent3D); + auto dSoundspeed = alpaka::allocBuf(devAcc, extent3D); // Additional buffer for soundspeed + + // Convert directly to mdspan auto mdDensity = alpaka::experimental::getMdSpan(dDensity); auto mdEnergy = alpaka::experimental::getMdSpan(dEnergy); auto mdPressure = alpaka::experimental::getMdSpan(dPressure); auto mdVelocityX = alpaka::experimental::getMdSpan(dVelocityX); auto mdVelocityY = alpaka::experimental::getMdSpan(dVelocityY); auto mdVelocityZ = alpaka::experimental::getMdSpan(dVelocityZ); + auto mdSoundspeed = alpaka::experimental::getMdSpan(dSoundspeed); // Additional mdspan for soundspeed + auto hFluxDensity = alpaka::allocBuf(devHost, extent3D); + auto hFluxEnergy = alpaka::allocBuf(devHost, extent3D); + auto hFluxVelocityX = alpaka::allocBuf(devHost, extent3D); + auto hFluxVelocityY = alpaka::allocBuf(devHost, extent3D); + auto hFluxVelocityZ = alpaka::allocBuf(devHost, extent3D); - auto hFluxDensity = alpaka::allocBuf(devHost, extent3D); - auto hFluxEnergy = alpaka::allocBuf(devHost, extent3D); - auto hFluxVelocityX = alpaka::allocBuf(devHost, extent3D); - auto hFluxVelocityY = alpaka::allocBuf(devHost, extent3D); - auto hFluxVelocityZ = alpaka::allocBuf(devHost, extent3D); - - auto dFluxDensity = alpaka::allocBuf(devAcc, extent3D); - auto dFluxEnergy = alpaka::allocBuf(devAcc, extent3D); - auto dFluxVelocityX = alpaka::allocBuf(devAcc, extent3D); - auto dFluxVelocityY = alpaka::allocBuf(devAcc, extent3D); - auto dFluxVelocityZ = alpaka::allocBuf(devAcc, extent3D); + auto dFluxDensity = alpaka::allocBuf(devAcc, extent3D); + auto dFluxEnergy = alpaka::allocBuf(devAcc, extent3D); + auto dFluxVelocityX = alpaka::allocBuf(devAcc, extent3D); + auto dFluxVelocityY = alpaka::allocBuf(devAcc, extent3D); + auto dFluxVelocityZ = alpaka::allocBuf(devAcc, extent3D); auto mdFluxDensity = alpaka::experimental::getMdSpan(dFluxDensity); auto mdFluxEnergy = alpaka::experimental::getMdSpan(dFluxEnergy); @@ -75,26 +83,31 @@ auto mdFluxVelocityY = alpaka::experimental::getMdSpan(dFluxVelocityY); auto mdFluxVelocityZ = alpaka::experimental::getMdSpan(dFluxVelocityZ); - // Allocate host memory for viscosity - auto hViscosity = alpaka::allocBuf(devHost, extent3D); + // Allocate host memory for viscosity + auto hViscosity = alpaka::allocBuf(devHost, extent3D); - // Allocate device memory for viscosity - auto dViscosity = alpaka::allocBuf(devAcc, extent3D); + // Allocate device memory for viscosity + auto dViscosity = alpaka::allocBuf(devAcc, extent3D); - // Create mdspan view for viscosity using alpaka::experimental::getMdSpan + // Create mdspan view for viscosity using alpaka::experimental::getMdSpan auto mdViscosity = alpaka::experimental::getMdSpan(dViscosity); - // Define work divisions + // Define work divisions const alpaka::Vec size{nx, ny, nz}; - const alpaka::Vec blockSize{8, 8, 8}; - auto workDiv = alpaka::WorkDivMembers{size, blockSize, alpaka::Vec::all(1)}; + // Define work divisions - // Simulation parameters + auto workDiv = alpaka::getValidWorkDiv( + devAcc, + size, + alpaka::Vec::ones(), + false, + alpaka::GridBlockExtentSubDivRestrictions::Unrestricted); + // Simulation parameters float totalTime = 0.0f; // Total simulation time float endTime = 1.0f; // End time of the simulation - // Exec the initialization kernel + // Exec the initialization kernel alpaka::exec( queue, workDiv, @@ -105,21 +118,25 @@ mdVelocityX, mdVelocityY, mdVelocityZ); - - // Wait for the kernel to finish + // std::cout << "InitializerKernel finished" << std::endl; + // Wait for the kernel to finish alpaka::wait(queue); - // Variables to calculate the dt at each step + // Variables to calculate the dt at each step float dx = 0.01f; float dy = 0.01f; float dz = 0.01f; float cMax = 0.5f; float dt = 0.01f; - // initial max_velocity to calculate time step - float maxVelocity = 1.0f; // This should be computed from your velocity fields - float dMaxVelocity = 1.0f; + alpaka::Vec, Idx> const extent1D = alpaka::Vec, Idx>::all(1u); + auto resultGpuHost = alpaka::allocBuf(devHost, extent1D); + resultGpuHost[0] = static_cast(1.0f); + auto outputDeviceMemory = alpaka::allocBuf(devAcc, extent1D); + alpaka::memcpy(queue, outputDeviceMemory, resultGpuHost); + std::cout << "Workdiv:" << workDiv << std::endl; + std::cout << "Running cloverleaf loop." << std::endl; while(totalTime < endTime) { // Calculate the time step (dt), max velocity is needed. @@ -131,14 +148,42 @@ mdVelocityX, mdVelocityY, mdVelocityZ, - &dMaxVelocity); + std::data(outputDeviceMemory)); + + // Copy result from device memory to host + alpaka::memcpy(queue, resultGpuHost, outputDeviceMemory, extent1D); + + // std::cout << "MaxVelocityKernel finished" << std::endl; alpaka::wait(queue); - maxVelocity = dMaxVelocity; + // Now use maxVelocity to compute the time step - dt = calculateTimeStep(dx, dy, dz, maxVelocity, cMax); + dt = calculateTimeStep(dx, dy, dz, resultGpuHost[0], cMax); + + // Exec the halo update kernel + alpaka::exec(queue, workDiv, UpdateHaloKernel{}, mdDensity); - // Exec the EOS kernel + // Wait for the halo update kernel to finish + alpaka::wait(queue); + // std::cout << "Halo kernel finished" << std::endl; + + // Exec the ideal gas kernel + alpaka::exec( + queue, + workDiv, + IdealGasKernel{}, + mdDensity, + mdEnergy, + mdPressure, + mdSoundspeed, + 1.4f); // gamma + + // std::cout << "IdealGasKernel finished" << std::endl; + + // Wait for the ideal gas kernel to finish + alpaka::wait(queue); + + // Exec the EOS kernel float gamma = 1.4f; // Specific heat ratio alpaka::exec( queue, @@ -152,10 +197,12 @@ mdVelocityZ, gamma); - // Wait for the EOS kernel to finish + // std::cout << "EOSKernel finished" << std::endl; + + // Wait for the EOS kernel to finish alpaka::wait(queue); - // Exec the Flux kernel + // Exec the Flux kernel alpaka::exec( queue, workDiv, @@ -172,20 +219,20 @@ mdFluxVelocityY, mdFluxVelocityZ); - // Wait for the Flux kernel to finish + // std::cout << "FluxKernel finished" << std::endl; + + // Wait for the Flux kernel to finish alpaka::wait(queue); - // Exec the advection kernel (fourth step) - alpaka::exec( - queue, - workDiv, - AdvectionKernel{}, - mdDensity, - mdVelocityX, - mdVelocityY, - mdVelocityZ); + // Exec the advection kernel (fourth step) + alpaka::exec(queue, workDiv, AdvectionKernel{}, mdDensity, mdVelocityX, mdVelocityY, mdVelocityZ); - // Copy data back to host + // std::cout << "AdvectionKernel finished" << std::endl; + + // Wait for the Advection kernel to finish + alpaka::wait(queue); + + // Copy data back to host for verification (optional) alpaka::memcpy(queue, hDensity, dDensity); alpaka::memcpy(queue, hEnergy, dEnergy); alpaka::memcpy(queue, hPressure, dPressure); @@ -193,15 +240,17 @@ alpaka::memcpy(queue, hVelocityY, dVelocityY); alpaka::memcpy(queue, hVelocityZ, dVelocityZ); - // Flux densities + // Flux densities alpaka::memcpy(queue, hFluxDensity, dFluxDensity); alpaka::memcpy(queue, hFluxEnergy, dFluxEnergy); alpaka::memcpy(queue, hFluxVelocityX, dFluxVelocityX); alpaka::memcpy(queue, hFluxVelocityY, dFluxVelocityY); - alpaka::memcpy(queue + alpaka::memcpy(queue, hFluxVelocityZ, dFluxVelocityZ); - , hFluxVelocityZ, dFluxVelocityZ); + // Wait for data transfer to complete + alpaka::wait(queue); + // Exec the Lagrangian kernel alpaka::exec( queue, workDiv, @@ -217,10 +266,12 @@ mdFluxVelocityY, mdFluxVelocityZ); - // Wait for the Lagrangian kernel to finish + // std::cout << "LagrangianKernel finished" << std::endl; + + // Wait for the Lagrangian kernel to finish alpaka::wait(queue); - // Exec the Viscosity kernel + // Exec the Viscosity kernel alpaka::exec( queue, workDiv, @@ -232,13 +283,16 @@ mdPressure, mdViscosity); - // Update the simulation time - totalTime += dt; + // std::cout << "ViscosityKernel finished" << std::endl; - // Optionally: Adjust dt based on CFL condition or other criteria - } + // Wait for the Viscosity kernel to finish + alpaka::wait(queue); - // Copy data back to host + // Update the simulation time + totalTime += dt; + } + std::cout << "Copying results back." << std::endl; + // Copy final data back to host for verification alpaka::memcpy(queue, hDensity, dDensity); alpaka::memcpy(queue, hEnergy, dEnergy); alpaka::memcpy(queue, hPressure, dPressure); @@ -252,11 +306,47 @@ alpaka::memcpy(queue, hFluxVelocityZ, dFluxVelocityZ); alpaka::memcpy(queue, hViscosity, dViscosity); - // Wait for the Viscosity kernel to finish + std::cout << "Data copy to host finished" << std::endl; + // Wait for all data transfers to complete alpaka::wait(queue); - // Copy viscosity data back to host - alpaka::memcpy(queue, hViscosity, dViscosity); + // Verification (check for NaNs and Infs) + auto mdHostDensity = alpaka::experimental::getMdSpan(hDensity); + auto mdHostEnergy = alpaka::experimental::getMdSpan(hEnergy); + auto mdHostPressure = alpaka::experimental::getMdSpan(hPressure); + auto mdHostVelocityX = alpaka::experimental::getMdSpan(hVelocityX); + auto mdHostVelocityY = alpaka::experimental::getMdSpan(hVelocityY); + auto mdHostVelocityZ = alpaka::experimental::getMdSpan(hVelocityZ); + + bool success = true; + for(Idx i = 0; i < nx; ++i) + { + for(Idx j = 0; j < ny; ++j) + { + for(Idx k = 0; k < nz; ++k) + { + if(std::isnan(mdHostDensity(i, j, k)) || std::isinf(mdHostDensity(i, j, k)) + || std::isnan(mdHostEnergy(i, j, k)) || std::isinf(mdHostEnergy(i, j, k)) + || std::isnan(mdHostPressure(i, j, k)) || std::isinf(mdHostPressure(i, j, k)) + || std::isnan(mdHostVelocityX(i, j, k)) || std::isinf(mdHostVelocityX(i, j, k)) + || std::isnan(mdHostVelocityY(i, j, k)) || std::isinf(mdHostVelocityY(i, j, k)) + || std::isnan(mdHostVelocityZ(i, j, k)) || std::isinf(mdHostVelocityZ(i, j, k))) + { + success = false; + break; + } + } + } + } + + if(success) + { + std::cout << "Simulation succeeded!" << std::endl; + } + else + { + std::cout << "Simulation failed due to NaN or Inf values!" << std::endl; + } } using TestAccs3D = alpaka::test::EnabledAccs, std::uint32_t>;