From 8d81455d27c384d49d03d1544d44a4ea68bc79eb Mon Sep 17 00:00:00 2001 From: Rahulkumar Gayatri Date: Thu, 22 Aug 2024 15:54:05 -0700 Subject: [PATCH] OpenMPTarget: Kernel mode implementation for collapse clause trials. --- containers/src/Kokkos_Bitset.hpp | 2 +- ...okkos_OpenMPTarget_ParallelFor_MDRange.hpp | 45 +++++++++++++++++++ core/src/impl/Kokkos_HostThreadTeam.cpp | 6 +-- 3 files changed, 49 insertions(+), 4 deletions(-) diff --git a/containers/src/Kokkos_Bitset.hpp b/containers/src/Kokkos_Bitset.hpp index f50ab0a0f7e..409260f0218 100644 --- a/containers/src/Kokkos_Bitset.hpp +++ b/containers/src/Kokkos_Bitset.hpp @@ -271,7 +271,7 @@ class Bitset { offset = !(scan_direction & BIT_SCAN_REVERSE) ? offset : (offset + block_mask) & block_mask; - block = Impl::rotate_right(block, offset); + block = Impl::rotate_right(block, offset); return (((!(scan_direction & BIT_SCAN_REVERSE) ? Impl::bit_scan_forward(block) : Impl::int_log2(block)) + diff --git a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_MDRange.hpp b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_MDRange.hpp index bd7d3eef5d7..f52fd0b747e 100644 --- a/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_MDRange.hpp +++ b/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_MDRange.hpp @@ -246,6 +246,49 @@ class ParallelFor, const Index end_1 = policy.m_upper[1]; const Index end_2 = policy.m_upper[2]; + +#if defined(KOKKOS_IMPL_OPENMPTARGET_KERNEL_MODE) + const Index tot = (end_2-begin_2) * (end_1-begin_1) * (end_0-begin_0); + + auto tot_inner = (end_1 - begin_1) * (end_0 - begin_0); + auto tot_outer = (end_2 - begin_2) * tot_inner; + +//#pragma omp target teams distribute parallel for map(to : functor) + //for (auto iter2 = 0; iter2 < tot_outer; ++iter2) { +// if(omp_get_team_num() == 0 && omp_get_num_threads() == 0) +// printf("num_teams = %d, team_size = %d, thread)id = %d\n", omp_get_num_teams(), omp_get_num_threads(), omp_get_num_threads()); + + constexpr const int team_size = 1; + const int num_teams = (tot_outer + team_size - 1) / team_size * team_size; + + printf("tot_outer = %d\n", tot_outer); + + for (auto tmp = 0; tmp < tot_outer; ++tmp) { +#pragma omp target teams ompx_bare thread_limit(1) num_teams(1) map(to:functor) firstprivate(tot_outer) +{ + const Index blockDimx = ompx::block_dim(ompx::dim_x); + const Index blockIdx = ompx::block_id(ompx::dim_x); + const Index threadIdx = ompx::thread_id(ompx::dim_x); + + auto iter2 = tmp; //+ blockDimx * blockIdx + threadIdx; + if (iter2 < tot_outer) { + auto i2 = iter2 / tot_inner; + auto iter = iter2 % tot_inner; + + auto i1 = iter / (end_0 - begin_0); + auto i0 = iter % (end_0 - begin_0); + +// printf("(i0,i1,i2) = (%d,%d,%d)\n", i0,i1,i2); +// printf("blockIdx = %d, iter2 = %d\n", blockIdx, iter2); + + if constexpr (std::is_void::value) + functor(i0, i1, i2); + else + functor(typename Policy::work_tag(), i0, i1, i2); + } + } +} +#else #pragma omp target teams distribute parallel for collapse(3) map(to : functor) for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i1 = begin_1; i1 < end_1; ++i1) { @@ -257,6 +300,8 @@ class ParallelFor, } } } +#endif + } template diff --git a/core/src/impl/Kokkos_HostThreadTeam.cpp b/core/src/impl/Kokkos_HostThreadTeam.cpp index 11bf701b57a..4bb7f639b73 100644 --- a/core/src/impl/Kokkos_HostThreadTeam.cpp +++ b/core/src/impl/Kokkos_HostThreadTeam.cpp @@ -130,10 +130,10 @@ int HostThreadTeamData::organize_team(const int team_size) { // zombi team around (for example m_pool_size = 5 and team_size = 2 // (ii) if team_alloc > team_size then the last team might have less // threads than the others - m_team_rank = (team_base_rank + team_size <= m_pool_size) && + m_team_rank = (team_base_rank + team_size <= m_pool_size) && (team_alloc_rank < team_size) - ? team_alloc_rank - : -1; + ? team_alloc_rank + : -1; m_team_size = team_size; m_team_alloc = team_alloc_size; m_league_rank = league_rank;