From 06bf5e99ee993ffbd9262567c6275dd709ba1e29 Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Fri, 6 Jun 2025 14:08:52 -0700 Subject: [PATCH 01/10] intial commit for high-dim thread teams --- examples/CMakeLists.txt | 4 + examples/launch-high-dim-threads.cpp | 194 ++++++++++++++++++++ include/RAJA/pattern/launch/launch_core.hpp | 33 ++-- include/RAJA/policy/hip/launch.hpp | 33 +++- include/RAJA/policy/openmp/launch.hpp | 8 +- include/RAJA/policy/sequential/launch.hpp | 8 +- scripts/lc-builds/toss4_amdclang.sh | 2 +- 7 files changed, 252 insertions(+), 30 deletions(-) create mode 100644 examples/launch-high-dim-threads.cpp diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 654f7a45ae..235e71be25 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -67,6 +67,10 @@ raja_add_executable( NAME pi-reduce_vs_atomic SOURCES pi-reduce_vs_atomic.cpp) +raja_add_executable( + NAME launch-high-dim-threads + SOURCES launch-high-dim-threads.cpp) + raja_add_executable( NAME raja-launch SOURCES raja-launch.cpp) diff --git a/examples/launch-high-dim-threads.cpp b/examples/launch-high-dim-threads.cpp new file mode 100644 index 0000000000..91ffd02b5c --- /dev/null +++ b/examples/launch-high-dim-threads.cpp @@ -0,0 +1,194 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-25, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include +#include +#include +#include + +#include "RAJA/RAJA.hpp" +#include "camp/resource.hpp" + + +/* + * RAJA Launch Example: Upper Triangular Pattern + Shared Memory + * + * Launch introduces hierarchical parallelism through the concept of + * teams and threads. Computation is executed in a pre-defined grid + * composed of threads and grouped into teams. The teams model enables + * developers to express parallelism through loops over teams, and inner loops + * over threads. Team loops are executed in parallel and + * threads within a team should be treated as sub-parallel regions. + * + * Team shared memory is allocated between team and thread loops. + * Memory allocated within thread loops are thread private. + * The example below demonstrates composing an upper triangular + * loop pattern, and using shared memory. + * + */ + +/* + * Define host/device launch policies + */ +using launch_policy = RAJA::LaunchPolicy< +#if defined(RAJA_ENABLE_OPENMP) + RAJA::omp_launch_t +#else + RAJA::seq_launch_t +#endif +#if defined(RAJA_ENABLE_CUDA) + , + RAJA::cuda_launch_t +#endif +#if defined(RAJA_ENABLE_HIP) + , + RAJA::hip_launch_t +#endif + >; + +/* + * Define team policies. + * Up to 3 dimension are supported: x,y,z + */ +using teams_x = RAJA::LoopPolicy< +#if defined(RAJA_ENABLE_OPENMP) + RAJA::omp_parallel_for_exec +#else + RAJA::seq_exec +#endif +#if defined(RAJA_ENABLE_CUDA) + , + RAJA::cuda_block_x_direct +#endif +#if defined(RAJA_ENABLE_HIP) + , + RAJA::hip_block_x_direct +#endif + >; +/* + * Define thread policies. + * Up to 3 dimension are supported: x,y,z + */ +using threads_x = RAJA::LoopPolicy; + + +int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +{ + + // Resource object for host + camp::resources::Host host_res; + + // Resource objects for CUDA or HIP +#if defined(RAJA_ENABLE_CUDA) + camp::resources::Cuda device_res; +#endif + +#if defined(RAJA_ENABLE_HIP) + camp::resources::Hip device_res; +#endif + + std::cout << "\n Running RAJA-Launch examples...\n"; + int num_of_backends = 1; +#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) + num_of_backends++; +#endif + + // RAJA teams may switch between host and device policies at run time. + // The loop below will execute through the available backends. + + for (int exec_place = 0; exec_place < num_of_backends; ++exec_place) { + + auto select_cpu_or_gpu = (RAJA::ExecPlace)exec_place; + + // Allocate memory for either host or device + int N_tri = 5; + + int* Ddat = nullptr; + if (select_cpu_or_gpu == RAJA::ExecPlace::HOST) { + Ddat = host_res.allocate(N_tri * N_tri); + } + +#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) + if (select_cpu_or_gpu == RAJA::ExecPlace::DEVICE) { + Ddat = device_res.allocate(N_tri * N_tri); + } +#endif + + /* + * RAJA::launch just starts a "kernel" and doesn't provide any looping. + * + * The first argument determines which policy should be executed, + * + * The second argument is the number of teams+threads needed for each of the + * policies. + * + * Third argument is the lambda. + * + * The lambda takes a "resource" object, which has the teams+threads + * and is used to perform thread synchronizations within a team. + */ + + if (select_cpu_or_gpu == RAJA::ExecPlace::HOST){ + std::cout << "\n Running upper triangular pattern example on the host...\n"; + } else { + std::cout << "\n Running upper triangular pattern example on the device...\n"; + } + + //Example of high-dimensional thread-team + RAJA::LaunchParams(RAJA::Teams(N_tri), RAJA::Threads<4>(1,2,3,4)); + + RAJA::View> D(Ddat, N_tri, N_tri); + + RAJA::launch + (select_cpu_or_gpu, + //RAJA::LaunchParams(RAJA::Teams(N_tri), RAJA::Threads<4>(1,2,3,4)), + RAJA::LaunchParams(RAJA::Teams(N_tri), RAJA::Threads(N_tri)), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + printf("in kernel \n"); + RAJA::loop(ctx, RAJA::RangeSegment(0, N_tri), [&](int r) { + + // Array shared within threads of the same team + RAJA_TEAM_SHARED int s_A[1]; + + RAJA::loop(ctx, RAJA::RangeSegment(0, 1), [&](int c) { + s_A[c] = r; + }); // loop c + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(r, N_tri), [&](int c) { + D(r, c) = r * N_tri + c; + printf("r=%d, c=%d : D=%d : s_A = %d \n", r, c, D(r, c), s_A[0]); + }); // loop c + + }); // loop r + + }); // outer lambda + + if (select_cpu_or_gpu == RAJA::ExecPlace::HOST) { + host_res.deallocate(Ddat); + } + +#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) + if (select_cpu_or_gpu == RAJA::ExecPlace::DEVICE) { + device_res.deallocate(Ddat); + } +#endif + + } // Execution places loop + + +} // Main diff --git a/include/RAJA/pattern/launch/launch_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp index bfce94057c..92b0deea4a 100644 --- a/include/RAJA/pattern/launch/launch_core.hpp +++ b/include/RAJA/pattern/launch/launch_core.hpp @@ -107,10 +107,11 @@ struct Teams constexpr Teams(int i, int j, int k) : value {i, j, k} {} }; +template struct Threads { - int value[3]; - + int value[DIM]; +#if 1 RAJA_INLINE RAJA_HOST_DEVICE @@ -130,6 +131,10 @@ struct Threads RAJA_HOST_DEVICE constexpr Threads(int i, int j, int k) : value {i, j, k} {} + //#else + template + constexpr Threads(Args...args) : value{args...} {}; +#endif }; struct Lanes @@ -147,18 +152,20 @@ struct Lanes constexpr Lanes(int i) : value(i) {} }; +template struct LaunchParams { public: Teams teams; - Threads threads; + Threads threads; size_t shared_mem_size; RAJA_INLINE LaunchParams() = default; + //template LaunchParams(Teams in_teams, - Threads in_threads, + Threads in_threads, size_t in_shared_mem_size = 0) : teams(in_teams), threads(in_threads), @@ -170,10 +177,12 @@ struct LaunchParams RAJA_INLINE Teams apply(Teams const& a) { return (teams = a); } + /* RAJA_HOST_DEVICE RAJA_INLINE Threads apply(Threads const& a) { return (threads = a); } + */ }; class LaunchContext @@ -246,8 +255,8 @@ struct LaunchExecute; // Duplicate of code above on account that we need to support the case in which // a kernel_name is not given -template -void launch(LaunchParams const& launch_params, +template +void launch(LaunchParams const& launch_params, ReduceParams&&... rest_of_launch_args) { // Get reducers @@ -288,17 +297,17 @@ void launch(LaunchParams const& launch_params, //================================================= // Run time based policy launch //================================================= -template -void launch(ExecPlace place, LaunchParams const& params, BODY const& body) +template +void launch(ExecPlace place, LaunchParams const& params, BODY const& body) { launch(place, params, body); } // Run-time API for new reducer interface with support of the case without a new // kernel name -template +template void launch(ExecPlace place, - const LaunchParams& launch_params, + LaunchParams const& launch_params, ReduceParams&&... rest_of_launch_args) // BODY const &body) { @@ -367,10 +376,10 @@ RAJA::resources::Resource Get_Host_Resource(T host_res, RAJA::ExecPlace device) // Duplicate of API above on account that we need to handle the case that a // kernel name is not provided -template +template resources::EventProxy launch( RAJA::resources::Resource res, - LaunchParams const& launch_params, + LaunchParams const& launch_params, ReduceParams&&... rest_of_launch_args) { diff --git a/include/RAJA/policy/hip/launch.hpp b/include/RAJA/policy/hip/launch.hpp index ab27e9b456..0062ce50cc 100644 --- a/include/RAJA/policy/hip/launch.hpp +++ b/include/RAJA/policy/hip/launch.hpp @@ -70,13 +70,13 @@ struct LaunchExecute< RAJA::policy::hip::hip_launch_t> { - template + template static concepts::enable_if_t< resources::EventProxy, RAJA::expt::type_traits::is_ForallParamPack, RAJA::expt::type_traits::is_ForallParamPack_empty> exec(RAJA::resources::Resource res, - const LaunchParams& params, + const LaunchParams & params, BODY_IN&& body_in, ReduceParams& RAJA_UNUSED_ARG(launch_reducers)) { @@ -86,6 +86,18 @@ struct LaunchExecute< resources::Hip hip_res = res.get(); + + for(int k=0; k<3;++k) { + std::cout<<"params.threads.value "<< + params.threads.value[k]<(res); } // Version with explicit reduction parameters.. - template + template static concepts::enable_if_t< resources::EventProxy, RAJA::expt::type_traits::is_ForallParamPack, concepts::negate< RAJA::expt::type_traits::is_ForallParamPack_empty>> exec(RAJA::resources::Resource res, - const LaunchParams& launch_params, + const LaunchParams& launch_params, BODY_IN&& body_in, ReduceParams& launch_reducers) { @@ -255,13 +270,13 @@ template struct LaunchExecute> { - template + template static concepts::enable_if_t< resources::EventProxy, RAJA::expt::type_traits::is_ForallParamPack, RAJA::expt::type_traits::is_ForallParamPack_empty> exec(RAJA::resources::Resource res, - const LaunchParams& params, + const LaunchParams& params, BODY_IN&& body_in, ReduceParams& RAJA_UNUSED_ARG(launch_reducers)) { @@ -318,14 +333,14 @@ struct LaunchExecute> } // Version with explicit reduction parameters.. - template + template static concepts::enable_if_t< resources::EventProxy, RAJA::expt::type_traits::is_ForallParamPack, concepts::negate< RAJA::expt::type_traits::is_ForallParamPack_empty>> exec(RAJA::resources::Resource res, - const LaunchParams& launch_params, + const LaunchParams& launch_params, BODY_IN&& body_in, ReduceParams& launch_reducers) { diff --git a/include/RAJA/policy/openmp/launch.hpp b/include/RAJA/policy/openmp/launch.hpp index 2092c87bb3..2d8499145f 100644 --- a/include/RAJA/policy/openmp/launch.hpp +++ b/include/RAJA/policy/openmp/launch.hpp @@ -28,13 +28,13 @@ template<> struct LaunchExecute { - template + template static concepts::enable_if_t< resources::EventProxy, RAJA::expt::type_traits::is_ForallParamPack, RAJA::expt::type_traits::is_ForallParamPack_empty> exec(RAJA::resources::Resource res, - LaunchParams const& params, + LaunchParams const& params, BODY const& body, ReduceParams& RAJA_UNUSED_ARG(launch_reducers)) { @@ -55,14 +55,14 @@ struct LaunchExecute return resources::EventProxy(res); } - template + template static concepts::enable_if_t< resources::EventProxy, RAJA::expt::type_traits::is_ForallParamPack, concepts::negate< RAJA::expt::type_traits::is_ForallParamPack_empty>> exec(RAJA::resources::Resource res, - LaunchParams const& launch_params, + LaunchParams const& launch_params, BODY const& body, ReduceParams& f_params) { diff --git a/include/RAJA/policy/sequential/launch.hpp b/include/RAJA/policy/sequential/launch.hpp index ee98804ecf..f5642274b0 100644 --- a/include/RAJA/policy/sequential/launch.hpp +++ b/include/RAJA/policy/sequential/launch.hpp @@ -40,13 +40,13 @@ template<> struct LaunchExecute { - template + template static concepts::enable_if_t< resources::EventProxy, RAJA::expt::type_traits::is_ForallParamPack, RAJA::expt::type_traits::is_ForallParamPack_empty> exec(RAJA::resources::Resource res, - LaunchParams const& params, + LaunchParams const& params, BODY const& body, ReduceParams& RAJA_UNUSED_ARG(ReduceParams)) { @@ -64,14 +64,14 @@ struct LaunchExecute return resources::EventProxy(res); } - template + template static concepts::enable_if_t< resources::EventProxy, RAJA::expt::type_traits::is_ForallParamPack, concepts::negate< RAJA::expt::type_traits::is_ForallParamPack_empty>> exec(RAJA::resources::Resource res, - LaunchParams const& launch_params, + LaunchParams const& launch_params, BODY const& body, ReduceParams& launch_reducers) { diff --git a/scripts/lc-builds/toss4_amdclang.sh b/scripts/lc-builds/toss4_amdclang.sh index cfb47678bc..43e00de64f 100755 --- a/scripts/lc-builds/toss4_amdclang.sh +++ b/scripts/lc-builds/toss4_amdclang.sh @@ -68,7 +68,7 @@ cmake \ -DCMAKE_HIP_ARCHITECTURES="${COMP_ARCH}" \ -DGPU_TARGETS="${COMP_ARCH}" \ -DAMDGPU_TARGETS="${COMP_ARCH}" \ - -DBLT_CXX_STD=c++14 \ + -DBLT_CXX_STD=c++17 \ -C "../host-configs/lc-builds/toss4/${HOSTCONFIG}.cmake" \ -DENABLE_HIP=ON \ -DENABLE_OPENMP=ON \ From 143a82151b036d906822fe494eb7292cfe163a79 Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Tue, 17 Jun 2025 14:21:57 -0700 Subject: [PATCH 02/10] build fixes for high dimensional threading --- examples/launch-high-dim-threads.cpp | 2 ++ include/RAJA/pattern/launch/launch_core.hpp | 37 +++++++-------------- include/RAJA/policy/hip/launch.hpp | 31 +++++++++-------- include/RAJA/policy/openmp/launch.hpp | 4 +-- include/RAJA/policy/sequential/launch.hpp | 4 +-- 5 files changed, 35 insertions(+), 43 deletions(-) diff --git a/examples/launch-high-dim-threads.cpp b/examples/launch-high-dim-threads.cpp index 91ffd02b5c..fd804e9239 100644 --- a/examples/launch-high-dim-threads.cpp +++ b/examples/launch-high-dim-threads.cpp @@ -147,6 +147,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::cout << "\n Running upper triangular pattern example on the device...\n"; } + RAJA::LaunchParams(); + //Example of high-dimensional thread-team RAJA::LaunchParams(RAJA::Teams(N_tri), RAJA::Threads<4>(1,2,3,4)); diff --git a/include/RAJA/pattern/launch/launch_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp index 92b0deea4a..841d9eff75 100644 --- a/include/RAJA/pattern/launch/launch_core.hpp +++ b/include/RAJA/pattern/launch/launch_core.hpp @@ -107,7 +107,7 @@ struct Teams constexpr Teams(int i, int j, int k) : value {i, j, k} {} }; -template +template struct Threads { int value[DIM]; @@ -131,28 +131,14 @@ struct Threads RAJA_HOST_DEVICE constexpr Threads(int i, int j, int k) : value {i, j, k} {} + //#else - template - constexpr Threads(Args...args) : value{args...} {}; + template + constexpr Threads(Args... args) : value {args...} {}; #endif }; -struct Lanes -{ - int value; - - RAJA_INLINE - - RAJA_HOST_DEVICE - constexpr Lanes() : value(0) {} - - RAJA_INLINE - - RAJA_HOST_DEVICE - constexpr Lanes(int i) : value(i) {} -}; - -template +template struct LaunchParams { public: @@ -163,7 +149,6 @@ struct LaunchParams RAJA_INLINE LaunchParams() = default; - //template LaunchParams(Teams in_teams, Threads in_threads, size_t in_shared_mem_size = 0) @@ -255,7 +240,7 @@ struct LaunchExecute; // Duplicate of code above on account that we need to support the case in which // a kernel_name is not given -template +template void launch(LaunchParams const& launch_params, ReduceParams&&... rest_of_launch_args) { @@ -297,15 +282,17 @@ void launch(LaunchParams const& launch_params, //================================================= // Run time based policy launch //================================================= -template -void launch(ExecPlace place, LaunchParams const& params, BODY const& body) +template +void launch(ExecPlace place, + LaunchParams const& params, + BODY const& body) { launch(place, params, body); } // Run-time API for new reducer interface with support of the case without a new // kernel name -template +template void launch(ExecPlace place, LaunchParams const& launch_params, ReduceParams&&... rest_of_launch_args) @@ -376,7 +363,7 @@ RAJA::resources::Resource Get_Host_Resource(T host_res, RAJA::ExecPlace device) // Duplicate of API above on account that we need to handle the case that a // kernel name is not provided -template +template resources::EventProxy launch( RAJA::resources::Resource res, LaunchParams const& launch_params, diff --git a/include/RAJA/policy/hip/launch.hpp b/include/RAJA/policy/hip/launch.hpp index 0062ce50cc..ba1cbfd267 100644 --- a/include/RAJA/policy/hip/launch.hpp +++ b/include/RAJA/policy/hip/launch.hpp @@ -70,13 +70,13 @@ struct LaunchExecute< RAJA::policy::hip::hip_launch_t> { - template + template static concepts::enable_if_t< resources::EventProxy, RAJA::expt::type_traits::is_ForallParamPack, RAJA::expt::type_traits::is_ForallParamPack_empty> exec(RAJA::resources::Resource res, - const LaunchParams & params, + const LaunchParams& params, BODY_IN&& body_in, ReduceParams& RAJA_UNUSED_ARG(launch_reducers)) { @@ -87,14 +87,15 @@ struct LaunchExecute< resources::Hip hip_res = res.get(); - for(int k=0; k<3;++k) { - std::cout<<"params.threads.value "<< - params.threads.value[k]<(res); } // Version with explicit reduction parameters.. - template + template static concepts::enable_if_t< resources::EventProxy, RAJA::expt::type_traits::is_ForallParamPack, @@ -270,7 +273,7 @@ template struct LaunchExecute> { - template + template static concepts::enable_if_t< resources::EventProxy, RAJA::expt::type_traits::is_ForallParamPack, @@ -333,7 +336,7 @@ struct LaunchExecute> } // Version with explicit reduction parameters.. - template + template static concepts::enable_if_t< resources::EventProxy, RAJA::expt::type_traits::is_ForallParamPack, diff --git a/include/RAJA/policy/openmp/launch.hpp b/include/RAJA/policy/openmp/launch.hpp index 2d8499145f..a7cebc31c9 100644 --- a/include/RAJA/policy/openmp/launch.hpp +++ b/include/RAJA/policy/openmp/launch.hpp @@ -28,7 +28,7 @@ template<> struct LaunchExecute { - template + template static concepts::enable_if_t< resources::EventProxy, RAJA::expt::type_traits::is_ForallParamPack, @@ -55,7 +55,7 @@ struct LaunchExecute return resources::EventProxy(res); } - template + template static concepts::enable_if_t< resources::EventProxy, RAJA::expt::type_traits::is_ForallParamPack, diff --git a/include/RAJA/policy/sequential/launch.hpp b/include/RAJA/policy/sequential/launch.hpp index f5642274b0..1026e78af2 100644 --- a/include/RAJA/policy/sequential/launch.hpp +++ b/include/RAJA/policy/sequential/launch.hpp @@ -40,7 +40,7 @@ template<> struct LaunchExecute { - template + template static concepts::enable_if_t< resources::EventProxy, RAJA::expt::type_traits::is_ForallParamPack, @@ -64,7 +64,7 @@ struct LaunchExecute return resources::EventProxy(res); } - template + template static concepts::enable_if_t< resources::EventProxy, RAJA::expt::type_traits::is_ForallParamPack, From a26063497c8c3aecc10c399bd37e9b06850f227a Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Thu, 3 Jul 2025 11:46:26 -0700 Subject: [PATCH 03/10] build fixes for cuda --- examples/launch-high-dim-threads.cpp | 2 +- include/RAJA/policy/cuda/launch.hpp | 32 +++++++++++++++++++++------- 2 files changed, 25 insertions(+), 9 deletions(-) diff --git a/examples/launch-high-dim-threads.cpp b/examples/launch-high-dim-threads.cpp index fd804e9239..95af2d1d2a 100644 --- a/examples/launch-high-dim-threads.cpp +++ b/examples/launch-high-dim-threads.cpp @@ -147,7 +147,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::cout << "\n Running upper triangular pattern example on the device...\n"; } - RAJA::LaunchParams(); + RAJA::LaunchParams; //Example of high-dimensional thread-team RAJA::LaunchParams(RAJA::Teams(N_tri), RAJA::Threads<4>(1,2,3,4)); diff --git a/include/RAJA/policy/cuda/launch.hpp b/include/RAJA/policy/cuda/launch.hpp index f0b7d0be98..a7dc2a43ae 100644 --- a/include/RAJA/policy/cuda/launch.hpp +++ b/include/RAJA/policy/cuda/launch.hpp @@ -72,13 +72,13 @@ struct LaunchExecute< named_usage::unspecified>> { - template + template static concepts::enable_if_t< resources::EventProxy, RAJA::expt::type_traits::is_ForallParamPack, RAJA::expt::type_traits::is_ForallParamPack_empty> exec(RAJA::resources::Resource res, - const LaunchParams& params, + const LaunchParams& params, BODY_IN&& body_in, ReduceParams& RAJA_UNUSED_ARG(launch_reducers)) { @@ -88,6 +88,18 @@ struct LaunchExecute< resources::Cuda cuda_res = res.get(); + for (int k = 0; k < 3; ++k) + { + std::cout << "params.threads.value " << params.threads.value[k] + << std::endl; + } + + for (int k = 0; k < 3; ++k) + { + std::cout << "params.teams.value " << params.teams.value[k] << std::endl; + } + + // // Compute the number of blocks and threads // @@ -129,19 +141,23 @@ struct LaunchExecute< RAJA_FT_END; } + else + { + std::cout << "did not launch kernel " << std::endl; + } return resources::EventProxy(res); } // Version with explicit reduction parameters.. - template + template static concepts::enable_if_t< resources::EventProxy, RAJA::expt::type_traits::is_ForallParamPack, concepts::negate< RAJA::expt::type_traits::is_ForallParamPack_empty>> exec(RAJA::resources::Resource res, - const LaunchParams& launch_params, + const LaunchParams& launch_params, BODY_IN&& body_in, ReduceParams& launch_reducers) { @@ -260,13 +276,13 @@ struct LaunchExecute< RAJA::policy::cuda::cuda_launch_explicit_t> { - template + template static concepts::enable_if_t< resources::EventProxy, RAJA::expt::type_traits::is_ForallParamPack, RAJA::expt::type_traits::is_ForallParamPack_empty> exec(RAJA::resources::Resource res, - const LaunchParams& params, + const LaunchParams& params, BODY_IN&& body_in, ReduceParams& RAJA_UNUSED_ARG(launch_reducers)) { @@ -323,14 +339,14 @@ struct LaunchExecute< } // Version with explicit reduction parameters.. - template + template static concepts::enable_if_t< resources::EventProxy, RAJA::expt::type_traits::is_ForallParamPack, concepts::negate< RAJA::expt::type_traits::is_ForallParamPack_empty>> exec(RAJA::resources::Resource res, - const LaunchParams& launch_params, + const LaunchParams& launch_params, BODY_IN&& body_in, ReduceParams& launch_reducers) { From ae7c6cec33e2eba8fd5511b473bc41ef45127c9c Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Mon, 7 Jul 2025 13:22:00 -0700 Subject: [PATCH 04/10] intial 4D contraction example --- examples/4D_tensor_contraction.cpp | 121 ++++++++++++++++++++ examples/CMakeLists.txt | 4 + examples/launch-high-dim-threads.cpp | 4 +- include/RAJA/pattern/launch/launch_core.hpp | 5 +- include/RAJA/policy/cuda/launch.hpp | 44 +++++-- 5 files changed, 166 insertions(+), 12 deletions(-) create mode 100644 examples/4D_tensor_contraction.cpp diff --git a/examples/4D_tensor_contraction.cpp b/examples/4D_tensor_contraction.cpp new file mode 100644 index 0000000000..918303dfc8 --- /dev/null +++ b/examples/4D_tensor_contraction.cpp @@ -0,0 +1,121 @@ +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// +// Copyright (c) 2016-25, Lawrence Livermore National Security, LLC +// and RAJA project contributors. See the RAJA/LICENSE file for details. +// +// SPDX-License-Identifier: (BSD-3-Clause) +//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// + +#include +#include +#include +#include + +#include "RAJA/RAJA.hpp" +#include "camp/resource.hpp" +#include "memoryManager.hpp" + + +/* + * RAJA Launch Example: Upper Triangular Pattern + Shared Memory + * + * Launch introduces hierarchical parallelism through the concept of + * teams and threads. Computation is executed in a pre-defined grid + * composed of threads and grouped into teams. The teams model enables + * developers to express parallelism through loops over teams, and inner loops + * over threads. Team loops are executed in parallel and + * threads within a team should be treated as sub-parallel regions. + * + * Team shared memory is allocated between team and thread loops. + * Memory allocated within thread loops are thread private. + * The example below demonstrates composing an upper triangular + * loop pattern, and using shared memory. + * + */ + +/* + * Define host/device launch policies + */ +using launch_policy = RAJA::LaunchPolicy; + +using teams_x = RAJA::LoopPolicy; + +using threads_x = RAJA::LoopPolicy; + + +int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) +{ + + constexpr int TotalMats = 100; + + constexpr int I = 2; + constexpr int J = 2; + constexpr int L = 2; + constexpr int K = 2; + constexpr int M = 2; + constexpr int N = 2; + constexpr int O = 2; + + double *Aptr = memoryManager::allocate(TotalMats * I * J * K * L); + double *Bptr = memoryManager::allocate(TotalMats * L * M * N * O); + double *Cptr = memoryManager::allocate(TotalMats * I * J * K * M * N * O); + + auto A = RAJA::make_permuted_view(Aptr, TotalMats, I, J, K, L); + auto B = RAJA::make_permuted_view(Bptr, TotalMats, L, M, N, O); + auto C = RAJA::make_permuted_view(Cptr, TotalMats, I, J, K, N, O); + + // Initialize A and B with some values + for(int mat = 0; mat < TotalMats; ++mat) { + + for (int i = 0; i < I; i++) { + for (int j = 0; j < J; j++) { + for (int k = 0; k < K; k++) { + for (int l = 0; l < L; l++) { + A(mat, i, j, k, l) = 1.0; + } + } + } + } + + for (int l = 0; l < L; l++) { + for (int m = 0; m < M; m++) { + for (int n = 0; n < N; n++) { + for (int o = 0; o < O; o++) { + B(mat, l, m, n, o) = 1.0; + } + } + } + } + + } + + + +#if 0 + RAJA::launch + (select_cpu_or_gpu, + RAJA::LaunchParams(RAJA::Teams(N_tri), RAJA::Threads<4>(1,2,3,4)), + //RAJA::LaunchParams(RAJA::Teams(N_tri), RAJA::Threads(N_tri)), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + printf("in kernel \n"); + RAJA::loop(ctx, RAJA::RangeSegment(0, N_tri), [&](int r) { + + // Array shared within threads of the same team + RAJA_TEAM_SHARED int s_A[1]; + + RAJA::loop(ctx, RAJA::RangeSegment(0, 1), [&](int c) { + s_A[c] = r; + }); // loop c + + ctx.teamSync(); + + RAJA::loop(ctx, RAJA::RangeSegment(r, N_tri), [&](int c) { + D(r, c) = r * N_tri + c; + printf("r=%d, c=%d : D=%d : s_A = %d \n", r, c, D(r, c), s_A[0]); + }); // loop c + + }); // loop r + + }); // outer lambda +#endif + +} // Main diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 235e71be25..0c4dcd20ca 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -71,6 +71,10 @@ raja_add_executable( NAME launch-high-dim-threads SOURCES launch-high-dim-threads.cpp) +raja_add_executable( + NAME 4D_tensor_contraction + SOURCES 4D_tensor_contraction.cpp) + raja_add_executable( NAME raja-launch SOURCES raja-launch.cpp) diff --git a/examples/launch-high-dim-threads.cpp b/examples/launch-high-dim-threads.cpp index 95af2d1d2a..abedd1b621 100644 --- a/examples/launch-high-dim-threads.cpp +++ b/examples/launch-high-dim-threads.cpp @@ -156,8 +156,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) RAJA::launch (select_cpu_or_gpu, - //RAJA::LaunchParams(RAJA::Teams(N_tri), RAJA::Threads<4>(1,2,3,4)), - RAJA::LaunchParams(RAJA::Teams(N_tri), RAJA::Threads(N_tri)), + RAJA::LaunchParams(RAJA::Teams(N_tri), RAJA::Threads<4>(1,2,3,4)), + //RAJA::LaunchParams(RAJA::Teams(N_tri), RAJA::Threads(N_tri)), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { printf("in kernel \n"); RAJA::loop(ctx, RAJA::RangeSegment(0, N_tri), [&](int r) { diff --git a/include/RAJA/pattern/launch/launch_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp index 841d9eff75..af3d66560c 100644 --- a/include/RAJA/pattern/launch/launch_core.hpp +++ b/include/RAJA/pattern/launch/launch_core.hpp @@ -110,7 +110,8 @@ struct Teams template struct Threads { - int value[DIM]; + std::array value; + //int value[DIM]; #if 1 RAJA_INLINE @@ -134,7 +135,7 @@ struct Threads //#else template - constexpr Threads(Args... args) : value {args...} {}; + constexpr Threads(Args... args) : value {static_cast(args)...} {}; #endif }; diff --git a/include/RAJA/policy/cuda/launch.hpp b/include/RAJA/policy/cuda/launch.hpp index a7dc2a43ae..cd6479230f 100644 --- a/include/RAJA/policy/cuda/launch.hpp +++ b/include/RAJA/policy/cuda/launch.hpp @@ -28,6 +28,22 @@ namespace RAJA { +//internal helper function +namespace detail +{ + +template +constexpr T multiply_impl(const std::array& arr, std::index_sequence) { + return (arr[I] * ...); +} + +template +constexpr T multiplyArray(const std::array& arr) { + return multiply_impl(arr, std::make_index_sequence{}); +} + +} + template __global__ void launch_global_fcn(BODY body_in) { @@ -88,15 +104,9 @@ struct LaunchExecute< resources::Cuda cuda_res = res.get(); - for (int k = 0; k < 3; ++k) - { - std::cout << "params.threads.value " << params.threads.value[k] - << std::endl; - } - for (int k = 0; k < 3; ++k) - { - std::cout << "params.teams.value " << params.teams.value[k] << std::endl; + if(params.threads.value.size() > 3) { + std::cout<<"threads container is larger than 3 : "<(params.teams.value[1]), static_cast(params.teams.value[2])}; + cuda_dim_t blockSize; + + if(params.threads.value.size() < 4) + { + blockSize = cuda_dim_t{static_cast(params.threads.value[0]), + static_cast(params.threads.value[1]), + static_cast(params.threads.value[2])}; + } else { + + int total_threads = detail::multiplyArray(params.threads.value); + std::cout<<"Total threads"<(detail::multiplyArray(params.threads.value)), + static_cast(1), + static_cast(1)}; + } + + /* cuda_dim_t blockSize { static_cast(params.threads.value[0]), static_cast(params.threads.value[1]), static_cast(params.threads.value[2])}; + */ // Only launch kernel if we have something to iterate over constexpr cuda_dim_member_t zero = 0; From 48e133e2fbcd45e41bb35a7b00b66cb9c0c50574 Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Mon, 7 Jul 2025 14:17:51 -0700 Subject: [PATCH 05/10] fix compile issue --- examples/launch-high-dim-threads.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/launch-high-dim-threads.cpp b/examples/launch-high-dim-threads.cpp index abedd1b621..a3c41c51ff 100644 --- a/examples/launch-high-dim-threads.cpp +++ b/examples/launch-high-dim-threads.cpp @@ -147,7 +147,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) std::cout << "\n Running upper triangular pattern example on the device...\n"; } - RAJA::LaunchParams; + RAJA::LaunchParams{}; //Example of high-dimensional thread-team RAJA::LaunchParams(RAJA::Teams(N_tri), RAJA::Threads<4>(1,2,3,4)); From e25abbe3c627963bd75aabfee1548009821d53dc Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Mon, 7 Jul 2025 14:39:09 -0700 Subject: [PATCH 06/10] fix build issues with example --- examples/4D_tensor_contraction.cpp | 79 ++++++++++++++++-------------- 1 file changed, 42 insertions(+), 37 deletions(-) diff --git a/examples/4D_tensor_contraction.cpp b/examples/4D_tensor_contraction.cpp index 918303dfc8..384366c8ab 100644 --- a/examples/4D_tensor_contraction.cpp +++ b/examples/4D_tensor_contraction.cpp @@ -36,32 +36,37 @@ * Define host/device launch policies */ using launch_policy = RAJA::LaunchPolicy; - -using teams_x = RAJA::LoopPolicy; - -using threads_x = RAJA::LoopPolicy; + +using teams = RAJA::LoopPolicy; + +using loop_0 = RAJA::LoopPolicy; +using loop_1 = RAJA::LoopPolicy; +using loop_2 = RAJA::LoopPolicy; +using loop_3 = RAJA::LoopPolicy; +using loop_4 = RAJA::LoopPolicy; +using loop_5 = RAJA::LoopPolicy; int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) { constexpr int TotalMats = 100; - + constexpr int I = 2; constexpr int J = 2; constexpr int L = 2; constexpr int K = 2; constexpr int M = 2; constexpr int N = 2; - constexpr int O = 2; - + constexpr int O = 2; + double *Aptr = memoryManager::allocate(TotalMats * I * J * K * L); double *Bptr = memoryManager::allocate(TotalMats * L * M * N * O); double *Cptr = memoryManager::allocate(TotalMats * I * J * K * M * N * O); auto A = RAJA::make_permuted_view(Aptr, TotalMats, I, J, K, L); auto B = RAJA::make_permuted_view(Bptr, TotalMats, L, M, N, O); - auto C = RAJA::make_permuted_view(Cptr, TotalMats, I, J, K, N, O); + auto C = RAJA::make_permuted_view(Cptr, TotalMats, I, J, K, M, N, O); // Initialize A and B with some values for(int mat = 0; mat < TotalMats; ++mat) { @@ -74,9 +79,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } } } - } - - for (int l = 0; l < L; l++) { + } + + for (int l = 0; l < L; l++) { for (int m = 0; m < M; m++) { for (int n = 0; n < N; n++) { for (int o = 0; o < O; o++) { @@ -85,37 +90,37 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } } } - + } - - -#if 0 + RAJA::launch - (select_cpu_or_gpu, - RAJA::LaunchParams(RAJA::Teams(N_tri), RAJA::Threads<4>(1,2,3,4)), - //RAJA::LaunchParams(RAJA::Teams(N_tri), RAJA::Threads(N_tri)), + (RAJA::LaunchParams(RAJA::Teams(TotalMats), RAJA::Threads<6>(I, J, K, M, N, O)), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { - printf("in kernel \n"); - RAJA::loop(ctx, RAJA::RangeSegment(0, N_tri), [&](int r) { - - // Array shared within threads of the same team - RAJA_TEAM_SHARED int s_A[1]; - - RAJA::loop(ctx, RAJA::RangeSegment(0, 1), [&](int c) { - s_A[c] = r; - }); // loop c - - ctx.teamSync(); - - RAJA::loop(ctx, RAJA::RangeSegment(r, N_tri), [&](int c) { - D(r, c) = r * N_tri + c; - printf("r=%d, c=%d : D=%d : s_A = %d \n", r, c, D(r, c), s_A[0]); - }); // loop c - - }); // loop r + RAJA::loop(ctx, RAJA::RangeSegment(0, TotalMats), [&](int r) { + + RAJA::loop(ctx, RAJA::RangeSegment(0, I), [&](int i) { + RAJA::loop(ctx, RAJA::RangeSegment(0, J), [&](int j) { + RAJA::loop(ctx, RAJA::RangeSegment(0, K), [&](int k) { + RAJA::loop(ctx, RAJA::RangeSegment(0, M), [&](int m) { + RAJA::loop(ctx, RAJA::RangeSegment(0, N), [&](int n) { + RAJA::loop(ctx, RAJA::RangeSegment(0, O), [&](int o) { + + double dot = 0.0; + for(int l = 0; l < L; ++l) { + dot += A(r, i,j,k,l) * B(r, l,m,n,o); + } + C(r, i,j,k,m,n,o) = dot; + + }); + }); + }); + }); + }); + }); + }); + }); // outer lambda -#endif } // Main From 699bce1905e60338deb5f31f23e151043e819aa6 Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Wed, 9 Jul 2025 10:32:38 -0700 Subject: [PATCH 07/10] build fixes for sycl --- include/RAJA/policy/sycl/launch.hpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/include/RAJA/policy/sycl/launch.hpp b/include/RAJA/policy/sycl/launch.hpp index f69e2c4424..6cdc48185e 100644 --- a/include/RAJA/policy/sycl/launch.hpp +++ b/include/RAJA/policy/sycl/launch.hpp @@ -33,7 +33,7 @@ struct LaunchExecute> { // If the launch lambda is trivially copyable - template {}, bool>::type = true> @@ -42,7 +42,7 @@ struct LaunchExecute> RAJA::expt::type_traits::is_ForallParamPack, RAJA::expt::type_traits::is_ForallParamPack_empty> exec(RAJA::resources::Resource res, - const LaunchParams& params, + const LaunchParams& params, BODY_IN&& body_in, ReduceParams& RAJA_UNUSED_ARG(launch_reducers)) { @@ -102,7 +102,7 @@ struct LaunchExecute> // If the launch lambda is trivially copyable and we have explcit reduction // parameters - template {}, bool>::type = true> @@ -112,7 +112,7 @@ struct LaunchExecute> concepts::negate< RAJA::expt::type_traits::is_ForallParamPack_empty>> exec(RAJA::resources::Resource res, - const LaunchParams& launch_params, + const LaunchParams& launch_params, BODY_IN&& body_in, ReduceParams launch_reducers) { @@ -194,7 +194,7 @@ struct LaunchExecute> } // If the launch lambda is not trivially copyable - template {}, bool>::type = true> @@ -203,7 +203,7 @@ struct LaunchExecute> RAJA::expt::type_traits::is_ForallParamPack, RAJA::expt::type_traits::is_ForallParamPack_empty> exec(RAJA::resources::Resource res, - const LaunchParams& params, + const LaunchParams& params, BODY_IN&& body_in, ReduceParams& RAJA_UNUSED_ARG(launch_reducers)) { @@ -269,7 +269,7 @@ struct LaunchExecute> } // If the launch lambda is not trivially copyable - template {}, bool>::type = true> @@ -279,7 +279,7 @@ struct LaunchExecute> concepts::negate< RAJA::expt::type_traits::is_ForallParamPack_empty>> exec(RAJA::resources::Resource res, - const LaunchParams& launch_params, + const LaunchParams& launch_params, BODY_IN&& body_in, ReduceParams launch_reducers) { From bf9f98d14af076f220377d1ef6184f6297aa0ad7 Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Wed, 9 Jul 2025 11:18:31 -0700 Subject: [PATCH 08/10] TODO - add device policy --- examples/4D_tensor_contraction.cpp | 140 +++++++++++++++++++---------- 1 file changed, 94 insertions(+), 46 deletions(-) diff --git a/examples/4D_tensor_contraction.cpp b/examples/4D_tensor_contraction.cpp index 384366c8ab..b87daa65fa 100644 --- a/examples/4D_tensor_contraction.cpp +++ b/examples/4D_tensor_contraction.cpp @@ -32,41 +32,118 @@ * */ +// Define problem setup +constexpr int TotalMats = 100; + +constexpr int I = 2; +constexpr int J = 2; +constexpr int L = 2; +constexpr int K = 2; +constexpr int M = 2; +constexpr int N = 2; +constexpr int O = 2; + /* * Define host/device launch policies */ -using launch_policy = RAJA::LaunchPolicy; +const bool async = false; +using launch_policy = RAJA::LaunchPolicy +#endif + >; + +using teams = RAJA::LoopPolicy; + +using loop_0 = RAJA::LoopPolicy; +using loop_1 = RAJA::LoopPolicy; +using loop_2 = RAJA::LoopPolicy; +using loop_3 = RAJA::LoopPolicy; + +using loop_4 = RAJA::LoopPolicy; +using loop_5 = RAJA::LoopPolicy; + + +template +void tensor_contraction(AVIEW A, BVIEW B, CVIEW C, RAJA::ExecPlace platform) +{ + + RAJA::launch + (RAJA::LaunchParams(RAJA::Teams(TotalMats), RAJA::Threads<6>(I, J, K, M, N, O)), + [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { + + RAJA::loop(ctx, RAJA::RangeSegment(0, TotalMats), [&](int r) { + + RAJA::loop(ctx, RAJA::RangeSegment(0, I), [&](int i) { + RAJA::loop(ctx, RAJA::RangeSegment(0, J), [&](int j) { + RAJA::loop(ctx, RAJA::RangeSegment(0, K), [&](int k) { + RAJA::loop(ctx, RAJA::RangeSegment(0, M), [&](int m) { + RAJA::loop(ctx, RAJA::RangeSegment(0, N), [&](int n) { + RAJA::loop(ctx, RAJA::RangeSegment(0, O), [&](int o) { + + double dot = 0.0; + for(int l = 0; l < L; ++l) { + dot += A(r, i,j,k,l) * B(r, l,m,n,o); + } + C(r, i,j,k,m,n,o) = dot; + + }); + }); + }); + }); + }); + }); + }); + + }); // outer lambda + -using teams = RAJA::LoopPolicy; +} -using loop_0 = RAJA::LoopPolicy; -using loop_1 = RAJA::LoopPolicy; -using loop_2 = RAJA::LoopPolicy; -using loop_3 = RAJA::LoopPolicy; -using loop_4 = RAJA::LoopPolicy; -using loop_5 = RAJA::LoopPolicy; int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) { - constexpr int TotalMats = 100; - - constexpr int I = 2; - constexpr int J = 2; - constexpr int L = 2; - constexpr int K = 2; - constexpr int M = 2; - constexpr int N = 2; - constexpr int O = 2; double *Aptr = memoryManager::allocate(TotalMats * I * J * K * L); double *Bptr = memoryManager::allocate(TotalMats * L * M * N * O); double *Cptr = memoryManager::allocate(TotalMats * I * J * K * M * N * O); + double *test_Cptr = memoryManager::allocate(TotalMats * I * J * K * M * N * O); + auto A = RAJA::make_permuted_view(Aptr, TotalMats, I, J, K, L); auto B = RAJA::make_permuted_view(Bptr, TotalMats, L, M, N, O); auto C = RAJA::make_permuted_view(Cptr, TotalMats, I, J, K, M, N, O); + auto test_C = RAJA::make_permuted_view(test_Cptr, TotalMats, I, J, K, M, N, O); // Initialize A and B with some values for(int mat = 0; mat < TotalMats; ++mat) { @@ -94,33 +171,4 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } - RAJA::launch - (RAJA::LaunchParams(RAJA::Teams(TotalMats), RAJA::Threads<6>(I, J, K, M, N, O)), - [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { - - RAJA::loop(ctx, RAJA::RangeSegment(0, TotalMats), [&](int r) { - - RAJA::loop(ctx, RAJA::RangeSegment(0, I), [&](int i) { - RAJA::loop(ctx, RAJA::RangeSegment(0, J), [&](int j) { - RAJA::loop(ctx, RAJA::RangeSegment(0, K), [&](int k) { - RAJA::loop(ctx, RAJA::RangeSegment(0, M), [&](int m) { - RAJA::loop(ctx, RAJA::RangeSegment(0, N), [&](int n) { - RAJA::loop(ctx, RAJA::RangeSegment(0, O), [&](int o) { - - double dot = 0.0; - for(int l = 0; l < L; ++l) { - dot += A(r, i,j,k,l) * B(r, l,m,n,o); - } - C(r, i,j,k,m,n,o) = dot; - - }); - }); - }); - }); - }); - }); - }); - - }); // outer lambda - } // Main From 289fcc0a1b97431830d595a533f44b418579a82d Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Wed, 9 Jul 2025 18:36:58 -0700 Subject: [PATCH 09/10] make style --- include/RAJA/pattern/launch/launch_core.hpp | 2 +- include/RAJA/policy/cuda/launch.hpp | 45 ++++++++++++--------- include/RAJA/policy/sycl/launch.hpp | 12 ++++-- 3 files changed, 36 insertions(+), 23 deletions(-) diff --git a/include/RAJA/pattern/launch/launch_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp index af3d66560c..4424259057 100644 --- a/include/RAJA/pattern/launch/launch_core.hpp +++ b/include/RAJA/pattern/launch/launch_core.hpp @@ -111,7 +111,7 @@ template struct Threads { std::array value; - //int value[DIM]; + // int value[DIM]; #if 1 RAJA_INLINE diff --git a/include/RAJA/policy/cuda/launch.hpp b/include/RAJA/policy/cuda/launch.hpp index cd6479230f..d39690675d 100644 --- a/include/RAJA/policy/cuda/launch.hpp +++ b/include/RAJA/policy/cuda/launch.hpp @@ -28,21 +28,24 @@ namespace RAJA { -//internal helper function +// internal helper function namespace detail { template -constexpr T multiply_impl(const std::array& arr, std::index_sequence) { - return (arr[I] * ...); +constexpr T multiply_impl(const std::array& arr, + std::index_sequence) +{ + return (arr[I] * ...); } template -constexpr T multiplyArray(const std::array& arr) { - return multiply_impl(arr, std::make_index_sequence{}); +constexpr T multiplyArray(const std::array& arr) +{ + return multiply_impl(arr, std::make_index_sequence {}); } -} +} // namespace detail template __global__ void launch_global_fcn(BODY body_in) @@ -105,8 +108,10 @@ struct LaunchExecute< resources::Cuda cuda_res = res.get(); - if(params.threads.value.size() > 3) { - std::cout<<"threads container is larger than 3 : "< 3) + { + std::cout << "threads container is larger than 3 : " + << params.threads.value.size() << std::endl; } @@ -120,18 +125,22 @@ struct LaunchExecute< cuda_dim_t blockSize; - if(params.threads.value.size() < 4) + if (params.threads.value.size() < 4) + { + blockSize = + cuda_dim_t {static_cast(params.threads.value[0]), + static_cast(params.threads.value[1]), + static_cast(params.threads.value[2])}; + } + else { - blockSize = cuda_dim_t{static_cast(params.threads.value[0]), - static_cast(params.threads.value[1]), - static_cast(params.threads.value[2])}; - } else { int total_threads = detail::multiplyArray(params.threads.value); - std::cout<<"Total threads"<(detail::multiplyArray(params.threads.value)), - static_cast(1), - static_cast(1)}; + std::cout << "Total threads" << std::endl; + blockSize = cuda_dim_t {static_cast( + detail::multiplyArray(params.threads.value)), + static_cast(1), + static_cast(1)}; } /* @@ -367,7 +376,7 @@ struct LaunchExecute< } // Version with explicit reduction parameters.. - template + template static concepts::enable_if_t< resources::EventProxy, RAJA::expt::type_traits::is_ForallParamPack, diff --git a/include/RAJA/policy/sycl/launch.hpp b/include/RAJA/policy/sycl/launch.hpp index 6cdc48185e..7dacc5105d 100644 --- a/include/RAJA/policy/sycl/launch.hpp +++ b/include/RAJA/policy/sycl/launch.hpp @@ -33,7 +33,8 @@ struct LaunchExecute> { // If the launch lambda is trivially copyable - template {}, bool>::type = true> @@ -102,7 +103,8 @@ struct LaunchExecute> // If the launch lambda is trivially copyable and we have explcit reduction // parameters - template {}, bool>::type = true> @@ -194,7 +196,8 @@ struct LaunchExecute> } // If the launch lambda is not trivially copyable - template {}, bool>::type = true> @@ -269,7 +272,8 @@ struct LaunchExecute> } // If the launch lambda is not trivially copyable - template {}, bool>::type = true> From 4bf066a05e392712f0be95b5c5cfcd76e34d540c Mon Sep 17 00:00:00 2001 From: Arturo Vargas Date: Fri, 11 Jul 2025 09:23:53 -0700 Subject: [PATCH 10/10] WIP -- need to rework code so it does not break the build --- examples/4D_tensor_contraction.cpp | 43 ++++--- include/RAJA/pattern/launch/launch_core.hpp | 7 +- include/RAJA/policy/cuda/launch.hpp | 19 ---- include/RAJA/policy/hip/launch.hpp | 117 ++++++++++++++++++-- 4 files changed, 142 insertions(+), 44 deletions(-) diff --git a/examples/4D_tensor_contraction.cpp b/examples/4D_tensor_contraction.cpp index b87daa65fa..51ffea53a6 100644 --- a/examples/4D_tensor_contraction.cpp +++ b/examples/4D_tensor_contraction.cpp @@ -61,33 +61,33 @@ using teams = RAJA::LoopPolicy #endif >; using loop_1 = RAJA::LoopPolicy #endif >; using loop_2 = RAJA::LoopPolicy #endif >; using loop_3 = RAJA::LoopPolicy #endif >; using loop_4 = RAJA::LoopPolicy #endif >; using loop_5 = RAJA::LoopPolicy #endif >; @@ -97,17 +97,17 @@ void tensor_contraction(AVIEW A, BVIEW B, CVIEW C, RAJA::ExecPlace platform) { RAJA::launch - (RAJA::LaunchParams(RAJA::Teams(TotalMats), RAJA::Threads<6>(I, J, K, M, N, O)), + (platform, RAJA::LaunchParams(RAJA::Teams(TotalMats), RAJA::Threads<6>(I, J, K, M, N, O)), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { RAJA::loop(ctx, RAJA::RangeSegment(0, TotalMats), [&](int r) { - RAJA::loop(ctx, RAJA::RangeSegment(0, I), [&](int i) { - RAJA::loop(ctx, RAJA::RangeSegment(0, J), [&](int j) { - RAJA::loop(ctx, RAJA::RangeSegment(0, K), [&](int k) { - RAJA::loop(ctx, RAJA::RangeSegment(0, M), [&](int m) { - RAJA::loop(ctx, RAJA::RangeSegment(0, N), [&](int n) { - RAJA::loop(ctx, RAJA::RangeSegment(0, O), [&](int o) { + RAJA::loop(ctx, RAJA::RangeSegment(0, I), [&](int i) { + RAJA::loop(ctx, RAJA::RangeSegment(0, J), [&](int j) { + RAJA::loop(ctx, RAJA::RangeSegment(0, K), [&](int k) { + RAJA::loop(ctx, RAJA::RangeSegment(0, M), [&](int m) { + RAJA::loop(ctx, RAJA::RangeSegment(0, N), [&](int n) { + RAJA::loop(ctx, RAJA::RangeSegment(0, O), [&](int o) { double dot = 0.0; for(int l = 0; l < L; ++l) { @@ -171,4 +171,21 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[])) } + + tensor_contraction(A, B, C, RAJA::ExecPlace::HOST); + + tensor_contraction(A, B, test_C, RAJA::ExecPlace::DEVICE); + + + //test correctness + double diff = 0.0; + for(int i = 0; i < I * J * K * M * N * O; ++i) { + diff += fabs(Cptr[i] - test_Cptr[i]); + } + + std::cout<<"diff = "< +template struct Threads { + std::array value; // int value[DIM]; #if 1 @@ -180,6 +181,10 @@ class LaunchContext void* shared_mem_ptr; + //hardcoded for now... + std::array thread_dim; + std::array thread_id; + #if defined(RAJA_ENABLE_SYCL) mutable ::sycl::nd_item<3>* itm; #endif diff --git a/include/RAJA/policy/cuda/launch.hpp b/include/RAJA/policy/cuda/launch.hpp index d39690675d..6a54205ded 100644 --- a/include/RAJA/policy/cuda/launch.hpp +++ b/include/RAJA/policy/cuda/launch.hpp @@ -28,25 +28,6 @@ namespace RAJA { -// internal helper function -namespace detail -{ - -template -constexpr T multiply_impl(const std::array& arr, - std::index_sequence) -{ - return (arr[I] * ...); -} - -template -constexpr T multiplyArray(const std::array& arr) -{ - return multiply_impl(arr, std::make_index_sequence {}); -} - -} // namespace detail - template __global__ void launch_global_fcn(BODY body_in) { diff --git a/include/RAJA/policy/hip/launch.hpp b/include/RAJA/policy/hip/launch.hpp index ba1cbfd267..32e5a6595b 100644 --- a/include/RAJA/policy/hip/launch.hpp +++ b/include/RAJA/policy/hip/launch.hpp @@ -28,6 +28,25 @@ namespace RAJA { +// internal helper function +namespace detail +{ + +template +constexpr T multiply_impl(const std::array& arr, + std::index_sequence) +{ + return (arr[I] * ...); +} + +template +constexpr T multiplyArray(const std::array& arr) +{ + return multiply_impl(arr, std::make_index_sequence {}); +} + +} // namespace detail + template __global__ void launch_global_fcn(BODY body_in) { @@ -44,6 +63,37 @@ __global__ void launch_global_fcn(BODY body_in) body(ctx); } +template +__global__ void launch_global_fcn_ctx(BODY body_in, LaunchContext ctx) +{ + //LaunchContext ctx; + + //unravel index + int tid = threadIdx.x; + for (int d = ctx.thread_dim.size()-1; d >= 0; --d) { + ctx.thread_id[d] = tid % ctx.thread_dim[d]; + tid /= ctx.thread_dim[d]; + } + + /* + if(threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0){ + for(int i=0; i __global__ void launch_new_reduce_global_fcn(BODY body_in, ReduceParams reduce_params) @@ -82,20 +132,14 @@ struct LaunchExecute< { using BODY = camp::decay; - auto func = reinterpret_cast(&launch_global_fcn); + auto func = reinterpret_cast(&launch_global_fcn_ctx); resources::Hip hip_res = res.get(); - - for (int k = 0; k < 3; ++k) - { - std::cout << "params.threads.value " << params.threads.value[k] - << std::endl; - } - - for (int k = 0; k < 3; ++k) + if (params.threads.value.size() > 3) { - std::cout << "params.teams.value " << params.teams.value[k] << std::endl; + std::cout << "threads container is larger than 3 : " + << params.threads.value.size() << std::endl; } @@ -107,10 +151,33 @@ struct LaunchExecute< static_cast(params.teams.value[1]), static_cast(params.teams.value[2])}; + /* hip_dim_t blockSize { static_cast(params.threads.value[0]), static_cast(params.threads.value[1]), static_cast(params.threads.value[2])}; + */ + + hip_dim_t blockSize; + + if (params.threads.value.size() < 4) + { + blockSize = + hip_dim_t {static_cast(params.threads.value[0]), + static_cast(params.threads.value[1]), + static_cast(params.threads.value[2])}; + } + else + { + + int total_threads = detail::multiplyArray(params.threads.value); + std::cout << "Total threads" << std::endl; + blockSize = hip_dim_t {static_cast( + detail::multiplyArray(params.threads.value)), + static_cast(1), + static_cast(1)}; + } + // Only launch kernel if we have something to iterate over constexpr hip_dim_member_t zero = 0; @@ -130,10 +197,14 @@ struct LaunchExecute< shared_mem_size, hip_res, std::forward(body_in)); + //Copy threads over + LaunchContext ctx; + ctx.thread_dim = params.threads.value; + // // Launch the kernel // - void* args[] = {(void*)&body}; + void* args[] = {(void*)&body, (void*)&ctx}; RAJA::hip::launch(func, gridSize, blockSize, args, shared_mem_size, hip_res, async); } @@ -419,6 +490,30 @@ struct LaunchExecute> } }; + +/* + Arbitrary dimension thread indexing +*/ +template +struct hip_loop_dim_exec; + +template +struct LoopExecute, SEGMENT> +{ + + template + static RAJA_INLINE RAJA_DEVICE void exec( + LaunchContext const &ctx, + SEGMENT const& segment, + BODY const& body) + { + const int i = ctx.thread_id[DIM]; + + body(*(segment.begin() + i)); + } +}; + + /* HIP generic loop implementations */