From 06bf5e99ee993ffbd9262567c6275dd709ba1e29 Mon Sep 17 00:00:00 2001
From: Arturo Vargas <vargas45@llnl.gov>
Date: Fri, 6 Jun 2025 14:08:52 -0700
Subject: [PATCH 01/10] intial commit for high-dim thread teams

---
 examples/CMakeLists.txt                     |   4 +
 examples/launch-high-dim-threads.cpp        | 194 ++++++++++++++++++++
 include/RAJA/pattern/launch/launch_core.hpp |  33 ++--
 include/RAJA/policy/hip/launch.hpp          |  33 +++-
 include/RAJA/policy/openmp/launch.hpp       |   8 +-
 include/RAJA/policy/sequential/launch.hpp   |   8 +-
 scripts/lc-builds/toss4_amdclang.sh         |   2 +-
 7 files changed, 252 insertions(+), 30 deletions(-)
 create mode 100644 examples/launch-high-dim-threads.cpp
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 654f7a45ae..235e71be25 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -67,6 +67,10 @@ raja_add_executable(
   NAME pi-reduce_vs_atomic
   SOURCES pi-reduce_vs_atomic.cpp)
 
+raja_add_executable(
+  NAME launch-high-dim-threads
+  SOURCES launch-high-dim-threads.cpp)
+
 raja_add_executable(
   NAME raja-launch
   SOURCES raja-launch.cpp)
diff --git a/examples/launch-high-dim-threads.cpp b/examples/launch-high-dim-threads.cpp
new file mode 100644
index 0000000000..91ffd02b5c
--- /dev/null
+++ b/examples/launch-high-dim-threads.cpp
@@ -0,0 +1,194 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-25, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+
+#include "RAJA/RAJA.hpp"
+#include "camp/resource.hpp"
+
+
+/*
+ * RAJA Launch Example: Upper Triangular Pattern + Shared Memory
+ *
+ * Launch introduces hierarchical parallelism through the concept of
+ * teams and threads.  Computation is executed in a pre-defined grid
+ * composed of threads and grouped into teams. The teams model enables
+ * developers to express parallelism through loops over teams, and inner loops
+ * over threads. Team loops are executed in parallel and
+ * threads within a team should be treated as sub-parallel regions.
+ *
+ * Team shared memory is allocated between team and thread loops.
+ * Memory allocated within thread loops are thread private.
+ * The example below demonstrates composing an upper triangular
+ * loop pattern, and using shared memory.
+ *
+ */
+
+/*
+ * Define host/device launch policies
+ */
+using launch_policy = RAJA::LaunchPolicy<
+#if defined(RAJA_ENABLE_OPENMP)
+    RAJA::omp_launch_t
+#else
+    RAJA::seq_launch_t
+#endif
+#if defined(RAJA_ENABLE_CUDA)
+    ,
+    RAJA::cuda_launch_t<false>
+#endif
+#if defined(RAJA_ENABLE_HIP)
+    ,
+    RAJA::hip_launch_t<false>
+#endif
+    >;
+
+/*
+ * Define team policies.
+ * Up to 3 dimension are supported: x,y,z
+ */
+using teams_x = RAJA::LoopPolicy<
+#if defined(RAJA_ENABLE_OPENMP)
+                                       RAJA::omp_parallel_for_exec
+#else
+                                       RAJA::seq_exec
+#endif
+#if defined(RAJA_ENABLE_CUDA)
+                                       ,
+                                       RAJA::cuda_block_x_direct
+#endif
+#if defined(RAJA_ENABLE_HIP)
+                                       ,
+                                       RAJA::hip_block_x_direct
+#endif
+                                       >;
+/*
+ * Define thread policies.
+ * Up to 3 dimension are supported: x,y,z
+ */
+using threads_x = RAJA::LoopPolicy<RAJA::seq_exec
+#if defined(RAJA_ENABLE_CUDA)
+                                         ,
+                                         RAJA::cuda_thread_x_loop
+#endif
+#if defined(RAJA_ENABLE_HIP)
+                                         ,
+                                         RAJA::hip_thread_x_loop
+#endif
+                                         >;
+
+
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+{
+
+  // Resource object for host
+  camp::resources::Host host_res;
+
+  // Resource objects for CUDA or HIP
+#if defined(RAJA_ENABLE_CUDA)
+  camp::resources::Cuda device_res;
+#endif
+
+#if defined(RAJA_ENABLE_HIP)
+  camp::resources::Hip device_res;
+#endif
+
+  std::cout << "\n Running RAJA-Launch examples...\n";
+  int num_of_backends = 1;
+#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
+  num_of_backends++;
+#endif
+
+  // RAJA teams may switch between host and device policies at run time.
+  // The loop below will execute through the available backends.
+
+  for (int exec_place = 0; exec_place < num_of_backends; ++exec_place) {
+
+    auto select_cpu_or_gpu = (RAJA::ExecPlace)exec_place;
+
+    // Allocate memory for either host or device
+    int N_tri = 5;
+
+    int* Ddat = nullptr;
+    if (select_cpu_or_gpu == RAJA::ExecPlace::HOST) {
+      Ddat = host_res.allocate<int>(N_tri * N_tri);
+    }
+
+#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
+    if (select_cpu_or_gpu == RAJA::ExecPlace::DEVICE) {
+      Ddat = device_res.allocate<int>(N_tri * N_tri);
+    }
+#endif
+
+    /*
+     * RAJA::launch just starts a "kernel" and doesn't provide any looping.
+     *
+     * The first argument determines which policy should be executed,
+     *
+     * The second argument is the number of teams+threads needed for each of the
+     * policies.
+     *
+     * Third argument is the lambda.
+     *
+     * The lambda takes a "resource" object, which has the teams+threads
+     * and is used to perform thread synchronizations within a team.
+     */
+
+    if (select_cpu_or_gpu == RAJA::ExecPlace::HOST){
+      std::cout << "\n Running upper triangular pattern example on the host...\n";
+    } else {
+      std::cout << "\n Running upper triangular pattern example on the device...\n";
+    }
+
+    //Example of high-dimensional thread-team
+    RAJA::LaunchParams(RAJA::Teams(N_tri), RAJA::Threads<4>(1,2,3,4));
+    
+    RAJA::View<int, RAJA::Layout<2>> D(Ddat, N_tri, N_tri);
+
+    RAJA::launch<launch_policy>
+      (select_cpu_or_gpu,
+       //RAJA::LaunchParams(RAJA::Teams(N_tri), RAJA::Threads<4>(1,2,3,4)),
+       RAJA::LaunchParams(RAJA::Teams(N_tri), RAJA::Threads(N_tri)),
+       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+         printf("in kernel \n");
+         RAJA::loop<teams_x>(ctx, RAJA::RangeSegment(0, N_tri), [&](int r) {
+
+           // Array shared within threads of the same team
+           RAJA_TEAM_SHARED int s_A[1];
+
+           RAJA::loop<threads_x>(ctx, RAJA::RangeSegment(0, 1), [&](int c) {
+              s_A[c] = r;
+           });  // loop c
+
+           ctx.teamSync();
+
+           RAJA::loop<threads_x>(ctx, RAJA::RangeSegment(r, N_tri), [&](int c) {
+               D(r, c) = r * N_tri + c;
+               printf("r=%d, c=%d : D=%d : s_A = %d \n", r, c, D(r, c), s_A[0]);
+           });  // loop c
+
+         });  // loop r
+
+       });  // outer lambda
+
+    if (select_cpu_or_gpu == RAJA::ExecPlace::HOST) {
+      host_res.deallocate(Ddat);
+    }
+
+#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
+    if (select_cpu_or_gpu == RAJA::ExecPlace::DEVICE) {
+      device_res.deallocate(Ddat);
+    }
+#endif
+
+  }  // Execution places loop
+
+
+}  // Main
diff --git a/include/RAJA/pattern/launch/launch_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp
index bfce94057c..92b0deea4a 100644
--- a/include/RAJA/pattern/launch/launch_core.hpp
+++ b/include/RAJA/pattern/launch/launch_core.hpp
@@ -107,10 +107,11 @@ struct Teams
   constexpr Teams(int i, int j, int k) : value {i, j, k} {}
 };
 
+template<size_t DIM=3>
 struct Threads
 {
-  int value[3];
-
+  int value[DIM];
+#if 1
   RAJA_INLINE
 
   RAJA_HOST_DEVICE
@@ -130,6 +131,10 @@ struct Threads
 
   RAJA_HOST_DEVICE
   constexpr Threads(int i, int j, int k) : value {i, j, k} {}
+  //#else
+  template<typename...Args>
+  constexpr Threads(Args...args) : value{args...} {};
+#endif
 };
 
 struct Lanes
@@ -147,18 +152,20 @@ struct Lanes
   constexpr Lanes(int i) : value(i) {}
 };
 
+template<size_t ThreadDIM>
 struct LaunchParams
 {
 public:
   Teams teams;
-  Threads threads;
+  Threads<ThreadDIM> threads;
   size_t shared_mem_size;
 
   RAJA_INLINE
   LaunchParams() = default;
 
+  //template<size_t ThreadDIM>
   LaunchParams(Teams in_teams,
-               Threads in_threads,
+               Threads<ThreadDIM> in_threads,
                size_t in_shared_mem_size = 0)
       : teams(in_teams),
         threads(in_threads),
@@ -170,10 +177,12 @@ struct LaunchParams
   RAJA_INLINE
   Teams apply(Teams const& a) { return (teams = a); }
 
+  /*
   RAJA_HOST_DEVICE
 
   RAJA_INLINE
   Threads apply(Threads const& a) { return (threads = a); }
+  */
 };
 
 class LaunchContext
@@ -246,8 +255,8 @@ struct LaunchExecute;
 
 // Duplicate of code above on account that we need to support the case in which
 // a kernel_name is not given
-template<typename LAUNCH_POLICY, typename... ReduceParams>
-void launch(LaunchParams const& launch_params,
+template<typename LAUNCH_POLICY,size_t ThreadDIM=3, typename... ReduceParams>
+void launch(LaunchParams<ThreadDIM> const& launch_params,
             ReduceParams&&... rest_of_launch_args)
 {
   // Get reducers
@@ -288,17 +297,17 @@ void launch(LaunchParams const& launch_params,
 //=================================================
 // Run time based policy launch
 //=================================================
-template<typename POLICY_LIST, typename BODY>
-void launch(ExecPlace place, LaunchParams const& params, BODY const& body)
+template<typename POLICY_LIST, size_t ThreadDIM=3, typename BODY>
+void launch(ExecPlace place, LaunchParams<ThreadDIM> const& params, BODY const& body)
 {
   launch<POLICY_LIST>(place, params, body);
 }
 
 // Run-time API for new reducer interface with support of the case without a new
 // kernel name
-template<typename POLICY_LIST, typename... ReduceParams>
+template<typename POLICY_LIST, size_t ThreadDIM=3, typename... ReduceParams>
 void launch(ExecPlace place,
-            const LaunchParams& launch_params,
+            LaunchParams<ThreadDIM> const& launch_params,
             ReduceParams&&... rest_of_launch_args)
 // BODY const &body)
 {
@@ -367,10 +376,10 @@ RAJA::resources::Resource Get_Host_Resource(T host_res, RAJA::ExecPlace device)
 
 // Duplicate of API above on account that we need to handle the case that a
 // kernel name is not provided
-template<typename POLICY_LIST, typename... ReduceParams>
+template<typename POLICY_LIST, size_t ThreadDIM=3, typename... ReduceParams>
 resources::EventProxy<resources::Resource> launch(
     RAJA::resources::Resource res,
-    LaunchParams const& launch_params,
+    LaunchParams<ThreadDIM> const& launch_params,
     ReduceParams&&... rest_of_launch_args)
 {
 
diff --git a/include/RAJA/policy/hip/launch.hpp b/include/RAJA/policy/hip/launch.hpp
index ab27e9b456..0062ce50cc 100644
--- a/include/RAJA/policy/hip/launch.hpp
+++ b/include/RAJA/policy/hip/launch.hpp
@@ -70,13 +70,13 @@ struct LaunchExecute<
     RAJA::policy::hip::hip_launch_t<async, named_usage::unspecified>>
 {
 
-  template<typename BODY_IN, typename ReduceParams>
+  template<size_t ThreadDIM=3, typename BODY_IN, typename ReduceParams>
   static concepts::enable_if_t<
       resources::EventProxy<resources::Resource>,
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
       RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
   exec(RAJA::resources::Resource res,
-       const LaunchParams& params,
+       const LaunchParams<ThreadDIM> & params,
        BODY_IN&& body_in,
        ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
   {
@@ -86,6 +86,18 @@ struct LaunchExecute<
 
     resources::Hip hip_res = res.get<RAJA::resources::Hip>();
 
+
+    for(int k=0; k<3;++k) {
+      std::cout<<"params.threads.value "<<
+        params.threads.value[k]<<std::endl;
+    }
+
+    for(int k=0; k<3;++k) {
+      std::cout<<"params.teams.value "<<
+        params.teams.value[k]<<std::endl;
+    }
+
+
     //
     // Compute the number of blocks and threads
     //
@@ -109,7 +121,7 @@ struct LaunchExecute<
 
       {
         size_t shared_mem_size = params.shared_mem_size;
-
+        std::cout<<"launching kernel "<<std::endl;
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
@@ -126,20 +138,23 @@ struct LaunchExecute<
       }
 
       RAJA_FT_END;
+    }else{
+
+      std::cout<<"did not launch kernel "<<std::endl;
     }
 
     return resources::EventProxy<resources::Resource>(res);
   }
 
   // Version with explicit reduction parameters..
-  template<typename BODY_IN, typename ReduceParams>
+  template<size_t ThreadDIM=3, typename BODY_IN, typename ReduceParams>
   static concepts::enable_if_t<
       resources::EventProxy<resources::Resource>,
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
       concepts::negate<
           RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
   exec(RAJA::resources::Resource res,
-       const LaunchParams& launch_params,
+       const LaunchParams<ThreadDIM>& launch_params,
        BODY_IN&& body_in,
        ReduceParams& launch_reducers)
   {
@@ -255,13 +270,13 @@ template<bool async, int nthreads>
 struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>>
 {
 
-  template<typename BODY_IN, typename ReduceParams>
+  template<size_t ThreadDIM=3, typename BODY_IN, typename ReduceParams>
   static concepts::enable_if_t<
       resources::EventProxy<resources::Resource>,
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
       RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
   exec(RAJA::resources::Resource res,
-       const LaunchParams& params,
+       const LaunchParams<ThreadDIM>& params,
        BODY_IN&& body_in,
        ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
   {
@@ -318,14 +333,14 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>>
   }
 
   // Version with explicit reduction parameters..
-  template<typename BODY_IN, typename ReduceParams>
+  template<size_t ThreadDIM=3, typename BODY_IN, typename ReduceParams>
   static concepts::enable_if_t<
       resources::EventProxy<resources::Resource>,
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
       concepts::negate<
           RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
   exec(RAJA::resources::Resource res,
-       const LaunchParams& launch_params,
+       const LaunchParams<ThreadDIM>& launch_params,
        BODY_IN&& body_in,
        ReduceParams& launch_reducers)
   {
diff --git a/include/RAJA/policy/openmp/launch.hpp b/include/RAJA/policy/openmp/launch.hpp
index 2092c87bb3..2d8499145f 100644
--- a/include/RAJA/policy/openmp/launch.hpp
+++ b/include/RAJA/policy/openmp/launch.hpp
@@ -28,13 +28,13 @@ template<>
 struct LaunchExecute<RAJA::omp_launch_t>
 {
 
-  template<typename BODY, typename ReduceParams>
+  template<size_t ThreadDIM=3, typename BODY, typename ReduceParams>
   static concepts::enable_if_t<
       resources::EventProxy<resources::Resource>,
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
       RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
   exec(RAJA::resources::Resource res,
-       LaunchParams const& params,
+       LaunchParams<ThreadDIM> const& params,
        BODY const& body,
        ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
   {
@@ -55,14 +55,14 @@ struct LaunchExecute<RAJA::omp_launch_t>
     return resources::EventProxy<resources::Resource>(res);
   }
 
-  template<typename ReduceParams, typename BODY>
+  template<typename ReduceParams, typename BODY, size_t ThreadDIM=3>
   static concepts::enable_if_t<
       resources::EventProxy<resources::Resource>,
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
       concepts::negate<
           RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
   exec(RAJA::resources::Resource res,
-       LaunchParams const& launch_params,
+       LaunchParams<ThreadDIM> const& launch_params,
        BODY const& body,
        ReduceParams& f_params)
   {
diff --git a/include/RAJA/policy/sequential/launch.hpp b/include/RAJA/policy/sequential/launch.hpp
index ee98804ecf..f5642274b0 100644
--- a/include/RAJA/policy/sequential/launch.hpp
+++ b/include/RAJA/policy/sequential/launch.hpp
@@ -40,13 +40,13 @@ template<>
 struct LaunchExecute<RAJA::seq_launch_t>
 {
 
-  template<typename BODY, typename ReduceParams>
+  template<typename BODY,size_t ThreadDIM=3, typename ReduceParams>
   static concepts::enable_if_t<
       resources::EventProxy<resources::Resource>,
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
       RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
   exec(RAJA::resources::Resource res,
-       LaunchParams const& params,
+       LaunchParams<ThreadDIM> const& params,
        BODY const& body,
        ReduceParams& RAJA_UNUSED_ARG(ReduceParams))
   {
@@ -64,14 +64,14 @@ struct LaunchExecute<RAJA::seq_launch_t>
     return resources::EventProxy<resources::Resource>(res);
   }
 
-  template<typename BODY, typename ReduceParams>
+  template<typename BODY, size_t ThreadDIM=3, typename ReduceParams>
   static concepts::enable_if_t<
       resources::EventProxy<resources::Resource>,
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
       concepts::negate<
           RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
   exec(RAJA::resources::Resource res,
-       LaunchParams const& launch_params,
+       LaunchParams<ThreadDIM> const& launch_params,
        BODY const& body,
        ReduceParams& launch_reducers)
   {
diff --git a/scripts/lc-builds/toss4_amdclang.sh b/scripts/lc-builds/toss4_amdclang.sh
index cfb47678bc..43e00de64f 100755
--- a/scripts/lc-builds/toss4_amdclang.sh
+++ b/scripts/lc-builds/toss4_amdclang.sh
@@ -68,7 +68,7 @@ cmake \
   -DCMAKE_HIP_ARCHITECTURES="${COMP_ARCH}" \
   -DGPU_TARGETS="${COMP_ARCH}" \
   -DAMDGPU_TARGETS="${COMP_ARCH}" \
-  -DBLT_CXX_STD=c++14 \
+  -DBLT_CXX_STD=c++17 \
   -C "../host-configs/lc-builds/toss4/${HOSTCONFIG}.cmake" \
   -DENABLE_HIP=ON \
   -DENABLE_OPENMP=ON \

From 143a82151b036d906822fe494eb7292cfe163a79 Mon Sep 17 00:00:00 2001
From: Arturo Vargas <vargas45@llnl.gov>
Date: Tue, 17 Jun 2025 14:21:57 -0700
Subject: [PATCH 02/10] build fixes for high dimensional threading

---
 examples/launch-high-dim-threads.cpp        |  2 ++
 include/RAJA/pattern/launch/launch_core.hpp | 37 +++++++--------------
 include/RAJA/policy/hip/launch.hpp          | 31 +++++++++--------
 include/RAJA/policy/openmp/launch.hpp       |  4 +--
 include/RAJA/policy/sequential/launch.hpp   |  4 +--
 5 files changed, 35 insertions(+), 43 deletions(-)

diff --git a/examples/launch-high-dim-threads.cpp b/examples/launch-high-dim-threads.cpp
index 91ffd02b5c..fd804e9239 100644
--- a/examples/launch-high-dim-threads.cpp
+++ b/examples/launch-high-dim-threads.cpp
@@ -147,6 +147,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
       std::cout << "\n Running upper triangular pattern example on the device...\n";
     }
 
+    RAJA::LaunchParams();
+    
     //Example of high-dimensional thread-team
     RAJA::LaunchParams(RAJA::Teams(N_tri), RAJA::Threads<4>(1,2,3,4));
     
diff --git a/include/RAJA/pattern/launch/launch_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp
index 92b0deea4a..841d9eff75 100644
--- a/include/RAJA/pattern/launch/launch_core.hpp
+++ b/include/RAJA/pattern/launch/launch_core.hpp
@@ -107,7 +107,7 @@ struct Teams
   constexpr Teams(int i, int j, int k) : value {i, j, k} {}
 };
 
-template<size_t DIM=3>
+template<size_t DIM = 3>
 struct Threads
 {
   int value[DIM];
@@ -131,28 +131,14 @@ struct Threads
 
   RAJA_HOST_DEVICE
   constexpr Threads(int i, int j, int k) : value {i, j, k} {}
+
   //#else
-  template<typename...Args>
-  constexpr Threads(Args...args) : value{args...} {};
+  template<typename... Args>
+  constexpr Threads(Args... args) : value {args...} {};
 #endif
 };
 
-struct Lanes
-{
-  int value;
-
-  RAJA_INLINE
-
-  RAJA_HOST_DEVICE
-  constexpr Lanes() : value(0) {}
-
-  RAJA_INLINE
-
-  RAJA_HOST_DEVICE
-  constexpr Lanes(int i) : value(i) {}
-};
-
-template<size_t ThreadDIM>
+template<size_t ThreadDIM = 3>
 struct LaunchParams
 {
 public:
@@ -163,7 +149,6 @@ struct LaunchParams
   RAJA_INLINE
   LaunchParams() = default;
 
-  //template<size_t ThreadDIM>
   LaunchParams(Teams in_teams,
                Threads<ThreadDIM> in_threads,
                size_t in_shared_mem_size = 0)
@@ -255,7 +240,7 @@ struct LaunchExecute;
 
 // Duplicate of code above on account that we need to support the case in which
 // a kernel_name is not given
-template<typename LAUNCH_POLICY,size_t ThreadDIM=3, typename... ReduceParams>
+template<typename LAUNCH_POLICY, size_t ThreadDIM = 3, typename... ReduceParams>
 void launch(LaunchParams<ThreadDIM> const& launch_params,
             ReduceParams&&... rest_of_launch_args)
 {
@@ -297,15 +282,17 @@ void launch(LaunchParams<ThreadDIM> const& launch_params,
 //=================================================
 // Run time based policy launch
 //=================================================
-template<typename POLICY_LIST, size_t ThreadDIM=3, typename BODY>
-void launch(ExecPlace place, LaunchParams<ThreadDIM> const& params, BODY const& body)
+template<typename POLICY_LIST, size_t ThreadDIM = 3, typename BODY>
+void launch(ExecPlace place,
+            LaunchParams<ThreadDIM> const& params,
+            BODY const& body)
 {
   launch<POLICY_LIST>(place, params, body);
 }
 
 // Run-time API for new reducer interface with support of the case without a new
 // kernel name
-template<typename POLICY_LIST, size_t ThreadDIM=3, typename... ReduceParams>
+template<typename POLICY_LIST, size_t ThreadDIM = 3, typename... ReduceParams>
 void launch(ExecPlace place,
             LaunchParams<ThreadDIM> const& launch_params,
             ReduceParams&&... rest_of_launch_args)
@@ -376,7 +363,7 @@ RAJA::resources::Resource Get_Host_Resource(T host_res, RAJA::ExecPlace device)
 
 // Duplicate of API above on account that we need to handle the case that a
 // kernel name is not provided
-template<typename POLICY_LIST, size_t ThreadDIM=3, typename... ReduceParams>
+template<typename POLICY_LIST, size_t ThreadDIM = 3, typename... ReduceParams>
 resources::EventProxy<resources::Resource> launch(
     RAJA::resources::Resource res,
     LaunchParams<ThreadDIM> const& launch_params,
diff --git a/include/RAJA/policy/hip/launch.hpp b/include/RAJA/policy/hip/launch.hpp
index 0062ce50cc..ba1cbfd267 100644
--- a/include/RAJA/policy/hip/launch.hpp
+++ b/include/RAJA/policy/hip/launch.hpp
@@ -70,13 +70,13 @@ struct LaunchExecute<
     RAJA::policy::hip::hip_launch_t<async, named_usage::unspecified>>
 {
 
-  template<size_t ThreadDIM=3, typename BODY_IN, typename ReduceParams>
+  template<size_t ThreadDIM = 3, typename BODY_IN, typename ReduceParams>
   static concepts::enable_if_t<
       resources::EventProxy<resources::Resource>,
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
       RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
   exec(RAJA::resources::Resource res,
-       const LaunchParams<ThreadDIM> & params,
+       const LaunchParams<ThreadDIM>& params,
        BODY_IN&& body_in,
        ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
   {
@@ -87,14 +87,15 @@ struct LaunchExecute<
     resources::Hip hip_res = res.get<RAJA::resources::Hip>();
 
 
-    for(int k=0; k<3;++k) {
-      std::cout<<"params.threads.value "<<
-        params.threads.value[k]<<std::endl;
+    for (int k = 0; k < 3; ++k)
+    {
+      std::cout << "params.threads.value " << params.threads.value[k]
+                << std::endl;
     }
 
-    for(int k=0; k<3;++k) {
-      std::cout<<"params.teams.value "<<
-        params.teams.value[k]<<std::endl;
+    for (int k = 0; k < 3; ++k)
+    {
+      std::cout << "params.teams.value " << params.teams.value[k] << std::endl;
     }
 
 
@@ -121,7 +122,7 @@ struct LaunchExecute<
 
       {
         size_t shared_mem_size = params.shared_mem_size;
-        std::cout<<"launching kernel "<<std::endl;
+        std::cout << "launching kernel " << std::endl;
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
@@ -138,16 +139,18 @@ struct LaunchExecute<
       }
 
       RAJA_FT_END;
-    }else{
+    }
+    else
+    {
 
-      std::cout<<"did not launch kernel "<<std::endl;
+      std::cout << "did not launch kernel " << std::endl;
     }
 
     return resources::EventProxy<resources::Resource>(res);
   }
 
   // Version with explicit reduction parameters..
-  template<size_t ThreadDIM=3, typename BODY_IN, typename ReduceParams>
+  template<size_t ThreadDIM = 3, typename BODY_IN, typename ReduceParams>
   static concepts::enable_if_t<
       resources::EventProxy<resources::Resource>,
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
@@ -270,7 +273,7 @@ template<bool async, int nthreads>
 struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>>
 {
 
-  template<size_t ThreadDIM=3, typename BODY_IN, typename ReduceParams>
+  template<size_t ThreadDIM = 3, typename BODY_IN, typename ReduceParams>
   static concepts::enable_if_t<
       resources::EventProxy<resources::Resource>,
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
@@ -333,7 +336,7 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>>
   }
 
   // Version with explicit reduction parameters..
-  template<size_t ThreadDIM=3, typename BODY_IN, typename ReduceParams>
+  template<size_t ThreadDIM = 3, typename BODY_IN, typename ReduceParams>
   static concepts::enable_if_t<
       resources::EventProxy<resources::Resource>,
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
diff --git a/include/RAJA/policy/openmp/launch.hpp b/include/RAJA/policy/openmp/launch.hpp
index 2d8499145f..a7cebc31c9 100644
--- a/include/RAJA/policy/openmp/launch.hpp
+++ b/include/RAJA/policy/openmp/launch.hpp
@@ -28,7 +28,7 @@ template<>
 struct LaunchExecute<RAJA::omp_launch_t>
 {
 
-  template<size_t ThreadDIM=3, typename BODY, typename ReduceParams>
+  template<size_t ThreadDIM = 3, typename BODY, typename ReduceParams>
   static concepts::enable_if_t<
       resources::EventProxy<resources::Resource>,
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
@@ -55,7 +55,7 @@ struct LaunchExecute<RAJA::omp_launch_t>
     return resources::EventProxy<resources::Resource>(res);
   }
 
-  template<typename ReduceParams, typename BODY, size_t ThreadDIM=3>
+  template<typename ReduceParams, typename BODY, size_t ThreadDIM = 3>
   static concepts::enable_if_t<
       resources::EventProxy<resources::Resource>,
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
diff --git a/include/RAJA/policy/sequential/launch.hpp b/include/RAJA/policy/sequential/launch.hpp
index f5642274b0..1026e78af2 100644
--- a/include/RAJA/policy/sequential/launch.hpp
+++ b/include/RAJA/policy/sequential/launch.hpp
@@ -40,7 +40,7 @@ template<>
 struct LaunchExecute<RAJA::seq_launch_t>
 {
 
-  template<typename BODY,size_t ThreadDIM=3, typename ReduceParams>
+  template<typename BODY, size_t ThreadDIM = 3, typename ReduceParams>
   static concepts::enable_if_t<
       resources::EventProxy<resources::Resource>,
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
@@ -64,7 +64,7 @@ struct LaunchExecute<RAJA::seq_launch_t>
     return resources::EventProxy<resources::Resource>(res);
   }
 
-  template<typename BODY, size_t ThreadDIM=3, typename ReduceParams>
+  template<typename BODY, size_t ThreadDIM = 3, typename ReduceParams>
   static concepts::enable_if_t<
       resources::EventProxy<resources::Resource>,
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,

From a26063497c8c3aecc10c399bd37e9b06850f227a Mon Sep 17 00:00:00 2001
From: Arturo Vargas <vargas45@llnl.gov>
Date: Thu, 3 Jul 2025 11:46:26 -0700
Subject: [PATCH 03/10] build fixes for cuda

---
 examples/launch-high-dim-threads.cpp |  2 +-
 include/RAJA/policy/cuda/launch.hpp  | 32 +++++++++++++++++++++-------
 2 files changed, 25 insertions(+), 9 deletions(-)

diff --git a/examples/launch-high-dim-threads.cpp b/examples/launch-high-dim-threads.cpp
index fd804e9239..95af2d1d2a 100644
--- a/examples/launch-high-dim-threads.cpp
+++ b/examples/launch-high-dim-threads.cpp
@@ -147,7 +147,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
       std::cout << "\n Running upper triangular pattern example on the device...\n";
     }
 
-    RAJA::LaunchParams();
+    RAJA::LaunchParams;
     
     //Example of high-dimensional thread-team
     RAJA::LaunchParams(RAJA::Teams(N_tri), RAJA::Threads<4>(1,2,3,4));
diff --git a/include/RAJA/policy/cuda/launch.hpp b/include/RAJA/policy/cuda/launch.hpp
index f0b7d0be98..a7dc2a43ae 100644
--- a/include/RAJA/policy/cuda/launch.hpp
+++ b/include/RAJA/policy/cuda/launch.hpp
@@ -72,13 +72,13 @@ struct LaunchExecute<
                                                named_usage::unspecified>>
 {
 
-  template<typename BODY_IN, typename ReduceParams>
+  template<size_t ThreadDIM = 3, typename BODY_IN, typename ReduceParams>
   static concepts::enable_if_t<
       resources::EventProxy<resources::Resource>,
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
       RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
   exec(RAJA::resources::Resource res,
-       const LaunchParams& params,
+       const LaunchParams<ThreadDIM>& params,
        BODY_IN&& body_in,
        ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
   {
@@ -88,6 +88,18 @@ struct LaunchExecute<
 
     resources::Cuda cuda_res = res.get<RAJA::resources::Cuda>();
 
+    for (int k = 0; k < 3; ++k)
+    {
+      std::cout << "params.threads.value " << params.threads.value[k]
+                << std::endl;
+    }
+
+    for (int k = 0; k < 3; ++k)
+    {
+      std::cout << "params.teams.value " << params.teams.value[k] << std::endl;
+    }
+
+
     //
     // Compute the number of blocks and threads
     //
@@ -129,19 +141,23 @@ struct LaunchExecute<
 
       RAJA_FT_END;
     }
+    else
+    {
+      std::cout << "did not launch kernel " << std::endl;
+    }
 
     return resources::EventProxy<resources::Resource>(res);
   }
 
   // Version with explicit reduction parameters..
-  template<typename BODY_IN, typename ReduceParams>
+  template<size_t ThreadDIM = 3, typename BODY_IN, typename ReduceParams>
   static concepts::enable_if_t<
       resources::EventProxy<resources::Resource>,
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
       concepts::negate<
           RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
   exec(RAJA::resources::Resource res,
-       const LaunchParams& launch_params,
+       const LaunchParams<ThreadDIM>& launch_params,
        BODY_IN&& body_in,
        ReduceParams& launch_reducers)
   {
@@ -260,13 +276,13 @@ struct LaunchExecute<
     RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads, BLOCKS_PER_SM>>
 {
 
-  template<typename BODY_IN, typename ReduceParams>
+  template<size_t ThreadDIM = 3, typename BODY_IN, typename ReduceParams>
   static concepts::enable_if_t<
       resources::EventProxy<resources::Resource>,
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
       RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
   exec(RAJA::resources::Resource res,
-       const LaunchParams& params,
+       const LaunchParams<ThreadDIM>& params,
        BODY_IN&& body_in,
        ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
   {
@@ -323,14 +339,14 @@ struct LaunchExecute<
   }
 
   // Version with explicit reduction parameters..
-  template<typename BODY_IN, typename ReduceParams>
+  template<size_t ThreadDIM=3, typename BODY_IN, typename ReduceParams>
   static concepts::enable_if_t<
       resources::EventProxy<resources::Resource>,
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
       concepts::negate<
           RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
   exec(RAJA::resources::Resource res,
-       const LaunchParams& launch_params,
+       const LaunchParams<ThreadDIM>& launch_params,
        BODY_IN&& body_in,
        ReduceParams& launch_reducers)
   {

From ae7c6cec33e2eba8fd5511b473bc41ef45127c9c Mon Sep 17 00:00:00 2001
From: Arturo Vargas <vargas45@llnl.gov>
Date: Mon, 7 Jul 2025 13:22:00 -0700
Subject: [PATCH 04/10] intial 4D contraction example

---
 examples/4D_tensor_contraction.cpp          | 121 ++++++++++++++++++++
 examples/CMakeLists.txt                     |   4 +
 examples/launch-high-dim-threads.cpp        |   4 +-
 include/RAJA/pattern/launch/launch_core.hpp |   5 +-
 include/RAJA/policy/cuda/launch.hpp         |  44 +++++--
 5 files changed, 166 insertions(+), 12 deletions(-)
 create mode 100644 examples/4D_tensor_contraction.cpp

diff --git a/examples/4D_tensor_contraction.cpp b/examples/4D_tensor_contraction.cpp
new file mode 100644
index 0000000000..918303dfc8
--- /dev/null
+++ b/examples/4D_tensor_contraction.cpp
@@ -0,0 +1,121 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-25, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/LICENSE file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+
+#include "RAJA/RAJA.hpp"
+#include "camp/resource.hpp"
+#include "memoryManager.hpp"
+
+
+/*
+ * RAJA Launch Example: Upper Triangular Pattern + Shared Memory
+ *
+ * Launch introduces hierarchical parallelism through the concept of
+ * teams and threads.  Computation is executed in a pre-defined grid
+ * composed of threads and grouped into teams. The teams model enables
+ * developers to express parallelism through loops over teams, and inner loops
+ * over threads. Team loops are executed in parallel and
+ * threads within a team should be treated as sub-parallel regions.
+ *
+ * Team shared memory is allocated between team and thread loops.
+ * Memory allocated within thread loops are thread private.
+ * The example below demonstrates composing an upper triangular
+ * loop pattern, and using shared memory.
+ *
+ */
+
+/*
+ * Define host/device launch policies
+ */
+using launch_policy = RAJA::LaunchPolicy<RAJA::seq_launch_t>;
+  
+using teams_x = RAJA::LoopPolicy<RAJA::seq_exec>;
+                                 
+using threads_x = RAJA::LoopPolicy<RAJA::seq_exec>;
+
+
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
+{
+
+  constexpr int TotalMats = 100;
+  
+  constexpr int I = 2;
+  constexpr int J = 2;
+  constexpr int L = 2;
+  constexpr int K = 2;
+  constexpr int M = 2;
+  constexpr int N = 2;
+  constexpr int O = 2;    
+  
+  double *Aptr = memoryManager::allocate<double>(TotalMats * I * J * K * L);
+  double *Bptr = memoryManager::allocate<double>(TotalMats * L * M * N * O);
+  double *Cptr = memoryManager::allocate<double>(TotalMats * I * J * K * M * N * O);
+
+  auto A = RAJA::make_permuted_view<RAJA::layout_right>(Aptr, TotalMats, I, J, K, L);
+  auto B = RAJA::make_permuted_view<RAJA::layout_right>(Bptr, TotalMats, L, M, N, O);
+  auto C = RAJA::make_permuted_view<RAJA::layout_right>(Cptr, TotalMats, I, J, K, N, O);
+
+  // Initialize A and B with some values
+  for(int mat = 0; mat < TotalMats; ++mat) {
+
+    for (int i = 0; i < I; i++) {
+      for (int j = 0; j < J; j++) {
+	for (int k = 0; k < K; k++) {
+	  for (int l = 0; l < L; l++) {
+	    A(mat, i, j, k, l) = 1.0;
+	  }
+	}
+      }
+    }  
+    
+    for (int l = 0; l < L; l++) { 
+      for (int m = 0; m < M; m++) {
+	for (int n = 0; n < N; n++) {
+	  for (int o = 0; o < O; o++) {
+	    B(mat, l, m, n, o) = 1.0;
+	  }
+	}
+      }
+    }
+    
+  }
+
+  
+  
+#if 0
+    RAJA::launch<launch_policy>
+      (select_cpu_or_gpu,
+       RAJA::LaunchParams(RAJA::Teams(N_tri), RAJA::Threads<4>(1,2,3,4)),
+       //RAJA::LaunchParams(RAJA::Teams(N_tri), RAJA::Threads(N_tri)),
+       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+         printf("in kernel \n");
+         RAJA::loop<teams_x>(ctx, RAJA::RangeSegment(0, N_tri), [&](int r) {
+
+           // Array shared within threads of the same team
+           RAJA_TEAM_SHARED int s_A[1];
+
+           RAJA::loop<threads_x>(ctx, RAJA::RangeSegment(0, 1), [&](int c) {
+              s_A[c] = r;
+           });  // loop c
+
+           ctx.teamSync();
+
+           RAJA::loop<threads_x>(ctx, RAJA::RangeSegment(r, N_tri), [&](int c) {
+               D(r, c) = r * N_tri + c;
+               printf("r=%d, c=%d : D=%d : s_A = %d \n", r, c, D(r, c), s_A[0]);
+           });  // loop c
+
+         });  // loop r
+
+       });  // outer lambda
+#endif    
+
+}  // Main
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 235e71be25..0c4dcd20ca 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -71,6 +71,10 @@ raja_add_executable(
   NAME launch-high-dim-threads
   SOURCES launch-high-dim-threads.cpp)
 
+raja_add_executable(
+  NAME 4D_tensor_contraction
+  SOURCES 4D_tensor_contraction.cpp)
+
 raja_add_executable(
   NAME raja-launch
   SOURCES raja-launch.cpp)
diff --git a/examples/launch-high-dim-threads.cpp b/examples/launch-high-dim-threads.cpp
index 95af2d1d2a..abedd1b621 100644
--- a/examples/launch-high-dim-threads.cpp
+++ b/examples/launch-high-dim-threads.cpp
@@ -156,8 +156,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
     RAJA::launch<launch_policy>
       (select_cpu_or_gpu,
-       //RAJA::LaunchParams(RAJA::Teams(N_tri), RAJA::Threads<4>(1,2,3,4)),
-       RAJA::LaunchParams(RAJA::Teams(N_tri), RAJA::Threads(N_tri)),
+       RAJA::LaunchParams(RAJA::Teams(N_tri), RAJA::Threads<4>(1,2,3,4)),
+       //RAJA::LaunchParams(RAJA::Teams(N_tri), RAJA::Threads(N_tri)),
        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
          printf("in kernel \n");
          RAJA::loop<teams_x>(ctx, RAJA::RangeSegment(0, N_tri), [&](int r) {
diff --git a/include/RAJA/pattern/launch/launch_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp
index 841d9eff75..af3d66560c 100644
--- a/include/RAJA/pattern/launch/launch_core.hpp
+++ b/include/RAJA/pattern/launch/launch_core.hpp
@@ -110,7 +110,8 @@ struct Teams
 template<size_t DIM = 3>
 struct Threads
 {
-  int value[DIM];
+  std::array<int, DIM> value;
+  //int value[DIM];
 #if 1
   RAJA_INLINE
 
@@ -134,7 +135,7 @@ struct Threads
 
   //#else
   template<typename... Args>
-  constexpr Threads(Args... args) : value {args...} {};
+  constexpr Threads(Args... args) : value {static_cast<int>(args)...} {};
 #endif
 };
 
diff --git a/include/RAJA/policy/cuda/launch.hpp b/include/RAJA/policy/cuda/launch.hpp
index a7dc2a43ae..cd6479230f 100644
--- a/include/RAJA/policy/cuda/launch.hpp
+++ b/include/RAJA/policy/cuda/launch.hpp
@@ -28,6 +28,22 @@
 namespace RAJA
 {
 
+//internal helper function
+namespace detail
+{
+
+template<typename T, size_t... I>
+constexpr T multiply_impl(const std::array<T, sizeof...(I)>& arr, std::index_sequence<I...>) {
+    return (arr[I] * ...);
+}
+
+template<typename T, size_t N>
+constexpr T multiplyArray(const std::array<T, N>& arr) {
+    return multiply_impl(arr, std::make_index_sequence<N>{});
+}
+
+}
+
 template<typename BODY>
 __global__ void launch_global_fcn(BODY body_in)
 {
@@ -88,15 +104,9 @@ struct LaunchExecute<
 
     resources::Cuda cuda_res = res.get<RAJA::resources::Cuda>();
 
-    for (int k = 0; k < 3; ++k)
-    {
-      std::cout << "params.threads.value " << params.threads.value[k]
-                << std::endl;
-    }
 
-    for (int k = 0; k < 3; ++k)
-    {
-      std::cout << "params.teams.value " << params.teams.value[k] << std::endl;
+    if(params.threads.value.size() > 3) {
+      std::cout<<"threads container is larger than 3 : "<<params.threads.value.size() <<std::endl;
     }
 
 
@@ -108,10 +118,28 @@ struct LaunchExecute<
                          static_cast<cuda_dim_member_t>(params.teams.value[1]),
                          static_cast<cuda_dim_member_t>(params.teams.value[2])};
 
+    cuda_dim_t blockSize;
+
+    if(params.threads.value.size() < 4)
+    {
+      blockSize = cuda_dim_t{static_cast<cuda_dim_member_t>(params.threads.value[0]),
+                             static_cast<cuda_dim_member_t>(params.threads.value[1]),
+                             static_cast<cuda_dim_member_t>(params.threads.value[2])};
+    } else {
+
+      int total_threads = detail::multiplyArray(params.threads.value);
+      std::cout<<"Total threads"<<std::endl;
+      blockSize = cuda_dim_t{static_cast<cuda_dim_member_t>(detail::multiplyArray(params.threads.value)),
+                             static_cast<cuda_dim_member_t>(1),
+                             static_cast<cuda_dim_member_t>(1)};
+    }
+
+    /*
     cuda_dim_t blockSize {
         static_cast<cuda_dim_member_t>(params.threads.value[0]),
         static_cast<cuda_dim_member_t>(params.threads.value[1]),
         static_cast<cuda_dim_member_t>(params.threads.value[2])};
+    */
 
     // Only launch kernel if we have something to iterate over
     constexpr cuda_dim_member_t zero = 0;

From 48e133e2fbcd45e41bb35a7b00b66cb9c0c50574 Mon Sep 17 00:00:00 2001
From: Arturo Vargas <vargas45@llnl.gov>
Date: Mon, 7 Jul 2025 14:17:51 -0700
Subject: [PATCH 05/10] fix compile issue

---
 examples/launch-high-dim-threads.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/launch-high-dim-threads.cpp b/examples/launch-high-dim-threads.cpp
index abedd1b621..a3c41c51ff 100644
--- a/examples/launch-high-dim-threads.cpp
+++ b/examples/launch-high-dim-threads.cpp
@@ -147,7 +147,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
       std::cout << "\n Running upper triangular pattern example on the device...\n";
     }
 
-    RAJA::LaunchParams;
+    RAJA::LaunchParams{};
     
     //Example of high-dimensional thread-team
     RAJA::LaunchParams(RAJA::Teams(N_tri), RAJA::Threads<4>(1,2,3,4));

From e25abbe3c627963bd75aabfee1548009821d53dc Mon Sep 17 00:00:00 2001
From: Arturo Vargas <vargas45@llnl.gov>
Date: Mon, 7 Jul 2025 14:39:09 -0700
Subject: [PATCH 06/10] fix build issues with example

---
 examples/4D_tensor_contraction.cpp | 79 ++++++++++++++++--------------
 1 file changed, 42 insertions(+), 37 deletions(-)

diff --git a/examples/4D_tensor_contraction.cpp b/examples/4D_tensor_contraction.cpp
index 918303dfc8..384366c8ab 100644
--- a/examples/4D_tensor_contraction.cpp
+++ b/examples/4D_tensor_contraction.cpp
@@ -36,32 +36,37 @@
  * Define host/device launch policies
  */
 using launch_policy = RAJA::LaunchPolicy<RAJA::seq_launch_t>;
-  
-using teams_x = RAJA::LoopPolicy<RAJA::seq_exec>;
-                                 
-using threads_x = RAJA::LoopPolicy<RAJA::seq_exec>;
+
+using teams = RAJA::LoopPolicy<RAJA::seq_exec>;
+
+using loop_0 = RAJA::LoopPolicy<RAJA::seq_exec>;
+using loop_1 = RAJA::LoopPolicy<RAJA::seq_exec>;
+using loop_2 = RAJA::LoopPolicy<RAJA::seq_exec>;
+using loop_3 = RAJA::LoopPolicy<RAJA::seq_exec>;
+using loop_4 = RAJA::LoopPolicy<RAJA::seq_exec>;
+using loop_5 = RAJA::LoopPolicy<RAJA::seq_exec>;
 
 
 int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 {
 
   constexpr int TotalMats = 100;
-  
+
   constexpr int I = 2;
   constexpr int J = 2;
   constexpr int L = 2;
   constexpr int K = 2;
   constexpr int M = 2;
   constexpr int N = 2;
-  constexpr int O = 2;    
-  
+  constexpr int O = 2;
+
   double *Aptr = memoryManager::allocate<double>(TotalMats * I * J * K * L);
   double *Bptr = memoryManager::allocate<double>(TotalMats * L * M * N * O);
   double *Cptr = memoryManager::allocate<double>(TotalMats * I * J * K * M * N * O);
 
   auto A = RAJA::make_permuted_view<RAJA::layout_right>(Aptr, TotalMats, I, J, K, L);
   auto B = RAJA::make_permuted_view<RAJA::layout_right>(Bptr, TotalMats, L, M, N, O);
-  auto C = RAJA::make_permuted_view<RAJA::layout_right>(Cptr, TotalMats, I, J, K, N, O);
+  auto C = RAJA::make_permuted_view<RAJA::layout_right>(Cptr, TotalMats, I, J, K, M, N, O);
 
   // Initialize A and B with some values
   for(int mat = 0; mat < TotalMats; ++mat) {
@@ -74,9 +79,9 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 	  }
 	}
       }
-    }  
-    
-    for (int l = 0; l < L; l++) { 
+    }
+
+    for (int l = 0; l < L; l++) {
       for (int m = 0; m < M; m++) {
 	for (int n = 0; n < N; n++) {
 	  for (int o = 0; o < O; o++) {
@@ -85,37 +90,37 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 	}
       }
     }
-    
+
   }
 
-  
-  
-#if 0
+
     RAJA::launch<launch_policy>
-      (select_cpu_or_gpu,
-       RAJA::LaunchParams(RAJA::Teams(N_tri), RAJA::Threads<4>(1,2,3,4)),
-       //RAJA::LaunchParams(RAJA::Teams(N_tri), RAJA::Threads(N_tri)),
+      (RAJA::LaunchParams(RAJA::Teams(TotalMats), RAJA::Threads<6>(I, J, K, M, N, O)),
        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-         printf("in kernel \n");
-         RAJA::loop<teams_x>(ctx, RAJA::RangeSegment(0, N_tri), [&](int r) {
-
-           // Array shared within threads of the same team
-           RAJA_TEAM_SHARED int s_A[1];
-
-           RAJA::loop<threads_x>(ctx, RAJA::RangeSegment(0, 1), [&](int c) {
-              s_A[c] = r;
-           });  // loop c
-
-           ctx.teamSync();
-
-           RAJA::loop<threads_x>(ctx, RAJA::RangeSegment(r, N_tri), [&](int c) {
-               D(r, c) = r * N_tri + c;
-               printf("r=%d, c=%d : D=%d : s_A = %d \n", r, c, D(r, c), s_A[0]);
-           });  // loop c
-
-         });  // loop r
 
+         RAJA::loop<teams>(ctx, RAJA::RangeSegment(0, TotalMats), [&](int r) {
+
+           RAJA::loop<loop_0>(ctx, RAJA::RangeSegment(0, I), [&](int i) {
+             RAJA::loop<loop_1>(ctx, RAJA::RangeSegment(0, J), [&](int j) {
+               RAJA::loop<loop_2>(ctx, RAJA::RangeSegment(0, K), [&](int k) {
+                 RAJA::loop<loop_3>(ctx, RAJA::RangeSegment(0, M), [&](int m) {
+                   RAJA::loop<loop_4>(ctx, RAJA::RangeSegment(0, N), [&](int n) {
+                     RAJA::loop<loop_5>(ctx, RAJA::RangeSegment(0, O), [&](int o) {
+
+                       double dot = 0.0;
+                       for(int l = 0; l < L; ++l) {
+                         dot += A(r, i,j,k,l) * B(r, l,m,n,o);
+                       }
+                       C(r, i,j,k,m,n,o) = dot;
+                       
+                      });
+                    });
+                  });
+                });
+              });
+            });
+          });             
+         
        });  // outer lambda
-#endif    
 
 }  // Main

From 699bce1905e60338deb5f31f23e151043e819aa6 Mon Sep 17 00:00:00 2001
From: Arturo Vargas <artv3.89@gmail.com>
Date: Wed, 9 Jul 2025 10:32:38 -0700
Subject: [PATCH 07/10] build fixes for sycl

---
 include/RAJA/policy/sycl/launch.hpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/include/RAJA/policy/sycl/launch.hpp b/include/RAJA/policy/sycl/launch.hpp
index f69e2c4424..6cdc48185e 100644
--- a/include/RAJA/policy/sycl/launch.hpp
+++ b/include/RAJA/policy/sycl/launch.hpp
@@ -33,7 +33,7 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>>
 {
 
   // If the launch lambda is trivially copyable
-  template<typename BODY_IN,
+  template<size_t ThreadDIM = 3, typename BODY_IN,
            typename ReduceParams,
            typename std::enable_if<std::is_trivially_copyable<BODY_IN> {},
                                    bool>::type = true>
@@ -42,7 +42,7 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>>
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
       RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
   exec(RAJA::resources::Resource res,
-       const LaunchParams& params,
+       const LaunchParams<ThreadDIM>& params,
        BODY_IN&& body_in,
        ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
   {
@@ -102,7 +102,7 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>>
 
   // If the launch lambda is trivially copyable and we have explcit reduction
   // parameters
-  template<typename BODY_IN,
+  template<size_t ThreadDIM = 3, typename BODY_IN,
            typename ReduceParams,
            typename std::enable_if<std::is_trivially_copyable<BODY_IN> {},
                                    bool>::type = true>
@@ -112,7 +112,7 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>>
       concepts::negate<
           RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
   exec(RAJA::resources::Resource res,
-       const LaunchParams& launch_params,
+       const LaunchParams<ThreadDIM>& launch_params,
        BODY_IN&& body_in,
        ReduceParams launch_reducers)
   {
@@ -194,7 +194,7 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>>
   }
 
   // If the launch lambda is not trivially copyable
-  template<typename BODY_IN,
+  template<size_t ThreadDIM = 3, typename BODY_IN,
            typename ReduceParams,
            typename std::enable_if<!std::is_trivially_copyable<BODY_IN> {},
                                    bool>::type = true>
@@ -203,7 +203,7 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>>
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
       RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
   exec(RAJA::resources::Resource res,
-       const LaunchParams& params,
+       const LaunchParams<ThreadDIM>& params,
        BODY_IN&& body_in,
        ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
   {
@@ -269,7 +269,7 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>>
   }
 
   // If the launch lambda is not trivially copyable
-  template<typename BODY_IN,
+  template<size_t ThreadDIM = 3, typename BODY_IN,
            typename ReduceParams,
            typename std::enable_if<!std::is_trivially_copyable<BODY_IN> {},
                                    bool>::type = true>
@@ -279,7 +279,7 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>>
       concepts::negate<
           RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
   exec(RAJA::resources::Resource res,
-       const LaunchParams& launch_params,
+       const LaunchParams<ThreadDIM>& launch_params,
        BODY_IN&& body_in,
        ReduceParams launch_reducers)
   {

From bf9f98d14af076f220377d1ef6184f6297aa0ad7 Mon Sep 17 00:00:00 2001
From: Arturo Vargas <vargas45@llnl.gov>
Date: Wed, 9 Jul 2025 11:18:31 -0700
Subject: [PATCH 08/10] TODO - add device policy

---
 examples/4D_tensor_contraction.cpp | 140 +++++++++++++++++++----------
 1 file changed, 94 insertions(+), 46 deletions(-)

diff --git a/examples/4D_tensor_contraction.cpp b/examples/4D_tensor_contraction.cpp
index 384366c8ab..b87daa65fa 100644
--- a/examples/4D_tensor_contraction.cpp
+++ b/examples/4D_tensor_contraction.cpp
@@ -32,41 +32,118 @@
  *
  */
 
+// Define problem setup
+constexpr int TotalMats = 100;
+
+constexpr int I = 2;
+constexpr int J = 2;
+constexpr int L = 2;
+constexpr int K = 2;
+constexpr int M = 2;
+constexpr int N = 2;
+constexpr int O = 2;
+
 /*
  * Define host/device launch policies
  */
-using launch_policy = RAJA::LaunchPolicy<RAJA::seq_launch_t>;
+const bool async = false;
+using launch_policy = RAJA::LaunchPolicy<RAJA::seq_launch_t
+#if defined(RAJA_ENABLE_HIP)
+                                         ,RAJA::hip_launch_t<async>
+#endif
+                                         >;
+
+using teams = RAJA::LoopPolicy<RAJA::seq_exec
+#if defined(RAJA_ENABLE_HIP)
+                               ,RAJA::hip_block_x_direct
+#endif
+                               >;
+
+using loop_0 = RAJA::LoopPolicy<RAJA::seq_exec
+#if defined(RAJA_ENABLE_HIP)
+                                ,RAJA::seq_exec
+#endif
+                                >;
+using loop_1 = RAJA::LoopPolicy<RAJA::seq_exec
+#if defined(RAJA_ENABLE_HIP)
+                                ,RAJA::seq_exec
+#endif
+                                >;
+using loop_2 = RAJA::LoopPolicy<RAJA::seq_exec
+#if defined(RAJA_ENABLE_HIP)
+                                ,RAJA::seq_exec
+#endif
+                                >;
+using loop_3 = RAJA::LoopPolicy<RAJA::seq_exec
+#if defined(RAJA_ENABLE_HIP)
+                                ,RAJA::seq_exec
+#endif
+                                >;
+
+using loop_4 = RAJA::LoopPolicy<RAJA::seq_exec
+#if defined(RAJA_ENABLE_HIP)
+                                ,RAJA::seq_exec
+#endif
+                                >;
+using loop_5 = RAJA::LoopPolicy<RAJA::seq_exec
+#if defined(RAJA_ENABLE_HIP)
+                                ,RAJA::seq_exec
+#endif
+                                >;
+
+
+template<typename AVIEW, typename BVIEW, typename CVIEW>
+void tensor_contraction(AVIEW A, BVIEW B, CVIEW C, RAJA::ExecPlace platform)
+{
+
+  RAJA::launch<launch_policy>
+    (RAJA::LaunchParams(RAJA::Teams(TotalMats), RAJA::Threads<6>(I, J, K, M, N, O)),
+       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
+
+         RAJA::loop<teams>(ctx, RAJA::RangeSegment(0, TotalMats), [&](int r) {
+
+           RAJA::loop<loop_0>(ctx, RAJA::RangeSegment(0, I), [&](int i) {
+             RAJA::loop<loop_1>(ctx, RAJA::RangeSegment(0, J), [&](int j) {
+               RAJA::loop<loop_2>(ctx, RAJA::RangeSegment(0, K), [&](int k) {
+                 RAJA::loop<loop_3>(ctx, RAJA::RangeSegment(0, M), [&](int m) {
+                   RAJA::loop<loop_4>(ctx, RAJA::RangeSegment(0, N), [&](int n) {
+                     RAJA::loop<loop_5>(ctx, RAJA::RangeSegment(0, O), [&](int o) {
+
+                       double dot = 0.0;
+                       for(int l = 0; l < L; ++l) {
+                         dot += A(r, i,j,k,l) * B(r, l,m,n,o);
+                       }
+                       C(r, i,j,k,m,n,o) = dot;
+
+                      });
+                    });
+                  });
+                });
+              });
+            });
+          });
+
+       });  // outer lambda
+
 
-using teams = RAJA::LoopPolicy<RAJA::seq_exec>;
+}
 
-using loop_0 = RAJA::LoopPolicy<RAJA::seq_exec>;
-using loop_1 = RAJA::LoopPolicy<RAJA::seq_exec>;
-using loop_2 = RAJA::LoopPolicy<RAJA::seq_exec>;
-using loop_3 = RAJA::LoopPolicy<RAJA::seq_exec>;
-using loop_4 = RAJA::LoopPolicy<RAJA::seq_exec>;
-using loop_5 = RAJA::LoopPolicy<RAJA::seq_exec>;
 
 
 int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 {
 
-  constexpr int TotalMats = 100;
-
-  constexpr int I = 2;
-  constexpr int J = 2;
-  constexpr int L = 2;
-  constexpr int K = 2;
-  constexpr int M = 2;
-  constexpr int N = 2;
-  constexpr int O = 2;
 
   double *Aptr = memoryManager::allocate<double>(TotalMats * I * J * K * L);
   double *Bptr = memoryManager::allocate<double>(TotalMats * L * M * N * O);
   double *Cptr = memoryManager::allocate<double>(TotalMats * I * J * K * M * N * O);
 
+  double *test_Cptr = memoryManager::allocate<double>(TotalMats * I * J * K * M * N * O);
+
   auto A = RAJA::make_permuted_view<RAJA::layout_right>(Aptr, TotalMats, I, J, K, L);
   auto B = RAJA::make_permuted_view<RAJA::layout_right>(Bptr, TotalMats, L, M, N, O);
   auto C = RAJA::make_permuted_view<RAJA::layout_right>(Cptr, TotalMats, I, J, K, M, N, O);
+  auto test_C = RAJA::make_permuted_view<RAJA::layout_right>(test_Cptr, TotalMats, I, J, K, M, N, O);
 
   // Initialize A and B with some values
   for(int mat = 0; mat < TotalMats; ++mat) {
@@ -94,33 +171,4 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   }
 
 
-    RAJA::launch<launch_policy>
-      (RAJA::LaunchParams(RAJA::Teams(TotalMats), RAJA::Threads<6>(I, J, K, M, N, O)),
-       [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
-
-         RAJA::loop<teams>(ctx, RAJA::RangeSegment(0, TotalMats), [&](int r) {
-
-           RAJA::loop<loop_0>(ctx, RAJA::RangeSegment(0, I), [&](int i) {
-             RAJA::loop<loop_1>(ctx, RAJA::RangeSegment(0, J), [&](int j) {
-               RAJA::loop<loop_2>(ctx, RAJA::RangeSegment(0, K), [&](int k) {
-                 RAJA::loop<loop_3>(ctx, RAJA::RangeSegment(0, M), [&](int m) {
-                   RAJA::loop<loop_4>(ctx, RAJA::RangeSegment(0, N), [&](int n) {
-                     RAJA::loop<loop_5>(ctx, RAJA::RangeSegment(0, O), [&](int o) {
-
-                       double dot = 0.0;
-                       for(int l = 0; l < L; ++l) {
-                         dot += A(r, i,j,k,l) * B(r, l,m,n,o);
-                       }
-                       C(r, i,j,k,m,n,o) = dot;
-                       
-                      });
-                    });
-                  });
-                });
-              });
-            });
-          });             
-         
-       });  // outer lambda
-
 }  // Main

From 289fcc0a1b97431830d595a533f44b418579a82d Mon Sep 17 00:00:00 2001
From: Arturo Vargas <vargas45@llnl.gov>
Date: Wed, 9 Jul 2025 18:36:58 -0700
Subject: [PATCH 09/10] make style

---
 include/RAJA/pattern/launch/launch_core.hpp |  2 +-
 include/RAJA/policy/cuda/launch.hpp         | 45 ++++++++++++---------
 include/RAJA/policy/sycl/launch.hpp         | 12 ++++--
 3 files changed, 36 insertions(+), 23 deletions(-)

diff --git a/include/RAJA/pattern/launch/launch_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp
index af3d66560c..4424259057 100644
--- a/include/RAJA/pattern/launch/launch_core.hpp
+++ b/include/RAJA/pattern/launch/launch_core.hpp
@@ -111,7 +111,7 @@ template<size_t DIM = 3>
 struct Threads
 {
   std::array<int, DIM> value;
-  //int value[DIM];
+  // int value[DIM];
 #if 1
   RAJA_INLINE
 
diff --git a/include/RAJA/policy/cuda/launch.hpp b/include/RAJA/policy/cuda/launch.hpp
index cd6479230f..d39690675d 100644
--- a/include/RAJA/policy/cuda/launch.hpp
+++ b/include/RAJA/policy/cuda/launch.hpp
@@ -28,21 +28,24 @@
 namespace RAJA
 {
 
-//internal helper function
+// internal helper function
 namespace detail
 {
 
 template<typename T, size_t... I>
-constexpr T multiply_impl(const std::array<T, sizeof...(I)>& arr, std::index_sequence<I...>) {
-    return (arr[I] * ...);
+constexpr T multiply_impl(const std::array<T, sizeof...(I)>& arr,
+                          std::index_sequence<I...>)
+{
+  return (arr[I] * ...);
 }
 
 template<typename T, size_t N>
-constexpr T multiplyArray(const std::array<T, N>& arr) {
-    return multiply_impl(arr, std::make_index_sequence<N>{});
+constexpr T multiplyArray(const std::array<T, N>& arr)
+{
+  return multiply_impl(arr, std::make_index_sequence<N> {});
 }
 
-}
+}  // namespace detail
 
 template<typename BODY>
 __global__ void launch_global_fcn(BODY body_in)
@@ -105,8 +108,10 @@ struct LaunchExecute<
     resources::Cuda cuda_res = res.get<RAJA::resources::Cuda>();
 
 
-    if(params.threads.value.size() > 3) {
-      std::cout<<"threads container is larger than 3 : "<<params.threads.value.size() <<std::endl;
+    if (params.threads.value.size() > 3)
+    {
+      std::cout << "threads container is larger than 3 : "
+                << params.threads.value.size() << std::endl;
     }
 
 
@@ -120,18 +125,22 @@ struct LaunchExecute<
 
     cuda_dim_t blockSize;
 
-    if(params.threads.value.size() < 4)
+    if (params.threads.value.size() < 4)
+    {
+      blockSize =
+          cuda_dim_t {static_cast<cuda_dim_member_t>(params.threads.value[0]),
+                      static_cast<cuda_dim_member_t>(params.threads.value[1]),
+                      static_cast<cuda_dim_member_t>(params.threads.value[2])};
+    }
+    else
     {
-      blockSize = cuda_dim_t{static_cast<cuda_dim_member_t>(params.threads.value[0]),
-                             static_cast<cuda_dim_member_t>(params.threads.value[1]),
-                             static_cast<cuda_dim_member_t>(params.threads.value[2])};
-    } else {
 
       int total_threads = detail::multiplyArray(params.threads.value);
-      std::cout<<"Total threads"<<std::endl;
-      blockSize = cuda_dim_t{static_cast<cuda_dim_member_t>(detail::multiplyArray(params.threads.value)),
-                             static_cast<cuda_dim_member_t>(1),
-                             static_cast<cuda_dim_member_t>(1)};
+      std::cout << "Total threads" << std::endl;
+      blockSize = cuda_dim_t {static_cast<cuda_dim_member_t>(
+                                  detail::multiplyArray(params.threads.value)),
+                              static_cast<cuda_dim_member_t>(1),
+                              static_cast<cuda_dim_member_t>(1)};
     }
 
     /*
@@ -367,7 +376,7 @@ struct LaunchExecute<
   }
 
   // Version with explicit reduction parameters..
-  template<size_t ThreadDIM=3, typename BODY_IN, typename ReduceParams>
+  template<size_t ThreadDIM = 3, typename BODY_IN, typename ReduceParams>
   static concepts::enable_if_t<
       resources::EventProxy<resources::Resource>,
       RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
diff --git a/include/RAJA/policy/sycl/launch.hpp b/include/RAJA/policy/sycl/launch.hpp
index 6cdc48185e..7dacc5105d 100644
--- a/include/RAJA/policy/sycl/launch.hpp
+++ b/include/RAJA/policy/sycl/launch.hpp
@@ -33,7 +33,8 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>>
 {
 
   // If the launch lambda is trivially copyable
-  template<size_t ThreadDIM = 3, typename BODY_IN,
+  template<size_t ThreadDIM = 3,
+           typename BODY_IN,
            typename ReduceParams,
            typename std::enable_if<std::is_trivially_copyable<BODY_IN> {},
                                    bool>::type = true>
@@ -102,7 +103,8 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>>
 
   // If the launch lambda is trivially copyable and we have explcit reduction
   // parameters
-  template<size_t ThreadDIM = 3, typename BODY_IN,
+  template<size_t ThreadDIM = 3,
+           typename BODY_IN,
            typename ReduceParams,
            typename std::enable_if<std::is_trivially_copyable<BODY_IN> {},
                                    bool>::type = true>
@@ -194,7 +196,8 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>>
   }
 
   // If the launch lambda is not trivially copyable
-  template<size_t ThreadDIM = 3, typename BODY_IN,
+  template<size_t ThreadDIM = 3,
+           typename BODY_IN,
            typename ReduceParams,
            typename std::enable_if<!std::is_trivially_copyable<BODY_IN> {},
                                    bool>::type = true>
@@ -269,7 +272,8 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>>
   }
 
   // If the launch lambda is not trivially copyable
-  template<size_t ThreadDIM = 3, typename BODY_IN,
+  template<size_t ThreadDIM = 3,
+           typename BODY_IN,
            typename ReduceParams,
            typename std::enable_if<!std::is_trivially_copyable<BODY_IN> {},
                                    bool>::type = true>

From 4bf066a05e392712f0be95b5c5cfcd76e34d540c Mon Sep 17 00:00:00 2001
From: Arturo Vargas <vargas45@llnl.gov>
Date: Fri, 11 Jul 2025 09:23:53 -0700
Subject: [PATCH 10/10] WIP -- need to rework code so it does not break the
 build

---
 examples/4D_tensor_contraction.cpp          |  43 ++++---
 include/RAJA/pattern/launch/launch_core.hpp |   7 +-
 include/RAJA/policy/cuda/launch.hpp         |  19 ----
 include/RAJA/policy/hip/launch.hpp          | 117 ++++++++++++++++++--
 4 files changed, 142 insertions(+), 44 deletions(-)

diff --git a/examples/4D_tensor_contraction.cpp b/examples/4D_tensor_contraction.cpp
index b87daa65fa..51ffea53a6 100644
--- a/examples/4D_tensor_contraction.cpp
+++ b/examples/4D_tensor_contraction.cpp
@@ -61,33 +61,33 @@ using teams = RAJA::LoopPolicy<RAJA::seq_exec
 
 using loop_0 = RAJA::LoopPolicy<RAJA::seq_exec
 #if defined(RAJA_ENABLE_HIP)
-                                ,RAJA::seq_exec
+                                ,RAJA::hip_loop_dim_exec<0>
 #endif
                                 >;
 using loop_1 = RAJA::LoopPolicy<RAJA::seq_exec
 #if defined(RAJA_ENABLE_HIP)
-                                ,RAJA::seq_exec
+                                ,RAJA::hip_loop_dim_exec<1>
 #endif
                                 >;
 using loop_2 = RAJA::LoopPolicy<RAJA::seq_exec
 #if defined(RAJA_ENABLE_HIP)
-                                ,RAJA::seq_exec
+                                ,RAJA::hip_loop_dim_exec<2>
 #endif
                                 >;
 using loop_3 = RAJA::LoopPolicy<RAJA::seq_exec
 #if defined(RAJA_ENABLE_HIP)
-                                ,RAJA::seq_exec
+                                ,RAJA::hip_loop_dim_exec<3>
 #endif
                                 >;
 
 using loop_4 = RAJA::LoopPolicy<RAJA::seq_exec
 #if defined(RAJA_ENABLE_HIP)
-                                ,RAJA::seq_exec
+                                ,RAJA::hip_loop_dim_exec<4>
 #endif
                                 >;
 using loop_5 = RAJA::LoopPolicy<RAJA::seq_exec
 #if defined(RAJA_ENABLE_HIP)
-                                ,RAJA::seq_exec
+                                ,RAJA::hip_loop_dim_exec<5>
 #endif
                                 >;
 
@@ -97,17 +97,17 @@ void tensor_contraction(AVIEW A, BVIEW B, CVIEW C, RAJA::ExecPlace platform)
 {
 
   RAJA::launch<launch_policy>
-    (RAJA::LaunchParams(RAJA::Teams(TotalMats), RAJA::Threads<6>(I, J, K, M, N, O)),
+    (platform, RAJA::LaunchParams(RAJA::Teams(TotalMats), RAJA::Threads<6>(I, J, K, M, N, O)),
        [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
 
          RAJA::loop<teams>(ctx, RAJA::RangeSegment(0, TotalMats), [&](int r) {
 
-           RAJA::loop<loop_0>(ctx, RAJA::RangeSegment(0, I), [&](int i) {
-             RAJA::loop<loop_1>(ctx, RAJA::RangeSegment(0, J), [&](int j) {
-               RAJA::loop<loop_2>(ctx, RAJA::RangeSegment(0, K), [&](int k) {
-                 RAJA::loop<loop_3>(ctx, RAJA::RangeSegment(0, M), [&](int m) {
-                   RAJA::loop<loop_4>(ctx, RAJA::RangeSegment(0, N), [&](int n) {
-                     RAJA::loop<loop_5>(ctx, RAJA::RangeSegment(0, O), [&](int o) {
+           RAJA::loop<loop_5>(ctx, RAJA::RangeSegment(0, I), [&](int i) {
+             RAJA::loop<loop_4>(ctx, RAJA::RangeSegment(0, J), [&](int j) {
+               RAJA::loop<loop_3>(ctx, RAJA::RangeSegment(0, K), [&](int k) {
+                 RAJA::loop<loop_2>(ctx, RAJA::RangeSegment(0, M), [&](int m) {
+                   RAJA::loop<loop_1>(ctx, RAJA::RangeSegment(0, N), [&](int n) {
+                     RAJA::loop<loop_0>(ctx, RAJA::RangeSegment(0, O), [&](int o) {
 
                        double dot = 0.0;
                        for(int l = 0; l < L; ++l) {
@@ -171,4 +171,21 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   }
 
 
+
+  tensor_contraction(A, B, C, RAJA::ExecPlace::HOST);
+
+  tensor_contraction(A, B, test_C, RAJA::ExecPlace::DEVICE);
+
+
+  //test correctness
+  double diff = 0.0;
+  for(int i = 0; i < I * J * K * M * N * O; ++i) {
+    diff += fabs(Cptr[i] - test_Cptr[i]);
+  }
+
+  std::cout<<"diff = "<<diff<<std::endl;
+
+
+
+
 }  // Main
diff --git a/include/RAJA/pattern/launch/launch_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp
index 4424259057..8a580fb801 100644
--- a/include/RAJA/pattern/launch/launch_core.hpp
+++ b/include/RAJA/pattern/launch/launch_core.hpp
@@ -107,9 +107,10 @@ struct Teams
   constexpr Teams(int i, int j, int k) : value {i, j, k} {}
 };
 
-template<size_t DIM = 3>
+template<size_t DIM=3>
 struct Threads
 {
+
   std::array<int, DIM> value;
   // int value[DIM];
 #if 1
@@ -180,6 +181,10 @@ class LaunchContext
 
   void* shared_mem_ptr;
 
+  //hardcoded for now...
+  std::array<int, 6> thread_dim;
+  std::array<int, 6> thread_id;
+
 #if defined(RAJA_ENABLE_SYCL)
   mutable ::sycl::nd_item<3>* itm;
 #endif
diff --git a/include/RAJA/policy/cuda/launch.hpp b/include/RAJA/policy/cuda/launch.hpp
index d39690675d..6a54205ded 100644
--- a/include/RAJA/policy/cuda/launch.hpp
+++ b/include/RAJA/policy/cuda/launch.hpp
@@ -28,25 +28,6 @@
 namespace RAJA
 {
 
-// internal helper function
-namespace detail
-{
-
-template<typename T, size_t... I>
-constexpr T multiply_impl(const std::array<T, sizeof...(I)>& arr,
-                          std::index_sequence<I...>)
-{
-  return (arr[I] * ...);
-}
-
-template<typename T, size_t N>
-constexpr T multiplyArray(const std::array<T, N>& arr)
-{
-  return multiply_impl(arr, std::make_index_sequence<N> {});
-}
-
-}  // namespace detail
-
 template<typename BODY>
 __global__ void launch_global_fcn(BODY body_in)
 {
diff --git a/include/RAJA/policy/hip/launch.hpp b/include/RAJA/policy/hip/launch.hpp
index ba1cbfd267..32e5a6595b 100644
--- a/include/RAJA/policy/hip/launch.hpp
+++ b/include/RAJA/policy/hip/launch.hpp
@@ -28,6 +28,25 @@
 namespace RAJA
 {
 
+// internal helper function
+namespace detail
+{
+
+template<typename T, size_t... I>
+constexpr T multiply_impl(const std::array<T, sizeof...(I)>& arr,
+                          std::index_sequence<I...>)
+{
+  return (arr[I] * ...);
+}
+
+template<typename T, size_t N>
+constexpr T multiplyArray(const std::array<T, N>& arr)
+{
+  return multiply_impl(arr, std::make_index_sequence<N> {});
+}
+
+}  // namespace detail
+
 template<typename BODY>
 __global__ void launch_global_fcn(BODY body_in)
 {
@@ -44,6 +63,37 @@ __global__ void launch_global_fcn(BODY body_in)
   body(ctx);
 }
 
+template<typename BODY>
+__global__ void launch_global_fcn_ctx(BODY body_in, LaunchContext ctx)
+{
+  //LaunchContext ctx;
+
+  //unravel index
+  int tid = threadIdx.x;
+  for (int d = ctx.thread_dim.size()-1; d >= 0; --d) {
+    ctx.thread_id[d] = tid % ctx.thread_dim[d];
+    tid /= ctx.thread_dim[d];
+  }
+
+  /*
+  if(threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0){
+    for(int i=0; i<ctx.threads.size(); ++i) {
+      printf("ctx.threads[i] = %d \n", ctx.threads[i]);
+    }
+  }
+  */
+
+  using RAJA::internal::thread_privatize;
+  auto privatizer = thread_privatize(body_in);
+  auto& body      = privatizer.get_priv();
+
+  // Set pointer to shared memory
+  extern __shared__ char raja_shmem_ptr[];
+  ctx.shared_mem_ptr = raja_shmem_ptr;
+
+  body(ctx);
+}
+
 template<typename BODY, typename ReduceParams>
 __global__ void launch_new_reduce_global_fcn(BODY body_in,
                                              ReduceParams reduce_params)
@@ -82,20 +132,14 @@ struct LaunchExecute<
   {
     using BODY = camp::decay<BODY_IN>;
 
-    auto func = reinterpret_cast<const void*>(&launch_global_fcn<BODY>);
+    auto func = reinterpret_cast<const void*>(&launch_global_fcn_ctx<BODY>);
 
     resources::Hip hip_res = res.get<RAJA::resources::Hip>();
 
-
-    for (int k = 0; k < 3; ++k)
-    {
-      std::cout << "params.threads.value " << params.threads.value[k]
-                << std::endl;
-    }
-
-    for (int k = 0; k < 3; ++k)
+    if (params.threads.value.size() > 3)
     {
-      std::cout << "params.teams.value " << params.teams.value[k] << std::endl;
+      std::cout << "threads container is larger than 3 : "
+                << params.threads.value.size() << std::endl;
     }
 
 
@@ -107,10 +151,33 @@ struct LaunchExecute<
                         static_cast<hip_dim_member_t>(params.teams.value[1]),
                         static_cast<hip_dim_member_t>(params.teams.value[2])};
 
+    /*
     hip_dim_t blockSize {
         static_cast<hip_dim_member_t>(params.threads.value[0]),
         static_cast<hip_dim_member_t>(params.threads.value[1]),
         static_cast<hip_dim_member_t>(params.threads.value[2])};
+    */
+
+    hip_dim_t blockSize;
+
+    if (params.threads.value.size() < 4)
+    {
+      blockSize =
+          hip_dim_t {static_cast<hip_dim_member_t>(params.threads.value[0]),
+                      static_cast<hip_dim_member_t>(params.threads.value[1]),
+                      static_cast<hip_dim_member_t>(params.threads.value[2])};
+    }
+    else
+    {
+
+      int total_threads = detail::multiplyArray(params.threads.value);
+      std::cout << "Total threads" << std::endl;
+      blockSize = hip_dim_t {static_cast<hip_dim_member_t>(
+                                  detail::multiplyArray(params.threads.value)),
+                              static_cast<hip_dim_member_t>(1),
+                              static_cast<hip_dim_member_t>(1)};
+    }
+
 
     // Only launch kernel if we have something to iterate over
     constexpr hip_dim_member_t zero = 0;
@@ -130,10 +197,14 @@ struct LaunchExecute<
                                                 shared_mem_size, hip_res,
                                                 std::forward<BODY_IN>(body_in));
 
+        //Copy threads over
+        LaunchContext ctx;
+        ctx.thread_dim = params.threads.value;
+
         //
         // Launch the kernel
         //
-        void* args[] = {(void*)&body};
+        void* args[] = {(void*)&body, (void*)&ctx};
         RAJA::hip::launch(func, gridSize, blockSize, args, shared_mem_size,
                           hip_res, async);
       }
@@ -419,6 +490,30 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>>
   }
 };
 
+
+/*
+  Arbitrary dimension thread indexing
+*/
+template<size_t dim>
+struct hip_loop_dim_exec;
+
+template<typename SEGMENT, size_t DIM>
+struct LoopExecute<hip_loop_dim_exec<DIM>, SEGMENT>
+{
+
+  template<typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const &ctx,
+      SEGMENT const& segment,
+      BODY const& body)
+  {
+    const int i = ctx.thread_id[DIM];
+
+    body(*(segment.begin() + i));
+  }
+};
+
+
 /*
    HIP generic loop implementations
 */