llnl · MrBurmark · Sep 21, 2025 · Sep 21, 2025 · Sep 21, 2025 · Sep 21, 2025
diff --git a/exercises/kernel-matrix-transpose-local-array_solution.cpp b/exercises/kernel-matrix-transpose-local-array_solution.cpp
@@ -399,8 +399,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
       //      These loops iterate over the number of
       //      tiles needed to carry out the transpose
       //
-      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::cuda_block_y_loop,
-        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::cuda_block_x_loop,
+      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::cuda_block_y_direct_unchecked,
+        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::cuda_block_x_direct_unchecked,
           // This statement will initalize local array memory inside a
           // kernel. The cpu_tile_mem policy specifies that memory should be
           // allocated on the stack. The entries in the RAJA::ParamList
@@ -431,10 +431,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
               RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::cuda_thread_x_direct,
                                             RAJA::statement::Lambda<1>
               >
-            >,
-            // Synchronize threads to ensure all reads
-            // from the local array are complete
-            RAJA::statement::CudaSyncThreads
+            >
           >
         >
       >
@@ -494,8 +491,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
       //      These loops iterate over the number of
       //      tiles needed to carry out the transpose
       //
-      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::hip_block_y_loop,
-        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::hip_block_x_loop,
+      RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::hip_block_y_direct_unchecked,
+        RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::hip_block_x_direct_unchecked,
           // This statement will initalize local array memory inside a
           // kernel. The cpu_tile_mem policy specifies that memory should be
           // allocated on the stack. The entries in the RAJA::ParamList
@@ -526,10 +523,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
               RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::hip_thread_x_direct,
                                             RAJA::statement::Lambda<1>
               >
-            >,
-            // Synchronize threads to ensure all reads
-            // from the local array are complete
-            RAJA::statement::HipSyncThreads
+            >
           >
         >
       >
@@ -556,6 +550,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   );
 
   CAMP_HIP_API_INVOKE_AND_CHECK(hipMemcpy, At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost);
+  CAMP_HIP_API_INVOKE_AND_CHECK(hipDeviceSynchronize);
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
 #endif

diff --git a/exercises/launch-matrix-transpose-local-array.cpp b/exercises/launch-matrix-transpose-local-array.cpp
@@ -145,18 +145,17 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
       //
       // (2) Inner loops to write array data into output array tile
       //
-      //     Note: loop order is swapped from above so that output matrix
-      //           data access is stride-1.
       //
       for (int tx = 0; tx < TILE_DIM; ++tx) {
         for (int ty = 0; ty < TILE_DIM; ++ty) {
 
-          int col = bx * TILE_DIM + tx;  // Matrix column index
-          int row = by * TILE_DIM + ty;  // Matrix row index
+          // Tranpose tile offset
+          int col = by * TILE_DIM + tx;  // Matrix column index
+          int row = bx * TILE_DIM + ty;  // Matrix row index
 
           // Bounds check
-          if (row < N_r && col < N_c) {
-            Atview(col, row) = Tile[ty][tx];
+          if (row < N_c && col < N_r) {
+            Atview(row, col) = Tile[tx][ty];
           }
         }
       }
@@ -195,10 +194,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
           /// input matrix into the RAJA_TEAM_SHARED memory array
           ///
 
-          RAJA::loop_icount<loop_pol_1>(ctx, col_tile, [&] (int col, int tx) {
-            RAJA::loop_icount<loop_pol_1>(ctx, row_tile, [&] (int row, int ty) {
+          RAJA::loop_icount<loop_pol_1>(ctx, col_tile, [&] (int row, int ty) {
+            RAJA::loop_icount<loop_pol_1>(ctx, row_tile, [&] (int col, int tx) {
 
-              Atview(col, row) = Tile_Array[ty][tx];
+              Atview(row, col) = Tile_Array[tx][ty];
 
             });
           });
@@ -244,18 +243,18 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
             RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];
 
-            RAJA::loop_icount<loop_pol_2>(ctx, row_tile, [&] (int row, int ty) {
-              RAJA::loop_icount<loop_pol_2>(ctx, col_tile, [&] (int col, int tx) {
+          RAJA::loop_icount<loop_pol_2>(ctx, row_tile, [&] (int row, int ty) {
+            RAJA::loop_icount<loop_pol_2>(ctx, col_tile, [&] (int col, int tx) {
 
-                Tile_Array[ty][tx] = Aview(row, col);
+              Tile_Array[ty][tx] = Aview(row, col);
 
               });
             });
 
-            RAJA::loop_icount<loop_pol_2>(ctx, col_tile, [&] (int col, int tx) {
-              RAJA::loop_icount<loop_pol_2>(ctx, row_tile, [&] (int row, int ty) {
+          RAJA::loop_icount<loop_pol_2>(ctx, col_tile, [&] (int row, int ty) {
+            RAJA::loop_icount<loop_pol_2>(ctx, row_tile, [&] (int col, int tx) {
 
-                Atview(col, row) = Tile_Array[ty][tx];
+              Atview(row, col) = Tile_Array[tx][ty];
 
                 });
               });
@@ -307,10 +306,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
             });
           });
 
-          RAJA::loop_icount<cuda_threads_x>(ctx, col_tile, [&] (int col, int tx) {
-            RAJA::loop_icount<cuda_threads_y>(ctx, row_tile, [&] (int row, int ty) {
+          ctx.teamSync();
 
-              Atview(col, row) = Tile_Array[ty][tx];
+          RAJA::loop_icount<cuda_threads_y>(ctx, col_tile, [&] (int row, int ty) {
+            RAJA::loop_icount<cuda_threads_x>(ctx, row_tile, [&] (int col, int tx) {
+
+              Atview(row, col) = Tile_Array[tx][ty];
 
             });
           });
@@ -379,10 +380,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
             });
           });
 
-          RAJA::loop_icount<hip_threads_x>(ctx, col_tile, [&] (int col, int tx) {
-            RAJA::loop_icount<hip_threads_y>(ctx, row_tile, [&] (int row, int ty) {
+          ctx.teamSync();
+
+          RAJA::loop_icount<hip_threads_y>(ctx, col_tile, [&] (int row, int ty) {
+            RAJA::loop_icount<hip_threads_x>(ctx, row_tile, [&] (int col, int tx) {
 
-              d_Atview(col, row) = Tile_Array[ty][tx];
+              d_Atview(row, col) = Tile_Array[tx][ty];
 
             });
           });

diff --git a/exercises/launch-matrix-transpose-local-array_solution.cpp b/exercises/launch-matrix-transpose-local-array_solution.cpp
@@ -145,18 +145,17 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
       //
       // (2) Inner loops to write array data into output array tile
       //
-      //     Note: loop order is swapped from above so that output matrix
-      //           data access is stride-1.
       //
-      for (int tx = 0; tx < TILE_DIM; ++tx) {
-        for (int ty = 0; ty < TILE_DIM; ++ty) {
+      for (int ty = 0; ty < TILE_DIM; ++ty) {
+        for (int tx = 0; tx < TILE_DIM; ++tx) {
 
-          int col = bx * TILE_DIM + tx;  // Matrix column index
-          int row = by * TILE_DIM + ty;  // Matrix row index
+          // Tranpose tile offset
+          int col = by * TILE_DIM + tx;  // Matrix column index
+          int row = bx * TILE_DIM + ty;  // Matrix row index
 
           // Bounds check
-          if (row < N_r && col < N_c) {
-            Atview(col, row) = Tile[ty][tx];
+          if (row < N_c && col < N_r) {
+            Atview(row, col) = Tile[tx][ty];
           }
         }
       }
@@ -182,9 +181,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
     RAJA::LaunchParams(), //LaunchParams may be empty when only running on the cpu
     [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
 
-      RAJA::tile<loop_pol_1>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_r), [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
 
-        RAJA::tile<loop_pol_1>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_c), [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
+      RAJA::tile<loop_pol_1>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_r),
+          [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
+
+        RAJA::tile<loop_pol_1>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_c),
+          [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
 
           RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];
 
@@ -196,10 +198,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
             });
           });
 
-          RAJA::loop_icount<loop_pol_1>(ctx, col_tile, [&] (int col, int tx) {
-            RAJA::loop_icount<loop_pol_1>(ctx, row_tile, [&] (int row, int ty) {
+          RAJA::loop_icount<loop_pol_1>(ctx, col_tile, [&] (int row, int ty) {
+            RAJA::loop_icount<loop_pol_1>(ctx, row_tile, [&] (int col, int tx) {
 
-              Atview(col, row) = Tile_Array[ty][tx];
+              Atview(row, col) = Tile_Array[tx][ty];
 
             });
           });
@@ -232,9 +234,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
     RAJA::LaunchParams(), //LaunchParams may be empty when only running on the cpu
     [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
 
-      RAJA::tile<omp_pol_2>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_r), [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
+      RAJA::tile<omp_pol_2>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_r),
+        [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
 
-        RAJA::tile<loop_pol_2>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_c), [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
+        RAJA::tile<loop_pol_2>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_c),
+          [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
 
           RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];
 
@@ -246,10 +250,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
             });
           });
 
-          RAJA::loop_icount<loop_pol_2>(ctx, col_tile, [&] (int col, int tx) {
-            RAJA::loop_icount<loop_pol_2>(ctx, row_tile, [&] (int row, int ty) {
+          RAJA::loop_icount<loop_pol_2>(ctx, col_tile, [&] (int row, int ty) {
+            RAJA::loop_icount<loop_pol_2>(ctx, row_tile, [&] (int col, int tx) {
 
-              Atview(col, row) = Tile_Array[ty][tx];
+              Atview(row, col) = Tile_Array[tx][ty];
 
             });
           });
@@ -274,8 +278,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   const int n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz);
   const int n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz);
 
-  using cuda_teams_y = RAJA::LoopPolicy<RAJA::cuda_block_y_direct>;
-  using cuda_teams_x = RAJA::LoopPolicy<RAJA::cuda_block_x_direct>;
+  using cuda_teams_y = RAJA::LoopPolicy<RAJA::cuda_block_y_direct_unchecked>;
+  using cuda_teams_x = RAJA::LoopPolicy<RAJA::cuda_block_x_direct_unchecked>;
 
   using cuda_threads_y = RAJA::LoopPolicy<RAJA::cuda_thread_y_direct>;
   using cuda_threads_x = RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>;
@@ -285,12 +289,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   RAJA::launch<cuda_launch_policy>(
     RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r),
-                     RAJA::Threads(c_block_sz, r_block_sz)),
+                       RAJA::Threads(c_block_sz, r_block_sz)),
     [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
 
-      RAJA::tile<cuda_teams_y>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_r), [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
+      RAJA::tile<cuda_teams_y>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_r),
+        [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
 
-        RAJA::tile<cuda_teams_x>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_c), [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
+        RAJA::tile<cuda_teams_x>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_c),
+          [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
 
           RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];
 
@@ -302,16 +308,18 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
             });
           });
 
-         RAJA::loop_icount<cuda_threads_x>(ctx, col_tile, [&] (int col, int tx) {
-           RAJA::loop_icount<cuda_threads_y>(ctx, row_tile, [&] (int row, int ty) {
+          ctx.teamSync();
 
-             Atview(col, row) = Tile_Array[ty][tx];
+          RAJA::loop_icount<cuda_threads_y>(ctx, col_tile, [&] (int row, int ty) {
+            RAJA::loop_icount<cuda_threads_x>(ctx, row_tile, [&] (int col, int tx) {
 
-           });
-         });
+              Atview(row, col) = Tile_Array[tx][ty];
+
+            });
+          });
 
-       });
-     });
+        });
+      });
 
    });
 
@@ -346,8 +354,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
   const int n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz);
   const int n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz);
 
-  using hip_teams_y = RAJA::LoopPolicy<RAJA::hip_block_y_direct>;
-  using hip_teams_x = RAJA::LoopPolicy<RAJA::hip_block_x_direct>;
+  using hip_teams_y = RAJA::LoopPolicy<RAJA::hip_block_y_direct_unchecked>;
+  using hip_teams_x = RAJA::LoopPolicy<RAJA::hip_block_x_direct_unchecked>;
 
   using hip_threads_y = RAJA::LoopPolicy<RAJA::hip_thread_y_direct>;
   using hip_threads_x = RAJA::LoopPolicy<RAJA::hip_thread_x_direct>;
@@ -357,12 +365,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
 
   RAJA::launch<hip_launch_policy>
      (RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r),
-                     RAJA::Threads(c_block_sz, r_block_sz)),
+                         RAJA::Threads(c_block_sz, r_block_sz)),
     [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
 
-      RAJA::tile<hip_teams_y>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_r), [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
+      RAJA::tile<hip_teams_y>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_r),
+        [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
 
-        RAJA::tile<hip_teams_x>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_c), [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
+        RAJA::tile<hip_teams_x>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_c),
+          [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
 
           RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];
 
@@ -374,20 +384,23 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
             });
           });
 
-          RAJA::loop_icount<hip_threads_x>(ctx, col_tile, [&] (int col, int tx) {
-           RAJA::loop_icount<hip_threads_y>(ctx, row_tile, [&] (int row, int ty) {
+          ctx.teamSync();
+
+          RAJA::loop_icount<hip_threads_y>(ctx, col_tile, [&] (int row, int ty) {
+            RAJA::loop_icount<hip_threads_x>(ctx, row_tile, [&] (int col, int tx) {
 
-             d_Atview(col, row) = Tile_Array[ty][tx];
+              d_Atview(row, col) = Tile_Array[tx][ty];
 
-           });
-         });
+            });
+          });
 
-       });
-     });
+        });
+      });
 
    });
 
   CAMP_HIP_API_INVOKE_AND_CHECK(hipMemcpy, At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost);
+  CAMP_HIP_API_INVOKE_AND_CHECK(hipDeviceSynchronize);
   checkResult<int>(Atview, N_c, N_r);
   // printResult<int>(Atview, N_c, N_r);
 #endif