Skip to content
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 7 additions & 12 deletions exercises/kernel-matrix-transpose-local-array_solution.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -399,8 +399,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
// These loops iterate over the number of
// tiles needed to carry out the transpose
//
RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::cuda_block_y_loop,
RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::cuda_block_x_loop,
RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::cuda_block_y_direct_unchecked,
RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::cuda_block_x_direct_unchecked,
// This statement will initalize local array memory inside a
// kernel. The cpu_tile_mem policy specifies that memory should be
// allocated on the stack. The entries in the RAJA::ParamList
Expand Down Expand Up @@ -431,10 +431,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::cuda_thread_x_direct,
RAJA::statement::Lambda<1>
>
>,
// Synchronize threads to ensure all reads
// from the local array are complete
RAJA::statement::CudaSyncThreads
>
>
>
>
Expand Down Expand Up @@ -494,8 +491,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
// These loops iterate over the number of
// tiles needed to carry out the transpose
//
RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::hip_block_y_loop,
RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::hip_block_x_loop,
RAJA::statement::Tile<1, RAJA::tile_fixed<TILE_DIM>, RAJA::hip_block_y_direct_unchecked,
RAJA::statement::Tile<0, RAJA::tile_fixed<TILE_DIM>, RAJA::hip_block_x_direct_unchecked,
// This statement will initalize local array memory inside a
// kernel. The cpu_tile_mem policy specifies that memory should be
// allocated on the stack. The entries in the RAJA::ParamList
Expand Down Expand Up @@ -526,10 +523,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
RAJA::statement::ForICount<1, RAJA::statement::Param<0>, RAJA::hip_thread_x_direct,
RAJA::statement::Lambda<1>
>
>,
// Synchronize threads to ensure all reads
// from the local array are complete
RAJA::statement::HipSyncThreads
>
>
>
>
Expand All @@ -556,6 +550,7 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
);

CAMP_HIP_API_INVOKE_AND_CHECK(hipMemcpy, At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost);
CAMP_HIP_API_INVOKE_AND_CHECK(hipDeviceSynchronize);
checkResult<int>(Atview, N_c, N_r);
// printResult<int>(Atview, N_c, N_r);
#endif
Expand Down
45 changes: 24 additions & 21 deletions exercises/launch-matrix-transpose-local-array.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -145,18 +145,17 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
//
// (2) Inner loops to write array data into output array tile
//
// Note: loop order is swapped from above so that output matrix
// data access is stride-1.
//
for (int tx = 0; tx < TILE_DIM; ++tx) {
for (int ty = 0; ty < TILE_DIM; ++ty) {

int col = bx * TILE_DIM + tx; // Matrix column index
int row = by * TILE_DIM + ty; // Matrix row index
// Tranpose tile offset
int col = by * TILE_DIM + tx; // Matrix column index
int row = bx * TILE_DIM + ty; // Matrix row index

// Bounds check
if (row < N_r && col < N_c) {
Atview(col, row) = Tile[ty][tx];
if (row < N_c && col < N_r) {
Atview(row, col) = Tile[tx][ty];
}
}
}
Expand Down Expand Up @@ -195,10 +194,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
/// input matrix into the RAJA_TEAM_SHARED memory array
///

RAJA::loop_icount<loop_pol_1>(ctx, col_tile, [&] (int col, int tx) {
RAJA::loop_icount<loop_pol_1>(ctx, row_tile, [&] (int row, int ty) {
RAJA::loop_icount<loop_pol_1>(ctx, col_tile, [&] (int row, int ty) {
RAJA::loop_icount<loop_pol_1>(ctx, row_tile, [&] (int col, int tx) {

Atview(col, row) = Tile_Array[ty][tx];
Atview(row, col) = Tile_Array[tx][ty];

});
});
Expand Down Expand Up @@ -244,18 +243,18 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))

RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];

RAJA::loop_icount<loop_pol_2>(ctx, row_tile, [&] (int row, int ty) {
RAJA::loop_icount<loop_pol_2>(ctx, col_tile, [&] (int col, int tx) {
RAJA::loop_icount<loop_pol_2>(ctx, row_tile, [&] (int row, int ty) {
RAJA::loop_icount<loop_pol_2>(ctx, col_tile, [&] (int col, int tx) {

Tile_Array[ty][tx] = Aview(row, col);
Tile_Array[ty][tx] = Aview(row, col);

});
});

RAJA::loop_icount<loop_pol_2>(ctx, col_tile, [&] (int col, int tx) {
RAJA::loop_icount<loop_pol_2>(ctx, row_tile, [&] (int row, int ty) {
RAJA::loop_icount<loop_pol_2>(ctx, col_tile, [&] (int row, int ty) {
RAJA::loop_icount<loop_pol_2>(ctx, row_tile, [&] (int col, int tx) {

Atview(col, row) = Tile_Array[ty][tx];
Atview(row, col) = Tile_Array[tx][ty];

});
});
Expand Down Expand Up @@ -307,10 +306,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
});
});

RAJA::loop_icount<cuda_threads_x>(ctx, col_tile, [&] (int col, int tx) {
RAJA::loop_icount<cuda_threads_y>(ctx, row_tile, [&] (int row, int ty) {
ctx.teamSync();

Atview(col, row) = Tile_Array[ty][tx];
RAJA::loop_icount<cuda_threads_y>(ctx, col_tile, [&] (int row, int ty) {
RAJA::loop_icount<cuda_threads_x>(ctx, row_tile, [&] (int col, int tx) {

Atview(row, col) = Tile_Array[tx][ty];

});
});
Expand Down Expand Up @@ -379,10 +380,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
});
});

RAJA::loop_icount<hip_threads_x>(ctx, col_tile, [&] (int col, int tx) {
RAJA::loop_icount<hip_threads_y>(ctx, row_tile, [&] (int row, int ty) {
ctx.teamSync();

RAJA::loop_icount<hip_threads_y>(ctx, col_tile, [&] (int row, int ty) {
RAJA::loop_icount<hip_threads_x>(ctx, row_tile, [&] (int col, int tx) {

d_Atview(col, row) = Tile_Array[ty][tx];
d_Atview(row, col) = Tile_Array[tx][ty];

});
});
Expand Down
97 changes: 55 additions & 42 deletions exercises/launch-matrix-transpose-local-array_solution.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -145,18 +145,17 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
//
// (2) Inner loops to write array data into output array tile
//
// Note: loop order is swapped from above so that output matrix
// data access is stride-1.
//
for (int tx = 0; tx < TILE_DIM; ++tx) {
for (int ty = 0; ty < TILE_DIM; ++ty) {
for (int ty = 0; ty < TILE_DIM; ++ty) {
for (int tx = 0; tx < TILE_DIM; ++tx) {

int col = bx * TILE_DIM + tx; // Matrix column index
int row = by * TILE_DIM + ty; // Matrix row index
// Tranpose tile offset
int col = by * TILE_DIM + tx; // Matrix column index
int row = bx * TILE_DIM + ty; // Matrix row index

// Bounds check
if (row < N_r && col < N_c) {
Atview(col, row) = Tile[ty][tx];
if (row < N_c && col < N_r) {
Atview(row, col) = Tile[tx][ty];
}
}
}
Expand All @@ -182,9 +181,12 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
RAJA::LaunchParams(), //LaunchParams may be empty when only running on the cpu
[=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {

RAJA::tile<loop_pol_1>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_r), [&] (RAJA::TypedRangeSegment<int> const &row_tile) {

RAJA::tile<loop_pol_1>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_c), [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
RAJA::tile<loop_pol_1>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_r),
[&] (RAJA::TypedRangeSegment<int> const &row_tile) {

RAJA::tile<loop_pol_1>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_c),
[&] (RAJA::TypedRangeSegment<int> const &col_tile) {

RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];

Expand All @@ -196,10 +198,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
});
});

RAJA::loop_icount<loop_pol_1>(ctx, col_tile, [&] (int col, int tx) {
RAJA::loop_icount<loop_pol_1>(ctx, row_tile, [&] (int row, int ty) {
RAJA::loop_icount<loop_pol_1>(ctx, col_tile, [&] (int row, int ty) {
RAJA::loop_icount<loop_pol_1>(ctx, row_tile, [&] (int col, int tx) {

Atview(col, row) = Tile_Array[ty][tx];
Atview(row, col) = Tile_Array[tx][ty];

});
});
Expand Down Expand Up @@ -232,9 +234,11 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
RAJA::LaunchParams(), //LaunchParams may be empty when only running on the cpu
[=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {

RAJA::tile<omp_pol_2>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_r), [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
RAJA::tile<omp_pol_2>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_r),
[&] (RAJA::TypedRangeSegment<int> const &row_tile) {

RAJA::tile<loop_pol_2>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_c), [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
RAJA::tile<loop_pol_2>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_c),
[&] (RAJA::TypedRangeSegment<int> const &col_tile) {

RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];

Expand All @@ -246,10 +250,10 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
});
});

RAJA::loop_icount<loop_pol_2>(ctx, col_tile, [&] (int col, int tx) {
RAJA::loop_icount<loop_pol_2>(ctx, row_tile, [&] (int row, int ty) {
RAJA::loop_icount<loop_pol_2>(ctx, col_tile, [&] (int row, int ty) {
RAJA::loop_icount<loop_pol_2>(ctx, row_tile, [&] (int col, int tx) {

Atview(col, row) = Tile_Array[ty][tx];
Atview(row, col) = Tile_Array[tx][ty];

});
});
Expand All @@ -274,8 +278,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
const int n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz);
const int n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz);

using cuda_teams_y = RAJA::LoopPolicy<RAJA::cuda_block_y_direct>;
using cuda_teams_x = RAJA::LoopPolicy<RAJA::cuda_block_x_direct>;
using cuda_teams_y = RAJA::LoopPolicy<RAJA::cuda_block_y_direct_unchecked>;
using cuda_teams_x = RAJA::LoopPolicy<RAJA::cuda_block_x_direct_unchecked>;

using cuda_threads_y = RAJA::LoopPolicy<RAJA::cuda_thread_y_direct>;
using cuda_threads_x = RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>;
Expand All @@ -285,12 +289,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))

RAJA::launch<cuda_launch_policy>(
RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r),
RAJA::Threads(c_block_sz, r_block_sz)),
RAJA::Threads(c_block_sz, r_block_sz)),
[=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {

RAJA::tile<cuda_teams_y>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_r), [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
RAJA::tile<cuda_teams_y>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_r),
[&] (RAJA::TypedRangeSegment<int> const &row_tile) {

RAJA::tile<cuda_teams_x>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_c), [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
RAJA::tile<cuda_teams_x>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_c),
[&] (RAJA::TypedRangeSegment<int> const &col_tile) {

RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];

Expand All @@ -302,16 +308,18 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
});
});

RAJA::loop_icount<cuda_threads_x>(ctx, col_tile, [&] (int col, int tx) {
RAJA::loop_icount<cuda_threads_y>(ctx, row_tile, [&] (int row, int ty) {
ctx.teamSync();

Atview(col, row) = Tile_Array[ty][tx];
RAJA::loop_icount<cuda_threads_y>(ctx, col_tile, [&] (int row, int ty) {
RAJA::loop_icount<cuda_threads_x>(ctx, row_tile, [&] (int col, int tx) {

});
});
Atview(row, col) = Tile_Array[tx][ty];

});
});

});
});
});
});

});

Expand Down Expand Up @@ -346,8 +354,8 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
const int n_blocks_c = RAJA_DIVIDE_CEILING_INT(N_c, c_block_sz);
const int n_blocks_r = RAJA_DIVIDE_CEILING_INT(N_r, r_block_sz);

using hip_teams_y = RAJA::LoopPolicy<RAJA::hip_block_y_direct>;
using hip_teams_x = RAJA::LoopPolicy<RAJA::hip_block_x_direct>;
using hip_teams_y = RAJA::LoopPolicy<RAJA::hip_block_y_direct_unchecked>;
using hip_teams_x = RAJA::LoopPolicy<RAJA::hip_block_x_direct_unchecked>;

using hip_threads_y = RAJA::LoopPolicy<RAJA::hip_thread_y_direct>;
using hip_threads_x = RAJA::LoopPolicy<RAJA::hip_thread_x_direct>;
Expand All @@ -357,12 +365,14 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))

RAJA::launch<hip_launch_policy>
(RAJA::LaunchParams(RAJA::Teams(n_blocks_c, n_blocks_r),
RAJA::Threads(c_block_sz, r_block_sz)),
RAJA::Threads(c_block_sz, r_block_sz)),
[=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {

RAJA::tile<hip_teams_y>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_r), [&] (RAJA::TypedRangeSegment<int> const &row_tile) {
RAJA::tile<hip_teams_y>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_r),
[&] (RAJA::TypedRangeSegment<int> const &row_tile) {

RAJA::tile<hip_teams_x>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_c), [&] (RAJA::TypedRangeSegment<int> const &col_tile) {
RAJA::tile<hip_teams_x>(ctx, TILE_DIM, RAJA::TypedRangeSegment<int>(0, N_c),
[&] (RAJA::TypedRangeSegment<int> const &col_tile) {

RAJA_TEAM_SHARED double Tile_Array[TILE_DIM][TILE_DIM];

Expand All @@ -374,20 +384,23 @@ int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
});
});

RAJA::loop_icount<hip_threads_x>(ctx, col_tile, [&] (int col, int tx) {
RAJA::loop_icount<hip_threads_y>(ctx, row_tile, [&] (int row, int ty) {
ctx.teamSync();

RAJA::loop_icount<hip_threads_y>(ctx, col_tile, [&] (int row, int ty) {
RAJA::loop_icount<hip_threads_x>(ctx, row_tile, [&] (int col, int tx) {

d_Atview(col, row) = Tile_Array[ty][tx];
d_Atview(row, col) = Tile_Array[tx][ty];
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@MrBurmark , I switched it around so its clear that the x and y threads have been transposed in shared memory. I'm not too sure how to express that in Kernel.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it be more clear to call row and col here rowt and colt?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good idea!

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

using row_t and col_t was the source of the compilation error. I switched it back so code compiles. Please feel free to undo my change and fix the compile differently.


});
});
});
});

});
});
});
});

});

CAMP_HIP_API_INVOKE_AND_CHECK(hipMemcpy, At, d_At, N_r * N_c * sizeof(int), hipMemcpyDeviceToHost);
CAMP_HIP_API_INVOKE_AND_CHECK(hipDeviceSynchronize);
checkResult<int>(Atview, N_c, N_r);
// printResult<int>(Atview, N_c, N_r);
#endif
Expand Down
Loading