From 840e3e737647870741950a6fc7a719c679670c3d Mon Sep 17 00:00:00 2001 From: Sourabh Betigeri Date: Thu, 11 Jul 2024 17:22:16 -0700 Subject: [PATCH] SWDEV-462007 - Fixes and strenghtens multi grid sync on mgpus - Replaces logic relying on clock64() instead to use an atomic counter that the last wg in grid and multi-grid groups wait on while the the non-last work groups increment the counter to "done" - Implements a new logic to verify the multi-grid sync() Change-Id: I7780d8124e5f144b124e5d191d0f412483a3b565 --- .../hipCGMultiGridGroupType_old.cc | 222 ++++++------------ 1 file changed, 73 insertions(+), 149 deletions(-) diff --git a/catch/unit/cooperativeGrps/hipCGMultiGridGroupType_old.cc b/catch/unit/cooperativeGrps/hipCGMultiGridGroupType_old.cc index 33bf5b784..d8b37c546 100644 --- a/catch/unit/cooperativeGrps/hipCGMultiGridGroupType_old.cc +++ b/catch/unit/cooperativeGrps/hipCGMultiGridGroupType_old.cc @@ -158,148 +158,58 @@ static __global__ void kernel_cg_multi_grid_group_type_via_public_api( } } -static __global__ void test_kernel(unsigned int* atomic_val, unsigned int* global_array, - unsigned int* array, uint32_t loops) { +__global__ void test_kernel(unsigned int* atomic_val, unsigned int* array, uint32_t loops, + unsigned int* per_loop_atomic, unsigned int* grid_counters, + unsigned int* recorded_values, unsigned int* grid_values) { cg::grid_group grid = cg::this_grid(); cg::multi_grid_group mgrid = cg::this_multi_grid(); unsigned rank = grid.thread_rank(); - unsigned global_rank = mgrid.thread_rank(); - + int grid_id = mgrid.grid_rank(); + int num_grids = mgrid.num_grids(); + int blocks_seen = 0; + int grid_blocks = gridDim.x * gridDim.y * gridDim.z; int offset = blockIdx.x; for (int i = 0; i < loops; i++) { // Make the last thread run way behind everyone else. - // If the grid barrier below fails, then the other threads may hit the - // atomicInc instruction many times before the last thread ever gets - // to it. - // As such, without the barrier, the last array entry will eventually - // contain a very large value, defined by however many times the other - // wavefronts make it through this loop. - // If the barrier works, then it will likely contain some number - // near "total number of blocks". It will be the last wavefront to - // reach the atomicInc, but everyone will have only hit the atomic once. + // If the grid sync below fails, then the other threads may hit the + // atomicInc instruction many times before the last thread ever gets to it. + // If the sync works, then it will likely contain "total number of blocks"*iter if (rank == (grid.size() - 1)) { - long long time_diff = 0; - long long last_clock = clock64(); - do { - long long cur_clock = clock64(); - if (cur_clock > last_clock) { - time_diff += (cur_clock - last_clock); - } - // If it rolls over, we don't know how much to add to catch up. - // So just ignore those slipped cycles. - last_clock = cur_clock; - } while (time_diff < 1000000); - } - if (threadIdx.x == 0) { - array[offset] = atomicInc(atomic_val, UINT_MAX); - } - grid.sync(); + // The last wavefront should spin on this loop's atomic value + // until all of the other wavefronts have incremented the + // per-loop atomic and hit the grid.sync() +#if HT_AMD + while (__hip_atomic_load(&per_loop_atomic[i], __ATOMIC_RELAXED, + __HIP_MEMORY_SCOPE_AGENT) < (grid_blocks - 1)) { + __builtin_amdgcn_s_sleep(127); + } - // Make the last thread in the entire multi-grid run way behind - // everyone else. - // If the mgrid barrier below fails, then the two global_array entries - // will end up being out of sync, because the intermingling of adds - // and multiplies will not be aligned between to the two GPUs. - if (global_rank == (mgrid.size() - 1)) { - long long time_diff = 0; - long long last_clock = clock64(); - do { - long long cur_clock = clock64(); - if (cur_clock > last_clock) { - time_diff += (cur_clock - last_clock); - } - // If it rolls over, we don't know how much to add to catch up. - // So just ignore those slipped cycles. - last_clock = cur_clock; - } while (time_diff < 1000000); - } - // During even iterations, add into your own array entry - // During odd iterations, add into your partner's array entry - unsigned grid_rank = mgrid.grid_rank(); - unsigned inter_gpu_offset = (grid_rank + i) % mgrid.num_grids(); - if (rank == (grid.size() - 1)) { - if (i % 2 == 0) { - global_array[grid_rank] += 2; - } else { - global_array[inter_gpu_offset] *= 2; + // Give the other waves time to maybe go around the loop again + // if the barrier has failed + __builtin_amdgcn_s_sleep(127); +#else // CUDA does not seem to need an ordered atomic load + while(per_loop_atomic[i] < (grid_blocks - 1)) { } +#endif } - mgrid.sync(); - offset += gridDim.x; - } -} - -__global__ void test_kernel_gfx11(unsigned int* atomic_val, unsigned int* global_array, - unsigned int* array, uint32_t loops) { -#if HT_AMD - cg::grid_group grid = cg::this_grid(); - cg::multi_grid_group mgrid = cg::this_multi_grid(); - unsigned rank = grid.thread_rank(); - unsigned global_rank = mgrid.thread_rank(); - int offset = blockIdx.x; - for (int i = 0; i < loops; i++) { - // Make the last thread run way behind everyone else. - // If the grid barrier below fails, then the other threads may hit the - // atomicInc instruction many times before the last thread ever gets - // to it. - // As such, without the barrier, the last array entry will eventually - // contain a very large value, defined by however many times the other - // wavefronts make it through this loop. - // If the barrier works, then it will likely contain some number - // near "total number of blocks". It will be the last wavefront to - // reach the atomicInc, but everyone will have only hit the atomic once. - if (rank == (grid.size() - 1)) { - long long time_diff = 0; - long long last_clock = wall_clock64(); - do { - long long cur_clock = wall_clock64(); - if (cur_clock > last_clock) { - time_diff += (cur_clock - last_clock); - } - // If it rolls over, we don't know how much to add to catch up. - // So just ignore those slipped cycles. - last_clock = cur_clock; - } while (time_diff < 1000000); - } if (threadIdx.x == 0) { - array[offset] = atomicInc(atomic_val, UINT_MAX); + atomicInc(&per_loop_atomic[i], UINT_MAX); + array[offset + blocks_seen] = atomicInc(atomic_val, UINT_MAX); } grid.sync(); + blocks_seen += grid_blocks; - // Make the last thread in the entire multi-grid run way behind - // everyone else. - // If the mgrid barrier below fails, then the two global_array entries - // will end up being out of sync, because the intermingling of adds - // and multiplies will not be aligned between to the two GPUs. - if (global_rank == (mgrid.size() - 1)) { - long long time_diff = 0; - long long last_clock = wall_clock64(); - do { - long long cur_clock = wall_clock64(); - if (cur_clock > last_clock) { - time_diff += (cur_clock - last_clock); - } - // If it rolls over, we don't know how much to add to catch up. - // So just ignore those slipped cycles. - last_clock = cur_clock; - } while (time_diff < 1000000); - } - // During even iterations, add into your own array entry - // During odd iterations, add into your partner's array entry - unsigned grid_rank = mgrid.grid_rank(); - unsigned inter_gpu_offset = (grid_rank + i) % mgrid.num_grids(); + // Each grid updates its own counter if (rank == (grid.size() - 1)) { - if (i % mgrid.num_grids() == 0) { - global_array[grid_rank] += 2; - } else { - global_array[inter_gpu_offset] *= 2; - } + grid_values[grid_id] = atomicAdd(&grid_counters[grid_id], grid_id + 1); } mgrid.sync(); - offset += gridDim.x; + + // After mgrid sync, read the next grid's counter + recorded_values[(grid_id * loops + i)] = grid_values[(grid_id + 1) % num_grids]; + mgrid.sync(); } -#endif } static void verify_barrier_buffer(unsigned int loops, unsigned int warps, unsigned int* host_buffer, @@ -313,17 +223,21 @@ static void verify_barrier_buffer(unsigned int loops, unsigned int warps, unsign } } -static void verify_multi_gpu_buffer(unsigned int loops, unsigned int array_val) { - unsigned int desired_val = 0; - for (int i = 0; i < loops; i++) { - if (i % 2 == 0) { - desired_val += 2; - } else { - desired_val *= 2; +// Function to verify recorded readings +static void verify_recorded_values(unsigned int* recorded_values, uint32_t loops, int num_devices) { + for (uint32_t i = 0; i < loops; i++) { + for (int grid_id = 0; grid_id < num_devices; grid_id++) { + // Determine the expected value from the next grid's counter + int next_grid_id = (grid_id + 1) % num_devices; + // The expected value should be the sum of the increments from previous loops + unsigned int expected_value = i * (next_grid_id + 1); + // Check the recorded value + unsigned int recorded_value = recorded_values[grid_id * loops + i]; + REQUIRE(recorded_value == expected_value); + INFO("Mismatch at loop " << i << " for grid " << grid_id << ": expected " + << expected_value << ", got " << recorded_value); } } - - REQUIRE(array_val == desired_val); } template @@ -554,9 +468,8 @@ TEST_CASE("Unit_hipCGMultiGridGroupType_Barrier") { int max_blocks_per_sm = INT_MAX; for (int i = 0; i < num_devices; i++) { HIP_CHECK(hipSetDevice(i)); - auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel; HIP_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor( - &max_blocks_per_sm_arr[i], test_kernel_used, num_threads_in_block, 0)); + &max_blocks_per_sm_arr[i], test_kernel, num_threads_in_block, 0)); if (max_blocks_per_sm_arr[i] < max_blocks_per_sm) { max_blocks_per_sm = max_blocks_per_sm_arr[i]; } @@ -573,6 +486,16 @@ TEST_CASE("Unit_hipCGMultiGridGroupType_Barrier") { std::vector kernel_buffer(num_devices); std::vector kernel_atomic(num_devices); std::vector streams(num_devices); + std::vector per_loop_atomic(num_devices); + // Allocate and initialize grid-specific counters and values using hipHostMalloc + unsigned int* grid_counters, *recorded_values, *grid_values; + HIP_CHECK(hipHostMalloc(&grid_counters, sizeof(unsigned int) * num_devices)); + HIP_CHECK(hipHostMalloc(&recorded_values, sizeof(unsigned int) * num_devices * loops)); + HIP_CHECK(hipHostMalloc(&grid_values, sizeof(unsigned int) * num_devices)); + HIP_CHECK(hipMemset(grid_counters, 0, num_devices * sizeof(unsigned int))); + HIP_CHECK(hipMemset(recorded_values, 0, num_devices * loops * sizeof(unsigned int))); + HIP_CHECK(hipMemset(grid_values, 0, num_devices * sizeof(unsigned int))); + for (int i = 0; i < num_devices; i++) { host_buffer[i] = reinterpret_cast(calloc(total_buffer_len, sizeof(unsigned int))); @@ -582,28 +505,27 @@ TEST_CASE("Unit_hipCGMultiGridGroupType_Barrier") { hipMemcpyHostToDevice)); HIP_CHECK(hipMalloc(&kernel_atomic[i], sizeof(unsigned int))); HIP_CHECK(hipMemset(kernel_atomic[i], 0, sizeof(unsigned int))); + HIP_CHECK(hipMalloc(&per_loop_atomic[i], loops * sizeof(unsigned int))); + HIP_CHECK(hipMemset(per_loop_atomic[i], 0, loops * sizeof(unsigned int))); HIP_CHECK(hipStreamCreate(&streams[i])); } - // Single kernel atomic shared between both devices; put it on the host - unsigned int* global_array; - HIP_CHECK(hipHostMalloc(&global_array, sizeof(unsigned int) * num_devices)); - HIP_CHECK(hipMemset(global_array, 0, num_devices * sizeof(unsigned int))); - // Launch the kernels INFO("Launching a cooperative kernel with " << warps << " warps in " << requested_blocks << " thread blocks"); - std::vector> dev_params(num_devices, std::vector(4, nullptr)); + std::vector> dev_params(num_devices, std::vector(7, nullptr)); std::vector md_params(num_devices); for (int i = 0; i < num_devices; i++) { HIP_CHECK(hipSetDevice(i)); - auto test_kernel_used = IsGfx11() ? test_kernel_gfx11 : test_kernel; dev_params[i][0] = reinterpret_cast(&kernel_atomic[i]); - dev_params[i][1] = reinterpret_cast(&global_array); - dev_params[i][2] = reinterpret_cast(&kernel_buffer[i]); - dev_params[i][3] = reinterpret_cast(&loops); - md_params[i].func = reinterpret_cast(test_kernel_used); + dev_params[i][1] = reinterpret_cast(&kernel_buffer[i]); + dev_params[i][2] = reinterpret_cast(&loops); + dev_params[i][3] = reinterpret_cast(&per_loop_atomic[i]); + dev_params[i][4] = reinterpret_cast(&grid_counters); + dev_params[i][5] = reinterpret_cast(&recorded_values); + dev_params[i][6] = reinterpret_cast(&grid_values); + md_params[i].func = reinterpret_cast(test_kernel); md_params[i].gridDim = requested_blocks; md_params[i].blockDim = num_threads_in_block; md_params[i].sharedMem = 0; @@ -624,14 +546,16 @@ TEST_CASE("Unit_hipCGMultiGridGroupType_Barrier") { verify_barrier_buffer(loops, requested_blocks, host_buffer[dev], num_devices); } - for (int dev = 0; dev < num_devices; dev++) { - verify_multi_gpu_buffer(loops, global_array[dev]); - } + // Verify the recorded values + verify_recorded_values(recorded_values, loops, num_devices); - HIP_CHECK(hipHostFree(global_array)); + HIP_CHECK(hipHostFree(grid_counters)); + HIP_CHECK(hipHostFree(recorded_values)); + HIP_CHECK(hipHostFree(grid_values)); for (int k = 0; k < num_devices; ++k) { HIP_CHECK(hipFree(kernel_buffer[k])); HIP_CHECK(hipFree(kernel_atomic[k])); + HIP_CHECK(hipFree(per_loop_atomic[k])); HIP_CHECK(hipStreamDestroy(streams[k])); free(host_buffer[k]); }