From 30d5d95c2c00688c5c749aea68d75c3c7f7c84dd Mon Sep 17 00:00:00 2001 From: vinay birur Date: Thu, 9 Mar 2023 11:49:57 +0530 Subject: [PATCH] SWDEV-384938 - [catch2][dtest] Warp shuffle function support for half datatypes Change-Id: I8fb3f26f51894eaea7824111e3f69deb5da3dde3 --- catch/include/hip_test_defgroups.hh | 7 + catch/unit/kernel/CMakeLists.txt | 9 + catch/unit/kernel/hipShflTests.cc | 209 ++++++++++++++++++ catch/unit/kernel/hipShflUpDownTest.cc | 294 +++++++++++++++++++++++++ 4 files changed, 519 insertions(+) create mode 100644 catch/unit/kernel/hipShflTests.cc create mode 100644 catch/unit/kernel/hipShflUpDownTest.cc diff --git a/catch/include/hip_test_defgroups.hh b/catch/include/hip_test_defgroups.hh index 790d360f2..77e389b5d 100644 --- a/catch/include/hip_test_defgroups.hh +++ b/catch/include/hip_test_defgroups.hh @@ -35,3 +35,10 @@ THE SOFTWARE. * This section describes the graph management types & functions of HIP runtime API. * @} */ + +/** + * @defgroup ShflTest warp shuffle function Management + * @{ + * This section describes the warp shuffle types & functions of HIP runtime API. + * @} + */ diff --git a/catch/unit/kernel/CMakeLists.txt b/catch/unit/kernel/CMakeLists.txt index 523fa7b86..22fef5b29 100644 --- a/catch/unit/kernel/CMakeLists.txt +++ b/catch/unit/kernel/CMakeLists.txt @@ -22,8 +22,17 @@ set(TEST_SRC hipMemFaultStackAllocation.cc hipLaunchBounds.cc + hipShflTests.cc ) +# only for AMD +if(HIP_PLATFORM MATCHES "amd") + set(AMD_SRC + hipShflUpDownTest.cc + ) + set(TEST_SRC ${TEST_SRC} ${AMD_SRC}) +endif() + hip_add_exe_to_target(NAME KernelTest TEST_SRC ${TEST_SRC} TEST_TARGET_NAME build_tests) diff --git a/catch/unit/kernel/hipShflTests.cc b/catch/unit/kernel/hipShflTests.cc new file mode 100644 index 000000000..711a8d174 --- /dev/null +++ b/catch/unit/kernel/hipShflTests.cc @@ -0,0 +1,209 @@ +/* +Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +#include +#include +#include +#include + +#define WIDTH 4 + +#define NUM (WIDTH * WIDTH) + +#define THREADS_PER_BLOCK_X 4 +#define THREADS_PER_BLOCK_Y 4 +#define THREADS_PER_BLOCK_Z 1 + +// Device (Kernel) function, it must be void +template +__global__ void matrixTranspose(T* out, T* in, const int width) { + int x = blockDim.x * blockIdx.x + threadIdx.x; + T val = in[x]; + for (int i = 0; i < width; i++) { + for (int j = 0; j < width; j++) + out[i * width + j] = __shfl(val, j * width + i); + } +} + +// CPU implementation of matrix transpose +template +void matrixTransposeCPUReference(T* output, + T* input, const unsigned int width) { + for (unsigned int j = 0; j < width; j++) { + for (unsigned int i = 0; i < width; i++) { + output[i * width + j] = input[j * width + i]; + } + } +} + +static void getFactor(int* fact) { *fact = 101; } +static void getFactor(unsigned int* fact) { + *fact = static_cast(INT32_MAX)+1; +} +static void getFactor(float* fact) { *fact = 2.5; } +static void getFactor(__half* fact) { *fact = 2.5; } +static void getFactor(double* fact) { *fact = 2.5; } +static void getFactor(int64_t* fact) { *fact = 303; } +static void getFactor(uint64_t* fact) { + *fact = static_cast(__LONG_LONG_MAX__)+1; +} + +template +int compare(T* TransposeMatrix, T* cpuTransposeMatrix) { + int errors = 0; + for (int i = 0; i < NUM; i++) { + if (TransposeMatrix[i] != cpuTransposeMatrix[i]) { + errors++; + } + } + return errors; +} + +template <> +int compare<__half>(__half* TransposeMatrix, __half* cpuTransposeMatrix) { + int errors = 0; + for (int i = 0; i < NUM; i++) { + if (__half2float(TransposeMatrix[i]) != __half2float(cpuTransposeMatrix[i])) { // NOLINT + errors++; + } + } + return errors; +} + +template +void init(T* Matrix) { + // initialize the input data + T factor; + getFactor(&factor); + for (int i = 0; i < NUM; i++) { + Matrix[i] = (T)i + factor; + } +} + +template <> +void init(__half* Matrix) { + // initialize the input data + __half factor; + getFactor(&factor); + for (int i = 0; i < NUM; i++) { + Matrix[i] = i + __half2float(factor); + } +} + +template +static void runTest() { + T* Matrix; + T* TransposeMatrix; + T* cpuTransposeMatrix; + + T* gpuMatrix; + T* gpuTransposeMatrix; + + hipDeviceProp_t devProp; + HIP_CHECK(hipGetDeviceProperties(&devProp, 0)); + + int errors = 0; + + Matrix = reinterpret_cast(malloc(NUM * sizeof(T))); + TransposeMatrix = reinterpret_cast(malloc(NUM * sizeof(T))); + cpuTransposeMatrix = reinterpret_cast(malloc(NUM * sizeof(T))); + + init(Matrix); + + // allocate the memory on the device side + HIP_CHECK(hipMalloc(reinterpret_cast(&gpuMatrix), NUM * sizeof(T))); + HIP_CHECK(hipMalloc(reinterpret_cast(&gpuTransposeMatrix), + NUM * sizeof(T))); + + // Memory transfer from host to device + HIP_CHECK(hipMemcpy(gpuMatrix, Matrix, NUM * sizeof(T), + hipMemcpyHostToDevice)); + + // Lauching kernel from host + hipLaunchKernelGGL(matrixTranspose, dim3(1), + dim3(THREADS_PER_BLOCK_X * THREADS_PER_BLOCK_Y), 0, 0, + gpuTransposeMatrix, gpuMatrix, WIDTH); + + // Memory transfer from device to host + HIP_CHECK(hipMemcpy(TransposeMatrix, gpuTransposeMatrix, + NUM * sizeof(T), hipMemcpyDeviceToHost)); + + // CPU MatrixTranspose computation + matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH); + + // verify the results + REQUIRE(errors == compare(TransposeMatrix, cpuTransposeMatrix)); + double eps = 1.0E-6; + // free the resources on device side + HIP_CHECK(hipFree(gpuMatrix)); + HIP_CHECK(hipFree(gpuTransposeMatrix)); + + // free the resources on host side + free(Matrix); + free(TransposeMatrix); + free(cpuTransposeMatrix); +} + +/** + * @addtogroup __shfl __shfl + * @{ + * @ingroup ShflTest + * `T __shfl(T var, int srcLane, int width=warpSize)` - + * Contains wrap __shfl functions. + * @} + */ + +/** + * Test Description + * ------------------------ + * - Test case to verify __shfl warp functions for different datatypes. + + * Test source + * ------------------------ + * - catch/unit/kernel/hipShflTests.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.6 + */ + +TEST_CASE("Unit_hipShflTests") { + SECTION("run test for int") { + runTest(); + } + SECTION("run test for float") { + runTest(); + } + SECTION("run test for double") { + runTest(); + } + // Test added to support half datatype. + SECTION("run test for __half") { + runTest<__half>(); + } + SECTION("run test for int64_t") { + runTest(); + } + SECTION("run test for unsigned int") { + runTest(); + } + SECTION("run test for uint64_t") { + runTest(); + } +} diff --git a/catch/unit/kernel/hipShflUpDownTest.cc b/catch/unit/kernel/hipShflUpDownTest.cc new file mode 100644 index 000000000..ab80dd51b --- /dev/null +++ b/catch/unit/kernel/hipShflUpDownTest.cc @@ -0,0 +1,294 @@ +/* +Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +#include +#include +#include +#include + +const int size = 32; + +template +__global__ void shflDownSum(T* a, int size) { + T val = a[threadIdx.x]; + for (int i = size / 2; i > 0; i /= 2) { + val += __shfl_down(val, i, size); + } + a[threadIdx.x] = val; +} + +template +__global__ void shflUpSum(T* a, int size) { + T val = a[threadIdx.x]; + for (int i = size / 2; i > 0; i /= 2) { + val += __shfl_up(val, i, size); + } + a[threadIdx.x] = val; +} + +template +__global__ void shflXorSum(T* a, int size) { + T val = a[threadIdx.x]; + for (int i = size/2; i > 0; i /= 2) { + val += __shfl_xor(val, i, size); + } + a[threadIdx.x] = val; +} + +static void getFactor(int* fact) { *fact = 101; } +static void getFactor(unsigned int* fact) { + *fact = static_cast(INT32_MAX)+1; +} +static void getFactor(float* fact) { *fact = 2.5; } +static void getFactor(double* fact) { *fact = 2.5; } +static void getFactor(__half* fact) { *fact = 2.5; } +static void getFactor(int64_t* fact) { *fact = 303; } +static void getFactor(uint64_t* fact) { + *fact = static_cast(__LONG_LONG_MAX__)+1; +} + +template T sum(T* a) { + T cpuSum = 0; + T factor; + getFactor(&factor); + for (int i = 0; i < size; i++) { + a[i] = i + factor; + cpuSum += a[i]; + } + return cpuSum; +} + +template <> __half sum(__half* a) { + __half cpuSum = 0; + __half factor; + getFactor(&factor); + for (int i = 0; i < size; i++) { + a[i] = i + __half2float(factor); + cpuSum = __half2float(cpuSum) + __half2float(a[i]); + } + return cpuSum; +} + +template bool compare(T gpuSum, T cpuSum) { + if (gpuSum != cpuSum) { + return true; + } + return false; +} + +template <> bool compare(__half gpuSum, __half cpuSum) { + if (__half2float(gpuSum) != __half2float(cpuSum)) { + return true; + } + return false; +} + +template +static void runTestShflUp() { + const int size = 32; + T a[size]; + T cpuSum = sum(a); + T* d_a; + HIP_CHECK(hipMalloc(&d_a, sizeof(T) * size)); + HIP_CHECK(hipMemcpy(d_a, &a, sizeof(T) * size, hipMemcpyDefault)); + hipLaunchKernelGGL(shflUpSum, 1, size, 0, 0, d_a, size); + HIP_CHECK(hipMemcpy(&a, d_a, sizeof(T) * size, hipMemcpyDefault)); + REQUIRE((compare(a[size - 1], cpuSum)) == 0); + HIP_CHECK(hipFree(d_a)); +} + +template +static void runTestShflDown() { + T a[size]; + T cpuSum = sum(a); + T* d_a; + HIP_CHECK(hipMalloc(&d_a, sizeof(T) * size)); + HIP_CHECK(hipMemcpy(d_a, &a, sizeof(T) * size, hipMemcpyDefault)); + hipLaunchKernelGGL(shflDownSum, 1, size, 0, 0, d_a, size); + HIP_CHECK(hipMemcpy(&a, d_a, sizeof(T) * size, hipMemcpyDefault)); + REQUIRE((compare(a[0], cpuSum)) == 0); + HIP_CHECK(hipFree(d_a)); +} + +template +static void runTestShflXor() { + T a[size]; + T cpuSum = sum(a); + T* d_a; + HIP_CHECK(hipMalloc(&d_a, sizeof(T) * size)); + HIP_CHECK(hipMemcpy(d_a, &a, sizeof(T) * size, hipMemcpyDefault)); + hipLaunchKernelGGL(shflXorSum, 1, size, 0, 0, d_a, size); + HIP_CHECK(hipMemcpy(&a, d_a, sizeof(T) * size, hipMemcpyDefault)); + REQUIRE((compare(a[0], cpuSum)) == 0); + HIP_CHECK(hipFree(d_a)); +} + +/** +* @addtogroup __shfl __shfl +* @{ +* @ingroup ShflTest +* `T __shfl_up(T var, unsigned int lane_delta, int width = warpSize)` - +* Contains warp __shfl_up function +*/ + +/** + * Test Description + * ------------------------ + * - Test case to verify __shfl_up warp functions for different datatypes. + + * Test source + * ------------------------ + * - catch/unit/kernel/hipShflUpDownTest.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.6 + * - Gaurding this test against cuda with refernce to mentioned + * ticket SWDEV-379177 + */ + +TEST_CASE("Unit_runTestShfl_up") { + SECTION("runTestShflUp for int") { + runTestShflUp(); + } + SECTION("runTestShflUp for float") { + runTestShflUp(); + } + SECTION("runTestShflUp for double") { + runTestShflUp(); + } + SECTION("runTestShflUp for __half") { + runTestShflUp<__half>(); + } + SECTION("runTestShflUp for int64_t") { + runTestShflUp(); + } + SECTION("runTestShflUp for unsigned int") { + runTestShflUp(); + } + SECTION("runTestShflUp for uint64_t") { + runTestShflUp(); + } +} +/** + * End doxygen group __shfl. + * @} + */ + +/** +* @addtogroup __shfl __shfl +* @{ +* @ingroup ShflTest +* `T __shfl_down(T var, unsigned int lane_delta, int width = warpSize)` - +* Contains warp __shfl_down function +*/ + +/** + * Test Description + * ------------------------ + * - Test case to verify __shfl_down warp functions for different datatypes. + + * Test source + * ------------------------ + * - catch/unit/kernel/hipShflUpDownTest.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.6 + * - Gaurding this test against cuda with refernce to mentioned + * ticket SWDEV-379177 + */ + +TEST_CASE("Unit_runTestShfl_Down") { + SECTION("runTestShflDown for int") { + runTestShflDown(); + } + SECTION("runTestShflDown for float") { + runTestShflDown(); + } + SECTION("runTestShflDown for double") { + runTestShflDown(); + } + SECTION("runTestShflDown for __half") { + runTestShflDown<__half>(); + } + SECTION("runTestShflDown for int64_t") { + runTestShflDown(); + } + SECTION("runTestShflDown for unsigned int") { + runTestShflDown(); + } + SECTION("runTestShflDown for uint64_t") { + runTestShflDown(); + } +} +/** + * End doxygen group __shfl. + * @} + */ + +/** +* @addtogroup __shfl __shfl +* @{ +* @ingroup ShflTest +* `T __shfl_xor(T var, int laneMask, int width=warpSize)` - +* Contains warp __shfl_xor function +*/ + +/** + * Test Description + * ------------------------ + * - Test case to verify __shfl_xor warp functions for different datatypes. + + * Test source + * ------------------------ + * - catch/unit/kernel/hipShflUpDownTest.cc + * Test requirements + * ------------------------ + * - HIP_VERSION >= 5.6 + * - Gaurding this test against cuda with refernce to mentioned + * ticket SWDEV-379177 + */ + +TEST_CASE("Unit_runTestShfl_Xor") { + SECTION("runTestShflXor for int") { + runTestShflXor(); + } + SECTION("runTestShflXor for float") { + runTestShflXor(); + } + SECTION("runTestShflXor for double") { + runTestShflXor(); + } + SECTION("runTestShflXor for __half") { + runTestShflXor<__half>(); + } + SECTION("runTestShflXor for int64_t") { + runTestShflXor(); + } + SECTION("runTestShflXor for unsigned int") { + runTestShflXor(); + } + SECTION("runTestShflXor for uint64_t") { + runTestShflXor(); + } +} +/** + * End doxygen group __shfl. + * @} + */