From 30d5d95c2c00688c5c749aea68d75c3c7f7c84dd Mon Sep 17 00:00:00 2001
From: vinay birur <vinaybirur.venkateshamurthy@amd.com>
Date: Thu, 9 Mar 2023 11:49:57 +0530
Subject: [PATCH] SWDEV-384938 - [catch2][dtest] Warp shuffle function support
 for half datatypes

Change-Id: I8fb3f26f51894eaea7824111e3f69deb5da3dde3
---
 catch/include/hip_test_defgroups.hh    |   7 +
 catch/unit/kernel/CMakeLists.txt       |   9 +
 catch/unit/kernel/hipShflTests.cc      | 209 ++++++++++++++++++
 catch/unit/kernel/hipShflUpDownTest.cc | 294 +++++++++++++++++++++++++
 4 files changed, 519 insertions(+)
 create mode 100644 catch/unit/kernel/hipShflTests.cc
 create mode 100644 catch/unit/kernel/hipShflUpDownTest.cc
diff --git a/catch/include/hip_test_defgroups.hh b/catch/include/hip_test_defgroups.hh
index 790d360f2..77e389b5d 100644
--- a/catch/include/hip_test_defgroups.hh
+++ b/catch/include/hip_test_defgroups.hh
@@ -35,3 +35,10 @@ THE SOFTWARE.
  * This section describes the graph management types & functions of HIP runtime API.
  * @}
  */
+
+/**
+ * @defgroup ShflTest warp shuffle function Management
+ * @{
+ * This section describes the warp shuffle types & functions of HIP runtime API.
+ * @}
+ */
diff --git a/catch/unit/kernel/CMakeLists.txt b/catch/unit/kernel/CMakeLists.txt
index 523fa7b86..22fef5b29 100644
--- a/catch/unit/kernel/CMakeLists.txt
+++ b/catch/unit/kernel/CMakeLists.txt
@@ -22,8 +22,17 @@
 set(TEST_SRC
     hipMemFaultStackAllocation.cc
     hipLaunchBounds.cc
+    hipShflTests.cc
 )
 
+# only for AMD
+if(HIP_PLATFORM MATCHES "amd")
+  set(AMD_SRC
+      hipShflUpDownTest.cc
+  )
+  set(TEST_SRC ${TEST_SRC} ${AMD_SRC})
+endif()
+
 hip_add_exe_to_target(NAME KernelTest
                       TEST_SRC ${TEST_SRC}
                       TEST_TARGET_NAME build_tests)
diff --git a/catch/unit/kernel/hipShflTests.cc b/catch/unit/kernel/hipShflTests.cc
new file mode 100644
index 000000000..711a8d174
--- /dev/null
+++ b/catch/unit/kernel/hipShflTests.cc
@@ -0,0 +1,209 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_kernels.hh>
+#include <hip_test_checkers.hh>
+#include <hip_test_common.hh>
+#include <hip/hip_fp16.h>
+#include <hip_test_defgroups.hh>
+
+#define WIDTH 4
+
+#define NUM (WIDTH * WIDTH)
+
+#define THREADS_PER_BLOCK_X 4
+#define THREADS_PER_BLOCK_Y 4
+#define THREADS_PER_BLOCK_Z 1
+
+// Device (Kernel) function, it must be void
+template <typename T>
+__global__ void matrixTranspose(T* out, T* in, const int width) {
+  int x = blockDim.x * blockIdx.x + threadIdx.x;
+  T val = in[x];
+  for (int i = 0; i < width; i++) {
+    for (int j = 0; j < width; j++)
+      out[i * width + j] = __shfl(val, j * width + i);
+  }
+}
+
+// CPU implementation of matrix transpose
+template <typename T>
+void matrixTransposeCPUReference(T* output,
+                                 T* input, const unsigned int width) {
+  for (unsigned int j = 0; j < width; j++) {
+    for (unsigned int i = 0; i < width; i++) {
+      output[i * width + j] = input[j * width + i];
+    }
+  }
+}
+
+static void getFactor(int* fact) { *fact = 101; }
+static void getFactor(unsigned int* fact) {
+  *fact = static_cast<unsigned int>(INT32_MAX)+1;
+}
+static void getFactor(float* fact) { *fact = 2.5; }
+static void getFactor(__half* fact) { *fact = 2.5; }
+static void getFactor(double* fact) { *fact = 2.5; }
+static void getFactor(int64_t* fact) { *fact = 303; }
+static void getFactor(uint64_t* fact) {
+  *fact = static_cast<uint64_t>(__LONG_LONG_MAX__)+1;
+}
+
+template <typename T>
+int compare(T* TransposeMatrix, T* cpuTransposeMatrix) {
+  int errors = 0;
+  for (int i = 0; i < NUM; i++) {
+      if (TransposeMatrix[i] != cpuTransposeMatrix[i]) {
+          errors++;
+      }
+  }
+  return errors;
+}
+
+template <>
+int compare<__half>(__half* TransposeMatrix, __half* cpuTransposeMatrix) {
+  int errors = 0;
+  for (int i = 0; i < NUM; i++) {
+    if (__half2float(TransposeMatrix[i]) != __half2float(cpuTransposeMatrix[i])) { // NOLINT
+      errors++;
+    }
+  }
+  return errors;
+}
+
+template <typename T>
+void init(T* Matrix) {
+  // initialize the input data
+  T factor;
+  getFactor(&factor);
+  for (int i = 0; i < NUM; i++) {
+      Matrix[i] = (T)i + factor;
+  }
+}
+
+template <>
+void init(__half* Matrix) {
+  // initialize the input data
+  __half factor;
+  getFactor(&factor);
+  for (int i = 0; i < NUM; i++) {
+      Matrix[i] = i + __half2float(factor);
+  }
+}
+
+template<typename T>
+static void runTest() {
+  T* Matrix;
+  T* TransposeMatrix;
+  T* cpuTransposeMatrix;
+
+  T* gpuMatrix;
+  T* gpuTransposeMatrix;
+
+  hipDeviceProp_t devProp;
+  HIP_CHECK(hipGetDeviceProperties(&devProp, 0));
+
+  int errors = 0;
+
+  Matrix = reinterpret_cast<T*>(malloc(NUM * sizeof(T)));
+  TransposeMatrix = reinterpret_cast<T*>(malloc(NUM * sizeof(T)));
+  cpuTransposeMatrix = reinterpret_cast<T*>(malloc(NUM * sizeof(T)));
+
+  init(Matrix);
+
+  // allocate the memory on the device side
+  HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&gpuMatrix), NUM * sizeof(T)));
+  HIP_CHECK(hipMalloc(reinterpret_cast<void**>(&gpuTransposeMatrix),
+                       NUM * sizeof(T)));
+
+  // Memory transfer from host to device
+  HIP_CHECK(hipMemcpy(gpuMatrix, Matrix, NUM * sizeof(T),
+                      hipMemcpyHostToDevice));
+
+  // Lauching kernel from host
+  hipLaunchKernelGGL(matrixTranspose<T>, dim3(1),
+                    dim3(THREADS_PER_BLOCK_X * THREADS_PER_BLOCK_Y), 0, 0,
+                    gpuTransposeMatrix, gpuMatrix, WIDTH);
+
+  // Memory transfer from device to host
+  HIP_CHECK(hipMemcpy(TransposeMatrix, gpuTransposeMatrix,
+                       NUM * sizeof(T), hipMemcpyDeviceToHost));
+
+  // CPU MatrixTranspose computation
+  matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH);
+
+  // verify the results
+  REQUIRE(errors == compare(TransposeMatrix, cpuTransposeMatrix));
+  double eps = 1.0E-6;
+  // free the resources on device side
+  HIP_CHECK(hipFree(gpuMatrix));
+  HIP_CHECK(hipFree(gpuTransposeMatrix));
+
+  // free the resources on host side
+  free(Matrix);
+  free(TransposeMatrix);
+  free(cpuTransposeMatrix);
+}
+
+/**
+ * @addtogroup __shfl __shfl
+ * @{
+ * @ingroup ShflTest
+ * `T  __shfl(T var, int srcLane, int width=warpSize)` -
+ * Contains wrap __shfl functions.
+ * @}
+ */
+
+/**
+ * Test Description
+ * ------------------------
+ * - Test case to verify __shfl warp functions for different datatypes.
+
+ * Test source
+ * ------------------------
+ *    - catch/unit/kernel/hipShflTests.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.6
+ */
+
+TEST_CASE("Unit_hipShflTests") {
+  SECTION("run test for int") {
+    runTest<int>();
+  }
+  SECTION("run test for float") {
+    runTest<float>();
+  }
+  SECTION("run test for double") {
+    runTest<double>();
+  }
+  // Test added to support half datatype.
+  SECTION("run test for __half") {
+    runTest<__half>();
+  }
+  SECTION("run test for int64_t") {
+    runTest<int64_t>();
+  }
+  SECTION("run test for unsigned int") {
+    runTest<unsigned int>();
+  }
+  SECTION("run test for uint64_t") {
+    runTest<uint64_t>();
+  }
+}
diff --git a/catch/unit/kernel/hipShflUpDownTest.cc b/catch/unit/kernel/hipShflUpDownTest.cc
new file mode 100644
index 000000000..ab80dd51b
--- /dev/null
+++ b/catch/unit/kernel/hipShflUpDownTest.cc
@@ -0,0 +1,294 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip_test_kernels.hh>
+#include <hip_test_checkers.hh>
+#include <hip_test_common.hh>
+#include <hip/hip_fp16.h>
+#include <hip_test_defgroups.hh>
+
+const int size = 32;
+
+template <typename T>
+__global__ void shflDownSum(T* a, int size) {
+  T val = a[threadIdx.x];
+  for (int i = size / 2; i > 0; i /= 2) {
+    val += __shfl_down(val, i, size);
+  }
+  a[threadIdx.x] = val;
+}
+
+template <typename T>
+__global__ void shflUpSum(T* a, int size) {
+  T val = a[threadIdx.x];
+  for (int i = size / 2; i > 0; i /= 2) {
+    val += __shfl_up(val, i, size);
+  }
+  a[threadIdx.x] = val;
+}
+
+template <typename T>
+__global__ void shflXorSum(T* a, int size) {
+  T val = a[threadIdx.x];
+  for (int i = size/2; i > 0; i /= 2) {
+    val += __shfl_xor(val, i, size);
+  }
+  a[threadIdx.x] = val;
+}
+
+static void getFactor(int* fact) { *fact = 101; }
+static void getFactor(unsigned int* fact) {
+  *fact = static_cast<unsigned int>(INT32_MAX)+1;
+}
+static void getFactor(float* fact) { *fact = 2.5; }
+static void getFactor(double* fact) { *fact = 2.5; }
+static void getFactor(__half* fact) { *fact = 2.5; }
+static void getFactor(int64_t* fact) { *fact = 303; }
+static void getFactor(uint64_t* fact) {
+  *fact = static_cast<uint64_t>(__LONG_LONG_MAX__)+1;
+}
+
+template <typename T> T sum(T* a) {
+  T cpuSum = 0;
+  T factor;
+  getFactor(&factor);
+  for (int i = 0; i < size; i++) {
+      a[i] = i + factor;
+      cpuSum += a[i];
+  }
+  return cpuSum;
+}
+
+template <> __half sum(__half* a) {
+  __half cpuSum = 0;
+  __half factor;
+  getFactor(&factor);
+  for (int i = 0; i < size; i++) {
+      a[i] = i + __half2float(factor);
+      cpuSum = __half2float(cpuSum) + __half2float(a[i]);
+  }
+  return cpuSum;
+}
+
+template <typename T> bool compare(T gpuSum, T cpuSum) {
+  if (gpuSum != cpuSum) {
+    return true;
+  }
+  return false;
+}
+
+template <> bool compare(__half gpuSum, __half cpuSum) {
+  if (__half2float(gpuSum) != __half2float(cpuSum)) {
+    return true;
+  }
+  return false;
+}
+
+template <typename T>
+static void runTestShflUp() {
+  const int size = 32;
+  T a[size];
+  T cpuSum = sum(a);
+  T* d_a;
+  HIP_CHECK(hipMalloc(&d_a, sizeof(T) * size));
+  HIP_CHECK(hipMemcpy(d_a, &a, sizeof(T) * size, hipMemcpyDefault));
+  hipLaunchKernelGGL(shflUpSum<T>, 1, size, 0, 0, d_a, size);
+  HIP_CHECK(hipMemcpy(&a, d_a, sizeof(T) * size, hipMemcpyDefault));
+  REQUIRE((compare(a[size - 1], cpuSum)) == 0);
+  HIP_CHECK(hipFree(d_a));
+}
+
+template <typename T>
+static void runTestShflDown() {
+  T a[size];
+  T cpuSum = sum(a);
+  T* d_a;
+  HIP_CHECK(hipMalloc(&d_a, sizeof(T) * size));
+  HIP_CHECK(hipMemcpy(d_a, &a, sizeof(T) * size, hipMemcpyDefault));
+  hipLaunchKernelGGL(shflDownSum<T>, 1, size, 0, 0, d_a, size);
+  HIP_CHECK(hipMemcpy(&a, d_a, sizeof(T) * size, hipMemcpyDefault));
+  REQUIRE((compare(a[0], cpuSum)) == 0);
+  HIP_CHECK(hipFree(d_a));
+}
+
+template <typename T>
+static void runTestShflXor() {
+  T a[size];
+  T cpuSum = sum(a);
+  T* d_a;
+  HIP_CHECK(hipMalloc(&d_a, sizeof(T) * size));
+  HIP_CHECK(hipMemcpy(d_a, &a, sizeof(T) * size, hipMemcpyDefault));
+  hipLaunchKernelGGL(shflXorSum<T>, 1, size, 0, 0, d_a, size);
+  HIP_CHECK(hipMemcpy(&a, d_a, sizeof(T) * size, hipMemcpyDefault));
+  REQUIRE((compare(a[0], cpuSum)) == 0);
+  HIP_CHECK(hipFree(d_a));
+}
+
+/**
+* @addtogroup __shfl __shfl
+* @{
+* @ingroup ShflTest
+* `T __shfl_up(T var, unsigned int lane_delta, int width = warpSize)` -
+* Contains warp __shfl_up function
+*/
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Test case to verify __shfl_up warp functions for different datatypes.
+
+ * Test source
+ * ------------------------
+ *    - catch/unit/kernel/hipShflUpDownTest.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.6
+ *    - Gaurding this test against cuda with refernce to mentioned
+ * ticket SWDEV-379177
+ */
+
+TEST_CASE("Unit_runTestShfl_up") {
+  SECTION("runTestShflUp for int") {
+    runTestShflUp<int>();
+  }
+  SECTION("runTestShflUp for float") {
+    runTestShflUp<float>();
+  }
+  SECTION("runTestShflUp for double") {
+    runTestShflUp<double>();
+  }
+  SECTION("runTestShflUp for __half") {
+    runTestShflUp<__half>();
+  }
+  SECTION("runTestShflUp for int64_t") {
+    runTestShflUp<int64_t>();
+  }
+  SECTION("runTestShflUp for unsigned int") {
+    runTestShflUp<unsigned int>();
+  }
+  SECTION("runTestShflUp for uint64_t") {
+    runTestShflUp<uint64_t>();
+  }
+}
+/**
+ * End doxygen group __shfl.
+ * @}
+ */
+
+/**
+* @addtogroup __shfl __shfl
+* @{
+* @ingroup ShflTest
+* `T __shfl_down(T var, unsigned int lane_delta, int width = warpSize)` -
+* Contains warp __shfl_down function
+*/
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Test case to verify __shfl_down warp functions for different datatypes.
+
+ * Test source
+ * ------------------------
+ *    - catch/unit/kernel/hipShflUpDownTest.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.6
+ *    - Gaurding this test against cuda with refernce to mentioned
+ * ticket SWDEV-379177
+ */
+
+TEST_CASE("Unit_runTestShfl_Down") {
+  SECTION("runTestShflDown for int") {
+    runTestShflDown<int>();
+  }
+  SECTION("runTestShflDown for float") {
+    runTestShflDown<float>();
+  }
+  SECTION("runTestShflDown for double") {
+    runTestShflDown<double>();
+  }
+  SECTION("runTestShflDown for __half") {
+    runTestShflDown<__half>();
+  }
+  SECTION("runTestShflDown for int64_t") {
+    runTestShflDown<int64_t>();
+  }
+  SECTION("runTestShflDown for unsigned int") {
+    runTestShflDown<unsigned int>();
+  }
+  SECTION("runTestShflDown for uint64_t") {
+    runTestShflDown<uint64_t>();
+  }
+}
+/**
+ * End doxygen group __shfl.
+ * @}
+ */
+
+/**
+* @addtogroup __shfl __shfl
+* @{
+* @ingroup ShflTest
+* `T __shfl_xor(T var, int laneMask, int width=warpSize)` -
+* Contains warp __shfl_xor function
+*/
+
+/**
+ * Test Description
+ * ------------------------
+ *    - Test case to verify __shfl_xor warp functions for different datatypes.
+
+ * Test source
+ * ------------------------
+ *    - catch/unit/kernel/hipShflUpDownTest.cc
+ * Test requirements
+ * ------------------------
+ *    - HIP_VERSION >= 5.6
+ *    - Gaurding this test against cuda with refernce to mentioned
+ * ticket SWDEV-379177
+ */
+
+TEST_CASE("Unit_runTestShfl_Xor") {
+  SECTION("runTestShflXor for int") {
+    runTestShflXor<int>();
+  }
+  SECTION("runTestShflXor for float") {
+    runTestShflXor<float>();
+  }
+  SECTION("runTestShflXor for double") {
+    runTestShflXor<double>();
+  }
+  SECTION("runTestShflXor for __half") {
+    runTestShflXor<__half>();
+  }
+  SECTION("runTestShflXor for int64_t") {
+    runTestShflXor<int64_t>();
+  }
+  SECTION("runTestShflXor for unsigned int") {
+    runTestShflXor<unsigned int>();
+  }
+  SECTION("runTestShflXor for uint64_t") {
+    runTestShflXor<uint64_t>();
+  }
+}
+/**
+ * End doxygen group __shfl.
+ * @}
+ */