From ab6a53f208d029b732cc9b37041d4189af995cca Mon Sep 17 00:00:00 2001
From: Daniel Bevenius <daniel.bevenius@gmail.com>
Date: Wed, 7 Aug 2024 10:06:47 +0200
Subject: [PATCH] cuda: add warp matrix multiply accumulate example

---
 gpu/cuda/.gitignore  |  1 +
 gpu/cuda/Makefile    |  6 +++-
 gpu/cuda/src/wmma.cu | 74 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 80 insertions(+), 1 deletion(-)
 create mode 100644 gpu/cuda/src/wmma.cu
diff --git a/gpu/cuda/.gitignore b/gpu/cuda/.gitignore
index c4f2f12a..fd8098c7 100644
--- a/gpu/cuda/.gitignore
+++ b/gpu/cuda/.gitignore
@@ -4,3 +4,4 @@ threads
 inc
 array-add
 minimal
+wmma
diff --git a/gpu/cuda/Makefile b/gpu/cuda/Makefile
index 6ec8c85e..66915993 100644
--- a/gpu/cuda/Makefile
+++ b/gpu/cuda/Makefile
@@ -1,6 +1,10 @@
 minimal: src/minimal.cu
 	nvcc -o $@ $<
 
+wmma: src/wmma.cu
+	# GeForce RTX 4080 has compute compatibility 8.6 (https://developer.nvidia.com/cuda-gpus)
+	nvcc -arch=sm_89 -o $@ $<
+
 hello-world: src/hello-world.cu
 	nvcc -lnppc -o $@ $<
 
@@ -19,4 +23,4 @@ array-add: src/array-add.cu
 
 .PHONY: clean
 clean:
-	${RM} hello-world threads inc hello-world.ptx minimal
+	${RM} hello-world threads inc hello-world.ptx minimal wmma
diff --git a/gpu/cuda/src/wmma.cu b/gpu/cuda/src/wmma.cu
new file mode 100644
index 00000000..fc3de4a9
--- /dev/null
+++ b/gpu/cuda/src/wmma.cu
@@ -0,0 +1,74 @@
+#include <cuda_runtime.h>
+#include <mma.h>
+#include <cuda_fp16.h>
+#include <cstdio>
+
+#define MATRIX_SIZE 16
+#define WMMA_TILE_SIZE 16
+
+using namespace nvcuda::wmma;
+
+// This is the CUDA kernel
+__global__ void wmma_example_kernel(const half* a, const half* b, half* c) {
+    // Define the fragment types for the input and output matrices
+    fragment<matrix_a, WMMA_TILE_SIZE, WMMA_TILE_SIZE, WMMA_TILE_SIZE, half, row_major> a_frag;
+    fragment<matrix_b, WMMA_TILE_SIZE, WMMA_TILE_SIZE, WMMA_TILE_SIZE, half, col_major> b_frag;
+    fragment<accumulator, WMMA_TILE_SIZE, WMMA_TILE_SIZE, WMMA_TILE_SIZE, half> c_frag;
+    // Initialize the output to zero
+    fill_fragment(c_frag, __float2half(0.0f));
+
+    // Load the input matrices into the fragments
+    load_matrix_sync(a_frag, a, MATRIX_SIZE);
+    load_matrix_sync(b_frag, b, MATRIX_SIZE);
+
+    // Perform the matrix multiplication
+    mma_sync(c_frag, a_frag, b_frag, c_frag);
+
+    // Store the result back to memory
+    store_matrix_sync(c, c_frag, MATRIX_SIZE, mem_row_major);
+}
+
+int main() {
+    printf("Warp-level Matrix Muliply Accumulate example.\n\n");
+    // Initialize host matrices
+    half h_a[MATRIX_SIZE * MATRIX_SIZE];
+    half h_b[MATRIX_SIZE * MATRIX_SIZE];
+    half h_c[MATRIX_SIZE * MATRIX_SIZE];
+
+    // Fill matrices with example data
+    for (int i = 0; i < MATRIX_SIZE * MATRIX_SIZE; ++i) {
+        h_a[i] = __float2half(1.0f);
+        h_b[i] = __float2half(1.0f);
+    }
+
+    // Allocate device memory
+    half *d_a, *d_b, *d_c;
+    cudaMalloc((void**)&d_a, MATRIX_SIZE * MATRIX_SIZE * sizeof(half));
+    cudaMalloc((void**)&d_b, MATRIX_SIZE * MATRIX_SIZE * sizeof(half));
+    cudaMalloc((void**)&d_c, MATRIX_SIZE * MATRIX_SIZE * sizeof(half));
+
+    // Copy host matrices to device
+    cudaMemcpy(d_a, h_a, MATRIX_SIZE * MATRIX_SIZE * sizeof(half), cudaMemcpyHostToDevice);
+    cudaMemcpy(d_b, h_b, MATRIX_SIZE * MATRIX_SIZE * sizeof(half), cudaMemcpyHostToDevice);
+
+    // Launch the WMMA kernel
+    wmma_example_kernel<<<1, 32>>>(d_a, d_b, d_c);
+
+    // Copy the result back to host
+    cudaMemcpy(h_c, d_c, MATRIX_SIZE * MATRIX_SIZE * sizeof(half), cudaMemcpyDeviceToHost);
+
+    // Print the result matrix
+    for (int i = 0; i < MATRIX_SIZE; ++i) {
+        for (int j = 0; j < MATRIX_SIZE; ++j) {
+            printf("%.0f ", __half2float(h_c[i * MATRIX_SIZE + j]));
+        }
+	printf("\n");
+    }
+
+    // Free device memory
+    cudaFree(d_a);
+    cudaFree(d_b);
+    cudaFree(d_c);
+
+    return 0;
+}