From ab6a53f208d029b732cc9b37041d4189af995cca Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Wed, 7 Aug 2024 10:06:47 +0200 Subject: [PATCH] cuda: add warp matrix multiply accumulate example --- gpu/cuda/.gitignore | 1 + gpu/cuda/Makefile | 6 +++- gpu/cuda/src/wmma.cu | 74 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 80 insertions(+), 1 deletion(-) create mode 100644 gpu/cuda/src/wmma.cu diff --git a/gpu/cuda/.gitignore b/gpu/cuda/.gitignore index c4f2f12a..fd8098c7 100644 --- a/gpu/cuda/.gitignore +++ b/gpu/cuda/.gitignore @@ -4,3 +4,4 @@ threads inc array-add minimal +wmma diff --git a/gpu/cuda/Makefile b/gpu/cuda/Makefile index 6ec8c85e..66915993 100644 --- a/gpu/cuda/Makefile +++ b/gpu/cuda/Makefile @@ -1,6 +1,10 @@ minimal: src/minimal.cu nvcc -o $@ $< +wmma: src/wmma.cu + # GeForce RTX 4080 has compute compatibility 8.6 (https://developer.nvidia.com/cuda-gpus) + nvcc -arch=sm_89 -o $@ $< + hello-world: src/hello-world.cu nvcc -lnppc -o $@ $< @@ -19,4 +23,4 @@ array-add: src/array-add.cu .PHONY: clean clean: - ${RM} hello-world threads inc hello-world.ptx minimal + ${RM} hello-world threads inc hello-world.ptx minimal wmma diff --git a/gpu/cuda/src/wmma.cu b/gpu/cuda/src/wmma.cu new file mode 100644 index 00000000..fc3de4a9 --- /dev/null +++ b/gpu/cuda/src/wmma.cu @@ -0,0 +1,74 @@ +#include +#include +#include +#include + +#define MATRIX_SIZE 16 +#define WMMA_TILE_SIZE 16 + +using namespace nvcuda::wmma; + +// This is the CUDA kernel +__global__ void wmma_example_kernel(const half* a, const half* b, half* c) { + // Define the fragment types for the input and output matrices + fragment a_frag; + fragment b_frag; + fragment c_frag; + // Initialize the output to zero + fill_fragment(c_frag, __float2half(0.0f)); + + // Load the input matrices into the fragments + load_matrix_sync(a_frag, a, MATRIX_SIZE); + load_matrix_sync(b_frag, b, MATRIX_SIZE); + + // Perform the matrix multiplication + mma_sync(c_frag, a_frag, b_frag, c_frag); + + // Store the result back to memory + store_matrix_sync(c, c_frag, MATRIX_SIZE, mem_row_major); +} + +int main() { + printf("Warp-level Matrix Muliply Accumulate example.\n\n"); + // Initialize host matrices + half h_a[MATRIX_SIZE * MATRIX_SIZE]; + half h_b[MATRIX_SIZE * MATRIX_SIZE]; + half h_c[MATRIX_SIZE * MATRIX_SIZE]; + + // Fill matrices with example data + for (int i = 0; i < MATRIX_SIZE * MATRIX_SIZE; ++i) { + h_a[i] = __float2half(1.0f); + h_b[i] = __float2half(1.0f); + } + + // Allocate device memory + half *d_a, *d_b, *d_c; + cudaMalloc((void**)&d_a, MATRIX_SIZE * MATRIX_SIZE * sizeof(half)); + cudaMalloc((void**)&d_b, MATRIX_SIZE * MATRIX_SIZE * sizeof(half)); + cudaMalloc((void**)&d_c, MATRIX_SIZE * MATRIX_SIZE * sizeof(half)); + + // Copy host matrices to device + cudaMemcpy(d_a, h_a, MATRIX_SIZE * MATRIX_SIZE * sizeof(half), cudaMemcpyHostToDevice); + cudaMemcpy(d_b, h_b, MATRIX_SIZE * MATRIX_SIZE * sizeof(half), cudaMemcpyHostToDevice); + + // Launch the WMMA kernel + wmma_example_kernel<<<1, 32>>>(d_a, d_b, d_c); + + // Copy the result back to host + cudaMemcpy(h_c, d_c, MATRIX_SIZE * MATRIX_SIZE * sizeof(half), cudaMemcpyDeviceToHost); + + // Print the result matrix + for (int i = 0; i < MATRIX_SIZE; ++i) { + for (int j = 0; j < MATRIX_SIZE; ++j) { + printf("%.0f ", __half2float(h_c[i * MATRIX_SIZE + j])); + } + printf("\n"); + } + + // Free device memory + cudaFree(d_a); + cudaFree(d_b); + cudaFree(d_c); + + return 0; +}