Skip to content

Commit

Permalink
rocm: add matrix multiplication example
Browse files Browse the repository at this point in the history
  • Loading branch information
danbev committed Aug 21, 2024
1 parent ab56485 commit b1d59ea
Show file tree
Hide file tree
Showing 2 changed files with 88 additions and 0 deletions.
3 changes: 3 additions & 0 deletions gpu/rocm/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@ helloworld: src/helloworld.cpp
# use --keep to inspect the generated assembly code
env HIP_PLATFORM=nvidia hipcc -o $@ $<

matmul: src/matmul.cpp
env HIP_PLATFORM=nvidia hipcc -o $@ $<

.PHONY: clean
clean:
${RM} helloworld
85 changes: 85 additions & 0 deletions gpu/rocm/src/matmul.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
#include <hip/hip_runtime.h>
#include <iostream>
#include <random>

#define WIDTH 1024

__global__ void matrixMultiplication(float* A, float* B, float* C, int width) {
int row = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
int col = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;

if (row < width && col < width) {
float sum = 0.0f;
for (int i = 0; i < width; i++) {
sum += A[row * width + i] * B[i * width + col];
}
C[row * width + col] = sum;
}
}

void initMatrix(float* matrix, int size) {
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_real_distribution<> dis(0.0, 1.0);

for (int i = 0; i < size; i++) {
matrix[i] = dis(gen);
}
}

int main() {
const int size = WIDTH * WIDTH;
size_t bytes = size * sizeof(float);

float *h_A, *h_B, *h_C;
float *d_A, *d_B, *d_C;

// Allocate host memory
h_A = new float[size];
h_B = new float[size];
h_C = new float[size];

// Initialize host matrices
initMatrix(h_A, size);
initMatrix(h_B, size);

// Allocate device memory
hipMalloc(&d_A, bytes);
hipMalloc(&d_B, bytes);
hipMalloc(&d_C, bytes);

// Copy host memory to device
hipMemcpy(d_A, h_A, bytes, hipMemcpyHostToDevice);
hipMemcpy(d_B, h_B, bytes, hipMemcpyHostToDevice);

// Setup execution configuration
dim3 threadsPerBlock(16, 16);
dim3 numBlocks((WIDTH + threadsPerBlock.x - 1) / threadsPerBlock.x,
(WIDTH + threadsPerBlock.y - 1) / threadsPerBlock.y);

// Launch kernel
hipLaunchKernelGGL(matrixMultiplication, numBlocks, threadsPerBlock, 0, 0, d_A, d_B, d_C, WIDTH);

// Wait for GPU to finish
hipDeviceSynchronize();

// Copy result back to host
hipMemcpy(h_C, d_C, bytes, hipMemcpyDeviceToHost);

// Verify result (check a few elements)
std::cout << "Result verification (first few elements):" << std::endl;
for (int i = 0; i < 5; i++) {
std::cout << h_C[i] << " ";
}
std::cout << "..." << std::endl;

// Free memory
delete[] h_A;
delete[] h_B;
delete[] h_C;
hipFree(d_A);
hipFree(d_B);
hipFree(d_C);

return 0;
}

0 comments on commit b1d59ea

Please sign in to comment.