diff --git a/challenges/medium/75_sparse_matrix_dense_matrix_multiplication/challenge.html b/challenges/medium/75_sparse_matrix_dense_matrix_multiplication/challenge.html new file mode 100644 index 00000000..d914831e --- /dev/null +++ b/challenges/medium/75_sparse_matrix_dense_matrix_multiplication/challenge.html @@ -0,0 +1,61 @@ +

+ Implement a GPU program that multiplies a sparse matrix A of dimensions M × N + by a dense matrix B of dimensions N × K, producing a dense output matrix + C of dimensions M × K. + All matrices are stored in row-major order using 32-bit floats. + The matrix A is approximately 60–70% sparse (i.e., 60–70% of elements are zero), + and nnz gives the number of non-zero elements in A. +

+ +

+ Mathematically, the operation is defined as: + \[ + C_{ij} = \sum_{k=0}^{N-1} A_{ik} \cdot B_{kj} \quad \text{for} \quad i = 0, \ldots, M-1,\; j = 0, \ldots, K-1 + \] +

+ +

Implementation Requirements

+ + +

Example

+

+Input:
+Matrix \(A\) (\(3 \times 4\)): +\[ +\begin{bmatrix} +2.0 & 0.0 & 0.0 & 1.0 \\ +0.0 & 3.0 & 0.0 & 0.0 \\ +0.0 & 0.0 & 4.0 & 0.0 +\end{bmatrix} +\] +Matrix \(B\) (\(4 \times 2\)): +\[ +\begin{bmatrix} +1.0 & 2.0 \\ +3.0 & 4.0 \\ +5.0 & 6.0 \\ +7.0 & 8.0 +\end{bmatrix} +\] +Output:
+Matrix \(C\) (\(3 \times 2\)): +\[ +\begin{bmatrix} +9.0 & 12.0 \\ +9.0 & 12.0 \\ +20.0 & 24.0 +\end{bmatrix} +\] +

+ +

Constraints

+ diff --git a/challenges/medium/75_sparse_matrix_dense_matrix_multiplication/challenge.py b/challenges/medium/75_sparse_matrix_dense_matrix_multiplication/challenge.py new file mode 100644 index 00000000..e3660751 --- /dev/null +++ b/challenges/medium/75_sparse_matrix_dense_matrix_multiplication/challenge.py @@ -0,0 +1,264 @@ +import ctypes +from typing import Any, Dict, List + +import torch +from core.challenge_base import ChallengeBase + + +class Challenge(ChallengeBase): + def __init__(self): + super().__init__( + name="Sparse Matrix-Dense Matrix Multiplication (SpMM)", + atol=1e-03, + rtol=1e-03, + num_gpus=1, + access_tier="free", + ) + + def reference_impl( + self, + A: torch.Tensor, + B: torch.Tensor, + C: torch.Tensor, + M: int, + N: int, + K: int, + nnz: int, + ): + if A.shape == (M * N,): + A_matrix = A.view(M, N) + elif A.shape == (M, N): + A_matrix = A + else: + raise AssertionError( + f"A.shape {A.shape} does not match expected {(M * N,)} or {(M, N)}" + ) + if B.shape == (N * K,): + B_matrix = B.view(N, K) + elif B.shape == (N, K): + B_matrix = B + else: + raise AssertionError( + f"B.shape {B.shape} does not match expected {(N * K,)} or {(N, K)}" + ) + assert C.shape == (M, K) or C.shape == ( + M * K, + ), f"C.shape {C.shape} does not match expected {(M, K)} or {(M * K,)}" + assert A_matrix.dtype == torch.float32 + assert B_matrix.dtype == torch.float32 + assert A_matrix.device.type == "cuda" + assert B_matrix.device.type == "cuda" + assert C.device.type == "cuda" + result = torch.matmul(A_matrix, B_matrix) + C.copy_(result.view(C.shape)) + + def get_solve_signature(self) -> Dict[str, tuple]: + return { + "A": (ctypes.POINTER(ctypes.c_float), "in"), + "B": (ctypes.POINTER(ctypes.c_float), "in"), + "C": (ctypes.POINTER(ctypes.c_float), "out"), + "M": (ctypes.c_int, "in"), + "N": (ctypes.c_int, "in"), + "K": (ctypes.c_int, "in"), + "nnz": (ctypes.c_int, "in"), + } + + def generate_example_test(self) -> Dict[str, Any]: + dtype = torch.float32 + A = torch.tensor( + [ + [2.0, 0.0, 0.0, 1.0], + [0.0, 3.0, 0.0, 0.0], + [0.0, 0.0, 4.0, 0.0], + ], + device="cuda", + dtype=dtype, + ) + B = torch.tensor( + [ + [1.0, 2.0], + [3.0, 4.0], + [5.0, 6.0], + [7.0, 8.0], + ], + device="cuda", + dtype=dtype, + ) + C = torch.empty((3, 2), device="cuda", dtype=dtype) + return { + "A": A, + "B": B, + "C": C, + "M": 3, + "N": 4, + "K": 2, + "nnz": 4, + } + + def generate_functional_test(self) -> List[Dict[str, Any]]: + dtype = torch.float32 + tests = [] + + # edge_1x1x1 + tests.append( + { + "A": torch.tensor([[3.0]], device="cuda", dtype=dtype), + "B": torch.tensor([[2.0]], device="cuda", dtype=dtype), + "C": torch.empty((1, 1), device="cuda", dtype=dtype), + "M": 1, + "N": 1, + "K": 1, + "nnz": 1, + } + ) + + # edge_2x2_k1_spmv_like + tests.append( + { + "A": torch.tensor([[1.0, 0.0], [0.0, 2.0]], device="cuda", dtype=dtype), + "B": torch.tensor([[3.0], [4.0]], device="cuda", dtype=dtype), + "C": torch.empty((2, 1), device="cuda", dtype=dtype), + "M": 2, + "N": 2, + "K": 1, + "nnz": 2, + } + ) + + # edge_zero_matrix + tests.append( + { + "A": torch.zeros((3, 3), device="cuda", dtype=dtype), + "B": torch.tensor([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], device="cuda", dtype=dtype), + "C": torch.empty((3, 2), device="cuda", dtype=dtype), + "M": 3, + "N": 3, + "K": 2, + "nnz": 0, + } + ) + + # edge_identity_a + tests.append( + { + "A": torch.eye(4, device="cuda", dtype=dtype), + "B": torch.tensor( + [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0], [10.0, 11.0, 12.0]], + device="cuda", + dtype=dtype, + ), + "C": torch.empty((4, 3), device="cuda", dtype=dtype), + "M": 4, + "N": 4, + "K": 3, + "nnz": 4, + } + ) + + # power_of_2_16x16x8 + M, N, K = 16, 16, 8 + A_dense = torch.empty((M, N), device="cuda", dtype=dtype).uniform_(-2.0, 2.0) + mask = torch.rand((M, N), device="cuda") > 0.65 + A_sparse = A_dense * mask + tests.append( + { + "A": A_sparse, + "B": torch.empty((N, K), device="cuda", dtype=dtype).uniform_(-1.0, 1.0), + "C": torch.empty((M, K), device="cuda", dtype=dtype), + "M": M, + "N": N, + "K": K, + "nnz": int(mask.sum().item()), + } + ) + + # power_of_2_64x32x16 + M, N, K = 64, 32, 16 + A_dense = torch.empty((M, N), device="cuda", dtype=dtype).uniform_(-3.0, 3.0) + mask = torch.rand((M, N), device="cuda") > 0.70 + A_sparse = A_dense * mask + tests.append( + { + "A": A_sparse, + "B": torch.empty((N, K), device="cuda", dtype=dtype).uniform_(-1.0, 1.0), + "C": torch.empty((M, K), device="cuda", dtype=dtype), + "M": M, + "N": N, + "K": K, + "nnz": int(mask.sum().item()), + } + ) + + # non_power_of_2_negative_values + M, N, K = 30, 50, 20 + A_dense = torch.empty((M, N), device="cuda", dtype=dtype).uniform_(-5.0, 5.0) + mask = torch.rand((M, N), device="cuda") > 0.65 + A_sparse = A_dense * mask + tests.append( + { + "A": A_sparse, + "B": torch.empty((N, K), device="cuda", dtype=dtype).uniform_(-3.0, 3.0), + "C": torch.empty((M, K), device="cuda", dtype=dtype), + "M": M, + "N": N, + "K": K, + "nnz": int(mask.sum().item()), + } + ) + + # non_power_of_2_255x100x33 + M, N, K = 255, 100, 33 + A_dense = torch.empty((M, N), device="cuda", dtype=dtype).uniform_(-2.0, 2.0) + mask = torch.rand((M, N), device="cuda") > 0.70 + A_sparse = A_dense * mask + tests.append( + { + "A": A_sparse, + "B": torch.empty((N, K), device="cuda", dtype=dtype).uniform_(-1.0, 1.0), + "C": torch.empty((M, K), device="cuda", dtype=dtype), + "M": M, + "N": N, + "K": K, + "nnz": int(mask.sum().item()), + } + ) + + # realistic_1000x500x64 + M, N, K = 1000, 500, 64 + A_dense = torch.empty((M, N), device="cuda", dtype=dtype).uniform_(-1.0, 1.0) + mask = torch.rand((M, N), device="cuda") > 0.65 + A_sparse = A_dense * mask + tests.append( + { + "A": A_sparse, + "B": torch.empty((N, K), device="cuda", dtype=dtype).uniform_(-1.0, 1.0), + "C": torch.empty((M, K), device="cuda", dtype=dtype), + "M": M, + "N": N, + "K": K, + "nnz": int(mask.sum().item()), + } + ) + + return tests + + def generate_performance_test(self) -> Dict[str, Any]: + dtype = torch.float32 + M = 4096 + N = 2048 + K = 512 + A_dense = torch.empty((M, N), device="cuda", dtype=dtype).uniform_(-1.0, 1.0) + mask = torch.rand((M, N), device="cuda") > 0.65 + A_sparse = A_dense * mask + nnz = int(mask.sum().item()) + B = torch.empty((N, K), device="cuda", dtype=dtype).uniform_(-1.0, 1.0) + C = torch.empty((M, K), device="cuda", dtype=dtype) + return { + "A": A_sparse, + "B": B, + "C": C, + "M": M, + "N": N, + "K": K, + "nnz": nnz, + } diff --git a/challenges/medium/75_sparse_matrix_dense_matrix_multiplication/starter/starter.cu b/challenges/medium/75_sparse_matrix_dense_matrix_multiplication/starter/starter.cu new file mode 100644 index 00000000..62cc8a05 --- /dev/null +++ b/challenges/medium/75_sparse_matrix_dense_matrix_multiplication/starter/starter.cu @@ -0,0 +1,4 @@ +#include + +// A, B, C are device pointers +extern "C" void solve(const float* A, const float* B, float* C, int M, int N, int K, int nnz) {} diff --git a/challenges/medium/75_sparse_matrix_dense_matrix_multiplication/starter/starter.cute.py b/challenges/medium/75_sparse_matrix_dense_matrix_multiplication/starter/starter.cute.py new file mode 100644 index 00000000..6ef93209 --- /dev/null +++ b/challenges/medium/75_sparse_matrix_dense_matrix_multiplication/starter/starter.cute.py @@ -0,0 +1,16 @@ +import cutlass +import cutlass.cute as cute + + +# A, B, C are tensors on the GPU +@cute.jit +def solve( + A: cute.Tensor, + B: cute.Tensor, + C: cute.Tensor, + M: cute.Int32, + N: cute.Int32, + K: cute.Int32, + nnz: cute.Int32, +): + pass diff --git a/challenges/medium/75_sparse_matrix_dense_matrix_multiplication/starter/starter.jax.py b/challenges/medium/75_sparse_matrix_dense_matrix_multiplication/starter/starter.jax.py new file mode 100644 index 00000000..a541e101 --- /dev/null +++ b/challenges/medium/75_sparse_matrix_dense_matrix_multiplication/starter/starter.jax.py @@ -0,0 +1,9 @@ +import jax +import jax.numpy as jnp + + +# A, B are tensors on GPU +@jax.jit +def solve(A: jax.Array, B: jax.Array, M: int, N: int, K: int, nnz: int) -> jax.Array: + # return output tensor directly + pass diff --git a/challenges/medium/75_sparse_matrix_dense_matrix_multiplication/starter/starter.mojo b/challenges/medium/75_sparse_matrix_dense_matrix_multiplication/starter/starter.mojo new file mode 100644 index 00000000..78690e4b --- /dev/null +++ b/challenges/medium/75_sparse_matrix_dense_matrix_multiplication/starter/starter.mojo @@ -0,0 +1,9 @@ +from gpu.host import DeviceContext +from gpu.id import block_dim, block_idx, thread_idx +from memory import UnsafePointer +from math import ceildiv + +# A, B, C are device pointers +@export +def solve(A: UnsafePointer[Float32], B: UnsafePointer[Float32], C: UnsafePointer[Float32], M: Int32, N: Int32, K: Int32, nnz: Int32): + pass diff --git a/challenges/medium/75_sparse_matrix_dense_matrix_multiplication/starter/starter.pytorch.py b/challenges/medium/75_sparse_matrix_dense_matrix_multiplication/starter/starter.pytorch.py new file mode 100644 index 00000000..5df37bbf --- /dev/null +++ b/challenges/medium/75_sparse_matrix_dense_matrix_multiplication/starter/starter.pytorch.py @@ -0,0 +1,6 @@ +import torch + + +# A, B, C are tensors on the GPU +def solve(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor, M: int, N: int, K: int, nnz: int): + pass diff --git a/challenges/medium/75_sparse_matrix_dense_matrix_multiplication/starter/starter.triton.py b/challenges/medium/75_sparse_matrix_dense_matrix_multiplication/starter/starter.triton.py new file mode 100644 index 00000000..fc6adb0a --- /dev/null +++ b/challenges/medium/75_sparse_matrix_dense_matrix_multiplication/starter/starter.triton.py @@ -0,0 +1,8 @@ +import torch +import triton +import triton.language as tl + + +# A, B, C are tensors on the GPU +def solve(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor, M: int, N: int, K: int, nnz: int): + pass