diff --git a/challenges/medium/75_sparse_matrix_dense_matrix_multiplication/challenge.html b/challenges/medium/75_sparse_matrix_dense_matrix_multiplication/challenge.html
new file mode 100644
index 00000000..d914831e
--- /dev/null
+++ b/challenges/medium/75_sparse_matrix_dense_matrix_multiplication/challenge.html
@@ -0,0 +1,61 @@
+
+ Implement a GPU program that multiplies a sparse matrix A of dimensions M × N
+ by a dense matrix B of dimensions N × K, producing a dense output matrix
+ C of dimensions M × K.
+ All matrices are stored in row-major order using 32-bit floats.
+ The matrix A is approximately 60–70% sparse (i.e., 60–70% of elements are zero),
+ and nnz gives the number of non-zero elements in A.
+
+
+
+ Mathematically, the operation is defined as:
+ \[
+ C_{ij} = \sum_{k=0}^{N-1} A_{ik} \cdot B_{kj} \quad \text{for} \quad i = 0, \ldots, M-1,\; j = 0, \ldots, K-1
+ \]
+
+
+Implementation Requirements
+
+ - Use only GPU native features (external libraries are not permitted)
+ - The
solve function signature must remain unchanged
+ - The final result must be stored in matrix
C
+
+
+Example
+
+Input:
+Matrix \(A\) (\(3 \times 4\)):
+\[
+\begin{bmatrix}
+2.0 & 0.0 & 0.0 & 1.0 \\
+0.0 & 3.0 & 0.0 & 0.0 \\
+0.0 & 0.0 & 4.0 & 0.0
+\end{bmatrix}
+\]
+Matrix \(B\) (\(4 \times 2\)):
+\[
+\begin{bmatrix}
+1.0 & 2.0 \\
+3.0 & 4.0 \\
+5.0 & 6.0 \\
+7.0 & 8.0
+\end{bmatrix}
+\]
+Output:
+Matrix \(C\) (\(3 \times 2\)):
+\[
+\begin{bmatrix}
+9.0 & 12.0 \\
+9.0 & 12.0 \\
+20.0 & 24.0
+\end{bmatrix}
+\]
+
+
+Constraints
+
+ - 1 ≤
M, N, K ≤ 8,192
+ - All values in
A and B are 32-bit floats in the range [−10, 10]
+ - The matrix
A is approximately 60–70% sparse
+ - Performance is measured with
M = 4,096, N = 2,048, K = 512
+
diff --git a/challenges/medium/75_sparse_matrix_dense_matrix_multiplication/challenge.py b/challenges/medium/75_sparse_matrix_dense_matrix_multiplication/challenge.py
new file mode 100644
index 00000000..e3660751
--- /dev/null
+++ b/challenges/medium/75_sparse_matrix_dense_matrix_multiplication/challenge.py
@@ -0,0 +1,264 @@
+import ctypes
+from typing import Any, Dict, List
+
+import torch
+from core.challenge_base import ChallengeBase
+
+
+class Challenge(ChallengeBase):
+ def __init__(self):
+ super().__init__(
+ name="Sparse Matrix-Dense Matrix Multiplication (SpMM)",
+ atol=1e-03,
+ rtol=1e-03,
+ num_gpus=1,
+ access_tier="free",
+ )
+
+ def reference_impl(
+ self,
+ A: torch.Tensor,
+ B: torch.Tensor,
+ C: torch.Tensor,
+ M: int,
+ N: int,
+ K: int,
+ nnz: int,
+ ):
+ if A.shape == (M * N,):
+ A_matrix = A.view(M, N)
+ elif A.shape == (M, N):
+ A_matrix = A
+ else:
+ raise AssertionError(
+ f"A.shape {A.shape} does not match expected {(M * N,)} or {(M, N)}"
+ )
+ if B.shape == (N * K,):
+ B_matrix = B.view(N, K)
+ elif B.shape == (N, K):
+ B_matrix = B
+ else:
+ raise AssertionError(
+ f"B.shape {B.shape} does not match expected {(N * K,)} or {(N, K)}"
+ )
+ assert C.shape == (M, K) or C.shape == (
+ M * K,
+ ), f"C.shape {C.shape} does not match expected {(M, K)} or {(M * K,)}"
+ assert A_matrix.dtype == torch.float32
+ assert B_matrix.dtype == torch.float32
+ assert A_matrix.device.type == "cuda"
+ assert B_matrix.device.type == "cuda"
+ assert C.device.type == "cuda"
+ result = torch.matmul(A_matrix, B_matrix)
+ C.copy_(result.view(C.shape))
+
+ def get_solve_signature(self) -> Dict[str, tuple]:
+ return {
+ "A": (ctypes.POINTER(ctypes.c_float), "in"),
+ "B": (ctypes.POINTER(ctypes.c_float), "in"),
+ "C": (ctypes.POINTER(ctypes.c_float), "out"),
+ "M": (ctypes.c_int, "in"),
+ "N": (ctypes.c_int, "in"),
+ "K": (ctypes.c_int, "in"),
+ "nnz": (ctypes.c_int, "in"),
+ }
+
+ def generate_example_test(self) -> Dict[str, Any]:
+ dtype = torch.float32
+ A = torch.tensor(
+ [
+ [2.0, 0.0, 0.0, 1.0],
+ [0.0, 3.0, 0.0, 0.0],
+ [0.0, 0.0, 4.0, 0.0],
+ ],
+ device="cuda",
+ dtype=dtype,
+ )
+ B = torch.tensor(
+ [
+ [1.0, 2.0],
+ [3.0, 4.0],
+ [5.0, 6.0],
+ [7.0, 8.0],
+ ],
+ device="cuda",
+ dtype=dtype,
+ )
+ C = torch.empty((3, 2), device="cuda", dtype=dtype)
+ return {
+ "A": A,
+ "B": B,
+ "C": C,
+ "M": 3,
+ "N": 4,
+ "K": 2,
+ "nnz": 4,
+ }
+
+ def generate_functional_test(self) -> List[Dict[str, Any]]:
+ dtype = torch.float32
+ tests = []
+
+ # edge_1x1x1
+ tests.append(
+ {
+ "A": torch.tensor([[3.0]], device="cuda", dtype=dtype),
+ "B": torch.tensor([[2.0]], device="cuda", dtype=dtype),
+ "C": torch.empty((1, 1), device="cuda", dtype=dtype),
+ "M": 1,
+ "N": 1,
+ "K": 1,
+ "nnz": 1,
+ }
+ )
+
+ # edge_2x2_k1_spmv_like
+ tests.append(
+ {
+ "A": torch.tensor([[1.0, 0.0], [0.0, 2.0]], device="cuda", dtype=dtype),
+ "B": torch.tensor([[3.0], [4.0]], device="cuda", dtype=dtype),
+ "C": torch.empty((2, 1), device="cuda", dtype=dtype),
+ "M": 2,
+ "N": 2,
+ "K": 1,
+ "nnz": 2,
+ }
+ )
+
+ # edge_zero_matrix
+ tests.append(
+ {
+ "A": torch.zeros((3, 3), device="cuda", dtype=dtype),
+ "B": torch.tensor([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], device="cuda", dtype=dtype),
+ "C": torch.empty((3, 2), device="cuda", dtype=dtype),
+ "M": 3,
+ "N": 3,
+ "K": 2,
+ "nnz": 0,
+ }
+ )
+
+ # edge_identity_a
+ tests.append(
+ {
+ "A": torch.eye(4, device="cuda", dtype=dtype),
+ "B": torch.tensor(
+ [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0], [10.0, 11.0, 12.0]],
+ device="cuda",
+ dtype=dtype,
+ ),
+ "C": torch.empty((4, 3), device="cuda", dtype=dtype),
+ "M": 4,
+ "N": 4,
+ "K": 3,
+ "nnz": 4,
+ }
+ )
+
+ # power_of_2_16x16x8
+ M, N, K = 16, 16, 8
+ A_dense = torch.empty((M, N), device="cuda", dtype=dtype).uniform_(-2.0, 2.0)
+ mask = torch.rand((M, N), device="cuda") > 0.65
+ A_sparse = A_dense * mask
+ tests.append(
+ {
+ "A": A_sparse,
+ "B": torch.empty((N, K), device="cuda", dtype=dtype).uniform_(-1.0, 1.0),
+ "C": torch.empty((M, K), device="cuda", dtype=dtype),
+ "M": M,
+ "N": N,
+ "K": K,
+ "nnz": int(mask.sum().item()),
+ }
+ )
+
+ # power_of_2_64x32x16
+ M, N, K = 64, 32, 16
+ A_dense = torch.empty((M, N), device="cuda", dtype=dtype).uniform_(-3.0, 3.0)
+ mask = torch.rand((M, N), device="cuda") > 0.70
+ A_sparse = A_dense * mask
+ tests.append(
+ {
+ "A": A_sparse,
+ "B": torch.empty((N, K), device="cuda", dtype=dtype).uniform_(-1.0, 1.0),
+ "C": torch.empty((M, K), device="cuda", dtype=dtype),
+ "M": M,
+ "N": N,
+ "K": K,
+ "nnz": int(mask.sum().item()),
+ }
+ )
+
+ # non_power_of_2_negative_values
+ M, N, K = 30, 50, 20
+ A_dense = torch.empty((M, N), device="cuda", dtype=dtype).uniform_(-5.0, 5.0)
+ mask = torch.rand((M, N), device="cuda") > 0.65
+ A_sparse = A_dense * mask
+ tests.append(
+ {
+ "A": A_sparse,
+ "B": torch.empty((N, K), device="cuda", dtype=dtype).uniform_(-3.0, 3.0),
+ "C": torch.empty((M, K), device="cuda", dtype=dtype),
+ "M": M,
+ "N": N,
+ "K": K,
+ "nnz": int(mask.sum().item()),
+ }
+ )
+
+ # non_power_of_2_255x100x33
+ M, N, K = 255, 100, 33
+ A_dense = torch.empty((M, N), device="cuda", dtype=dtype).uniform_(-2.0, 2.0)
+ mask = torch.rand((M, N), device="cuda") > 0.70
+ A_sparse = A_dense * mask
+ tests.append(
+ {
+ "A": A_sparse,
+ "B": torch.empty((N, K), device="cuda", dtype=dtype).uniform_(-1.0, 1.0),
+ "C": torch.empty((M, K), device="cuda", dtype=dtype),
+ "M": M,
+ "N": N,
+ "K": K,
+ "nnz": int(mask.sum().item()),
+ }
+ )
+
+ # realistic_1000x500x64
+ M, N, K = 1000, 500, 64
+ A_dense = torch.empty((M, N), device="cuda", dtype=dtype).uniform_(-1.0, 1.0)
+ mask = torch.rand((M, N), device="cuda") > 0.65
+ A_sparse = A_dense * mask
+ tests.append(
+ {
+ "A": A_sparse,
+ "B": torch.empty((N, K), device="cuda", dtype=dtype).uniform_(-1.0, 1.0),
+ "C": torch.empty((M, K), device="cuda", dtype=dtype),
+ "M": M,
+ "N": N,
+ "K": K,
+ "nnz": int(mask.sum().item()),
+ }
+ )
+
+ return tests
+
+ def generate_performance_test(self) -> Dict[str, Any]:
+ dtype = torch.float32
+ M = 4096
+ N = 2048
+ K = 512
+ A_dense = torch.empty((M, N), device="cuda", dtype=dtype).uniform_(-1.0, 1.0)
+ mask = torch.rand((M, N), device="cuda") > 0.65
+ A_sparse = A_dense * mask
+ nnz = int(mask.sum().item())
+ B = torch.empty((N, K), device="cuda", dtype=dtype).uniform_(-1.0, 1.0)
+ C = torch.empty((M, K), device="cuda", dtype=dtype)
+ return {
+ "A": A_sparse,
+ "B": B,
+ "C": C,
+ "M": M,
+ "N": N,
+ "K": K,
+ "nnz": nnz,
+ }
diff --git a/challenges/medium/75_sparse_matrix_dense_matrix_multiplication/starter/starter.cu b/challenges/medium/75_sparse_matrix_dense_matrix_multiplication/starter/starter.cu
new file mode 100644
index 00000000..62cc8a05
--- /dev/null
+++ b/challenges/medium/75_sparse_matrix_dense_matrix_multiplication/starter/starter.cu
@@ -0,0 +1,4 @@
+#include
+
+// A, B, C are device pointers
+extern "C" void solve(const float* A, const float* B, float* C, int M, int N, int K, int nnz) {}
diff --git a/challenges/medium/75_sparse_matrix_dense_matrix_multiplication/starter/starter.cute.py b/challenges/medium/75_sparse_matrix_dense_matrix_multiplication/starter/starter.cute.py
new file mode 100644
index 00000000..6ef93209
--- /dev/null
+++ b/challenges/medium/75_sparse_matrix_dense_matrix_multiplication/starter/starter.cute.py
@@ -0,0 +1,16 @@
+import cutlass
+import cutlass.cute as cute
+
+
+# A, B, C are tensors on the GPU
+@cute.jit
+def solve(
+ A: cute.Tensor,
+ B: cute.Tensor,
+ C: cute.Tensor,
+ M: cute.Int32,
+ N: cute.Int32,
+ K: cute.Int32,
+ nnz: cute.Int32,
+):
+ pass
diff --git a/challenges/medium/75_sparse_matrix_dense_matrix_multiplication/starter/starter.jax.py b/challenges/medium/75_sparse_matrix_dense_matrix_multiplication/starter/starter.jax.py
new file mode 100644
index 00000000..a541e101
--- /dev/null
+++ b/challenges/medium/75_sparse_matrix_dense_matrix_multiplication/starter/starter.jax.py
@@ -0,0 +1,9 @@
+import jax
+import jax.numpy as jnp
+
+
+# A, B are tensors on GPU
+@jax.jit
+def solve(A: jax.Array, B: jax.Array, M: int, N: int, K: int, nnz: int) -> jax.Array:
+ # return output tensor directly
+ pass
diff --git a/challenges/medium/75_sparse_matrix_dense_matrix_multiplication/starter/starter.mojo b/challenges/medium/75_sparse_matrix_dense_matrix_multiplication/starter/starter.mojo
new file mode 100644
index 00000000..78690e4b
--- /dev/null
+++ b/challenges/medium/75_sparse_matrix_dense_matrix_multiplication/starter/starter.mojo
@@ -0,0 +1,9 @@
+from gpu.host import DeviceContext
+from gpu.id import block_dim, block_idx, thread_idx
+from memory import UnsafePointer
+from math import ceildiv
+
+# A, B, C are device pointers
+@export
+def solve(A: UnsafePointer[Float32], B: UnsafePointer[Float32], C: UnsafePointer[Float32], M: Int32, N: Int32, K: Int32, nnz: Int32):
+ pass
diff --git a/challenges/medium/75_sparse_matrix_dense_matrix_multiplication/starter/starter.pytorch.py b/challenges/medium/75_sparse_matrix_dense_matrix_multiplication/starter/starter.pytorch.py
new file mode 100644
index 00000000..5df37bbf
--- /dev/null
+++ b/challenges/medium/75_sparse_matrix_dense_matrix_multiplication/starter/starter.pytorch.py
@@ -0,0 +1,6 @@
+import torch
+
+
+# A, B, C are tensors on the GPU
+def solve(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor, M: int, N: int, K: int, nnz: int):
+ pass
diff --git a/challenges/medium/75_sparse_matrix_dense_matrix_multiplication/starter/starter.triton.py b/challenges/medium/75_sparse_matrix_dense_matrix_multiplication/starter/starter.triton.py
new file mode 100644
index 00000000..fc6adb0a
--- /dev/null
+++ b/challenges/medium/75_sparse_matrix_dense_matrix_multiplication/starter/starter.triton.py
@@ -0,0 +1,8 @@
+import torch
+import triton
+import triton.language as tl
+
+
+# A, B, C are tensors on the GPU
+def solve(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor, M: int, N: int, K: int, nnz: int):
+ pass