diff --git a/challenges/medium/77_integral_image/challenge.html b/challenges/medium/77_integral_image/challenge.html
new file mode 100644
index 00000000..f423bbb5
--- /dev/null
+++ b/challenges/medium/77_integral_image/challenge.html
@@ -0,0 +1,43 @@
+
+ Given a 2D image of shape H × W containing 32-bit floating point values,
+ compute its integral image (also known as a summed area table). The value at each output position
+ output[i][j] is the sum of all input values in the rectangle from the top-left corner
+ (0, 0) to position (i, j), inclusive. All values are stored in row-major order.
+
+
+Implementation Requirements
+
+ - Use only native features (external libraries are not permitted)
+ - The
solve function signature must remain unchanged
+ - The final result must be stored in
output
+ - Read exclusively from
input and write exclusively to output (do not update input)
+
+
+Example
+
+Input (\(3 \times 3\)):
+\[
+\begin{bmatrix}
+1.0 & 2.0 & 3.0 \\
+4.0 & 5.0 & 6.0 \\
+7.0 & 8.0 & 9.0
+\end{bmatrix}
+\]
+Output (\(3 \times 3\)):
+\[
+\begin{bmatrix}
+1.0 & 3.0 & 6.0 \\
+5.0 & 12.0 & 21.0 \\
+12.0 & 27.0 & 45.0
+\end{bmatrix}
+\]
+For example, \(\text{output}[1][1] = \text{input}[0][0] + \text{input}[0][1] + \text{input}[1][0] + \text{input}[1][1] = 1 + 2 + 4 + 5 = 12\).
+
+
+Constraints
+
+ - 1 ≤
H, W ≤ 16,384
+ - Input values are in the range [-100, 100]
+ - All values are 32-bit floats
+ - Performance is measured with
H = 8,192, W = 8,192
+
diff --git a/challenges/medium/77_integral_image/challenge.py b/challenges/medium/77_integral_image/challenge.py
new file mode 100644
index 00000000..9395067d
--- /dev/null
+++ b/challenges/medium/77_integral_image/challenge.py
@@ -0,0 +1,160 @@
+import ctypes
+from typing import Any, Dict, List
+
+import torch
+from core.challenge_base import ChallengeBase
+
+
+class Challenge(ChallengeBase):
+ def __init__(self):
+ super().__init__(
+ name="Integral Image",
+ atol=1.0,
+ rtol=1e-05,
+ num_gpus=1,
+ access_tier="free",
+ )
+
+ def reference_impl(self, input: torch.Tensor, output: torch.Tensor, H: int, W: int):
+ assert input.shape == (H, W)
+ assert output.shape == (H, W)
+ assert input.dtype == torch.float32
+ assert input.device.type == "cuda"
+
+ result = torch.cumsum(torch.cumsum(input, dim=0), dim=1)
+ output.copy_(result)
+
+ def get_solve_signature(self) -> Dict[str, tuple]:
+ return {
+ "input": (ctypes.POINTER(ctypes.c_float), "in"),
+ "output": (ctypes.POINTER(ctypes.c_float), "out"),
+ "H": (ctypes.c_int, "in"),
+ "W": (ctypes.c_int, "in"),
+ }
+
+ def generate_example_test(self) -> Dict[str, Any]:
+ dtype = torch.float32
+ input = torch.tensor(
+ [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]],
+ device="cuda",
+ dtype=dtype,
+ )
+ output = torch.empty((3, 3), device="cuda", dtype=dtype)
+ return {"input": input, "output": output, "H": 3, "W": 3}
+
+ def generate_functional_test(self) -> List[Dict[str, Any]]:
+ dtype = torch.float32
+ tests = []
+
+ # single_element
+ tests.append(
+ {
+ "input": torch.tensor([[7.0]], device="cuda", dtype=dtype),
+ "output": torch.empty((1, 1), device="cuda", dtype=dtype),
+ "H": 1,
+ "W": 1,
+ }
+ )
+
+ # single_row
+ tests.append(
+ {
+ "input": torch.tensor([[1.0, -2.0, 3.0, -4.0]], device="cuda", dtype=dtype),
+ "output": torch.empty((1, 4), device="cuda", dtype=dtype),
+ "H": 1,
+ "W": 4,
+ }
+ )
+
+ # single_col
+ tests.append(
+ {
+ "input": torch.tensor([[2.0], [5.0], [-1.0], [3.0]], device="cuda", dtype=dtype),
+ "output": torch.empty((4, 1), device="cuda", dtype=dtype),
+ "H": 4,
+ "W": 1,
+ }
+ )
+
+ # all_zeros_16x16
+ tests.append(
+ {
+ "input": torch.zeros((16, 16), device="cuda", dtype=dtype),
+ "output": torch.empty((16, 16), device="cuda", dtype=dtype),
+ "H": 16,
+ "W": 16,
+ }
+ )
+
+ # power_of_2_square_32x32
+ tests.append(
+ {
+ "input": torch.empty((32, 32), device="cuda", dtype=dtype).uniform_(-5.0, 5.0),
+ "output": torch.empty((32, 32), device="cuda", dtype=dtype),
+ "H": 32,
+ "W": 32,
+ }
+ )
+
+ # power_of_2_square_128x128
+ tests.append(
+ {
+ "input": torch.empty((128, 128), device="cuda", dtype=dtype).uniform_(-1.0, 1.0),
+ "output": torch.empty((128, 128), device="cuda", dtype=dtype),
+ "H": 128,
+ "W": 128,
+ }
+ )
+
+ # non_power_of_2_30x30
+ tests.append(
+ {
+ "input": torch.empty((30, 30), device="cuda", dtype=dtype).uniform_(-3.0, 3.0),
+ "output": torch.empty((30, 30), device="cuda", dtype=dtype),
+ "H": 30,
+ "W": 30,
+ }
+ )
+
+ # non_power_of_2_100x100_negative
+ tests.append(
+ {
+ "input": torch.empty((100, 100), device="cuda", dtype=dtype).uniform_(-10.0, 0.0),
+ "output": torch.empty((100, 100), device="cuda", dtype=dtype),
+ "H": 100,
+ "W": 100,
+ }
+ )
+
+ # non_square_255x33
+ tests.append(
+ {
+ "input": torch.empty((255, 33), device="cuda", dtype=dtype).uniform_(-2.0, 2.0),
+ "output": torch.empty((255, 33), device="cuda", dtype=dtype),
+ "H": 255,
+ "W": 33,
+ }
+ )
+
+ # realistic_1024x1024
+ tests.append(
+ {
+ "input": torch.empty((1024, 1024), device="cuda", dtype=dtype).uniform_(-1.0, 1.0),
+ "output": torch.empty((1024, 1024), device="cuda", dtype=dtype),
+ "H": 1024,
+ "W": 1024,
+ }
+ )
+
+ return tests
+
+ def generate_performance_test(self) -> Dict[str, Any]:
+ dtype = torch.float32
+ H = 8192
+ W = 8192
+ return {
+ "input": torch.empty((H, W), device="cuda", dtype=dtype).uniform_(-1.0, 1.0),
+ "output": torch.empty((H, W), device="cuda", dtype=dtype),
+ "H": H,
+ "W": W,
+ }
diff --git a/challenges/medium/77_integral_image/starter/starter.cu b/challenges/medium/77_integral_image/starter/starter.cu
new file mode 100644
index 00000000..f8642d6b
--- /dev/null
+++ b/challenges/medium/77_integral_image/starter/starter.cu
@@ -0,0 +1,4 @@
+#include
+
+// input, output are device pointers
+extern "C" void solve(const float* input, float* output, int H, int W) {}
diff --git a/challenges/medium/77_integral_image/starter/starter.cute.py b/challenges/medium/77_integral_image/starter/starter.cute.py
new file mode 100644
index 00000000..73f4f244
--- /dev/null
+++ b/challenges/medium/77_integral_image/starter/starter.cute.py
@@ -0,0 +1,8 @@
+import cutlass
+import cutlass.cute as cute
+
+
+# input, output are tensors on the GPU
+@cute.jit
+def solve(input: cute.Tensor, output: cute.Tensor, H: cute.Int32, W: cute.Int32):
+ pass
diff --git a/challenges/medium/77_integral_image/starter/starter.jax.py b/challenges/medium/77_integral_image/starter/starter.jax.py
new file mode 100644
index 00000000..29f4bd2c
--- /dev/null
+++ b/challenges/medium/77_integral_image/starter/starter.jax.py
@@ -0,0 +1,9 @@
+import jax
+import jax.numpy as jnp
+
+
+# input is a tensor on the GPU
+@jax.jit
+def solve(input: jax.Array, H: int, W: int) -> jax.Array:
+ # return output tensor directly
+ pass
diff --git a/challenges/medium/77_integral_image/starter/starter.mojo b/challenges/medium/77_integral_image/starter/starter.mojo
new file mode 100644
index 00000000..d9b2aa8a
--- /dev/null
+++ b/challenges/medium/77_integral_image/starter/starter.mojo
@@ -0,0 +1,9 @@
+from gpu.host import DeviceContext
+from gpu.id import block_dim, block_idx, thread_idx
+from memory import UnsafePointer
+from math import ceildiv
+
+# input, output are device pointers
+@export
+def solve(input: UnsafePointer[Float32], output: UnsafePointer[Float32], H: Int32, W: Int32):
+ pass
diff --git a/challenges/medium/77_integral_image/starter/starter.pytorch.py b/challenges/medium/77_integral_image/starter/starter.pytorch.py
new file mode 100644
index 00000000..2309480d
--- /dev/null
+++ b/challenges/medium/77_integral_image/starter/starter.pytorch.py
@@ -0,0 +1,6 @@
+import torch
+
+
+# input, output are tensors on the GPU
+def solve(input: torch.Tensor, output: torch.Tensor, H: int, W: int):
+ pass
diff --git a/challenges/medium/77_integral_image/starter/starter.triton.py b/challenges/medium/77_integral_image/starter/starter.triton.py
new file mode 100644
index 00000000..90bc77fb
--- /dev/null
+++ b/challenges/medium/77_integral_image/starter/starter.triton.py
@@ -0,0 +1,8 @@
+import torch
+import triton
+import triton.language as tl
+
+
+# input, output are tensors on the GPU
+def solve(input: torch.Tensor, output: torch.Tensor, H: int, W: int):
+ pass