Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 43 additions & 0 deletions challenges/medium/77_integral_image/challenge.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
<p>
Given a 2D image of shape <code>H</code> &times; <code>W</code> containing 32-bit floating point values,
compute its integral image (also known as a summed area table). The value at each output position
<code>output[i][j]</code> is the sum of all input values in the rectangle from the top-left corner
<code>(0, 0)</code> to position <code>(i, j)</code>, inclusive. All values are stored in row-major order.
</p>

<h2>Implementation Requirements</h2>
<ul>
<li>Use only native features (external libraries are not permitted)</li>
<li>The <code>solve</code> function signature must remain unchanged</li>
<li>The final result must be stored in <code>output</code></li>
<li>Read exclusively from <code>input</code> and write exclusively to <code>output</code> (do not update <code>input</code>)</li>
</ul>

<h2>Example</h2>
<p>
Input (\(3 \times 3\)):
\[
\begin{bmatrix}
1.0 & 2.0 & 3.0 \\
4.0 & 5.0 & 6.0 \\
7.0 & 8.0 & 9.0
\end{bmatrix}
\]
Output (\(3 \times 3\)):
\[
\begin{bmatrix}
1.0 & 3.0 & 6.0 \\
5.0 & 12.0 & 21.0 \\
12.0 & 27.0 & 45.0
\end{bmatrix}
\]
For example, \(\text{output}[1][1] = \text{input}[0][0] + \text{input}[0][1] + \text{input}[1][0] + \text{input}[1][1] = 1 + 2 + 4 + 5 = 12\).
</p>

<h2>Constraints</h2>
<ul>
<li>1 &le; <code>H</code>, <code>W</code> &le; 16,384</li>
<li>Input values are in the range [-100, 100]</li>
<li>All values are 32-bit floats</li>
<li>Performance is measured with <code>H</code> = 8,192, <code>W</code> = 8,192</li>
</ul>
160 changes: 160 additions & 0 deletions challenges/medium/77_integral_image/challenge.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
import ctypes
from typing import Any, Dict, List

import torch
from core.challenge_base import ChallengeBase


class Challenge(ChallengeBase):
def __init__(self):
super().__init__(
name="Integral Image",
atol=1.0,
rtol=1e-05,
num_gpus=1,
access_tier="free",
)

def reference_impl(self, input: torch.Tensor, output: torch.Tensor, H: int, W: int):
assert input.shape == (H, W)
assert output.shape == (H, W)
assert input.dtype == torch.float32
assert input.device.type == "cuda"

result = torch.cumsum(torch.cumsum(input, dim=0), dim=1)
output.copy_(result)

def get_solve_signature(self) -> Dict[str, tuple]:
return {
"input": (ctypes.POINTER(ctypes.c_float), "in"),
"output": (ctypes.POINTER(ctypes.c_float), "out"),
"H": (ctypes.c_int, "in"),
"W": (ctypes.c_int, "in"),
}

def generate_example_test(self) -> Dict[str, Any]:
dtype = torch.float32
input = torch.tensor(
[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]],
device="cuda",
dtype=dtype,
)
output = torch.empty((3, 3), device="cuda", dtype=dtype)
return {"input": input, "output": output, "H": 3, "W": 3}

def generate_functional_test(self) -> List[Dict[str, Any]]:
dtype = torch.float32
tests = []

# single_element
tests.append(
{
"input": torch.tensor([[7.0]], device="cuda", dtype=dtype),
"output": torch.empty((1, 1), device="cuda", dtype=dtype),
"H": 1,
"W": 1,
}
)

# single_row
tests.append(
{
"input": torch.tensor([[1.0, -2.0, 3.0, -4.0]], device="cuda", dtype=dtype),
"output": torch.empty((1, 4), device="cuda", dtype=dtype),
"H": 1,
"W": 4,
}
)

# single_col
tests.append(
{
"input": torch.tensor([[2.0], [5.0], [-1.0], [3.0]], device="cuda", dtype=dtype),
"output": torch.empty((4, 1), device="cuda", dtype=dtype),
"H": 4,
"W": 1,
}
)

# all_zeros_16x16
tests.append(
{
"input": torch.zeros((16, 16), device="cuda", dtype=dtype),
"output": torch.empty((16, 16), device="cuda", dtype=dtype),
"H": 16,
"W": 16,
}
)

# power_of_2_square_32x32
tests.append(
{
"input": torch.empty((32, 32), device="cuda", dtype=dtype).uniform_(-5.0, 5.0),
"output": torch.empty((32, 32), device="cuda", dtype=dtype),
"H": 32,
"W": 32,
}
)

# power_of_2_square_128x128
tests.append(
{
"input": torch.empty((128, 128), device="cuda", dtype=dtype).uniform_(-1.0, 1.0),
"output": torch.empty((128, 128), device="cuda", dtype=dtype),
"H": 128,
"W": 128,
}
)

# non_power_of_2_30x30
tests.append(
{
"input": torch.empty((30, 30), device="cuda", dtype=dtype).uniform_(-3.0, 3.0),
"output": torch.empty((30, 30), device="cuda", dtype=dtype),
"H": 30,
"W": 30,
}
)

# non_power_of_2_100x100_negative
tests.append(
{
"input": torch.empty((100, 100), device="cuda", dtype=dtype).uniform_(-10.0, 0.0),
"output": torch.empty((100, 100), device="cuda", dtype=dtype),
"H": 100,
"W": 100,
}
)

# non_square_255x33
tests.append(
{
"input": torch.empty((255, 33), device="cuda", dtype=dtype).uniform_(-2.0, 2.0),
"output": torch.empty((255, 33), device="cuda", dtype=dtype),
"H": 255,
"W": 33,
}
)

# realistic_1024x1024
tests.append(
{
"input": torch.empty((1024, 1024), device="cuda", dtype=dtype).uniform_(-1.0, 1.0),
"output": torch.empty((1024, 1024), device="cuda", dtype=dtype),
"H": 1024,
"W": 1024,
}
)

return tests

def generate_performance_test(self) -> Dict[str, Any]:
dtype = torch.float32
H = 8192
W = 8192
return {
"input": torch.empty((H, W), device="cuda", dtype=dtype).uniform_(-1.0, 1.0),
"output": torch.empty((H, W), device="cuda", dtype=dtype),
"H": H,
"W": W,
}
4 changes: 4 additions & 0 deletions challenges/medium/77_integral_image/starter/starter.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#include <cuda_runtime.h>

// input, output are device pointers
extern "C" void solve(const float* input, float* output, int H, int W) {}
8 changes: 8 additions & 0 deletions challenges/medium/77_integral_image/starter/starter.cute.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import cutlass
import cutlass.cute as cute


# input, output are tensors on the GPU
@cute.jit
def solve(input: cute.Tensor, output: cute.Tensor, H: cute.Int32, W: cute.Int32):
pass
9 changes: 9 additions & 0 deletions challenges/medium/77_integral_image/starter/starter.jax.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import jax
import jax.numpy as jnp


# input is a tensor on the GPU
@jax.jit
def solve(input: jax.Array, H: int, W: int) -> jax.Array:
# return output tensor directly
pass
9 changes: 9 additions & 0 deletions challenges/medium/77_integral_image/starter/starter.mojo
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from gpu.host import DeviceContext
from gpu.id import block_dim, block_idx, thread_idx
from memory import UnsafePointer
from math import ceildiv

# input, output are device pointers
@export
def solve(input: UnsafePointer[Float32], output: UnsafePointer[Float32], H: Int32, W: Int32):
pass
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
import torch


# input, output are tensors on the GPU
def solve(input: torch.Tensor, output: torch.Tensor, H: int, W: int):
pass
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import torch
import triton
import triton.language as tl


# input, output are tensors on the GPU
def solve(input: torch.Tensor, output: torch.Tensor, H: int, W: int):
pass
Loading