diff --git a/challenges/medium/73_bilateral_filter/challenge.html b/challenges/medium/73_bilateral_filter/challenge.html
new file mode 100644
index 00000000..861fd4b8
--- /dev/null
+++ b/challenges/medium/73_bilateral_filter/challenge.html
@@ -0,0 +1,64 @@
+
+ Implement a bilateral filter on a 2D image of 32-bit floating point values.
+ The bilateral filter is an edge-preserving smoothing filter: for each pixel it computes a
+ weighted average of its neighbors, where the weight combines a spatial Gaussian (favoring
+ nearby pixels) and a range Gaussian (favoring pixels with similar intensity), so that sharp
+ edges are preserved while flat regions are smoothed.
+ The image is stored in row-major order and boundary pixels are handled by clamping to the
+ nearest valid index (border replication).
+
+
+Implementation Requirements
+
+ - Use only native features (external libraries are not permitted)
+ - The
solve function signature must remain unchanged
+ - The final result must be stored in
output
+ -
+ For each output pixel at position \((i, j)\), compute:
+ \[
+ \text{output}[i,j] = \frac{\displaystyle\sum_{dy=-r}^{r}\sum_{dx=-r}^{r} w_s(dy,dx)\, w_r\bigl(\text{image}[i',j'] - \text{image}[i,j]\bigr)\, \text{image}[i',j']}
+ {\displaystyle\sum_{dy=-r}^{r}\sum_{dx=-r}^{r} w_s(dy,dx)\, w_r\bigl(\text{image}[i',j'] - \text{image}[i,j]\bigr)}
+ \]
+ where \(i' = \text{clamp}(i+dy, 0, H-1)\), \(j' = \text{clamp}(j+dx, 0, W-1)\),
+ \(w_s(dy,dx) = \exp\!\left(-\tfrac{dy^2+dx^2}{2\sigma_s^2}\right)\), and
+ \(w_r(\delta) = \exp\!\left(-\tfrac{\delta^2}{2\sigma_r^2}\right)\)
+
+
+
+Example
+
+ Input image (\(H=3, W=3\)), spatial_sigma = 1.0,
+ range_sigma = 0.5, radius = 1:
+ \[
+ \begin{bmatrix}
+ 1.0 & 1.0 & 1.0 \\
+ 1.0 & 0.0 & 1.0 \\
+ 1.0 & 1.0 & 1.0
+ \end{bmatrix}
+ \]
+ Output:
+ \[
+ \begin{bmatrix}
+ 0.9891 & 0.9812 & 0.9891 \\
+ 0.9812 & 0.3453 & 0.9812 \\
+ 0.9891 & 0.9812 & 0.9891
+ \end{bmatrix}
+ \]
+ The center pixel (value 0.0) is surrounded by neighbors all equal to 1.0. Because the
+ intensity difference of 1.0 is large relative to range_sigma = 0.5, the range
+ weights strongly suppress those neighbors, so the output at the center (0.3453) is far lower
+ than a plain Gaussian blur would produce (≈ 0.75). The outer pixels remain close to
+ 1.0 because their neighbors are mostly equal to themselves.
+
+
+Constraints
+
+ - 1 ≤
H, W ≤ 8192
+ - 1 ≤
radius ≤ 16
+ - Image values are 32-bit floats (arbitrary range)
+ - 0.1 ≤
spatial_sigma, range_sigma ≤ 10.0
+ -
+ Performance is measured with
H = 2,048, W = 2,048,
+ radius = 5, spatial_sigma = 3.0, range_sigma = 0.1
+
+
diff --git a/challenges/medium/73_bilateral_filter/challenge.py b/challenges/medium/73_bilateral_filter/challenge.py
new file mode 100644
index 00000000..ac1cf05d
--- /dev/null
+++ b/challenges/medium/73_bilateral_filter/challenge.py
@@ -0,0 +1,273 @@
+import ctypes
+from typing import Any, Dict, List
+
+import torch
+import torch.nn.functional as F
+from core.challenge_base import ChallengeBase
+
+
+class Challenge(ChallengeBase):
+ def __init__(self):
+ super().__init__(
+ name="Bilateral Filter",
+ atol=1e-04,
+ rtol=1e-04,
+ num_gpus=1,
+ access_tier="free",
+ )
+
+ def reference_impl(
+ self,
+ image: torch.Tensor,
+ output: torch.Tensor,
+ H: int,
+ W: int,
+ spatial_sigma: float,
+ range_sigma: float,
+ radius: int,
+ ):
+ assert image.shape == (H * W,)
+ assert output.shape == (H * W,)
+ assert image.dtype == torch.float32
+ assert output.dtype == torch.float32
+ assert image.device.type == "cuda"
+ assert output.device.type == "cuda"
+
+ r = int(radius)
+ img = image.view(H, W)
+
+ yy = torch.arange(-r, r + 1, device=image.device, dtype=torch.float32)
+ xx = torch.arange(-r, r + 1, device=image.device, dtype=torch.float32)
+ grid_y, grid_x = torch.meshgrid(yy, xx, indexing="ij")
+ spatial_weights = torch.exp(-(grid_y**2 + grid_x**2) / (2.0 * float(spatial_sigma) ** 2))
+
+ padded = (
+ F.pad(img.unsqueeze(0).unsqueeze(0), (r, r, r, r), mode="replicate")
+ .squeeze(0)
+ .squeeze(0)
+ )
+
+ out = torch.zeros(H, W, device=image.device, dtype=torch.float32)
+ norm = torch.zeros(H, W, device=image.device, dtype=torch.float32)
+ inv_2rs2 = 1.0 / (2.0 * float(range_sigma) ** 2)
+
+ for dy in range(2 * r + 1):
+ for dx in range(2 * r + 1):
+ neighbor = padded[dy : dy + H, dx : dx + W]
+ range_weight = torch.exp(-((neighbor - img) ** 2) * inv_2rs2)
+ weight = spatial_weights[dy, dx] * range_weight
+ out += weight * neighbor
+ norm += weight
+
+ output.copy_(out.view(-1) / norm.view(-1))
+
+ def get_solve_signature(self) -> Dict[str, tuple]:
+ return {
+ "image": (ctypes.POINTER(ctypes.c_float), "in"),
+ "output": (ctypes.POINTER(ctypes.c_float), "out"),
+ "H": (ctypes.c_int, "in"),
+ "W": (ctypes.c_int, "in"),
+ "spatial_sigma": (ctypes.c_float, "in"),
+ "range_sigma": (ctypes.c_float, "in"),
+ "radius": (ctypes.c_int, "in"),
+ }
+
+ def generate_example_test(self) -> Dict[str, Any]:
+ dtype = torch.float32
+ H, W = 3, 3
+ image = torch.tensor(
+ [1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0], device="cuda", dtype=dtype
+ )
+ output = torch.zeros(H * W, device="cuda", dtype=dtype)
+ return {
+ "image": image,
+ "output": output,
+ "H": H,
+ "W": W,
+ "spatial_sigma": 1.0,
+ "range_sigma": 0.5,
+ "radius": 1,
+ }
+
+ def generate_functional_test(self) -> List[Dict[str, Any]]:
+ dtype = torch.float32
+ device = "cuda"
+ tests = []
+
+ # single_pixel
+ H, W = 1, 1
+ tests.append(
+ {
+ "image": torch.tensor([0.5], device=device, dtype=dtype),
+ "output": torch.zeros(H * W, device=device, dtype=dtype),
+ "H": H,
+ "W": W,
+ "spatial_sigma": 1.0,
+ "range_sigma": 0.5,
+ "radius": 1,
+ }
+ )
+
+ # two_by_two_zeros
+ H, W = 2, 2
+ tests.append(
+ {
+ "image": torch.zeros(H * W, device=device, dtype=dtype),
+ "output": torch.zeros(H * W, device=device, dtype=dtype),
+ "H": H,
+ "W": W,
+ "spatial_sigma": 1.0,
+ "range_sigma": 0.5,
+ "radius": 1,
+ }
+ )
+
+ # three_by_three_ring (matches example)
+ H, W = 3, 3
+ tests.append(
+ {
+ "image": torch.tensor(
+ [1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0], device=device, dtype=dtype
+ ),
+ "output": torch.zeros(H * W, device=device, dtype=dtype),
+ "H": H,
+ "W": W,
+ "spatial_sigma": 1.0,
+ "range_sigma": 0.5,
+ "radius": 1,
+ }
+ )
+
+ # four_by_four_negatives
+ H, W = 4, 4
+ tests.append(
+ {
+ "image": torch.tensor(
+ [
+ -1.0,
+ -1.0,
+ 1.0,
+ 1.0,
+ -1.0,
+ -1.0,
+ 1.0,
+ 1.0,
+ -1.0,
+ -1.0,
+ 1.0,
+ 1.0,
+ -1.0,
+ -1.0,
+ 1.0,
+ 1.0,
+ ],
+ device=device,
+ dtype=dtype,
+ ),
+ "output": torch.zeros(H * W, device=device, dtype=dtype),
+ "H": H,
+ "W": W,
+ "spatial_sigma": 1.5,
+ "range_sigma": 0.8,
+ "radius": 1,
+ }
+ )
+
+ # power_of_two_16x16
+ H, W = 16, 16
+ tests.append(
+ {
+ "image": torch.empty(H * W, device=device, dtype=dtype).uniform_(0.0, 1.0),
+ "output": torch.zeros(H * W, device=device, dtype=dtype),
+ "H": H,
+ "W": W,
+ "spatial_sigma": 1.5,
+ "range_sigma": 0.3,
+ "radius": 2,
+ }
+ )
+
+ # power_of_two_64x64_mixed
+ H, W = 64, 64
+ tests.append(
+ {
+ "image": torch.empty(H * W, device=device, dtype=dtype).uniform_(-1.0, 1.0),
+ "output": torch.zeros(H * W, device=device, dtype=dtype),
+ "H": H,
+ "W": W,
+ "spatial_sigma": 2.0,
+ "range_sigma": 0.5,
+ "radius": 2,
+ }
+ )
+
+ # non_power_of_two_100x100
+ H, W = 100, 100
+ tests.append(
+ {
+ "image": torch.empty(H * W, device=device, dtype=dtype).uniform_(0.0, 1.0),
+ "output": torch.zeros(H * W, device=device, dtype=dtype),
+ "H": H,
+ "W": W,
+ "spatial_sigma": 2.0,
+ "range_sigma": 0.3,
+ "radius": 3,
+ }
+ )
+
+ # non_power_of_two_255x255_mixed
+ H, W = 255, 255
+ tests.append(
+ {
+ "image": torch.empty(H * W, device=device, dtype=dtype).uniform_(-1.0, 1.0),
+ "output": torch.zeros(H * W, device=device, dtype=dtype),
+ "H": H,
+ "W": W,
+ "spatial_sigma": 1.5,
+ "range_sigma": 0.4,
+ "radius": 2,
+ }
+ )
+
+ # realistic_512x512
+ H, W = 512, 512
+ tests.append(
+ {
+ "image": torch.empty(H * W, device=device, dtype=dtype).uniform_(0.0, 1.0),
+ "output": torch.zeros(H * W, device=device, dtype=dtype),
+ "H": H,
+ "W": W,
+ "spatial_sigma": 2.0,
+ "range_sigma": 0.2,
+ "radius": 3,
+ }
+ )
+
+ # realistic_1000x1000
+ H, W = 1000, 1000
+ tests.append(
+ {
+ "image": torch.empty(H * W, device=device, dtype=dtype).uniform_(0.0, 1.0),
+ "output": torch.zeros(H * W, device=device, dtype=dtype),
+ "H": H,
+ "W": W,
+ "spatial_sigma": 3.0,
+ "range_sigma": 0.1,
+ "radius": 5,
+ }
+ )
+
+ return tests
+
+ def generate_performance_test(self) -> Dict[str, Any]:
+ dtype = torch.float32
+ H, W = 2048, 2048
+ return {
+ "image": torch.empty(H * W, device="cuda", dtype=dtype).uniform_(0.0, 1.0),
+ "output": torch.zeros(H * W, device="cuda", dtype=dtype),
+ "H": H,
+ "W": W,
+ "spatial_sigma": 3.0,
+ "range_sigma": 0.1,
+ "radius": 5,
+ }
diff --git a/challenges/medium/73_bilateral_filter/starter/starter.cu b/challenges/medium/73_bilateral_filter/starter/starter.cu
new file mode 100644
index 00000000..76760898
--- /dev/null
+++ b/challenges/medium/73_bilateral_filter/starter/starter.cu
@@ -0,0 +1,5 @@
+#include
+
+// image, output are device pointers
+extern "C" void solve(const float* image, float* output, int H, int W, float spatial_sigma,
+ float range_sigma, int radius) {}
diff --git a/challenges/medium/73_bilateral_filter/starter/starter.cute.py b/challenges/medium/73_bilateral_filter/starter/starter.cute.py
new file mode 100644
index 00000000..ef93ace3
--- /dev/null
+++ b/challenges/medium/73_bilateral_filter/starter/starter.cute.py
@@ -0,0 +1,16 @@
+import cutlass
+import cutlass.cute as cute
+
+
+# image, output are tensors on the GPU
+@cute.jit
+def solve(
+ image: cute.Tensor,
+ output: cute.Tensor,
+ H: cute.Int32,
+ W: cute.Int32,
+ spatial_sigma: cute.Float32,
+ range_sigma: cute.Float32,
+ radius: cute.Int32,
+):
+ pass
diff --git a/challenges/medium/73_bilateral_filter/starter/starter.jax.py b/challenges/medium/73_bilateral_filter/starter/starter.jax.py
new file mode 100644
index 00000000..3869e954
--- /dev/null
+++ b/challenges/medium/73_bilateral_filter/starter/starter.jax.py
@@ -0,0 +1,11 @@
+import jax
+import jax.numpy as jnp
+
+
+# image is a tensor on GPU
+@jax.jit
+def solve(
+ image: jax.Array, H: int, W: int, spatial_sigma: float, range_sigma: float, radius: int
+) -> jax.Array:
+ # return output tensor directly
+ pass
diff --git a/challenges/medium/73_bilateral_filter/starter/starter.mojo b/challenges/medium/73_bilateral_filter/starter/starter.mojo
new file mode 100644
index 00000000..ad6b1a11
--- /dev/null
+++ b/challenges/medium/73_bilateral_filter/starter/starter.mojo
@@ -0,0 +1,18 @@
+from gpu.host import DeviceContext
+from gpu.id import block_dim, block_idx, thread_idx
+from memory import UnsafePointer
+from math import ceildiv
+
+
+# image, output are device pointers
+@export
+def solve(
+ image: UnsafePointer[Float32],
+ output: UnsafePointer[Float32],
+ H: Int32,
+ W: Int32,
+ spatial_sigma: Float32,
+ range_sigma: Float32,
+ radius: Int32,
+):
+ pass
diff --git a/challenges/medium/73_bilateral_filter/starter/starter.pytorch.py b/challenges/medium/73_bilateral_filter/starter/starter.pytorch.py
new file mode 100644
index 00000000..363a237b
--- /dev/null
+++ b/challenges/medium/73_bilateral_filter/starter/starter.pytorch.py
@@ -0,0 +1,14 @@
+import torch
+
+
+# image, output are tensors on the GPU
+def solve(
+ image: torch.Tensor,
+ output: torch.Tensor,
+ H: int,
+ W: int,
+ spatial_sigma: float,
+ range_sigma: float,
+ radius: int,
+):
+ pass
diff --git a/challenges/medium/73_bilateral_filter/starter/starter.triton.py b/challenges/medium/73_bilateral_filter/starter/starter.triton.py
new file mode 100644
index 00000000..84bc0f46
--- /dev/null
+++ b/challenges/medium/73_bilateral_filter/starter/starter.triton.py
@@ -0,0 +1,16 @@
+import torch
+import triton
+import triton.language as tl
+
+
+# image, output are tensors on the GPU
+def solve(
+ image: torch.Tensor,
+ output: torch.Tensor,
+ H: int,
+ W: int,
+ spatial_sigma: float,
+ range_sigma: float,
+ radius: int,
+):
+ pass