diff --git a/Cargo.toml b/Cargo.toml
index 03c104ca..681d4619 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -22,6 +22,7 @@ gltf = { version = "1.1", default-features = false }
 log = "0.4"
 mint = "0.5"
 naga = { version = "22", features = ["wgsl-in"] }
+nanorand = { version = "0.7", default-features = false }
 profiling = "1"
 slab = "0.4"
 strum = { version = "0.25", features = ["derive"] }
@@ -79,7 +80,7 @@ glam = { workspace = true }
 log = { workspace = true }
 mint = { workspace = true, features = ["serde"] }
 naga = { workspace = true }
-nanorand = { version = "0.7", default-features = false, features = ["wyrand"] }
+nanorand = { workspace = true, features = ["wyrand"] }
 profiling = { workspace = true }
 ron = "0.8"
 serde = { version = "1", features = ["serde_derive"] }
@@ -95,7 +96,6 @@ egui-winit = "0.28"
 console_error_panic_hook = "0.1.7"
 console_log = "1"
 web-sys = { workspace = true, features = ["Window"] }
-getrandom = { version = "0.2", features = ["js"] }
 
 [target.'cfg(any(target_os = "windows", target_os = "linux"))'.dev-dependencies]
 renderdoc = "0.12"
diff --git a/blade-graphics/src/util.rs b/blade-graphics/src/util.rs
index 23e876e0..f0d2013e 100644
--- a/blade-graphics/src/util.rs
+++ b/blade-graphics/src/util.rs
@@ -94,14 +94,19 @@ impl super::TextureFormat {
     }
 }
 
+impl super::Extent {
+    pub fn group_by(&self, size: [u32; 3]) -> [u32; 3] {
+        [
+            (self.width + size[0] - 1) / size[0],
+            (self.height + size[1] - 1) / size[1],
+            (self.depth + size[2] - 1) / size[2],
+        ]
+    }
+}
+
 impl super::ComputePipeline {
     /// Return the dispatch group counts sufficient to cover the given extent.
     pub fn get_dispatch_for(&self, extent: super::Extent) -> [u32; 3] {
-        let wg_size = self.get_workgroup_size();
-        [
-            (extent.width + wg_size[0] - 1) / wg_size[0],
-            (extent.height + wg_size[1] - 1) / wg_size[1],
-            (extent.depth + wg_size[2] - 1) / wg_size[2],
-        ]
+        extent.group_by(self.get_workgroup_size())
     }
 }
diff --git a/blade-helpers/src/hud.rs b/blade-helpers/src/hud.rs
index a135a337..ff4cf5a5 100644
--- a/blade-helpers/src/hud.rs
+++ b/blade-helpers/src/hud.rs
@@ -22,9 +22,10 @@ impl ExposeHud for blade_render::RayConfig {
             egui::widgets::Slider::new(&mut self.spatial_tap_history, 0..=50)
                 .text("Spatial tap history"),
         );
+        ui.add(egui::widgets::Slider::new(&mut self.group_mixer, 1..=10).text("Group mixer"));
         ui.add(
-            egui::widgets::Slider::new(&mut self.spatial_radius, 1..=50)
-                .text("Spatial radius (px)"),
+            egui::widgets::Slider::new(&mut self.spatial_min_distance, 1..=10)
+                .text("Spatial minimum distance (px)"),
         );
         ui.add(
             egui::widgets::Slider::new(&mut self.t_start, 0.001..=0.5)
diff --git a/blade-render/code/blur.wgsl b/blade-render/code/blur.wgsl
index 3207ef60..3aec665b 100644
--- a/blade-render/code/blur.wgsl
+++ b/blade-render/code/blur.wgsl
@@ -1,5 +1,5 @@
 #include "camera.inc.wgsl"
-#include "gbuf.inc.wgsl"
+#include "motion.inc.wgsl"
 #include "quaternion.inc.wgsl"
 #include "surface.inc.wgsl"
 
diff --git a/blade-render/code/color.inc.wgsl b/blade-render/code/color.inc.wgsl
new file mode 100644
index 00000000..8e84cfda
--- /dev/null
+++ b/blade-render/code/color.inc.wgsl
@@ -0,0 +1,19 @@
+fn hsv_to_rgb(h: f32, s: f32, v: f32) -> vec3<f32> {
+    let c = v * s;
+    let x = c * (1.0 - abs((h / 60.0) % 2.0 - 1.0));
+    var q = vec3<f32>(v - c);
+    if (h < 60.0) {
+        q.r += c; q.g += x;
+    } else if (h < 120.0) {
+        q.g += c; q.r += x;
+    } else if (h < 180.0) {
+        q.g += c; q.b += x;
+    } else if (h < 240.0) {
+        q.b += c; q.g += x;
+    } else if (h < 300.0) {
+        q.b += c; q.r += x;
+    } else {
+        q.r += c; q.b += x;
+    }
+    return q;
+}
diff --git a/blade-render/code/fill-gbuf.wgsl b/blade-render/code/fill-gbuf.wgsl
deleted file mode 100644
index 346edf51..00000000
--- a/blade-render/code/fill-gbuf.wgsl
+++ /dev/null
@@ -1,204 +0,0 @@
-#include "quaternion.inc.wgsl"
-#include "camera.inc.wgsl"
-#include "debug.inc.wgsl"
-#include "debug-param.inc.wgsl"
-#include "gbuf.inc.wgsl"
-
-//TODO: use proper WGSL
-const RAY_FLAG_CULL_NO_OPAQUE: u32 = 0x80u;
-
-// Has to match the host!
-struct Vertex {
-    pos: vec3<f32>,
-    bitangent_sign: f32,
-    tex_coords: vec2<f32>,
-    normal: u32,
-    tangent: u32,
-}
-struct VertexBuffer {
-    data: array<Vertex>,
-}
-struct IndexBuffer {
-    data: array<u32>,
-}
-var<storage, read> vertex_buffers: binding_array<VertexBuffer>;
-var<storage, read> index_buffers: binding_array<IndexBuffer>;
-var textures: binding_array<texture_2d<f32>>;
-var sampler_linear: sampler;
-var sampler_nearest: sampler;
-
-struct HitEntry {
-    index_buf: u32,
-    vertex_buf: u32,
-    winding: f32,
-    // packed quaternion
-    geometry_to_world_rotation: u32,
-    geometry_to_object: mat4x3<f32>,
-    prev_object_to_world: mat4x3<f32>,
-    base_color_texture: u32,
-    // packed color factor
-    base_color_factor: u32,
-    normal_texture: u32,
-}
-var<storage, read> hit_entries: array<HitEntry>;
-
-var<uniform> camera: CameraParams;
-var<uniform> prev_camera: CameraParams;
-var<uniform> debug: DebugParams;
-var acc_struct: acceleration_structure;
-
-var out_depth: texture_storage_2d<r32float, write>;
-var out_flat_normal: texture_storage_2d<rgba8snorm, write>;
-var out_basis: texture_storage_2d<rgba8snorm, write>;
-var out_albedo: texture_storage_2d<rgba8unorm, write>;
-var out_motion: texture_storage_2d<rg8snorm, write>;
-var out_debug: texture_storage_2d<rgba8unorm, write>;
-
-fn decode_normal(raw: u32) -> vec3<f32> {
-    return unpack4x8snorm(raw).xyz;
-}
-
-fn debug_raw_normal(pos: vec3<f32>, normal_raw: u32, rotation: vec4<f32>, debug_len: f32, color: u32) {
-    let nw = normalize(qrot(rotation, decode_normal(normal_raw)));
-    debug_line(pos, pos + debug_len * nw, color);
-}
-
-@compute @workgroup_size(8, 4)
-fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {
-    if (any(global_id.xy >= camera.target_size)) {
-        return;
-    }
-
-    var rq: ray_query;
-    let ray_dir = get_ray_direction(camera, vec2<i32>(global_id.xy));
-    rayQueryInitialize(&rq, acc_struct, RayDesc(RAY_FLAG_CULL_NO_OPAQUE, 0xFFu, 0.0, camera.depth, camera.position, ray_dir));
-    rayQueryProceed(&rq);
-    let intersection = rayQueryGetCommittedIntersection(&rq);
-
-    var depth = 0.0;
-    var basis = vec4<f32>(0.0);
-    var flat_normal = vec3<f32>(0.0);
-    var albedo = vec3<f32>(1.0);
-    var motion = vec2<f32>(0.0);
-    let enable_debug = all(global_id.xy == debug.mouse_pos);
-
-    if (intersection.kind != RAY_QUERY_INTERSECTION_NONE) {
-        let entry = hit_entries[intersection.instance_custom_index + intersection.geometry_index];
-        depth = intersection.t;
-
-        var indices = intersection.primitive_index * 3u + vec3<u32>(0u, 1u, 2u);
-        if (entry.index_buf != ~0u) {
-            let iptr = &index_buffers[entry.index_buf].data;
-            indices = vec3<u32>((*iptr)[indices.x], (*iptr)[indices.y], (*iptr)[indices.z]);
-        }
-
-        let vptr = &vertex_buffers[entry.vertex_buf].data;
-        let vertices = array<Vertex, 3>(
-            (*vptr)[indices.x],
-            (*vptr)[indices.y],
-            (*vptr)[indices.z],
-        );
-
-        let positions_object = entry.geometry_to_object * mat3x4(
-            vec4<f32>(vertices[0].pos, 1.0), vec4<f32>(vertices[1].pos, 1.0), vec4<f32>(vertices[2].pos, 1.0)
-        );
-        let positions = intersection.object_to_world * mat3x4(
-            vec4<f32>(positions_object[0], 1.0), vec4<f32>(positions_object[1], 1.0), vec4<f32>(positions_object[2], 1.0)
-        );
-        flat_normal = entry.winding * normalize(cross(positions[1].xyz - positions[0].xyz, positions[2].xyz - positions[0].xyz));
-
-        let barycentrics = vec3<f32>(1.0 - intersection.barycentrics.x - intersection.barycentrics.y, intersection.barycentrics);
-        let position_object = vec4<f32>(positions_object * barycentrics, 1.0);
-        let tex_coords = mat3x2(vertices[0].tex_coords, vertices[1].tex_coords, vertices[2].tex_coords) * barycentrics;
-        let normal_geo = normalize(mat3x3(decode_normal(vertices[0].normal), decode_normal(vertices[1].normal), decode_normal(vertices[2].normal)) * barycentrics);
-        let tangent_geo = normalize(mat3x3(decode_normal(vertices[0].tangent), decode_normal(vertices[1].tangent), decode_normal(vertices[2].tangent)) * barycentrics);
-        let bitangent_geo = normalize(cross(normal_geo, tangent_geo)) * vertices[0].bitangent_sign;
-
-        let lod = 0.0; //TODO: this is actually complicated
-
-        let geo_to_world_rot = normalize(unpack4x8snorm(entry.geometry_to_world_rotation));
-        let tangent_space_geo = mat3x3(tangent_geo, bitangent_geo, normal_geo);
-        var normal_local: vec3<f32>;
-        if ((debug.texture_flags & DebugTextureFlags_NORMAL) != 0u) {
-            normal_local = vec3<f32>(0.0, 0.0, 1.0); // ignore normal map
-        } else {
-            let n_xy = textureSampleLevel(textures[entry.normal_texture], sampler_linear, tex_coords, lod).xy;
-            normal_local = vec3<f32>(n_xy, sqrt(max(0.0, 1.0 - dot(n_xy.xy, n_xy.xy))));
-        }
-        var normal = qrot(geo_to_world_rot, tangent_space_geo * normal_local);
-        basis = shortest_arc_quat(vec3<f32>(0.0, 0.0, 1.0), normalize(normal));
-
-        let hit_position = camera.position + intersection.t * ray_dir;
-        if (enable_debug) {
-            debug_buf.entry.custom_index = intersection.instance_custom_index;
-            debug_buf.entry.depth = intersection.t;
-            debug_buf.entry.tex_coords = tex_coords;
-            debug_buf.entry.base_color_texture = entry.base_color_texture;
-            debug_buf.entry.normal_texture = entry.normal_texture;
-            debug_buf.entry.position = hit_position;
-            debug_buf.entry.flat_normal = flat_normal;
-        }
-        if (enable_debug && (debug.draw_flags & DebugDrawFlags_SPACE) != 0u) {
-            let normal_len = 0.15 * intersection.t;
-            let side = 0.05 * intersection.t;
-            debug_line(hit_position, hit_position + normal_len * qrot(geo_to_world_rot, normal_geo), 0xFFFFFFu);
-            debug_line(hit_position - side * tangent_geo, hit_position + side * tangent_geo, 0x808080u);
-            debug_line(hit_position - side * bitangent_geo, hit_position + side * bitangent_geo, 0x808080u);
-        }
-        if (enable_debug && (debug.draw_flags & DebugDrawFlags_GEOMETRY) != 0u) {
-            let debug_len = intersection.t * 0.2;
-            debug_line(positions[0].xyz, positions[1].xyz, 0x00FFFFu);
-            debug_line(positions[1].xyz, positions[2].xyz, 0x00FFFFu);
-            debug_line(positions[2].xyz, positions[0].xyz, 0x00FFFFu);
-            let poly_center = (positions[0].xyz + positions[1].xyz + positions[2].xyz) / 3.0;
-            debug_line(poly_center, poly_center + 0.2 * debug_len * flat_normal, 0xFF00FFu);
-            // note: dynamic indexing into positions isn't allowed by WGSL yet
-            debug_raw_normal(positions[0].xyz, vertices[0].normal, geo_to_world_rot, 0.5*debug_len, 0xFFFF00u);
-            debug_raw_normal(positions[1].xyz, vertices[1].normal, geo_to_world_rot, 0.5*debug_len, 0xFFFF00u);
-            debug_raw_normal(positions[2].xyz, vertices[2].normal, geo_to_world_rot, 0.5*debug_len, 0xFFFF00u);
-            // draw tangent space
-            debug_line(hit_position, hit_position + debug_len * qrot(basis, vec3<f32>(1.0, 0.0, 0.0)), 0x0000FFu);
-            debug_line(hit_position, hit_position + debug_len * qrot(basis, vec3<f32>(0.0, 1.0, 0.0)), 0x00FF00u);
-            debug_line(hit_position, hit_position + debug_len * qrot(basis, vec3<f32>(0.0, 0.0, 1.0)), 0xFF0000u);
-        }
-
-        let base_color_factor = unpack4x8unorm(entry.base_color_factor);
-        if ((debug.texture_flags & DebugTextureFlags_ALBEDO) != 0u) {
-            albedo = base_color_factor.xyz;
-        } else {
-            let base_color_sample = textureSampleLevel(textures[entry.base_color_texture], sampler_linear, tex_coords, lod);
-            albedo = (base_color_factor * base_color_sample).xyz;
-        }
-
-        if (debug.view_mode == DebugMode_HitConsistency) {
-            let reprojected = get_projected_pixel(camera, hit_position);
-            let barycentrics_pos_diff = (intersection.object_to_world * position_object).xyz - hit_position;
-            let camera_projection_diff = vec2<f32>(global_id.xy) - vec2<f32>(reprojected);
-            let consistency = vec4<f32>(length(barycentrics_pos_diff), length(camera_projection_diff), 0.0, 0.0);
-            textureStore(out_debug, global_id.xy, consistency);
-        }
-
-        let prev_position = (entry.prev_object_to_world * position_object).xyz;
-        let prev_screen = get_projected_pixel_float(prev_camera, prev_position);
-        //TODO: consider just storing integers here?
-        //TODO: technically this "0.5" is just a waste compute on both packing and unpacking
-        motion = prev_screen - vec2<f32>(global_id.xy) - 0.5;
-        if (debug.view_mode == DebugMode_Motion) {
-            textureStore(out_debug, global_id.xy, vec4<f32>(motion * MOTION_SCALE + vec2<f32>(0.5), 0.0, 1.0));
-        }
-    } else {
-        if (enable_debug) {
-            debug_buf.entry = DebugEntry();
-        }
-        if (debug.view_mode != DebugMode_Final) {
-            textureStore(out_debug, global_id.xy, vec4<f32>(0.0));
-        }
-    }
-
-    // TODO: option to avoid writing data for the sky
-    textureStore(out_depth, global_id.xy, vec4<f32>(depth, 0.0, 0.0, 0.0));
-    textureStore(out_basis, global_id.xy, basis);
-    textureStore(out_flat_normal, global_id.xy, vec4<f32>(flat_normal, 0.0));
-    textureStore(out_albedo, global_id.xy, vec4<f32>(albedo, 0.0));
-    textureStore(out_motion, global_id.xy, vec4<f32>(motion * MOTION_SCALE, 0.0, 0.0));
-}
diff --git a/blade-render/code/gbuf.inc.wgsl b/blade-render/code/gbuf.inc.wgsl
deleted file mode 100644
index ecb4642d..00000000
--- a/blade-render/code/gbuf.inc.wgsl
+++ /dev/null
@@ -1,2 +0,0 @@
-const MOTION_SCALE: f32 = 0.02;
-const USE_MOTION_VECTORS: bool = true;
\ No newline at end of file
diff --git a/blade-render/code/geometry.inc.wgsl b/blade-render/code/geometry.inc.wgsl
new file mode 100644
index 00000000..99b771b1
--- /dev/null
+++ b/blade-render/code/geometry.inc.wgsl
@@ -0,0 +1,182 @@
+//TODO: https://github.com/gfx-rs/wgpu/pull/5429
+const RAY_FLAG_CULL_NO_OPAQUE: u32 = 0x80u;
+
+// Has to match the host!
+struct Vertex {
+    pos: vec3<f32>,
+    bitangent_sign: f32,
+    tex_coords: vec2<f32>,
+    normal: u32,
+    tangent: u32,
+}
+struct VertexBuffer {
+    data: array<Vertex>,
+}
+struct IndexBuffer {
+    data: array<u32>,
+}
+var<storage, read> vertex_buffers: binding_array<VertexBuffer>;
+var<storage, read> index_buffers: binding_array<IndexBuffer>;
+var textures: binding_array<texture_2d<f32>>;
+
+struct HitEntry {
+    index_buf: u32,
+    vertex_buf: u32,
+    winding: f32,
+    // packed quaternion
+    geometry_to_world_rotation: u32,
+    geometry_to_object: mat4x3<f32>,
+    prev_object_to_world: mat4x3<f32>,
+    base_color_texture: u32,
+    // packed color factor
+    base_color_factor: u32,
+    normal_texture: u32,
+}
+var<storage, read> hit_entries: array<HitEntry>;
+
+fn decode_normal(raw: u32) -> vec3<f32> {
+    return unpack4x8snorm(raw).xyz;
+}
+
+fn debug_raw_normal(pos: vec3<f32>, normal_raw: u32, rotation: vec4<f32>, debug_len: f32, color: u32) {
+    let nw = normalize(qrot(rotation, decode_normal(normal_raw)));
+    debug_line(pos, pos + debug_len * nw, color);
+}
+
+struct RichSurface {
+    inner: Surface,
+    position: vec3<f32>,
+    albedo: vec3<f32>,
+    motion: vec2<f32>,
+}
+
+fn fetch_geometry(pixel_coord: vec2<i32>, enable_debug: bool, is_primary: bool) -> RichSurface {
+    var rq: ray_query;
+    let ray_dir = get_ray_direction(camera, pixel_coord);
+    rayQueryInitialize(&rq, acc_struct, RayDesc(RAY_FLAG_CULL_NO_OPAQUE, 0xFFu, 0.0, camera.depth, camera.position, ray_dir));
+    rayQueryProceed(&rq);
+    let intersection = rayQueryGetCommittedIntersection(&rq);
+
+    var rs = RichSurface();
+    rs.albedo = vec3<f32>(1.0);
+
+    if (intersection.kind == RAY_QUERY_INTERSECTION_NONE) {
+        if (enable_debug) {
+            debug_buf.entry = DebugEntry();
+        }
+        return rs;
+    }
+
+    let entry = hit_entries[intersection.instance_custom_index + intersection.geometry_index];
+
+    var indices = intersection.primitive_index * 3u + vec3<u32>(0u, 1u, 2u);
+    if (entry.index_buf != ~0u) {
+        let iptr = &index_buffers[entry.index_buf].data;
+        indices = vec3<u32>((*iptr)[indices.x], (*iptr)[indices.y], (*iptr)[indices.z]);
+    }
+
+    let vptr = &vertex_buffers[entry.vertex_buf].data;
+    let vertices = array<Vertex, 3>(
+        (*vptr)[indices.x],
+        (*vptr)[indices.y],
+        (*vptr)[indices.z],
+    );
+
+    let positions_object = entry.geometry_to_object * mat3x4(
+        vec4<f32>(vertices[0].pos, 1.0), vec4<f32>(vertices[1].pos, 1.0), vec4<f32>(vertices[2].pos, 1.0)
+    );
+    let positions = intersection.object_to_world * mat3x4(
+        vec4<f32>(positions_object[0], 1.0), vec4<f32>(positions_object[1], 1.0), vec4<f32>(positions_object[2], 1.0)
+    );
+    let flat_normal = entry.winding * normalize(cross(positions[1].xyz - positions[0].xyz, positions[2].xyz - positions[0].xyz));
+
+    let barycentrics = vec3<f32>(1.0 - intersection.barycentrics.x - intersection.barycentrics.y, intersection.barycentrics);
+    let position_object = vec4<f32>(positions_object * barycentrics, 1.0);
+    let tex_coords = mat3x2(vertices[0].tex_coords, vertices[1].tex_coords, vertices[2].tex_coords) * barycentrics;
+    let normal_geo = normalize(mat3x3(decode_normal(vertices[0].normal), decode_normal(vertices[1].normal), decode_normal(vertices[2].normal)) * barycentrics);
+    let tangent_geo = normalize(mat3x3(decode_normal(vertices[0].tangent), decode_normal(vertices[1].tangent), decode_normal(vertices[2].tangent)) * barycentrics);
+    let bitangent_geo = normalize(cross(normal_geo, tangent_geo)) * vertices[0].bitangent_sign;
+
+    let lod = 0.0; //TODO: this is actually complicated
+
+    let geo_to_world_rot = normalize(unpack4x8snorm(entry.geometry_to_world_rotation));
+    let tangent_space_geo = mat3x3(tangent_geo, bitangent_geo, normal_geo);
+    var normal_local: vec3<f32>;
+    if ((debug.texture_flags & DebugTextureFlags_NORMAL) != 0u) {
+        normal_local = vec3<f32>(0.0, 0.0, 1.0); // ignore normal map
+    } else {
+        let n_xy = textureSampleLevel(textures[entry.normal_texture], sampler_linear, tex_coords, lod).xy;
+        normal_local = vec3<f32>(n_xy, sqrt(max(0.0, 1.0 - dot(n_xy.xy, n_xy.xy))));
+    }
+    let normal = qrot(geo_to_world_rot, tangent_space_geo * normal_local);
+    let basis = shortest_arc_quat(vec3<f32>(0.0, 0.0, 1.0), normalize(normal));
+
+    let hit_position = camera.position + intersection.t * ray_dir;
+    if (enable_debug && is_primary) {
+        debug_buf.entry.custom_index = intersection.instance_custom_index;
+        debug_buf.entry.depth = intersection.t;
+        debug_buf.entry.tex_coords = tex_coords;
+        debug_buf.entry.base_color_texture = entry.base_color_texture;
+        debug_buf.entry.normal_texture = entry.normal_texture;
+        debug_buf.entry.position = hit_position;
+        debug_buf.entry.flat_normal = flat_normal;
+    }
+    if (enable_debug && (debug.draw_flags & DebugDrawFlags_SPACE) != 0u) {
+        let normal_len = 0.15 * intersection.t;
+        let side = 0.05 * intersection.t;
+        debug_line(hit_position, hit_position + normal_len * qrot(geo_to_world_rot, normal_geo), 0xFFFFFFu);
+        debug_line(hit_position - side * tangent_geo, hit_position + side * tangent_geo, 0x808080u);
+        debug_line(hit_position - side * bitangent_geo, hit_position + side * bitangent_geo, 0x808080u);
+    }
+    if (enable_debug && (debug.draw_flags & DebugDrawFlags_GEOMETRY) != 0u) {
+        let debug_len = intersection.t * 0.2;
+        debug_line(positions[0].xyz, positions[1].xyz, 0x00FFFFu);
+        debug_line(positions[1].xyz, positions[2].xyz, 0x00FFFFu);
+        debug_line(positions[2].xyz, positions[0].xyz, 0x00FFFFu);
+        let poly_center = (positions[0].xyz + positions[1].xyz + positions[2].xyz) / 3.0;
+        debug_line(poly_center, poly_center + 0.2 * debug_len * flat_normal, 0xFF00FFu);
+        // note: dynamic indexing into positions isn't allowed by WGSL yet
+        debug_raw_normal(positions[0].xyz, vertices[0].normal, geo_to_world_rot, 0.5*debug_len, 0xFFFF00u);
+        debug_raw_normal(positions[1].xyz, vertices[1].normal, geo_to_world_rot, 0.5*debug_len, 0xFFFF00u);
+        debug_raw_normal(positions[2].xyz, vertices[2].normal, geo_to_world_rot, 0.5*debug_len, 0xFFFF00u);
+        // draw tangent space
+        debug_line(hit_position, hit_position + debug_len * qrot(basis, vec3<f32>(1.0, 0.0, 0.0)), 0x0000FFu);
+        debug_line(hit_position, hit_position + debug_len * qrot(basis, vec3<f32>(0.0, 1.0, 0.0)), 0x00FF00u);
+        debug_line(hit_position, hit_position + debug_len * qrot(basis, vec3<f32>(0.0, 0.0, 1.0)), 0xFF0000u);
+    }
+
+    rs.albedo = unpack4x8unorm(entry.base_color_factor).xyz;
+    if ((debug.texture_flags & DebugTextureFlags_ALBEDO) == 0u) {
+    let base_color_sample = textureSampleLevel(textures[entry.base_color_texture], sampler_linear, tex_coords, lod);
+        rs.albedo *= base_color_sample.xyz;
+    }
+
+    if (is_primary) {
+        if (debug.view_mode == DebugMode_Depth) {
+            textureStore(out_debug, pixel_coord, vec4<f32>(intersection.t / camera.depth));
+        }
+        if (debug.view_mode == DebugMode_Normal) {
+            textureStore(out_debug, pixel_coord, vec4<f32>(normal, 0.0));
+        }
+        if (debug.view_mode == DebugMode_HitConsistency) {
+            let reprojected = get_projected_pixel(camera, hit_position);
+            let barycentrics_pos_diff = (intersection.object_to_world * position_object).xyz - hit_position;
+            let camera_projection_diff = vec2<f32>(pixel_coord - reprojected);
+            let consistency = vec4<f32>(length(barycentrics_pos_diff), length(camera_projection_diff), 0.0, 0.0);
+            textureStore(out_debug, pixel_coord, consistency);
+        }
+    }
+
+    let prev_position = (entry.prev_object_to_world * position_object).xyz;
+    let prev_screen = get_projected_pixel_float(prev_camera, prev_position);
+    //TODO: consider just storing integers here?
+    //TODO: technically this "0.5" is just a waste compute on both packing and unpacking
+    rs.motion = prev_screen - vec2<f32>(pixel_coord) - 0.5;
+    rs.position = hit_position;
+
+    // Write down the Surface
+    rs.inner.basis = basis;
+    rs.inner.flat_normal = flat_normal;
+    rs.inner.depth = intersection.t;
+    return rs;
+}
diff --git a/blade-render/code/motion.inc.wgsl b/blade-render/code/motion.inc.wgsl
new file mode 100644
index 00000000..a9a9f48a
--- /dev/null
+++ b/blade-render/code/motion.inc.wgsl
@@ -0,0 +1,2 @@
+const MOTION_SCALE: f32 = 0.02;
+const USE_MOTION_VECTORS: bool = true;
diff --git a/blade-render/code/random.inc.wgsl b/blade-render/code/random.inc.wgsl
index 3f68478c..da144dbf 100644
--- a/blade-render/code/random.inc.wgsl
+++ b/blade-render/code/random.inc.wgsl
@@ -56,6 +56,10 @@ fn murmur3(rng: ptr<function, RandomState>) -> u32 {
     return hash;
 }
 
+fn random_u32(rng: ptr<function, RandomState>) -> u32 {
+    return murmur3(rng);
+}
+
 fn random_gen(rng: ptr<function, RandomState>) -> f32 {
     let v = murmur3(rng);
     let one = bitcast<u32>(1.0);
diff --git a/blade-render/code/ray-trace.wgsl b/blade-render/code/ray-trace.wgsl
index b27b1065..db350f3f 100644
--- a/blade-render/code/ray-trace.wgsl
+++ b/blade-render/code/ray-trace.wgsl
@@ -1,3 +1,4 @@
+#include "color.inc.wgsl"
 #include "quaternion.inc.wgsl"
 #include "random.inc.wgsl"
 #include "env-importance.inc.wgsl"
@@ -5,24 +6,22 @@
 #include "debug-param.inc.wgsl"
 #include "camera.inc.wgsl"
 #include "surface.inc.wgsl"
-#include "gbuf.inc.wgsl"
-
-//TODO: use proper WGSL
-const RAY_FLAG_CULL_NO_OPAQUE: u32 = 0x80u;
+#include "geometry.inc.wgsl"
+#include "motion.inc.wgsl"
 
 const PI: f32 = 3.1415926;
-const MAX_RESERVOIRS: u32 = 2u;
+const MAX_RESAMPLE: u32 = 4u;
 // See "9.1 pairwise mis for robust reservoir reuse"
 // "Correlations and Reuse for Fast and Accurate Physically Based Light Transport"
 const PAIRWISE_MIS: bool = true;
-// Base MIS for canonical samples. The constant isolates a critical difference between
-// Bitterli's pseudocode (where it's 1) and NVidia's RTXDI implementation (where it's 0).
-// With Bitterli's 1 we have MIS not respecting the prior history enough.
-const BASE_CANONICAL_MIS: f32 = 0.05;
 // See "DECOUPLING SHADING AND REUSE" in
 // "Rearchitecting Spatiotemporal Resampling for Production"
 const DECOUPLED_SHADING: bool = false;
 
+//TODO: crashes on AMD 6850U if `GROUP_SIZE_TOTAL` > 32
+const GROUP_SIZE: vec2<u32> = vec2<u32>(8, 4);
+const GROUP_SIZE_TOTAL: u32 = GROUP_SIZE.x * GROUP_SIZE.y;
+
 struct MainParams {
     frame_index: u32,
     num_environment_samples: u32,
@@ -31,10 +30,11 @@ struct MainParams {
     temporal_history: u32,
     spatial_taps: u32,
     spatial_tap_history: u32,
-    spatial_radius: i32,
+    spatial_min_distance: i32,
     t_start: f32,
     use_motion_vectors: u32,
-};
+    grid_scale: vec2<u32>,
+}
 
 var<uniform> camera: CameraParams;
 var<uniform> prev_camera: CameraParams;
@@ -54,7 +54,14 @@ struct StoredReservoir {
     confidence: f32,
 }
 var<storage, read_write> reservoirs: array<StoredReservoir>;
-var<storage, read> prev_reservoirs: array<StoredReservoir>;
+
+struct PixelCache {
+    surface: Surface,
+    reservoir: StoredReservoir,
+    //Note: we could store direction XY in local camera space instead
+    world_pos: vec3<f32>,
+}
+var<workgroup> pixel_cache: array<PixelCache, GROUP_SIZE_TOTAL>;
 
 struct LightSample {
     radiance: vec3<f32>,
@@ -83,12 +90,6 @@ fn get_reservoir_index(pixel: vec2<i32>, camera: CameraParams) -> i32 {
     }
 }
 
-fn get_pixel_from_reservoir_index(index: i32, camera: CameraParams) -> vec2<i32> {
-    let y = index / i32(camera.target_size.x);
-    let x = index - y * i32(camera.target_size.x);
-    return vec2<i32>(x, y);
-}
-
 fn bump_reservoir(r: ptr<function, LiveReservoir>, history: f32) {
     (*r).history += history;
 }
@@ -102,6 +103,7 @@ fn make_reservoir(ls: LightSample, light_index: u32, brdf: vec3<f32>) -> LiveRes
     r.history = 1.0;
     return r;
 }
+
 fn merge_reservoir(r: ptr<function, LiveReservoir>, other: LiveReservoir, random: f32) -> bool {
     (*r).weight_sum += other.weight_sum;
     (*r).history += other.history;
@@ -115,12 +117,12 @@ fn merge_reservoir(r: ptr<function, LiveReservoir>, other: LiveReservoir, random
         return false;
     }
 }
-fn unpack_reservoir(f: StoredReservoir, max_history: u32) -> LiveReservoir {
+fn unpack_reservoir(f: StoredReservoir, max_history: u32, radiance: vec3<f32>) -> LiveReservoir {
     var r: LiveReservoir;
     r.selected_light_index = f.light_index;
     r.selected_uv = f.light_uv;
     r.selected_target_score = f.target_score;
-    r.radiance = vec3<f32>(0.0); // to be continued...
+    r.radiance = radiance;
     let history = min(f.confidence, f32(max_history));
     r.weight_sum = f.contribution_weight * f.target_score * history;
     r.history = history;
@@ -140,13 +142,9 @@ fn pack_reservoir(r: LiveReservoir) -> StoredReservoir {
     return pack_reservoir_detail(r, r.history);
 }
 
-var t_depth: texture_2d<f32>;
 var t_prev_depth: texture_2d<f32>;
-var t_basis: texture_2d<f32>;
 var t_prev_basis: texture_2d<f32>;
-var t_flat_normal: texture_2d<f32>;
 var t_prev_flat_normal: texture_2d<f32>;
-var t_motion: texture_2d<f32>;
 var out_diffuse: texture_storage_2d<rgba16float, write>;
 var out_debug: texture_storage_2d<rgba8unorm, write>;
 
@@ -201,14 +199,6 @@ fn sample_light_from_environment(rng: ptr<function, RandomState>) -> LightSample
     return ls;
 }
 
-fn read_surface(pixel: vec2<i32>) -> Surface {
-    var surface: Surface;
-    surface.basis = normalize(textureLoad(t_basis, pixel, 0));
-    surface.flat_normal = normalize(textureLoad(t_flat_normal, pixel, 0).xyz);
-    surface.depth = textureLoad(t_depth, pixel, 0).x;
-    return surface;
-}
-
 fn read_prev_surface(pixel: vec2<i32>) -> Surface {
     var surface: Surface;
     surface.basis = normalize(textureLoad(t_prev_basis, pixel, 0));
@@ -217,6 +207,14 @@ fn read_prev_surface(pixel: vec2<i32>) -> Surface {
     return surface;
 }
 
+fn thread_index_to_coord(thread_index: u32, group_id: vec3<u32>) -> vec2<i32> {
+    let cluster_id = group_id.xy / parameters.grid_scale;
+    let cluster_offset = group_id.xy - cluster_id * parameters.grid_scale;
+    let local_id = vec2<u32>(thread_index % GROUP_SIZE.x, thread_index / GROUP_SIZE.x);
+    let global_id = (cluster_id * GROUP_SIZE + local_id) * parameters.grid_scale + cluster_offset;
+    return vec2<i32>(global_id);
+}
+
 fn evaluate_brdf(surface: Surface, dir: vec3<f32>) -> f32 {
     let lambert_brdf = 1.0 / PI;
     let lambert_term = qrot(qinv(surface.basis), dir).z;
@@ -255,9 +253,8 @@ fn evaluate_reflected_light(surface: Surface, light_index: u32, light_uv: vec2<f
     return radiance * brdf;
 }
 
-fn get_prev_pixel(pixel: vec2<i32>, pos_world: vec3<f32>) -> vec2<f32> {
+fn get_prev_pixel(pixel: vec2<i32>, pos_world: vec3<f32>, motion: vec2<f32>) -> vec2<f32> {
     if (USE_MOTION_VECTORS && parameters.use_motion_vectors != 0u) {
-        let motion = textureLoad(t_motion, pixel, 0).xy / MOTION_SCALE;
         return vec2<f32>(pixel) + 0.5 + motion;
     } else {
         return get_projected_pixel_float(prev_camera, pos_world);
@@ -290,11 +287,11 @@ fn estimate_target_score_with_occlusion(
 
     if (check_ray_occluded(acs, position, direction, debug_len)) {
         return TargetScore();
-    } else {
-        //Note: same as `evaluate_reflected_light`
-        let radiance = textureSampleLevel(env_map, sampler_nearest, light_uv, 0.0).xyz;
-        return make_target_score(brdf * radiance);
     }
+
+    //Note: same as `evaluate_reflected_light`
+    let radiance = textureSampleLevel(env_map, sampler_nearest, light_uv, 0.0).xyz;
+    return make_target_score(brdf * radiance);
 }
 
 fn evaluate_sample(ls: LightSample, surface: Surface, start_pos: vec3<f32>, debug_len: f32) -> f32 {
@@ -320,43 +317,11 @@ fn evaluate_sample(ls: LightSample, surface: Surface, start_pos: vec3<f32>, debu
     return brdf;
 }
 
-struct HeuristicFactors {
-    weight: f32,
-    //history: f32,
-}
-
-fn balance_heuristic(w0: f32, w1: f32, h0: f32, h1: f32) -> HeuristicFactors {
-    var hf: HeuristicFactors;
-    let balance_denom = h0 * w0 + h1 * w1;
-    hf.weight = select(h0 * w0 / balance_denom, 0.0, balance_denom <= 0.0);
-    //hf.history = select(pow(clamp(w1 / w0, 0.0, 1.0), 8.0), 1.0, w0 <= 0.0);
-    return hf;
-}
-
-struct RestirOutput {
-    radiance: vec3<f32>,
-}
-
-fn compute_restir(surface: Surface, pixel: vec2<i32>, rng: ptr<function, RandomState>, enable_debug: bool) -> RestirOutput {
-    if (debug.view_mode == DebugMode_Depth) {
-        textureStore(out_debug, pixel, vec4<f32>(surface.depth / camera.depth));
-    }
-    let ray_dir = get_ray_direction(camera, pixel);
-    let pixel_index = get_reservoir_index(pixel, camera);
-    if (surface.depth == 0.0) {
-        reservoirs[pixel_index] = StoredReservoir();
-        let env = evaluate_environment(ray_dir);
-        return RestirOutput(env);
-    }
-
-    let debug_len = select(0.0, surface.depth * 0.2, enable_debug);
-    let position = camera.position + surface.depth * ray_dir;
-    let normal = qrot(surface.basis, vec3<f32>(0.0, 0.0, 1.0));
-    if (debug.view_mode == DebugMode_Normal) {
-        textureStore(out_debug, pixel, vec4<f32>(normal, 0.0));
-    }
-
-    var canonical = LiveReservoir();
+fn produce_canonical(
+    surface: Surface, position: vec3<f32>,
+    rng: ptr<function, RandomState>, debug_len: f32,
+) -> LiveReservoir {
+    var reservoir = LiveReservoir();
     for (var i = 0u; i < parameters.num_environment_samples; i += 1u) {
         var ls: LightSample;
         if (parameters.environment_importance_sampling != 0u) {
@@ -368,156 +333,307 @@ fn compute_restir(surface: Surface, pixel: vec2<i32>, rng: ptr<function, RandomS
         let brdf = evaluate_sample(ls, surface, position, debug_len);
         if (brdf > 0.0) {
             let other = make_reservoir(ls, 0u, vec3<f32>(brdf));
-            merge_reservoir(&canonical, other, random_gen(rng));
+            merge_reservoir(&reservoir, other, random_gen(rng));
         } else {
-            bump_reservoir(&canonical, 1.0);
+            bump_reservoir(&reservoir, 1.0);
         }
     }
+    return reservoir;
+}
+
+struct ResampleBase {
+    surface: Surface,
+    canonical: LiveReservoir,
+    world_pos: vec3<f32>,
+    accepted_count: f32,
+}
+struct ResampleResult {
+    selected: bool,
+    mis_canonical: f32,
+    mis_sample: f32,
+}
+
+// Resample following Algorithm 8 in section 9.1 of Bitterli thesis
+fn resample(
+    dst: ptr<function, LiveReservoir>, color_and_weight: ptr<function, vec4<f32>>,
+    base: ResampleBase, other: PixelCache, other_acs: acceleration_structure, max_history: u32,
+    rng: ptr<function, RandomState>, debug_len: f32,
+) -> ResampleResult {
+    var src: LiveReservoir;
+    let neighbor = other.reservoir;
+    var rr = ResampleResult();
+    if (PAIRWISE_MIS) {
+        let canonical = base.canonical;
+        let neighbor_history = min(neighbor.confidence, f32(max_history));
+        {   // scoping this to hint the register allocation
+            let t_canonical_at_neighbor = estimate_target_score_with_occlusion(
+                other.surface, other.world_pos, canonical.selected_light_index, canonical.selected_uv, other_acs, debug_len);
+            let nom = canonical.selected_target_score * canonical.history / base.accepted_count;
+            let denom = t_canonical_at_neighbor.score * neighbor_history + nom;
+            rr.mis_canonical = select(0.0, nom / denom, denom > 0.0);
+        }
 
-    //TODO: find best match in a 2x2 grid
-    let prev_pixel = vec2<i32>(get_prev_pixel(pixel, position));
+        // Notes about t_neighbor_at_neighbor:
+        // 1. we assume lights aren't moving. Technically we should check if the
+        //   target light has moved, and re-evaluate the occlusion.
+        // 2. we can use the cached target score, and there is no use of the target color
+        //let t_neighbor_at_neighbor = estimate_target_pdf(neighbor_surface, neighbor_position, neighbor.selected_dir);
+        let t_neighbor_at_canonical = estimate_target_score_with_occlusion(
+            base.surface, base.world_pos, neighbor.light_index, neighbor.light_uv, acc_struct, debug_len);
+        let nom = neighbor.target_score * neighbor_history;
+        let denom = nom + t_neighbor_at_canonical.score * canonical.history / base.accepted_count;
+        let mis_neighbor = select(0.0, nom / denom, denom > 0.0);
+        rr.mis_sample  = mis_neighbor;
+
+        src.history = neighbor_history;
+        src.selected_light_index = neighbor.light_index;
+        src.selected_uv = neighbor.light_uv;
+        src.selected_target_score = t_neighbor_at_canonical.score;
+        src.weight_sum = t_neighbor_at_canonical.score * neighbor.contribution_weight * mis_neighbor;
+        src.radiance = t_neighbor_at_canonical.color;
+    } else {
+        rr.mis_canonical = 0.0;
+        rr.mis_sample = 1.0;
+        let radiance = evaluate_reflected_light(base.surface, neighbor.light_index, neighbor.light_uv);
+        src = unpack_reservoir(neighbor, max_history, radiance);
+    }
 
-    // First, gather the list of reservoirs to merge with
-    var accepted_reservoir_indices = array<i32, MAX_RESERVOIRS>();
-    var accepted_count = 0u;
-    var temporal_index = ~0u;
-    for (var tap = 0u; tap <= parameters.spatial_taps; tap += 1u) {
-        var other_pixel = prev_pixel;
-        if (tap != 0u) {
-            let r0 = max(prev_pixel - vec2<i32>(parameters.spatial_radius), vec2<i32>(0));
-            let r1 = min(prev_pixel + vec2<i32>(parameters.spatial_radius + 1), vec2<i32>(prev_camera.target_size));
-            other_pixel = vec2<i32>(mix(vec2<f32>(r0), vec2<f32>(r1), vec2<f32>(random_gen(rng), random_gen(rng))));
-        } else if (parameters.temporal_tap == 0u)
-        {
-            continue;
-        }
+    if (DECOUPLED_SHADING) {
+        //TODO: use `mis_neighbor`O
+        *color_and_weight += src.weight_sum * vec4<f32>(neighbor.contribution_weight * src.radiance, 1.0);
+    }
+    if (src.weight_sum <= 0.0) {
+        bump_reservoir(dst, src.history);
+    } else {
+        merge_reservoir(dst, src, random_gen(rng));
+        rr.selected = true;
+    }
+    return rr;
+}
 
-        let other_index = get_reservoir_index(other_pixel, prev_camera);
-        if (other_index < 0) {
-            continue;
-        }
-        if (prev_reservoirs[other_index].confidence == 0.0) {
-            continue;
-        }
+struct ResampleOutput {
+    reservoir: StoredReservoir,
+    color: vec3<f32>,
+}
 
-        let other_surface = read_prev_surface(other_pixel);
-        let compatibility = compare_surfaces(surface, other_surface);
-        if (compatibility < 0.1) {
-            // if the surfaces are too different, there is no trust in this sample
-            continue;
-        }
+fn revive_canonical(ro: ResampleOutput) -> LiveReservoir {
+    let radiance = select(vec3<f32>(0.0), ro.color / ro.reservoir.contribution_weight, ro.reservoir.contribution_weight > 0.0);
+    return unpack_reservoir(ro.reservoir, ~0u, radiance);
+}
 
-        if (tap == 0u) {
-            temporal_index = accepted_count;
-        }
-        accepted_reservoir_indices[accepted_count] = other_index;
-        if (accepted_count < MAX_RESERVOIRS) {
-            accepted_count += 1u;
+fn finalize_canonical(reservoir: LiveReservoir) -> ResampleOutput {
+    var ro = ResampleOutput();
+    ro.reservoir = pack_reservoir(reservoir);
+    ro.color = ro.reservoir.contribution_weight * reservoir.radiance;
+    return ro;
+}
+
+fn finalize_resampling(
+    reservoir: ptr<function, LiveReservoir>, color_and_weight: ptr<function, vec4<f32>>,
+    base: ResampleBase, mis_canonical: f32, rng: ptr<function, RandomState>,
+) -> ResampleOutput {
+    var ro = ResampleOutput();
+    var canonical = base.canonical;
+    canonical.weight_sum *= mis_canonical / canonical.history;
+    merge_reservoir(reservoir, canonical, random_gen(rng));
+
+    if (base.accepted_count > 0.0) {
+        let effective_history = select((*reservoir).history, 1.0 + base.accepted_count, PAIRWISE_MIS);
+        ro.reservoir = pack_reservoir_detail(*reservoir, effective_history);
+    } else {
+        ro.reservoir = pack_reservoir(canonical);
+    }
+
+    if (DECOUPLED_SHADING) {
+        if (canonical.selected_target_score > 0.0) {
+            let contribution_weight = canonical.weight_sum / canonical.selected_target_score;
+            *color_and_weight += canonical.weight_sum * vec4<f32>(contribution_weight * canonical.radiance, 1.0);
         }
+        ro.color = (*color_and_weight).xyz / max((*color_and_weight).w, 0.001);
+    } else {
+        ro.color = ro.reservoir.contribution_weight * (*reservoir).radiance;
+    }
+    return ro;
+}
+
+fn resample_temporal(
+    surface: Surface, motion: vec2<f32>, cur_pixel: vec2<i32>, position: vec3<f32>,
+    rng: ptr<function, RandomState>, debug_len: f32,
+) -> ResampleOutput {
+    if (debug.view_mode == DebugMode_TemporalMatch || debug.view_mode == DebugMode_TemporalMisCanonical || debug.view_mode == DebugMode_TemporalMisError) {
+        textureStore(out_debug, cur_pixel, vec4<f32>(0.0));
+    }
+    if (surface.depth == 0.0) {
+        return ResampleOutput();
+    }
+    let canonical = produce_canonical(surface, position, rng, debug_len);
+
+    //TODO: find best match in a 2x2 grid
+    let prev_pixel = vec2<i32>(get_prev_pixel(cur_pixel, position, motion));
+
+    let prev_reservoir_index = get_reservoir_index(prev_pixel, prev_camera);
+    if (parameters.temporal_tap == 0u || prev_reservoir_index < 0) {
+        return finalize_canonical(canonical);
+    }
+
+    let prev_reservoir = reservoirs[prev_reservoir_index];
+    let prev_surface = read_prev_surface(prev_pixel);
+    // if the surfaces are too different, there is no trust in this sample
+    if (prev_reservoir.confidence == 0.0 || compare_surfaces(surface, prev_surface) < 0.1) {
+        return finalize_canonical(canonical);
     }
 
-    // Next, evaluate the MIS of each of the samples versus the canonical one.
     var reservoir = LiveReservoir();
-    var shaded_color = vec3<f32>(0.0);
-    var mis_canonical = BASE_CANONICAL_MIS;
     var color_and_weight = vec4<f32>(0.0);
-    for (var rid = 0u; rid < accepted_count; rid += 1u) {
-        let neighbor_index = accepted_reservoir_indices[rid];
-        let neighbor = prev_reservoirs[neighbor_index];
-
-        let max_history = select(parameters.spatial_tap_history, parameters.temporal_history, rid == temporal_index);
-        var other: LiveReservoir;
-        if (PAIRWISE_MIS) {
-            let neighbor_pixel = get_pixel_from_reservoir_index(neighbor_index, prev_camera);
-            let neighbor_history = min(neighbor.confidence, f32(max_history));
-            {   // scoping this to hint the register allocation
-                let neighbor_surface = read_prev_surface(neighbor_pixel);
-                let neighbor_dir = get_ray_direction(prev_camera, neighbor_pixel);
-                let neighbor_position = prev_camera.position + neighbor_surface.depth * neighbor_dir;
-
-                let t_canonical_at_neighbor = estimate_target_score_with_occlusion(
-                    neighbor_surface, neighbor_position, canonical.selected_light_index, canonical.selected_uv, prev_acc_struct, debug_len);
-                let mis_sub_canonical = balance_heuristic(
-                    t_canonical_at_neighbor.score, canonical.selected_target_score,
-                    neighbor_history * f32(accepted_count), canonical.history);
-                mis_canonical += 1.0 - mis_sub_canonical.weight;
-            }
-
-            // Notes about t_neighbor_at_neighbor:
-            // 1. we assume lights aren't moving. Technically we should check if the
-            //   target light has moved, and re-evaluate the occlusion.
-            // 2. we can use the cached target score, and there is no use of the target color
-            //let t_neighbor_at_neighbor = estimate_target_pdf(neighbor_surface, neighbor_position, neighbor.selected_dir);
-            let t_neighbor_at_canonical = estimate_target_score_with_occlusion(
-                surface, position, neighbor.light_index, neighbor.light_uv, acc_struct, debug_len);
-            let mis_neighbor = balance_heuristic(
-                neighbor.target_score, t_neighbor_at_canonical.score,
-                neighbor_history * f32(accepted_count), canonical.history);
-
-            other.history = neighbor_history;
-            other.selected_light_index = neighbor.light_index;
-            other.selected_uv = neighbor.light_uv;
-            other.selected_target_score = t_neighbor_at_canonical.score;
-            other.weight_sum = t_neighbor_at_canonical.score * neighbor.contribution_weight * mis_neighbor.weight;
-            //Note: should be needed according to the paper
-            // other.history *= min(mis_neighbor.history, mis_sub_canonical.history);
-            other.radiance = t_neighbor_at_canonical.color;
-        } else {
-            other = unpack_reservoir(neighbor, max_history);
-            other.radiance = evaluate_reflected_light(surface, other.selected_light_index, other.selected_uv);
+    let base = ResampleBase(surface, canonical, position, 1.0);
+
+    let prev_dir = get_ray_direction(prev_camera, prev_pixel);
+    let prev_world_pos = prev_camera.position + prev_surface.depth * prev_dir;
+    let other = PixelCache(prev_surface, prev_reservoir, prev_world_pos);
+    let rr = resample(&reservoir, &color_and_weight, base, other, prev_acc_struct, parameters.temporal_history, rng, debug_len);
+    let mis_canonical = 1.0 + rr.mis_canonical;
+
+    if (debug.view_mode == DebugMode_TemporalMatch) {
+        textureStore(out_debug, cur_pixel, vec4<f32>(1.0));
+    }
+    if (debug.view_mode == DebugMode_TemporalMisCanonical) {
+        textureStore(out_debug, cur_pixel, vec4<f32>(mis_canonical / (1.0 + base.accepted_count)));
+    }
+    if (debug.view_mode == DebugMode_TemporalMisError) {
+        let total = mis_canonical + rr.mis_sample;
+        textureStore(out_debug, cur_pixel, vec4<f32>(abs(total - 1.0 - base.accepted_count)));
+    }
+    return finalize_resampling(&reservoir, &color_and_weight, base, mis_canonical, rng);
+}
+
+fn resample_spatial(
+    surface: Surface, cur_pixel: vec2<i32>, position: vec3<f32>,
+    group_id: vec3<u32>, canonical: LiveReservoir,
+    rng: ptr<function, RandomState>, debug_len: f32,
+) -> ResampleOutput {
+    if (surface.depth == 0.0) {
+        if (debug.view_mode == DebugMode_SpatialMatch || debug.view_mode == DebugMode_SpatialMisCanonical || debug.view_mode == DebugMode_SpatialMisError) {
+            textureStore(out_debug, cur_pixel, vec4<f32>(0.0));
         }
+        let dir = normalize(position - camera.position);
+        var ro = ResampleOutput();
+        ro.color = evaluate_environment(dir);
+        return ro;
+    }
 
-        if (DECOUPLED_SHADING) {
-            color_and_weight += other.weight_sum * vec4<f32>(neighbor.contribution_weight * other.radiance, 1.0);
+    // gather the list of neighbors (within the workgroup) to resample.
+    var accepted_count = 0u;
+    var accepted_local_indices = array<u32, MAX_RESAMPLE>();
+    let max_accepted = min(MAX_RESAMPLE, parameters.spatial_taps);
+    let num_candidates = parameters.spatial_taps * 3u;
+    for (var i = 0u; i < num_candidates && accepted_count < max_accepted; i += 1u) {
+        let other_cache_index = random_u32(rng) % GROUP_SIZE_TOTAL;
+        let diff = thread_index_to_coord(other_cache_index, group_id) - cur_pixel;
+        if (dot(diff, diff) < parameters.spatial_min_distance * parameters.spatial_min_distance) {
+            continue;
         }
-        if (other.weight_sum <= 0.0) {
-            bump_reservoir(&reservoir, other.history);
-        } else {
-            merge_reservoir(&reservoir, other, random_gen(rng));
+        let other = pixel_cache[other_cache_index];
+        // if the surfaces are too different, there is no trust in this sample
+        if (other.reservoir.confidence > 0.0 && compare_surfaces(surface, other.surface) > 0.1) {
+            accepted_local_indices[accepted_count] = other_cache_index;
+            accepted_count += 1u;
         }
     }
 
-    // Finally, merge in the canonical sample
-    if (PAIRWISE_MIS) {
-        canonical.weight_sum *= mis_canonical / canonical.history;
-    }
-    if (DECOUPLED_SHADING) {
-        //FIXME: issue with near zero denominator. Do we need do use BASE_CANONICAL_MIS?
-        let cw = canonical.weight_sum / max(canonical.selected_target_score * mis_canonical, 0.1);
-        color_and_weight += canonical.weight_sum * vec4<f32>(cw * canonical.radiance, 1.0);
+    var reservoir = LiveReservoir();
+    var color_and_weight = vec4<f32>(0.0);
+    let base = ResampleBase(surface, canonical, position, f32(accepted_count));
+    var mis_canonical = 1.0;
+    var mis_sample_sum = 0.0;
+
+    // evaluate the MIS of each of the samples versus the canonical one.
+    for (var lid = 0u; lid < accepted_count; lid += 1u) {
+        let other = pixel_cache[accepted_local_indices[lid]];
+        let rr = resample(&reservoir, &color_and_weight, base, other, acc_struct, parameters.spatial_tap_history, rng, debug_len);
+        mis_canonical += rr.mis_canonical;
+        mis_sample_sum += rr.mis_sample;
     }
-    merge_reservoir(&reservoir, canonical, random_gen(rng));
 
-    let effective_history = select(reservoir.history, BASE_CANONICAL_MIS + f32(accepted_count), PAIRWISE_MIS);
-    let stored = pack_reservoir_detail(reservoir, effective_history);
-    reservoirs[pixel_index] = stored;
-    var ro = RestirOutput();
-    if (DECOUPLED_SHADING) {
-        ro.radiance = color_and_weight.xyz / max(color_and_weight.w, 0.001);
-    } else {
-        ro.radiance = stored.contribution_weight * reservoir.radiance;
+    if (debug.view_mode == DebugMode_SpatialMatch) {
+        let value = f32(accepted_count) / max(1.0, f32(parameters.spatial_taps));
+        textureStore(out_debug, cur_pixel, vec4<f32>(value));
     }
-    return ro;
+    if (debug.view_mode == DebugMode_SpatialMisCanonical) {
+        textureStore(out_debug, cur_pixel, vec4<f32>(mis_canonical / (1.0 + base.accepted_count)));
+    }
+    if (debug.view_mode == DebugMode_SpatialMisError) {
+        let total = mis_canonical + mis_sample_sum;
+        textureStore(out_debug, cur_pixel, vec4<f32>(abs(total - 1.0 - base.accepted_count)));
+    }
+    return finalize_resampling(&reservoir, &color_and_weight, base, mis_canonical, rng);
 }
 
-@compute @workgroup_size(8, 4)
-fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {
-    if (any(global_id.xy >= camera.target_size)) {
+fn compute_restir(
+    rs: RichSurface,
+    pixel: vec2<i32>, local_index: u32, group_id: vec3<u32>,
+    rng: ptr<function, RandomState>, enable_debug: bool,
+) -> vec3<f32> {
+    let debug_len = select(0.0, rs.inner.depth * 0.2, enable_debug);
+
+    let temporal = resample_temporal(rs.inner, rs.motion, pixel, rs.position, rng, debug_len);
+    pixel_cache[local_index] = PixelCache(rs.inner, temporal.reservoir, rs.position);
+
+    // sync with the workgroup to ensure all reservoirs are available.
+    workgroupBarrier();
+
+    let temporal_live = revive_canonical(temporal);
+    let spatial = resample_spatial(rs.inner, pixel, rs.position, group_id, temporal_live, rng, debug_len);
+
+    let pixel_index = get_reservoir_index(pixel, camera);
+    reservoirs[pixel_index] = spatial.reservoir;
+    return spatial.color;
+}
+
+var out_depth: texture_storage_2d<r32float, write>;
+var out_basis: texture_storage_2d<rgba8snorm, write>;
+var out_flat_normal: texture_storage_2d<rgba8snorm, write>;
+var out_albedo: texture_storage_2d<rgba8unorm, write>;
+var out_motion: texture_storage_2d<rg8snorm, write>;
+
+@compute @workgroup_size(GROUP_SIZE.x, GROUP_SIZE.y)
+fn main(
+    @builtin(workgroup_id) group_id: vec3<u32>,
+    @builtin(local_invocation_index) local_index: u32,
+) {
+    pixel_cache[local_index] = PixelCache();
+    let pixel_coord = thread_index_to_coord(local_index, group_id);
+    if (any(vec2<u32>(pixel_coord) >= camera.target_size)) {
+        return;
+    }
+
+    if (debug.view_mode == DebugMode_Grouping) {
+        var rng = random_init(group_id.y * 1000u + group_id.x, 0u);
+        let h = random_gen(&rng) * 360.0;
+        let color = hsv_to_rgb(h, 0.5, 1.0);
+        textureStore(out_debug, pixel_coord, vec4<f32>(color, 1.0));
         return;
     }
 
-    let global_index = global_id.y * camera.target_size.x + global_id.x;
+    let enable_debug = all(pixel_coord == vec2<i32>(debug.mouse_pos));
+    let rs = fetch_geometry(pixel_coord, true, enable_debug);
+
+    // TODO: option to avoid writing data for the sky
+    textureStore(out_depth, pixel_coord, vec4<f32>(rs.inner.depth, 0.0, 0.0, 0.0));
+    textureStore(out_basis, pixel_coord, rs.inner.basis);
+    textureStore(out_flat_normal, pixel_coord, vec4<f32>(rs.inner.flat_normal, 0.0));
+    textureStore(out_albedo, pixel_coord, vec4<f32>(rs.albedo, 0.0));
+    textureStore(out_motion, pixel_coord, vec4<f32>(rs.motion * MOTION_SCALE, 0.0, 0.0));
+
+    let global_index = u32(pixel_coord.y) * camera.target_size.x + u32(pixel_coord.x);
     var rng = random_init(global_index, parameters.frame_index);
 
-    let surface = read_surface(vec2<i32>(global_id.xy));
-    let enable_debug = all(global_id.xy == debug.mouse_pos);
     let enable_restir_debug = (debug.draw_flags & DebugDrawFlags_RESTIR) != 0u && enable_debug;
-    let ro = compute_restir(surface, vec2<i32>(global_id.xy), &rng, enable_restir_debug);
-    let color = ro.radiance;
+    let color = compute_restir(rs, pixel_coord, local_index, group_id, &rng, enable_restir_debug);
+    textureStore(out_diffuse, pixel_coord, vec4<f32>(color, 1.0));
+
     if (enable_debug) {
         debug_buf.variance.color_sum += color;
         debug_buf.variance.color2_sum += color * color;
         debug_buf.variance.count += 1u;
     }
-    textureStore(out_diffuse, global_id.xy, vec4<f32>(color, 1.0));
 }
diff --git a/blade-render/src/render/mod.rs b/blade-render/src/render/mod.rs
index c856538f..e0e724c5 100644
--- a/blade-render/src/render/mod.rs
+++ b/blade-render/src/render/mod.rs
@@ -51,7 +51,14 @@ pub enum DebugMode {
     Normal = 2,
     Motion = 3,
     HitConsistency = 4,
-    Variance = 5,
+    Grouping = 5,
+    TemporalMatch = 10,
+    TemporalMisCanonical = 11,
+    TemporalMisError = 12,
+    SpatialMatch = 13,
+    SpatialMisCanonical = 14,
+    SpatialMisError = 15,
+    Variance = 100,
 }
 
 impl Default for DebugMode {
@@ -93,7 +100,11 @@ pub struct RayConfig {
     pub temporal_history: u32,
     pub spatial_taps: u32,
     pub spatial_tap_history: u32,
-    pub spatial_radius: u32,
+    /// Minimal distance to a spatially reused pixel (in the current frame).
+    pub spatial_min_distance: u32,
+    /// Scale and mix the groups into clusters, to allow spatial samples to mix
+    /// outside of the original workgroup pixel bounds.
+    pub group_mixer: u32,
     pub t_start: f32,
 }
 
@@ -199,7 +210,7 @@ impl<const N: usize> RenderTarget<N> {
 }
 
 struct RestirTargets {
-    reservoir_buf: [blade_graphics::Buffer; 2],
+    reservoir_buf: blade_graphics::Buffer,
     debug: RenderTarget<1>,
     depth: RenderTarget<2>,
     basis: RenderTarget<2>,
@@ -218,14 +229,11 @@ impl RestirTargets {
         gpu: &blade_graphics::Context,
     ) -> Self {
         let total_reservoirs = size.width as usize * size.height as usize;
-        let mut reservoir_buf = [blade_graphics::Buffer::default(); 2];
-        for (i, rb) in reservoir_buf.iter_mut().enumerate() {
-            *rb = gpu.create_buffer(blade_graphics::BufferDesc {
-                name: &format!("reservoirs{i}"),
-                size: reservoir_size as u64 * total_reservoirs as u64,
-                memory: blade_graphics::Memory::Device,
-            });
-        }
+        let reservoir_buf = gpu.create_buffer(blade_graphics::BufferDesc {
+            name: "reservoirs",
+            size: reservoir_size as u64 * total_reservoirs as u64,
+            memory: blade_graphics::Memory::Device,
+        });
 
         Self {
             reservoir_buf,
@@ -277,9 +285,7 @@ impl RestirTargets {
     }
 
     fn destroy(&self, gpu: &blade_graphics::Context) {
-        for rb in self.reservoir_buf.iter() {
-            gpu.destroy_buffer(*rb);
-        }
+        gpu.destroy_buffer(self.reservoir_buf);
         self.debug.destroy(gpu);
         self.depth.destroy(gpu);
         self.basis.destroy(gpu);
@@ -308,7 +314,6 @@ pub struct Renderer {
     shaders: Shaders,
     targets: RestirTargets,
     post_proc_input_index: usize,
-    fill_pipeline: blade_graphics::ComputePipeline,
     main_pipeline: blade_graphics::ComputePipeline,
     post_proc_pipeline: blade_graphics::RenderPipeline,
     blur: Blur,
@@ -363,53 +368,38 @@ struct MainParams {
     temporal_history: u32,
     spatial_taps: u32,
     spatial_tap_history: u32,
-    spatial_radius: u32,
+    spatial_min_distance: u32,
     t_start: f32,
     use_motion_vectors: u32,
+    grid_scale: [u32; 2],
 }
 
 #[derive(blade_macros::ShaderData)]
-struct FillData<'a> {
+struct MainData<'a> {
     camera: CameraParams,
     prev_camera: CameraParams,
     debug: DebugParams,
+    parameters: MainParams,
     acc_struct: blade_graphics::AccelerationStructure,
+    prev_acc_struct: blade_graphics::AccelerationStructure,
     hit_entries: blade_graphics::BufferPiece,
     index_buffers: &'a blade_graphics::BufferArray<MAX_RESOURCES>,
     vertex_buffers: &'a blade_graphics::BufferArray<MAX_RESOURCES>,
     textures: &'a blade_graphics::TextureArray<MAX_RESOURCES>,
     sampler_linear: blade_graphics::Sampler,
-    debug_buf: blade_graphics::BufferPiece,
-    out_depth: blade_graphics::TextureView,
-    out_basis: blade_graphics::TextureView,
-    out_flat_normal: blade_graphics::TextureView,
-    out_albedo: blade_graphics::TextureView,
-    out_motion: blade_graphics::TextureView,
-    out_debug: blade_graphics::TextureView,
-}
-
-#[derive(blade_macros::ShaderData)]
-struct MainData {
-    camera: CameraParams,
-    prev_camera: CameraParams,
-    debug: DebugParams,
-    parameters: MainParams,
-    acc_struct: blade_graphics::AccelerationStructure,
-    prev_acc_struct: blade_graphics::AccelerationStructure,
-    sampler_linear: blade_graphics::Sampler,
     sampler_nearest: blade_graphics::Sampler,
     env_map: blade_graphics::TextureView,
     env_weights: blade_graphics::TextureView,
-    t_depth: blade_graphics::TextureView,
     t_prev_depth: blade_graphics::TextureView,
-    t_basis: blade_graphics::TextureView,
     t_prev_basis: blade_graphics::TextureView,
-    t_flat_normal: blade_graphics::TextureView,
     t_prev_flat_normal: blade_graphics::TextureView,
-    t_motion: blade_graphics::TextureView,
     debug_buf: blade_graphics::BufferPiece,
     reservoirs: blade_graphics::BufferPiece,
-    prev_reservoirs: blade_graphics::BufferPiece,
+    out_depth: blade_graphics::TextureView,
+    out_basis: blade_graphics::TextureView,
+    out_flat_normal: blade_graphics::TextureView,
+    out_albedo: blade_graphics::TextureView,
+    out_motion: blade_graphics::TextureView,
     out_diffuse: blade_graphics::TextureView,
     out_debug: blade_graphics::TextureView,
 }
@@ -487,7 +477,6 @@ struct HitEntry {
 #[derive(Clone, PartialEq)]
 pub struct Shaders {
     env_prepare: blade_asset::Handle<crate::Shader>,
-    fill_gbuf: blade_asset::Handle<crate::Shader>,
     ray_trace: blade_asset::Handle<crate::Shader>,
     blur: blade_asset::Handle<crate::Shader>,
     post_proc: blade_asset::Handle<crate::Shader>,
@@ -500,7 +489,6 @@ impl Shaders {
         let mut ctx = asset_hub.open_context(path, "shader finish");
         let shaders = Self {
             env_prepare: ctx.load_shader("env-prepare.wgsl"),
-            fill_gbuf: ctx.load_shader("fill-gbuf.wgsl"),
             ray_trace: ctx.load_shader("ray-trace.wgsl"),
             blur: ctx.load_shader("blur.wgsl"),
             post_proc: ctx.load_shader("post-proc.wgsl"),
@@ -512,7 +500,6 @@ impl Shaders {
 }
 
 struct ShaderPipelines {
-    fill: blade_graphics::ComputePipeline,
     main: blade_graphics::ComputePipeline,
     temporal_accum: blade_graphics::ComputePipeline,
     atrous: blade_graphics::ComputePipeline,
@@ -522,19 +509,6 @@ struct ShaderPipelines {
 }
 
 impl ShaderPipelines {
-    fn create_gbuf_fill(
-        shader: &blade_graphics::Shader,
-        gpu: &blade_graphics::Context,
-    ) -> blade_graphics::ComputePipeline {
-        shader.check_struct_size::<crate::Vertex>();
-        shader.check_struct_size::<HitEntry>();
-        let layout = <FillData as blade_graphics::ShaderData>::layout();
-        gpu.create_compute_pipeline(blade_graphics::ComputePipelineDesc {
-            name: "fill-gbuf",
-            data_layouts: &[&layout],
-            compute: shader.at("main"),
-        })
-    }
     fn create_ray_trace(
         shader: &blade_graphics::Shader,
         gpu: &blade_graphics::Context,
@@ -545,11 +519,17 @@ impl ShaderPipelines {
         shader.check_struct_size::<DebugVariance>();
         shader.check_struct_size::<DebugEntry>();
         let layout = <MainData as blade_graphics::ShaderData>::layout();
-        gpu.create_compute_pipeline(blade_graphics::ComputePipelineDesc {
+        let pipeline = gpu.create_compute_pipeline(blade_graphics::ComputePipelineDesc {
             name: "ray-trace",
             data_layouts: &[&layout],
             compute: shader.at("main"),
-        })
+        });
+
+        let pl_struct_size = shader.get_struct_size("PixelCache");
+        let group_size = pipeline.get_workgroup_size();
+        let wg_required = pl_struct_size * group_size[0] * group_size[1];
+        log::info!("Using {} workgroup memory for RT", wg_required);
+        pipeline
     }
 
     fn create_temporal_accum(
@@ -606,7 +586,6 @@ impl ShaderPipelines {
         let sh_main = shader_man[shaders.ray_trace].raw.as_ref().unwrap();
         let sh_blur = shader_man[shaders.blur].raw.as_ref().unwrap();
         Ok(Self {
-            fill: Self::create_gbuf_fill(shader_man[shaders.fill_gbuf].raw.as_ref().unwrap(), gpu),
             main: Self::create_ray_trace(sh_main, gpu),
             temporal_accum: Self::create_temporal_accum(sh_blur, gpu),
             atrous: Self::create_atrous(sh_blur, gpu),
@@ -697,7 +676,6 @@ impl Renderer {
             shaders,
             targets,
             post_proc_input_index: 0,
-            fill_pipeline: sp.fill,
             main_pipeline: sp.main,
             post_proc_pipeline: sp.post_proc,
             blur: Blur {
@@ -744,7 +722,6 @@ impl Renderer {
         // pipelines
         gpu.destroy_compute_pipeline(&mut self.blur.temporal_accum_pipeline);
         gpu.destroy_compute_pipeline(&mut self.blur.atrous_pipeline);
-        gpu.destroy_compute_pipeline(&mut self.fill_pipeline);
         gpu.destroy_compute_pipeline(&mut self.main_pipeline);
         gpu.destroy_render_pipeline(&mut self.post_proc_pipeline);
     }
@@ -759,7 +736,6 @@ impl Renderer {
         let mut tasks = Vec::new();
         let old = self.shaders.clone();
 
-        tasks.extend(asset_hub.shaders.hot_reload(&mut self.shaders.fill_gbuf));
         tasks.extend(asset_hub.shaders.hot_reload(&mut self.shaders.ray_trace));
         tasks.extend(asset_hub.shaders.hot_reload(&mut self.shaders.blur));
         tasks.extend(asset_hub.shaders.hot_reload(&mut self.shaders.post_proc));
@@ -776,11 +752,6 @@ impl Renderer {
             let _ = task.join();
         }
 
-        if self.shaders.fill_gbuf != old.fill_gbuf {
-            if let Ok(ref shader) = asset_hub.shaders[self.shaders.fill_gbuf].raw {
-                self.fill_pipeline = ShaderPipelines::create_gbuf_fill(shader, gpu);
-            }
-        }
         if self.shaders.ray_trace != old.ray_trace {
             if let Ok(ref shader) = asset_hub.shaders[self.shaders.ray_trace].raw {
                 assert_eq!(
@@ -1089,13 +1060,11 @@ impl Renderer {
                 self.debug.reset_lines(&mut transfer);
             }
             let total_reservoirs = self.surface_size.width as u64 * self.surface_size.height as u64;
-            for reservoir_buf in self.targets.reservoir_buf.iter() {
-                transfer.fill_buffer(
-                    reservoir_buf.at(0),
-                    total_reservoirs * self.reservoir_size as u64,
-                    0,
-                );
-            }
+            transfer.fill_buffer(
+                self.targets.reservoir_buf.at(0),
+                total_reservoirs * self.reservoir_size as u64,
+                0,
+            );
         }
 
         if !config.frozen {
@@ -1119,35 +1088,26 @@ impl Renderer {
         let (cur, prev) = self.work_indices();
 
         if let mut pass = command_encoder.compute() {
-            let mut pc = pass.with(&self.fill_pipeline);
-            let groups = self.fill_pipeline.get_dispatch_for(self.surface_size);
-            pc.bind(
-                0,
-                &FillData {
-                    camera: self.targets.camera_params[cur],
-                    prev_camera: self.targets.camera_params[prev],
-                    debug,
-                    acc_struct: self.acceleration_structure,
-                    hit_entries: self.hit_buffer.into(),
-                    index_buffers: &self.index_buffers,
-                    vertex_buffers: &self.vertex_buffers,
-                    textures: &self.textures,
-                    sampler_linear: self.samplers.linear,
-                    debug_buf: self.debug.buffer_resource(),
-                    out_depth: self.targets.depth.views[cur],
-                    out_basis: self.targets.basis.views[cur],
-                    out_flat_normal: self.targets.flat_normal.views[cur],
-                    out_albedo: self.targets.albedo.views[0],
-                    out_motion: self.targets.motion.views[0],
-                    out_debug: self.targets.debug.views[0],
-                },
-            );
-            pc.dispatch(groups);
-        }
-
-        if let mut pass = command_encoder.compute() {
+            let grid_scale = {
+                let limit = ray_config.group_mixer;
+                let r = self.frame_index as u32 ^ 0x5A;
+                [r % limit + 1, (r / limit) % limit + 1]
+            };
+            let groups = {
+                let wg_size = self.main_pipeline.get_workgroup_size();
+                let cluster_size = [
+                    wg_size[0] * grid_scale[0],
+                    wg_size[1] * grid_scale[1],
+                    wg_size[2],
+                ];
+                let clusters = self.surface_size.group_by(cluster_size);
+                [
+                    clusters[0] * grid_scale[0],
+                    clusters[1] * grid_scale[1],
+                    clusters[2],
+                ]
+            };
             let mut pc = pass.with(&self.main_pipeline);
-            let groups = self.main_pipeline.get_dispatch_for(self.surface_size);
             pc.bind(
                 0,
                 &MainData {
@@ -1163,9 +1123,10 @@ impl Renderer {
                         temporal_history: ray_config.temporal_history,
                         spatial_taps: ray_config.spatial_taps,
                         spatial_tap_history: ray_config.spatial_tap_history,
-                        spatial_radius: ray_config.spatial_radius,
+                        spatial_min_distance: ray_config.spatial_min_distance,
                         t_start: ray_config.t_start,
                         use_motion_vectors: (self.frame_scene_built == self.frame_index) as u32,
+                        grid_scale,
                     },
                     acc_struct: self.acceleration_structure,
                     prev_acc_struct: if self.frame_scene_built < self.frame_index
@@ -1176,20 +1137,24 @@ impl Renderer {
                     } else {
                         self.prev_acceleration_structure
                     },
+                    hit_entries: self.hit_buffer.into(),
+                    index_buffers: &self.index_buffers,
+                    vertex_buffers: &self.vertex_buffers,
+                    textures: &self.textures,
                     sampler_linear: self.samplers.linear,
                     sampler_nearest: self.samplers.nearest,
                     env_map: self.env_map.main_view,
                     env_weights: self.env_map.weight_view,
-                    t_depth: self.targets.depth.views[cur],
                     t_prev_depth: self.targets.depth.views[prev],
-                    t_basis: self.targets.basis.views[cur],
                     t_prev_basis: self.targets.basis.views[prev],
-                    t_flat_normal: self.targets.flat_normal.views[cur],
                     t_prev_flat_normal: self.targets.flat_normal.views[prev],
-                    t_motion: self.targets.motion.views[0],
                     debug_buf: self.debug.buffer_resource(),
-                    reservoirs: self.targets.reservoir_buf[cur].into(),
-                    prev_reservoirs: self.targets.reservoir_buf[prev].into(),
+                    reservoirs: self.targets.reservoir_buf.into(),
+                    out_depth: self.targets.depth.views[cur],
+                    out_basis: self.targets.basis.views[cur],
+                    out_flat_normal: self.targets.flat_normal.views[cur],
+                    out_albedo: self.targets.albedo.views[0],
+                    out_motion: self.targets.motion.views[0],
                     out_diffuse: self.targets.light_diffuse.views[cur],
                     out_debug: self.targets.debug.views[0],
                 },
diff --git a/examples/scene/main.rs b/examples/scene/main.rs
index f09de24a..903a30ca 100644
--- a/examples/scene/main.rs
+++ b/examples/scene/main.rs
@@ -264,7 +264,8 @@ impl Example {
                 temporal_history: 10,
                 spatial_taps: 1,
                 spatial_tap_history: 5,
-                spatial_radius: 10,
+                spatial_min_distance: 4,
+                group_mixer: 10,
                 t_start: 0.1,
             },
             denoiser_enabled: true,
@@ -582,6 +583,8 @@ impl Example {
             return;
         }
 
+        ui.checkbox(&mut self.track_hot_reloads, "Hot reloading");
+
         let mut selection = blade_render::SelectionInfo::default();
         if self.debug.mouse_pos.is_some() {
             selection = self.renderer.read_debug_selection_info();
diff --git a/src/lib.rs b/src/lib.rs
index 5c8320b6..b14b8ebe 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -489,7 +489,8 @@ impl Engine {
                 temporal_history: 10,
                 spatial_taps: 1,
                 spatial_tap_history: 5,
-                spatial_radius: 10,
+                spatial_min_distance: 4,
+                group_mixer: 10,
                 t_start: 0.01,
             },
             denoiser_enabled: true,