From 70a9233f4c0a77c202409dfcfa95e1957b7e3ce6 Mon Sep 17 00:00:00 2001
From: Chris Lewin <clewin@ea.com>
Date: Thu, 8 Aug 2024 17:03:20 +0100
Subject: [PATCH] G2P2G optimization

---
 shaders/bukkit.inc.wgsl          |  23 ++
 shaders/bukkitAllocate.wgsl      |  58 +++++
 shaders/bukkitCount.wgsl         |  36 ++++
 shaders/bukkitInsert.wgsl        |  42 ++++
 shaders/dispatch.inc.wgsl        |   5 +
 shaders/g2p2g.wgsl               | 349 +++++++++++++++++++++++++++++++
 shaders/gridToParticle.wgsl      |  88 --------
 shaders/gridUpdate.wgsl          |  94 ---------
 shaders/gridZero.wgsl            |  25 ---
 shaders/particleEmit.wgsl        |   2 +-
 shaders/particleIntegrate.wgsl   |   2 +-
 shaders/particleToGrid.wgsl      |  66 ------
 shaders/particleUpdatePBMPM.wgsl |  88 --------
 shaders/setIndirectArgs.wgsl     |   5 -
 src/gpu.js                       |  69 ++++--
 src/main.js                      |   9 +-
 src/shader.js                    |  12 +-
 src/sim.js                       | 150 ++++++++++---
 src/time.js                      |   5 +-
 src/ui.js                        |   6 +-
 20 files changed, 703 insertions(+), 431 deletions(-)
 create mode 100644 shaders/bukkit.inc.wgsl
 create mode 100644 shaders/bukkitAllocate.wgsl
 create mode 100644 shaders/bukkitCount.wgsl
 create mode 100644 shaders/bukkitInsert.wgsl
 create mode 100644 shaders/g2p2g.wgsl
 delete mode 100644 shaders/gridToParticle.wgsl
 delete mode 100644 shaders/gridUpdate.wgsl
 delete mode 100644 shaders/gridZero.wgsl
 delete mode 100644 shaders/particleToGrid.wgsl
 delete mode 100644 shaders/particleUpdatePBMPM.wgsl
diff --git a/shaders/bukkit.inc.wgsl b/shaders/bukkit.inc.wgsl
new file mode 100644
index 0000000..99812eb
--- /dev/null
+++ b/shaders/bukkit.inc.wgsl
@@ -0,0 +1,23 @@
+//-----------------------------------------------------------------------------
+// Copyright (c) 2024 Electronic Arts.  All rights reserved.
+//-----------------------------------------------------------------------------
+
+//!include simConstants.inc
+
+fn bukkitAddressToIndex(address: vec2u, bukkitCountX: u32) -> u32
+{
+    return address.y*bukkitCountX + address.x;
+}
+
+fn positionToBukkitId(position: vec2f) -> vec2i
+{
+    return vec2i((position) / f32(BukkitSize));
+}
+
+struct BukkitThreadData
+{
+    rangeStart: u32,
+    rangeCount: u32,
+    bukkitX: u32,
+    bukkitY: u32,
+};
\ No newline at end of file
diff --git a/shaders/bukkitAllocate.wgsl b/shaders/bukkitAllocate.wgsl
new file mode 100644
index 0000000..83135ee
--- /dev/null
+++ b/shaders/bukkitAllocate.wgsl
@@ -0,0 +1,58 @@
+//-----------------------------------------------------------------------------
+// Copyright (c) 2024 Electronic Arts.  All rights reserved.
+//-----------------------------------------------------------------------------
+
+//!include dispatch.inc
+//!include simConstants.inc
+//!include bukkit.inc
+
+@group(0) @binding(0) var<uniform> g_simConstants : SimConstants;
+@group(0) @binding(1) var<storage> g_bukkitCounts : array<u32>;
+@group(0) @binding(2) var<storage, read_write> g_bukkitIndirectDispatch : array<atomic<u32>>;
+@group(0) @binding(3) var<storage, read_write> g_bukkitThreadData : array<BukkitThreadData>;
+@group(0) @binding(4) var<storage, read_write> g_bukkitParticleAlloctor : array<atomic<u32>>;
+@group(0) @binding(5) var<storage, read_write> g_bukkitIndexStart : array<u32>;
+
+@compute @workgroup_size(GridDispatchSize, GridDispatchSize)
+fn csMain( @builtin(global_invocation_id) id: vec3<u32> )
+{
+    if(id.x >= g_simConstants.bukkitCountX || id.y >= g_simConstants.bukkitCountY)
+    {
+        return;
+    }
+
+    let bukkitIndex = bukkitAddressToIndex(id.xy, g_simConstants.bukkitCountX);
+
+    let bukkitCount = g_bukkitCounts[bukkitIndex];
+    let bukkitCountResidual = bukkitCount % ParticleDispatchSize;
+
+    if(bukkitCount == 0)
+    {
+        return;
+    }
+
+    let dispatchCount = divUp(bukkitCount, ParticleDispatchSize);
+
+    let dispatchStartIndex = atomicAdd(&g_bukkitIndirectDispatch[0], dispatchCount);
+    let particleStartIndex = atomicAdd(&g_bukkitParticleAlloctor[0], bukkitCount);
+
+    g_bukkitIndexStart[bukkitIndex] = particleStartIndex;
+
+    for(var i: u32 = 0; i < dispatchCount; i++)
+    {
+        // Group count is equal to ParticleDispatchSize except for the final dispatch for this
+        // bukkit in which case it's equal to the residual count
+        var groupCount : u32 = ParticleDispatchSize;
+        if(bukkitCountResidual != 0 && i == dispatchCount - 1)
+        {
+            groupCount = bukkitCountResidual;
+        }
+
+        g_bukkitThreadData[i + dispatchStartIndex] = BukkitThreadData(
+            particleStartIndex + i * ParticleDispatchSize,
+            groupCount,
+            id.x,
+            id.y
+        );
+    }
+}
\ No newline at end of file
diff --git a/shaders/bukkitCount.wgsl b/shaders/bukkitCount.wgsl
new file mode 100644
index 0000000..6d3eef5
--- /dev/null
+++ b/shaders/bukkitCount.wgsl
@@ -0,0 +1,36 @@
+//-----------------------------------------------------------------------------
+// Copyright (c) 2024 Electronic Arts.  All rights reserved.
+//-----------------------------------------------------------------------------
+
+//!include dispatch.inc
+//!include simConstants.inc
+//!include particle.inc
+//!include bukkit.inc
+
+@group(0) @binding(0) var<uniform> g_simConstants : SimConstants;
+@group(0) @binding(1) var<storage> g_particleCount : array<u32>;
+@group(0) @binding(2) var<storage> g_particles : array<Particle>;
+@group(0) @binding(3) var<storage, read_write> g_bukkitCounts : array<atomic<u32>>;
+
+@compute @workgroup_size(ParticleDispatchSize)
+fn csMain( @builtin(global_invocation_id) id: vec3<u32> )
+{
+    if(id.x >= g_particleCount[0])
+    {
+        return;
+    }
+
+    let particle = g_particles[id.x];
+    let position = particle.position;
+
+    let particleBukkit = positionToBukkitId(position);
+
+    if(any(particleBukkit < vec2i(0)) || u32(particleBukkit.x) >= g_simConstants.bukkitCountX || u32(particleBukkit.y) >= g_simConstants.bukkitCountY)
+    {
+        return;
+    }
+
+    let bukkitIndex = bukkitAddressToIndex(vec2u(particleBukkit), g_simConstants.bukkitCountX);
+
+    atomicAdd(&g_bukkitCounts[bukkitIndex], 1);    
+}
\ No newline at end of file
diff --git a/shaders/bukkitInsert.wgsl b/shaders/bukkitInsert.wgsl
new file mode 100644
index 0000000..3fb4eef
--- /dev/null
+++ b/shaders/bukkitInsert.wgsl
@@ -0,0 +1,42 @@
+//-----------------------------------------------------------------------------
+// Copyright (c) 2024 Electronic Arts.  All rights reserved.
+//-----------------------------------------------------------------------------
+
+//!include dispatch.inc
+//!include simConstants.inc
+//!include bukkit.inc
+//!include particle.inc
+
+@group(0) @binding(0) var<uniform> g_simConstants : SimConstants;
+@group(0) @binding(1) var<storage> g_particleCount : array<u32>;
+@group(0) @binding(2) var<storage, read_write> g_particleInsertCounters : array<atomic<u32>>;
+@group(0) @binding(3) var<storage> g_particles : array<Particle>;
+@group(0) @binding(4) var<storage, read_write> g_particleData : array<u32>;
+@group(0) @binding(5) var<storage> g_bukkitIndexStart : array<u32>;
+
+@compute @workgroup_size(ParticleDispatchSize)
+fn csMain( @builtin(global_invocation_id) id: vec3<u32> )
+{
+    if(id.x >= g_particleCount[0])
+    {
+        return;
+    }
+
+    let particle = g_particles[id.x];
+    let position = particle.position;
+
+    let particleBukkit = positionToBukkitId(position);
+
+    if(any(particleBukkit < vec2i(0)) || u32(particleBukkit.x) >= g_simConstants.bukkitCountX || u32(particleBukkit.y) >= g_simConstants.bukkitCountY)
+    {
+        return;
+    }
+
+    let bukkitIndex = bukkitAddressToIndex(vec2u(particleBukkit), g_simConstants.bukkitCountX);
+    let bukkitIndexStart = g_bukkitIndexStart[bukkitIndex];
+
+    let particleInsertCounter = atomicAdd(&g_particleInsertCounters[bukkitIndex], 1u);
+
+
+    g_particleData[particleInsertCounter + bukkitIndexStart] = id.x;
+}
\ No newline at end of file
diff --git a/shaders/dispatch.inc.wgsl b/shaders/dispatch.inc.wgsl
index c0ac63e..126369b 100644
--- a/shaders/dispatch.inc.wgsl
+++ b/shaders/dispatch.inc.wgsl
@@ -3,3 +3,8 @@
 //-----------------------------------------------------------------------------
 
 //!insert DispatchSizes
+
+fn divUp(threadCount : u32, divisor : u32) -> u32
+{
+    return (threadCount + divisor - 1) / divisor;
+}
diff --git a/shaders/g2p2g.wgsl b/shaders/g2p2g.wgsl
new file mode 100644
index 0000000..e799d9e
--- /dev/null
+++ b/shaders/g2p2g.wgsl
@@ -0,0 +1,349 @@
+//-----------------------------------------------------------------------------
+// Copyright (c) 2024 Electronic Arts.  All rights reserved.
+//-----------------------------------------------------------------------------
+
+//!include dispatch.inc
+//!include simConstants.inc
+//!include bukkit.inc
+//!include particle.inc
+//!include shapes.inc
+
+@group(0) @binding(0) var<uniform> g_simConstants : SimConstants;
+@group(0) @binding(1) var<storage, read_write> g_particles : array<Particle>;
+@group(0) @binding(2) var<storage> g_gridSrc : array<i32>;
+@group(0) @binding(3) var<storage, read_write> g_gridDst : array<atomic<i32>>;
+@group(0) @binding(4) var<storage> g_bukkitThreadData : array<BukkitThreadData>;
+@group(0) @binding(5) var<storage> g_bukkitParticleData : array<u32>;
+@group(0) @binding(6) var<storage> g_shapes : array<SimShape>;
+
+const TotalBukkitEdgeLength = BukkitSize + BukkitHaloSize*2;
+const TileDataSizePerEdge = TotalBukkitEdgeLength * 4;
+const TileDataSize = TileDataSizePerEdge*TileDataSizePerEdge;
+var<workgroup> s_tileData: array<atomic<i32>, TileDataSize>;
+// Note we are going to atomicAdd to this but we don't need to initialize its contents
+// to zero because the webgpu spec guarantees this anyway 
+var<workgroup> s_tileDataDst: array<atomic<i32>, TileDataSize>;
+
+fn localGridIndex(index: vec2u) -> u32
+{
+    return (index.y * TotalBukkitEdgeLength + index.x)*4;
+}
+
+@compute @workgroup_size(ParticleDispatchSize)
+fn csMain( @builtin(local_invocation_index) indexInGroup: u32, @builtin(workgroup_id) groupId: vec3<u32> )
+{
+    let threadData = g_bukkitThreadData[groupId.x];
+
+    // Load grid
+    let localGridOrigin = BukkitSize*vec2i(vec2u(threadData.bukkitX, threadData.bukkitY)) - vec2i(BukkitHaloSize);
+    let idInGroup = vec2i(i32(indexInGroup) % TotalBukkitEdgeLength, i32(indexInGroup) / TotalBukkitEdgeLength);
+    let gridVertex = idInGroup + localGridOrigin;
+    let gridPosition = vec2f(gridVertex);
+
+    var dx = 0.0;
+    var dy = 0.0;
+    var w = 0.0;
+    var v = 0.0;
+
+    var gridVertexIsValid = all(gridVertex >= vec2i(0)) && all(gridVertex <= vec2i(g_simConstants.gridSize));
+
+    var appliedGridCorrection = false;
+
+    if(gridVertexIsValid)
+    {
+        let gridVertexAddress = gridVertexIndex(vec2u(gridVertex), g_simConstants.gridSize);
+
+        // Load from grid
+        dx = decodeFixedPoint(g_gridSrc[gridVertexAddress + 0], g_simConstants.fixedPointMultiplier);
+        dy = decodeFixedPoint(g_gridSrc[gridVertexAddress + 1], g_simConstants.fixedPointMultiplier);
+        w = decodeFixedPoint(g_gridSrc[gridVertexAddress + 2],  g_simConstants.fixedPointMultiplier);
+        v = decodeFixedPoint(g_gridSrc[gridVertexAddress + 3],  g_simConstants.fixedPointMultiplier);
+
+        // Grid update
+        if(w < 1e-5f)
+        {
+            dx = 0;
+            dy = 0;
+        }
+        else
+        {
+            // Perform mass weighting to get grid displacement
+            dx = dx / w;
+            dy = dy / w;
+        }
+
+    //dx = dx * 0.995;
+        //dy = dy * 0.995;
+
+
+     
+
+        var gridDisplacement = vec2f(dx, dy);
+
+        // Collision detection against collider shapes
+        for(var shapeIndex = 0u; shapeIndex < g_simConstants.shapeCount; shapeIndex++)
+        {
+            let shape = g_shapes[shapeIndex];
+
+            // if(shape.functionality != ShapeFunctionCollider)
+            // {
+            //     continue;
+            // }
+
+            // let displacedGridPosition = gridPosition + gridDisplacement;
+
+            // let collideResult = collide(shape, displacedGridPosition);
+
+            // if(collideResult.collides)
+            // {
+            //     let gap = min(0, dot(collideResult.normal, collideResult.pointOnCollider - gridPosition));
+            //     let penetration = dot(collideResult.normal, gridDisplacement) - gap;
+
+            //     // Prevent any further penetration in radial direction
+            //     let radialImpulse = max(penetration, 0);
+            //     gridDisplacement -= radialImpulse*collideResult.normal;
+            // }
+        }
+
+        // Collision detection against guardian shape
+
+        // Grid vertices near or inside the guardian region should have their displacenent values
+        // corrected in order to prevent particles moving into the guardian.
+        // We do this by finding whether a grid vertex would be inside the guardian region after displacement
+        // with the current velocity and, if it is, setting the displacement so that no further penetration can occur.
+        let displacedGridPosition = gridPosition + gridDisplacement;
+        let projectedGridPosition = projectInsideGuardian(displacedGridPosition, g_simConstants.gridSize, GuardianSize+1);
+        let projectedDifference = projectedGridPosition - displacedGridPosition;
+
+        if(projectedDifference.x != 0)
+        {
+            gridDisplacement.x = 0;
+            gridDisplacement.y = 0;
+            appliedGridCorrection = true;
+        }
+
+        if(projectedDifference.y != 0)
+        {
+            gridDisplacement.x = 0;
+            gridDisplacement.y = 0;
+            appliedGridCorrection = true;
+        }
+        
+        dx = gridDisplacement.x;
+        dy = gridDisplacement.y;
+    }
+
+    // Save grid to local memory
+    let tileDataIndex = localGridIndex(vec2u(idInGroup));
+    atomicStore(&s_tileData[tileDataIndex], encodeFixedPoint(dx, g_simConstants.fixedPointMultiplier));
+    atomicStore(&s_tileData[tileDataIndex+1], encodeFixedPoint(dy, g_simConstants.fixedPointMultiplier));
+    atomicStore(&s_tileData[tileDataIndex+2], encodeFixedPoint(w, g_simConstants.fixedPointMultiplier));
+    atomicStore(&s_tileData[tileDataIndex+3], encodeFixedPoint(v, g_simConstants.fixedPointMultiplier));
+
+    atomicStore(&s_tileDataDst[tileDataIndex], 0);
+    atomicStore(&s_tileDataDst[tileDataIndex+1], 0);
+    atomicStore(&s_tileDataDst[tileDataIndex+2], 0);
+    atomicStore(&s_tileDataDst[tileDataIndex+3], 0);
+
+    workgroupBarrier();
+
+    if(indexInGroup < threadData.rangeCount)
+    {
+        // Load Particle
+        let myParticleIndex = g_bukkitParticleData[threadData.rangeStart + indexInGroup];
+        //let myParticleIndex = indexInGroup + groupId.x*ParticleDispatchSize;
+        var particle = g_particles[myParticleIndex];
+
+        if(particle.enabled != 0.0)
+        {
+            var p = particle.position;
+            let weightInfo = quadraticWeightInit(p);
+
+            if(g_simConstants.iteration != 0)
+            {
+                // G2P
+                var B = ZeroMatrix;
+                var d = vec2f(0);
+                var volume = 0.0;
+                // Iterate over local 3x3 neigbourhood
+                for(var i = 0; i < 3; i++)
+                {
+                    for(var j = 0; j < 3; j++)
+                    {
+                        // Weight corresponding to this neighbourhood cell
+                        let weight = weightInfo.weights[i].x * weightInfo.weights[j].y;
+
+                        // 2d index of this cell in the grid
+                        let neighbourCellIndex = vec2i(weightInfo.cellIndex) + vec2i(i,j);
+
+                        // 2d index relative to the corner of the local grid
+                        let neighbourCellIndexLocal = neighbourCellIndex - localGridOrigin;
+
+                        // Linear index in the local grid
+                        let gridVertexIdx = localGridIndex(vec2u(neighbourCellIndexLocal));
+
+                        let weightedDisplacement = weight * vec2f(
+                            decodeFixedPoint(atomicLoad(&s_tileData[gridVertexIdx + 0]), g_simConstants.fixedPointMultiplier),
+                            decodeFixedPoint(atomicLoad(&s_tileData[gridVertexIdx + 1]), g_simConstants.fixedPointMultiplier)
+                        );
+                        
+                        let offset = vec2f(neighbourCellIndex) - p + 0.5;
+                        B += outerProduct(weightedDisplacement, offset);
+                        d += weightedDisplacement;
+
+                        // This is only required if we are going to mix in the grid volume to the liquid volume
+                        if(g_simConstants.useGridVolumeForLiquid != 0)
+                        {
+                            volume += weight * decodeFixedPoint(atomicLoad(&s_tileData[gridVertexIdx + 3]), g_simConstants.fixedPointMultiplier);
+                        }
+                    }
+                }
+
+                // Using standard MPM volume integration for liquids can lead to slow volume loss over time
+                // especially when particles are undergoing a lot of shearing motion.
+                // We can recover an objective measure of volume from the grid directly.
+                // Here we mix it in to the integrated volume, but only if the liquid is compressed.
+                // This is because the behaviour in tension of the grid volume and mpm volume is quite different.
+                // Note this runs every iteration in the PBMPM solver but we only really require it to happen occasionally
+                // because the MPM integration doesn't lose volume very fast.
+                if(g_simConstants.useGridVolumeForLiquid != 0)
+                {
+                    volume = 1.0/volume;
+                    if(volume < 1)
+                    {
+                        particle.liquidDensity = mix(particle.liquidDensity, volume, 0.1);
+                    }
+                }
+
+
+                particle.deformationDisplacement = B * 4.0;
+                particle.displacement = d;
+
+                // Save particle
+                g_particles[myParticleIndex] = particle;
+            }
+
+
+
+
+            
+            //if(g_simConstants.iteration != g_simConstants.iterationCount-1)
+            {
+                // Particle update
+                if(particle.material == MaterialLiquid)
+                {
+                    // Simple liquid viscosity: just remove deviatoric part of the deformation displacement
+                    let deviatoric = -1.0*(particle.deformationDisplacement + transpose(particle.deformationDisplacement));
+                    particle.deformationDisplacement += g_simConstants.liquidViscosity*0.5*deviatoric;
+
+                    // Volume preservation constraint:
+                    // we want to generate hydrostatic impulses with the form alpha*I
+                    // and we want the liquid volume integration (see particleIntegrate) to yield 1 = (1+tr(alpha*I + D))*det(F) at the end of the timestep.
+                    // where det(F) is stored as particle.liquidDensity.
+                    // Rearranging, we get the below expression that drives the deformation displacement towards preserving the volume.
+                    let alpha = 0.5*(1.0/particle.liquidDensity - tr(particle.deformationDisplacement) - 1.0);
+                    particle.deformationDisplacement += g_simConstants.liquidRelaxation*alpha*Identity; 
+                }
+                else if(particle.material == MaterialElastic || particle.material == MaterialVisco)
+                {
+                    let F =  (Identity + particle.deformationDisplacement) * particle.deformationGradient;
+
+                    var svdResult = svd(F);
+                    
+                    // Closest matrix to F with det == 1
+                    let df = det(F);
+                    let cdf = clamp(abs(df), 0.1, 1000);
+                    let Q = (1.0f/(sign(df)*sqrt(cdf)))*F;
+                    // Interpolate between the two target shapes
+                    let alpha = g_simConstants.elasticityRatio;
+                    let tgt = alpha*(svdResult.U*svdResult.Vt) + (1.0-alpha)*Q;
+
+                    let diff = (tgt*inverse(particle.deformationGradient) - Identity) - particle.deformationDisplacement;
+                    particle.deformationDisplacement += g_simConstants.elasticRelaxation*diff;
+
+                }
+                else if(particle.material == MaterialSand)
+                {
+                    let F =  (Identity + particle.deformationDisplacement) * particle.deformationGradient;
+
+                    var svdResult = svd(F);
+
+                    if(particle.logJp == 0)
+                    {
+                        svdResult.Sigma = clamp(svdResult.Sigma, vec2f(1, 1), vec2f(1000, 1000));
+                    }
+
+                    // Closest matrix to F with det == 1
+                    let df = det(F);
+                    let cdf = clamp(abs(df), 0.1, 1);
+                    let Q = (1.0f/(sign(df)*sqrt(cdf)))*F;
+                    // Interpolate between the two target shapes
+                    let alpha = g_simConstants.elasticityRatio;
+                    let tgt = alpha*(svdResult.U*mat2x2f(svdResult.Sigma.x, 0, 0, svdResult.Sigma.y)*svdResult.Vt) + (1.0-alpha)*Q;
+
+                    let diff = (tgt*inverse(particle.deformationGradient) - Identity) - particle.deformationDisplacement;
+                    particle.deformationDisplacement += g_simConstants.elasticRelaxation*diff;
+
+                    let deviatoric = -1.0*(particle.deformationDisplacement + transpose(particle.deformationDisplacement));
+                    particle.deformationDisplacement += g_simConstants.liquidViscosity*0.5*deviatoric;
+                }
+
+                // P2G
+
+
+                // Iterate over local 3x3 neigbourhood
+                for(var i = 0; i < 3; i++)
+                {
+                    for(var j = 0; j < 3; j++)
+                    {
+                        // Weight corresponding to this neighbourhood cell
+                        let weight = weightInfo.weights[i].x * weightInfo.weights[j].y;
+                        
+                        // 2d index of this cell in the grid
+                        let neighbourCellIndex = vec2i(weightInfo.cellIndex) + vec2i(i,j);
+
+                        // 2d index relative to the corner of the local grid
+                        let neighbourCellIndexLocal = neighbourCellIndex - localGridOrigin;
+
+                        // Linear index in the local grid
+                        let gridVertexIdx = localGridIndex(vec2u(neighbourCellIndexLocal));
+                        
+                        let offset = vec2f(neighbourCellIndex) - p + 0.5;
+
+                        let weightedMass = weight * particle.mass;
+                        let momentum = weightedMass * (particle.displacement +  particle.deformationDisplacement * offset);
+
+                        atomicAdd(&s_tileDataDst[gridVertexIdx + 0], encodeFixedPoint(momentum.x, g_simConstants.fixedPointMultiplier));
+                        atomicAdd(&s_tileDataDst[gridVertexIdx + 1], encodeFixedPoint(momentum.y, g_simConstants.fixedPointMultiplier));
+                        atomicAdd(&s_tileDataDst[gridVertexIdx + 2], encodeFixedPoint(weightedMass, g_simConstants.fixedPointMultiplier));
+
+                        // This is only required if we are going to mix in the grid volume to the liquid volume
+                        if(g_simConstants.useGridVolumeForLiquid != 0)
+                        {
+                            atomicAdd(&s_tileDataDst[gridVertexIdx + 3], encodeFixedPoint(particle.volume * weight, g_simConstants.fixedPointMultiplier));
+                        }
+                    }
+                }
+            }
+
+        }
+    }
+
+    workgroupBarrier();
+
+    // Save Grid
+    if(gridVertexIsValid)
+    {
+        let gridVertexAddress = gridVertexIndex(vec2u(gridVertex), g_simConstants.gridSize);
+
+        let dxi = atomicLoad(&s_tileDataDst[tileDataIndex]);
+        let dyi = atomicLoad(&s_tileDataDst[tileDataIndex+1]);
+        let wi = atomicLoad(&s_tileDataDst[tileDataIndex+2]);
+        let vi = atomicLoad(&s_tileDataDst[tileDataIndex+3]);
+
+        atomicAdd(&g_gridDst[gridVertexAddress + 0], dxi);
+        atomicAdd(&g_gridDst[gridVertexAddress + 1], dyi);
+        atomicAdd(&g_gridDst[gridVertexAddress + 2], wi);
+        atomicAdd(&g_gridDst[gridVertexAddress + 3], vi);
+    }
+}
\ No newline at end of file
diff --git a/shaders/gridToParticle.wgsl b/shaders/gridToParticle.wgsl
deleted file mode 100644
index 859e0e5..0000000
--- a/shaders/gridToParticle.wgsl
+++ /dev/null
@@ -1,88 +0,0 @@
-//-----------------------------------------------------------------------------
-// Copyright (c) 2024 Electronic Arts.  All rights reserved.
-//-----------------------------------------------------------------------------
-
-//!include dispatch.inc
-//!include particle.inc
-//!include simConstants.inc
-//!include matrix.inc
-
-@group(0) @binding(0) var<uniform> g_simConstants : SimConstants;
-@group(0) @binding(1) var<storage> g_particleCount : array<u32>;
-@group(0) @binding(2) var<storage, read_write> g_particles : array<Particle>;
-@group(0) @binding(3) var<storage> g_grid : array<i32>;
-
-@compute @workgroup_size(ParticleDispatchSize, 1, 1)
-fn csMain( @builtin(global_invocation_id) id: vec3<u32> )
-{
-    if(id.x >= g_particleCount[0])
-    {
-        return;
-    }
-
-    var particle = g_particles[id.x];
-
-    if(particle.enabled == 0)
-    {
-        return;
-    }
-
-    var p = particle.position;
-
-    let weightInfo = quadraticWeightInit(p);
-
-    var B = ZeroMatrix;
-    var d = vec2f(0);
-    var volume = 0.0;
-    // Iterate over local 3x3 neigbourhood
-    for(var i = 0; i < 3; i++)
-    {
-        for(var j = 0; j < 3; j++)
-        {
-            // Weight corresponding to this neighbourhood cell
-            let weight = weightInfo.weights[i].x * weightInfo.weights[j].y;
-
-            // 2d index of this cell in the grid
-            let neighbourCellIndex = vec2u(vec2i(weightInfo.cellIndex) + vec2i(i,j));
-
-            // Linear index in the buffer
-            let gridVertexIdx = gridVertexIndex(neighbourCellIndex, g_simConstants.gridSize);
-
-            let weightedDisplacement = weight * vec2f(
-                decodeFixedPoint(g_grid[gridVertexIdx + 0], g_simConstants.fixedPointMultiplier),
-                decodeFixedPoint(g_grid[gridVertexIdx + 1], g_simConstants.fixedPointMultiplier)
-            );
-            
-            let offset = vec2f(neighbourCellIndex) - p + 0.5;
-            B += outerProduct(weightedDisplacement, offset);
-            d += weightedDisplacement;
-
-            // This is only required if we are going to mix in the grid volume to the liquid volume
-            if(g_simConstants.useGridVolumeForLiquid != 0)
-            {
-                volume += weight * decodeFixedPoint(g_grid[gridVertexIdx + 3], g_simConstants.fixedPointMultiplier);
-            }
-        }
-    }
-
-    particle.deformationDisplacement = B * 4.0;
-    particle.displacement = d;
-    
-    // Using standard MPM volume integration for liquids can lead to slow volume loss over time
-    // especially when particles are undergoing a lot of shearing motion.
-    // We can recover an objective measure of volume from the grid directly.
-    // Here we mix it in to the integrated volume, but only if the liquid is compressed.
-    // This is because the behaviour in tension of the grid volume and mpm volume is quite different.
-    // Note this runs every iteration in the PBMPM solver but we only really require it to happen occasionally
-    // because the MPM integration doesn't lose volume very fast.
-    if(g_simConstants.useGridVolumeForLiquid != 0)
-    {
-        volume = 1.0/volume;
-        if(volume < 1)
-        {
-            particle.liquidDensity = mix(particle.liquidDensity, volume, 0.1);
-        }
-    }
-
-    g_particles[id.x] = particle;
-}
\ No newline at end of file
diff --git a/shaders/gridUpdate.wgsl b/shaders/gridUpdate.wgsl
deleted file mode 100644
index 6a53977..0000000
--- a/shaders/gridUpdate.wgsl
+++ /dev/null
@@ -1,94 +0,0 @@
-//-----------------------------------------------------------------------------
-// Copyright (c) 2024 Electronic Arts.  All rights reserved.
-//-----------------------------------------------------------------------------
-
-//!include dispatch.inc
-//!include simConstants.inc
-//!include particle.inc
-//!include shapes.inc
-
-@group(0) @binding(0) var<uniform> g_simConstants : SimConstants;
-@group(0) @binding(1) var<storage, read_write> g_grid : array<i32>;
-@group(0) @binding(2) var<storage> g_shapes : array<SimShape>;
-
-@compute @workgroup_size(GridDispatchSize, GridDispatchSize)
-fn csMain( @builtin(global_invocation_id) id: vec3<u32> )
-{
-    if(any(id.xy >= g_simConstants.gridSize))
-    {
-        return;
-    }
-
-    let gridVertexAddress = gridVertexIndex(id.xy, g_simConstants.gridSize);
-
-    // Load from grid
-    var dx = decodeFixedPoint(g_grid[gridVertexAddress + 0], g_simConstants.fixedPointMultiplier);
-    var dy = decodeFixedPoint(g_grid[gridVertexAddress + 1], g_simConstants.fixedPointMultiplier);
-    var w = decodeFixedPoint(g_grid[gridVertexAddress + 2], g_simConstants.fixedPointMultiplier);
-
-    if(w < 1e-5f)
-    {
-        dx = 0;
-        dy = 0;
-    }
-
-    // Perform mass weighting to get grid displacement
-    dx = dx / w;
-    dy = dy / w;
-
-    var gridDisplacement = vec2f(dx, dy);
-
-
-    // Collision detection against collider shapes
-    for(var shapeIndex = 0u; shapeIndex < g_simConstants.shapeCount; shapeIndex++)
-    {
-        let shape = g_shapes[shapeIndex];
-
-        if(shape.functionality != ShapeFunctionCollider)
-        {
-            continue;
-        }
-
-        let gridPosition = vec2f(id.xy);
-        let displacedGridPosition = gridPosition + gridDisplacement;
-
-        let collideResult = collide(shape, displacedGridPosition);
-
-        if(collideResult.collides)
-        {
-            let gap = min(0, dot(collideResult.normal, collideResult.pointOnCollider - gridPosition));
-            let penetration = dot(collideResult.normal, gridDisplacement) - gap;
-
-            // Prevent any further penetration in radial direction
-            let radialImpulse = max(penetration, 0);
-            gridDisplacement -= radialImpulse*collideResult.normal;
-        }
-    }
-
-    // Collision detection against guardian shape
-
-    // Grid vertices near or inside the guardian region should have their displacenent values
-    // corrected in order to prevent particles moving into the guardian.
-    // We do this by finding whether a grid vertex would be inside the guardian region after displacement
-    // with the current velocity and, if it is, setting the displacement so that no further penetration can occur.
-    let gridPosition = vec2f(id.xy);
-    let displacedGridPosition = gridPosition + gridDisplacement;
-    let projectedGridPosition = projectInsideGuardian(displacedGridPosition, g_simConstants.gridSize, GuardianSize+1);
-    let projectedDifference = projectedGridPosition - displacedGridPosition;
-
-    if(projectedDifference.x != 0)
-    {
-        gridDisplacement.x = 0;
-        gridDisplacement.y = 0;
-    }
-
-    if(projectedDifference.y != 0)
-    {
-        gridDisplacement.x = 0;
-        gridDisplacement.y = 0;
-    }
-
-    // Save back to grid
-    g_grid[gridVertexAddress + 0] = encodeFixedPoint(gridDisplacement.x , g_simConstants.fixedPointMultiplier);
-    g_grid[gridVertexAddress + 1] = encodeFixedPoint(gridDisplacement.y , g_simConstants.fixedPointMultiplier);
-}
\ No newline at end of file
diff --git a/shaders/gridZero.wgsl b/shaders/gridZero.wgsl
deleted file mode 100644
index 62d8fd3..0000000
--- a/shaders/gridZero.wgsl
+++ /dev/null
@@ -1,25 +0,0 @@
-//-----------------------------------------------------------------------------
-// Copyright (c) 2024 Electronic Arts.  All rights reserved.
-//-----------------------------------------------------------------------------
-
-//!include dispatch.inc
-//!include simConstants.inc
-
-@group(0) @binding(0) var<uniform> g_simConstants : SimConstants;
-@group(0) @binding(1) var<storage, read_write> g_grid : array<i32>;
-
-@compute @workgroup_size(GridDispatchSize, GridDispatchSize)
-fn csMain( @builtin(global_invocation_id) id: vec3<u32> )
-{
-    if(any(id.xy >= g_simConstants.gridSize))
-    {
-        return;
-    }
-
-    let gridVertexAddress = gridVertexIndex(id.xy, g_simConstants.gridSize);
-
-    g_grid[gridVertexAddress + 0] = 0;
-    g_grid[gridVertexAddress + 1] = 0;
-    g_grid[gridVertexAddress + 2] = 0;
-    g_grid[gridVertexAddress + 3] = 0;
-}
\ No newline at end of file
diff --git a/shaders/particleEmit.wgsl b/shaders/particleEmit.wgsl
index e82fb15..29a107d 100644
--- a/shaders/particleEmit.wgsl
+++ b/shaders/particleEmit.wgsl
@@ -108,7 +108,7 @@ fn csMain( @builtin(global_invocation_id) id: vec3u )
             continue;
         }
 
-        let particleCountPerCellAxis = select(u32(g_simConstants.particlesPerCellAxis), 1, shape.emitMaterial == MaterialLiquid || shape.emitMaterial == MaterialSand);
+        let particleCountPerCellAxis = u32(g_simConstants.particlesPerCellAxis);
         let volumePerParticle = 1.0f / f32(particleCountPerCellAxis*particleCountPerCellAxis);
 
         var c = collide(shape, pos);
diff --git a/shaders/particleIntegrate.wgsl b/shaders/particleIntegrate.wgsl
index a663f4f..b3a40c4 100644
--- a/shaders/particleIntegrate.wgsl
+++ b/shaders/particleIntegrate.wgsl
@@ -143,7 +143,7 @@ fn csMain( @builtin(global_invocation_id) id: vec3<u32> )
 
     // Gravity acceleration is normalized to the vertical size of the window.
     particle.displacement.y -= f32(g_simConstants.gridSize.y)*g_simConstants.gravityStrength*g_simConstants.deltaTime*g_simConstants.deltaTime;
-
+    
     // Free count may be negative because of emission. So make sure it is at last zero before incrementing.
     atomicMax(&g_freeCount[0], 0i);
 
diff --git a/shaders/particleToGrid.wgsl b/shaders/particleToGrid.wgsl
deleted file mode 100644
index 9e1190e..0000000
--- a/shaders/particleToGrid.wgsl
+++ /dev/null
@@ -1,66 +0,0 @@
-//-----------------------------------------------------------------------------
-// Copyright (c) 2024 Electronic Arts.  All rights reserved.
-//-----------------------------------------------------------------------------
-
-//!include dispatch.inc
-//!include particle.inc
-//!include simConstants.inc
-
-@group(0) @binding(0) var<uniform> g_simConstants : SimConstants;
-@group(0) @binding(1) var<storage> g_particleCount : array<u32>;
-@group(0) @binding(2) var<storage> g_particles : array<Particle>;
-@group(0) @binding(3) var<storage, read_write> g_grid : array<atomic<i32>>;
-
-@compute @workgroup_size(ParticleDispatchSize, 1, 1)
-fn csMain( @builtin(global_invocation_id) id: vec3<u32> )
-{
-    if(id.x >= g_particleCount[0])
-    {
-        return;
-    }
-
-    let particle = g_particles[id.x];
-
-    if(particle.enabled == 0)
-    {
-        return;
-    }
-
-    var p = particle.position;
-    let d = particle.displacement;
-    let D = particle.deformationDisplacement;
-    let m = particle.mass;
-
-    let weightInfo = quadraticWeightInit(p);
-
-    // Iterate over local 3x3 neigbourhood
-    for(var i = 0; i < 3; i++)
-    {
-        for(var j = 0; j < 3; j++)
-        {
-            // Weight corresponding to this neighbourhood cell
-            let weight = weightInfo.weights[i].x * weightInfo.weights[j].y;
-            
-            // 2d index of this cell in the grid
-            let neighbourCellIndex = vec2u(vec2i(weightInfo.cellIndex) + vec2i(i,j));
-
-            // Linear index in the buffer
-            let gridVertexIdx = gridVertexIndex(neighbourCellIndex, g_simConstants.gridSize);
-            
-            let offset = vec2f(neighbourCellIndex) - p + 0.5;
-
-            let weightedMass = weight * m;
-            let momentum = weightedMass * (d +  D * offset);
-
-            atomicAdd(&g_grid[gridVertexIdx + 0], encodeFixedPoint(momentum.x, g_simConstants.fixedPointMultiplier));
-            atomicAdd(&g_grid[gridVertexIdx + 1], encodeFixedPoint(momentum.y, g_simConstants.fixedPointMultiplier));
-            atomicAdd(&g_grid[gridVertexIdx + 2], encodeFixedPoint(weightedMass, g_simConstants.fixedPointMultiplier));
-
-            // This is only required if we are going to mix in the grid volume to the liquid volume
-            if(g_simConstants.useGridVolumeForLiquid != 0)
-            {
-                atomicAdd(&g_grid[gridVertexIdx + 3], encodeFixedPoint(particle.volume * weight, g_simConstants.fixedPointMultiplier));
-            }
-        }
-    }
-}
\ No newline at end of file
diff --git a/shaders/particleUpdatePBMPM.wgsl b/shaders/particleUpdatePBMPM.wgsl
deleted file mode 100644
index 974cdc1..0000000
--- a/shaders/particleUpdatePBMPM.wgsl
+++ /dev/null
@@ -1,88 +0,0 @@
-//-----------------------------------------------------------------------------
-// Copyright (c) 2024 Electronic Arts.  All rights reserved.
-//-----------------------------------------------------------------------------
-
-//!include dispatch.inc
-//!include particle.inc
-//!include simConstants.inc
-//!include matrix.inc
-
-@group(0) @binding(0) var<uniform> g_simConstants : SimConstants;
-@group(0) @binding(1) var<storage> g_particleCount : array<u32>;
-@group(0) @binding(2) var<storage, read_write> g_particles : array<Particle>;
-
-@compute @workgroup_size(ParticleDispatchSize, 1, 1)
-fn csMain( @builtin(global_invocation_id) id: vec3<u32> )
-{
-    if(id.x >= g_particleCount[0])
-    {
-        return;
-    }
-
-    var particle = g_particles[id.x];
-
-    if(particle.enabled == 0)
-    {
-        return;
-    }
-
-    if(particle.material == MaterialLiquid)
-    {
-        // Simple liquid viscosity: just remove deviatoric part of the deformation displacement
-        let deviatoric = -1.0*(particle.deformationDisplacement + transpose(particle.deformationDisplacement));
-        particle.deformationDisplacement += g_simConstants.liquidViscosity*0.5*deviatoric;
-
-        // Volume preservation constraint:
-        // we want to generate hydrostatic impulses with the form alpha*I
-        // and we want the liquid volume integration (see particleIntegrate) to yield 1 = (1+tr(alpha*I + D))*det(F) at the end of the timestep.
-        // where det(F) is stored as particle.liquidDensity.
-        // Rearranging, we get the below expression that drives the deformation displacement towards preserving the volume.
-        let alpha = 0.5*(1.0/particle.liquidDensity - tr(particle.deformationDisplacement) - 1.0);
-        particle.deformationDisplacement += g_simConstants.liquidRelaxation*alpha*Identity; 
-    }
-    else if(particle.material == MaterialElastic || particle.material == MaterialVisco)
-    {
-        let F =  (Identity + particle.deformationDisplacement) * particle.deformationGradient;
-
-        var svdResult = svd(F);
-        
-        // Closest matrix to F with det == 1
-        let df = det(F);
-        let cdf = clamp(abs(df), 0.1, 1000);
-        let Q = (1.0f/(sign(df)*sqrt(cdf)))*F;
-        // Interpolate between the two target shapes
-        let alpha = g_simConstants.elasticityRatio;
-        let tgt = alpha*(svdResult.U*svdResult.Vt) + (1.0-alpha)*Q;
-
-        let diff = (tgt*inverse(particle.deformationGradient) - Identity) - particle.deformationDisplacement;
-        particle.deformationDisplacement += g_simConstants.elasticRelaxation*diff;
-
-    }
-    else if(particle.material == MaterialSand)
-    {
-        let F =  (Identity + particle.deformationDisplacement) * particle.deformationGradient;
-
-        var svdResult = svd(F);
-
-        if(particle.logJp == 0)
-        {
-            svdResult.Sigma = clamp(svdResult.Sigma, vec2f(1, 1), vec2f(1000, 1000));
-        }
-
-        // Closest matrix to F with det == 1
-        let df = det(F);
-        let cdf = clamp(abs(df), 0.1, 1);
-        let Q = (1.0f/(sign(df)*sqrt(cdf)))*F;
-        // Interpolate between the two target shapes
-        let alpha = g_simConstants.elasticityRatio;
-        let tgt = alpha*(svdResult.U*mat2x2f(svdResult.Sigma.x, 0, 0, svdResult.Sigma.y)*svdResult.Vt) + (1.0-alpha)*Q;
-
-        let diff = (tgt*inverse(particle.deformationGradient) - Identity) - particle.deformationDisplacement;
-        particle.deformationDisplacement += g_simConstants.elasticRelaxation*diff;
-
-        let deviatoric = -1.0*(particle.deformationDisplacement + transpose(particle.deformationDisplacement));
-        particle.deformationDisplacement += g_simConstants.liquidViscosity*0.5*deviatoric;
-    }
-
-    g_particles[id.x] = particle;
-}
\ No newline at end of file
diff --git a/shaders/setIndirectArgs.wgsl b/shaders/setIndirectArgs.wgsl
index 65450f5..1df8053 100644
--- a/shaders/setIndirectArgs.wgsl
+++ b/shaders/setIndirectArgs.wgsl
@@ -8,11 +8,6 @@
 @group(0) @binding(1) var<storage, read_write> g_simIndirectArgs : array<u32>;
 @group(0) @binding(2) var<storage, read_write> g_renderIndirectArgs : array<u32>;
 
-fn divUp(threadCount : u32, divisor : u32) -> u32
-{
-    return (threadCount + divisor - 1) / divisor;
-}
-
 @compute @workgroup_size(1)
 fn csMain( @builtin(global_invocation_id) id: vec3<u32> )
 {
diff --git a/src/gpu.js b/src/gpu.js
index f74b735..369bc6d 100644
--- a/src/gpu.js
+++ b/src/gpu.js
@@ -29,7 +29,7 @@ export function getGpuContext() {return context;}
 
 export function divUp(threadCount, divisor)
 {
-    return Math.ceil(threadCount / divisor);
+    return Math.floor((threadCount + divisor - 1) / divisor);
 }
 
 export function createBindGroup(name, shaderName, resources)
@@ -47,29 +47,32 @@ export function createBindGroup(name, shaderName, resources)
     });
 }
 
-export function resetBuffers()
+export function construct4IntBuffer(name, usage, values)
 {
-    // Constructs a buffer containing 4 integers of the given values
-    function construct4IntBuffer(name, usage, values)
-    {
-        context[name] = context.device.createBuffer({
-            name: name, 
-            size: 16,
-            usage: usage
-        })
+    const buf = context.device.createBuffer({
+        name: name, 
+        size: 16,
+        usage: usage
+    })
 
-        const valueArray = new Int32Array(4);
-        valueArray.set(values);
-        context.device.queue.writeBuffer(context[name], 0, valueArray);
-    }
+    const valueArray = new Int32Array(4);
+    valueArray.set(values);
+    context.device.queue.writeBuffer(buf, 0, valueArray);
+
+    return buf;
+}
+
+export function resetBuffers(gridSize)
+{
+    // Constructs a buffer containing 4 integers of the given values
 
     // Construct various small buffers used for indirect dispatch, counting and staging
-    construct4IntBuffer('particleCountBuffer', GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST | GPUBufferUsage.COPY_SRC, [0,0,0,0]);
-    construct4IntBuffer('particleCountStagingBuffer', GPUBufferUsage.MAP_READ | GPUBufferUsage.COPY_DST, [0,0,0,0]);
-    construct4IntBuffer('particleRenderDispatchBuffer', GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST | GPUBufferUsage.INDIRECT, [6,0,0,0]);
-    construct4IntBuffer('particleSimDispatchBuffer', GPUBufferUsage.STORAGE | GPUBufferUsage.INDIRECT | GPUBufferUsage.COPY_DST, [0,1,1,0]);
-    construct4IntBuffer('particleFreeCountBuffer', GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC | GPUBufferUsage.COPY_DST, [0,0,0,0]);
-    construct4IntBuffer('particleFreeCountStagingBuffer', GPUBufferUsage.MAP_READ | GPUBufferUsage.COPY_DST, [0,0,0,0]);
+    context.particleCountBuffer = construct4IntBuffer('particleCountBuffer', GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST | GPUBufferUsage.COPY_SRC, [0,0,0,0]);
+    context.particleCountStagingBuffer = construct4IntBuffer('particleCountStagingBuffer', GPUBufferUsage.MAP_READ | GPUBufferUsage.COPY_DST, [0,0,0,0]);
+    context.particleRenderDispatchBuffer = construct4IntBuffer('particleRenderDispatchBuffer', GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST | GPUBufferUsage.INDIRECT, [6,0,0,0]);
+    context.particleSimDispatchBuffer = construct4IntBuffer('particleSimDispatchBuffer', GPUBufferUsage.STORAGE | GPUBufferUsage.INDIRECT | GPUBufferUsage.COPY_DST, [0,1,1,0]);
+    context.particleFreeCountBuffer = construct4IntBuffer('particleFreeCountBuffer', GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC | GPUBufferUsage.COPY_DST, [0,0,0,0]);
+    context.particleFreeCountStagingBuffer = construct4IntBuffer('particleFreeCountStagingBuffer', GPUBufferUsage.MAP_READ | GPUBufferUsage.COPY_DST, [0,0,0,0]);
 
     // Construct particle buffer.
     // Must be kept in sync with MPMParticle in particle.inc.wgsl
@@ -86,6 +89,21 @@ export function resetBuffers()
         size: context.maxParticleCount * 4,
         usage: GPUBufferUsage.STORAGE
     });
+
+    context.gridBuffer = context.device.createBuffer({
+        label: "gridBuffer",
+        size: gridSize[0] * gridSize[1] * 4 * 4,
+        usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST
+    });
+
+    context.gridBufferTmp = context.device.createBuffer({
+        label: "gridBufferTmp",
+        size: gridSize[0] * gridSize[1] * 4 * 4,
+        usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST | GPUBufferUsage.COPY_SRC
+    });
+
+    console.log(context.gridBuffer);
+    console.log(context.gridBufferTmp)
 }
 
 export function beginFrame()
@@ -149,6 +167,11 @@ export function computeDispatch(shaderName, resources, groupCount)
     let entries = []
     for(let i = 0; i < resources.length; ++i)
     {
+        if(!resources[i])
+        {
+            throw `Compute Dispatch [${shaderName}]: Resource at index ${i} was falsy!`
+        }
+
         entries.push({binding: i, resource: {buffer: resources[i]}});
     }
 
@@ -225,9 +248,9 @@ export async function init(insertHandlers)
     }
 
     context.device = await context.adapter.requestDevice({
-        requiredFeatures: [
-            context.canTimeStamp ? ['timestamp-query']: []
-        ]
+        requiredFeatures: context.canTimeStamp ? [
+             ['timestamp-query']
+        ] : undefined
     });
     
     await shader.init(context.device, insertHandlers);
diff --git a/src/main.js b/src/main.js
index 6a8923f..23c7b9d 100644
--- a/src/main.js
+++ b/src/main.js
@@ -151,12 +151,15 @@ function mainUpdate(timeStamp)
     
         const gpuContext = gpu.getGpuContext();
     
-        if(inputs.doReset)
-            gpu.resetBuffers();
+
 
         updateDom(gpuContext, inputs);
 
         gpu.beginFrame();
+
+        if(inputs.doReset)
+            gpu.resetBuffers(inputs.gridSize);
+        
         sim.update(gpuContext, inputs);    
         render.update(gpuContext, inputs);
         gpu.endFrame();
@@ -176,6 +179,8 @@ function updateInputs()
         || inputs.addElastic != g_prevInputs.addElastic
         || inputs.particlesPerCellAxis != g_prevInputs.particlesPerCellAxis
         || (inputs.solverType === sim.SimEnums.SolverTypePositionBasedFluids && inputs.solverType != g_prevInputs.solverType)
+        || inputs.gridSize[0] != g_prevInputs.gridSize[0]
+        || inputs.gridSize[1] != g_prevInputs.gridSize[1]
     ) {
         ui.windowResize();
         g_reset = true;
diff --git a/src/shader.js b/src/shader.js
index 5188b76..e8563fd 100644
--- a/src/shader.js
+++ b/src/shader.js
@@ -7,13 +7,15 @@
 export let Shaders = {
 
     // MPM  shaders
-    gridToParticle: 'gridToParticle',
-    particleToGrid: 'particleToGrid',
-    gridUpdate: 'gridUpdate',
-    gridZero: 'gridZero',
-    particleUpdatePBMPM: 'particleUpdatePBMPM',
     mpmParticleIntegrate: 'particleIntegrate',
 
+    g2p2g: 'g2p2g',
+
+    // Bukkitizing shaders
+    bukkitCount: 'bukkitCount',
+    bukkitAllocate: 'bukkitAllocate',
+    bukkitInsert: 'bukkitInsert',
+
     // Other sim shaders
     particleEmit: 'particleEmit',
     setIndirectArgs: 'setIndirectArgs',
diff --git a/src/sim.js b/src/sim.js
index 9aeb283..3d05d14 100644
--- a/src/sim.js
+++ b/src/sim.js
@@ -13,6 +13,8 @@ import {Shaders} from "./shader.js"
 export const DispatchSizes = {
     ParticleDispatchSize: 64,
     GridDispatchSize: 8,
+    BukkitSize: 6,
+    BukkitHaloSize: 1,
 };
 
 export const SimEnums = {
@@ -79,6 +81,12 @@ export function init(insertHandlers)
 
     simFactory.add('shapeCount', buffer_factory.u32);
     simFactory.add('simFrame', buffer_factory.u32);
+    simFactory.add('bukkitCount', buffer_factory.u32);
+    simFactory.add('bukkitCountX', buffer_factory.u32);
+    simFactory.add('bukkitCountY', buffer_factory.u32);
+    simFactory.add('iteration', buffer_factory.u32);
+    simFactory.add('iterationCount', buffer_factory.u32);
+
     simFactory.compile();
 
     const shapeFactory = new buffer_factory.BufferFactory('SimShape', buffer_factory.Storage);
@@ -126,6 +134,27 @@ function doEmission(gpuContext, simUniformBuffer, inputs, shapeBuffer)
     gpu.computeDispatch(Shaders.setIndirectArgs, [gpuContext.particleCountBuffer, gpuContext.particleSimDispatchBuffer, gpuContext.particleRenderDispatchBuffer], [1,1,1]);
 }
 
+function bukkitizeParticles(gpuContext, simUniformBuffer, inputs, bukkitSystem)
+{
+    gpuContext.encoder.clearBuffer(bukkitSystem.countBuffer);
+    gpuContext.encoder.clearBuffer(bukkitSystem.countBuffer2);
+    gpuContext.encoder.clearBuffer(bukkitSystem.threadData);
+    gpuContext.encoder.clearBuffer(bukkitSystem.particleData);
+    gpuContext.encoder.clearBuffer(bukkitSystem.particleAllocator);
+    gpuContext.encoder.copyBufferToBuffer(bukkitSystem.blankDispatch, 0, bukkitSystem.dispatch, 0, bukkitSystem.dispatch.size);
+
+    gpu.computeDispatch(Shaders.bukkitCount, [simUniformBuffer, gpuContext.particleCountBuffer, gpuContext.particleBuffer, bukkitSystem.countBuffer], gpuContext.particleSimDispatchBuffer);
+
+    let bukkitDispatchSize = [
+        gpu.divUp(bukkitSystem.countX, DispatchSizes.GridDispatchSize),
+        gpu.divUp(bukkitSystem.countY, DispatchSizes.GridDispatchSize),
+        1
+    ];
+
+    gpu.computeDispatch(Shaders.bukkitAllocate, [simUniformBuffer, bukkitSystem.countBuffer, bukkitSystem.dispatch, bukkitSystem.threadData, bukkitSystem.particleAllocator, bukkitSystem.indexStart], bukkitDispatchSize);
+    gpu.computeDispatch(Shaders.bukkitInsert, [simUniformBuffer, gpuContext.particleCountBuffer, bukkitSystem.countBuffer2, gpuContext.particleBuffer, bukkitSystem.particleData, bukkitSystem.indexStart], gpuContext.particleSimDispatchBuffer);
+}
+
 export function update(gpuContext, inputs)
 {
     if(inputs.doReset)
@@ -134,47 +163,107 @@ export function update(gpuContext, inputs)
     }
 
     const shapeBuffer = constructShapeBuffer(gpuContext, inputs);
+    const bukkitSystem = constructBukkitSystem(gpuContext, inputs);
 
     const threadGroupCountX = gpu.divUp(inputs.gridSize[0], DispatchSizes.GridDispatchSize);
     const threadGroupCountY = gpu.divUp(inputs.gridSize[1], DispatchSizes.GridDispatchSize);
     const gridThreadGroupCounts = [threadGroupCountX, threadGroupCountY, 1];
 
     const substepCount = time.doTimeRegulation(inputs);
-    let gridBuffer;
-    if(substepCount > 0)
+    for(let substepIdx = 0; substepIdx < substepCount; ++substepIdx)
     {
-        gridBuffer = constructGridBuffer(gpuContext, inputs);
- 
-        for(let substepIdx = 0; substepIdx < substepCount; ++substepIdx)
+        var simUniformBuffer = constructSimUniformBuffer(gpuContext, inputs, bukkitSystem, 0);
+        doEmission(gpuContext, simUniformBuffer, inputs, shapeBuffer);
+
+        bukkitizeParticles(gpuContext, simUniformBuffer, inputs, bukkitSystem);
+
+        for(let iterationIdx = 0; iterationIdx < inputs.iterationCount; ++iterationIdx)
         {
-            const simUniformBuffer = constructSimUniformBuffer(gpuContext, inputs);
-            doEmission(gpuContext, simUniformBuffer, inputs, shapeBuffer);
-
-            for(let iterationIdx = 0; iterationIdx < inputs.iterationCount; ++iterationIdx)
-            {
-                gpu.computeDispatch(Shaders.particleUpdatePBMPM, [simUniformBuffer, gpuContext.particleCountBuffer, gpuContext.particleBuffer], gpuContext.particleSimDispatchBuffer);
-                gpu.computeDispatch(Shaders.gridZero, [simUniformBuffer, gridBuffer],  gridThreadGroupCounts);
-                gpu.computeDispatch(Shaders.particleToGrid, [simUniformBuffer, gpuContext.particleCountBuffer, gpuContext.particleBuffer, gridBuffer], gpuContext.particleSimDispatchBuffer);
-                gpu.computeDispatch(Shaders.gridUpdate, [simUniformBuffer, gridBuffer, shapeBuffer], gridThreadGroupCounts);
-                gpu.computeDispatch(Shaders.gridToParticle, [simUniformBuffer, gpuContext.particleCountBuffer, gpuContext.particleBuffer, gridBuffer], gpuContext.particleSimDispatchBuffer);
-            }
-
-            gpu.computeDispatch(Shaders.mpmParticleIntegrate, [simUniformBuffer, gpuContext.particleCountBuffer, gpuContext.particleBuffer, shapeBuffer, gpuContext.particleFreeCountBuffer, gpuContext.particleFreeIndicesBuffer], gpuContext.particleSimDispatchBuffer);
-            g_substepIndex = (g_substepIndex + 1);
-        }  
-    }
+            simUniformBuffer = constructSimUniformBuffer(gpuContext, inputs, bukkitSystem, iterationIdx);
+
+            gpuContext.encoder.clearBuffer(gpuContext.gridBufferTmp);
+            gpu.computeDispatch(Shaders.g2p2g, [simUniformBuffer, gpuContext.particleBuffer, gpuContext.gridBuffer, gpuContext.gridBufferTmp, bukkitSystem.threadData, bukkitSystem.particleData, shapeBuffer], bukkitSystem.dispatch)
+
+            const tmp = gpuContext.gridBufferTmp;
+            gpuContext.gridBufferTmp = gpuContext.gridBuffer;
+            gpuContext.gridBuffer = tmp;
+        }
+
+        gpu.computeDispatch(Shaders.mpmParticleIntegrate, [simUniformBuffer, gpuContext.particleCountBuffer, gpuContext.particleBuffer, shapeBuffer, gpuContext.particleFreeCountBuffer, gpuContext.particleFreeIndicesBuffer], gpuContext.particleSimDispatchBuffer);
+        g_substepIndex = (g_substepIndex + 1);
+    }  
 }
 
-function constructGridBuffer(gpuContext, inputs) {
-    return gpuContext.device.createBuffer({
-        label: "grid",
-        // square of grid vertices, each of which has 4 components of 4-byte integers
-        size: inputs.gridSize[0] * inputs.gridSize[1] * 4 * 4,
+function constructBukkitSystem(gpuContext, inputs) {
+    const bukkitCountX = Math.ceil(inputs.gridSize[0] / DispatchSizes.BukkitSize);
+    const bukkitCountY = Math.ceil(inputs.gridSize[1] / DispatchSizes.BukkitSize);
+
+    console.log(`Bukkit count: ${bukkitCountX * bukkitCountY}`);
+
+    const bukkitParticleCountBuffer = gpuContext.device.createBuffer({
+        label: "bukkitParticleCountBuffer",
+        size: bukkitCountX * bukkitCountY * 4, // One integer count per bukkit
+        usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST
+    });
+
+    const bukkitParticleCountBuffer2 = gpuContext.device.createBuffer({
+        label: "bukkitParticleCountBuffer2",
+        size: bukkitCountX * bukkitCountY * 4, // One integer count per bukkit
+        usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST
+    });
+
+    const bukkitParticleThreadData = gpuContext.device.createBuffer({
+        label: "bukkitParticleThreadData",
+        size: 10 * bukkitCountX * bukkitCountY * 4 * 4, // four integers per allocated bukkit thread group
+        usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST
+    });
+
+    const bukkitParticleData = gpuContext.device.createBuffer({
+        label: 'bukkitParticleData',
+        size: gpuContext.maxParticleCount * 4, // One integer per particle
+        usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST
+    })
+
+    const bukkitParticleDispatchBuffer = gpu.construct4IntBuffer(
+        'bukkitParticleDispatchBuffer', 
+        GPUBufferUsage.STORAGE | GPUBufferUsage.INDIRECT | GPUBufferUsage.COPY_DST,
+        [0,1,1,0]
+    );
+
+    const blankDispatchBuffer = gpu.construct4IntBuffer(
+        'blankDispatchBuffer', 
+        GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC | GPUBufferUsage.COPY_DST,
+        [0,1,1,0]
+    );
+
+    const bukkitParticleAllocator = gpu.construct4IntBuffer(
+        'bukkitParticleAllocator',
+        GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST,
+        [0, 0, 0, 0]
+    );
+
+    const bukkitIndexStart = gpuContext.device.createBuffer({
+        label: 'bukkitIndexStart',
+        size: bukkitCountX * bukkitCountY * 4,
         usage: GPUBufferUsage.STORAGE
     });
+
+    return {
+        countX: bukkitCountX,
+        countY: bukkitCountY,
+        count: bukkitCountX*bukkitCountY,
+        countBuffer: bukkitParticleCountBuffer,
+        countBuffer2: bukkitParticleCountBuffer2,
+        particleData: bukkitParticleData,
+        threadData: bukkitParticleThreadData,
+        dispatch: bukkitParticleDispatchBuffer,
+        blankDispatch: blankDispatchBuffer,
+        particleAllocator: bukkitParticleAllocator,
+        indexStart: bukkitIndexStart
+    }
 }
 
-function constructSimUniformBuffer(gpuContext, inputs)
+function constructSimUniformBuffer(gpuContext, inputs, bukkitSystem, iteration)
 {
     let mouseActivation = 0;
     if(inputs.isMouseDown)
@@ -207,7 +296,12 @@ function constructSimUniformBuffer(gpuContext, inputs)
         rho_zero: Math.pow(inputs.particlesPerCellAxis, 2)*inputs.rhoZeroMultiplier,
         mouseRadius: inputs.mouseRadius/inputs.simResDivisor,
         shapeCount: inputs.shapes.size,
-        simFrame: g_substepIndex
+        simFrame: g_substepIndex,
+        bukkitCount: bukkitSystem.count,
+        bukkitCountX: bukkitSystem.countX,
+        bukkitCountY: bukkitSystem.countY,
+        iteration: iteration,
+        iterationCount: inputs.iterationCount,
     };
 
     return g_simFactory.constructUniformBuffer(gpuContext.device, [inputs, setDirectlyValues, gpuContext]);
diff --git a/src/time.js b/src/time.js
index 4e4a330..a1e1502 100644
--- a/src/time.js
+++ b/src/time.js
@@ -10,7 +10,7 @@ let g_timeState = {
     estimatedRenderTimeStepMs: 1000.0/60.0,
     estimatedThrottlingRatio: 1,
     framesAboveTarget: 0,
-    simFrameCountCap: 100,
+    simFrameCountCap: 20,
 }
 
 export function doTimeRegulation(inputs)
@@ -19,7 +19,7 @@ export function doTimeRegulation(inputs)
     {
         g_timeState.estimatedRenderTimeStepMs = 1000.0/60.0;
         g_timeState.prevTimeMs = inputs.timeStamp - g_timeState.estimatedRenderTimeStepMs;   
-        g_timeState.simFrameCountCap = 100;
+        g_timeState.simFrameCountCap = 20;
         g_timeState.framesAboveTarget = 0;
         g_timeState.estimatedThrottlingRatio = 1;
         g_timeState.timeAccumulatorMs = 0;
@@ -68,6 +68,7 @@ export function doTimeRegulation(inputs)
 
     if(substepCount > g_timeState.simFrameCountCap)
     {
+        console.log(`Warning: Requested substep count ${substepCount} will be capped to ${g_timeState.simFrameCountCap}`)
         substepCount = g_timeState.simFrameCountCap;
     }
 
diff --git a/src/ui.js b/src/ui.js
index 19088c9..943b330 100644
--- a/src/ui.js
+++ b/src/ui.js
@@ -84,7 +84,7 @@ const g_uiElements =
     {type: Button,name: 'pauseButton', desc: 'Pause (Spacebar)'},
     {type: RawHTML, value: `<br>`},
     
-    {type: Combo, name: 'simResDivisor', desc:'Render Pixels per Sim Grid Cell', values:[1,2,4,8,16], default:8},
+    {type: Combo, name: 'simResDivisor', desc:'Render Pixels per Sim Grid Cell', values:[1,2,4,8,16], default:4},
 
     {type: Range, name: 'particlesPerCellAxis', desc: 'Particles per cell axis', default: 2, min: 1, max: 8, step: 1},
     {type: Combo, name: 'simRate', desc:"Sim Update Rate (Hz)", values:[15, 30, 60, 120, 240, 480, 600, 1200, 2400], default:480},
@@ -97,9 +97,9 @@ const g_uiElements =
         {desc:'Push', value: SimEnums.MouseFunctionPush},
     ]},
 
-    {type: Range, name: 'iterationCount', desc: 'Iteration Count', default: 2, min: 1, max: 10, step: 1},
+    {type: Range, name: 'iterationCount', desc: 'Iteration Count', default: 5, min: 1, max: 100, step: 1},
     {type: Range, name: 'elasticityRatio', desc: 'Elasticity Ratio', default: 1, min: 0, max: 1, step: 0.01},
-    {type: Range, name: 'liquidRelaxation', desc: 'Liquid Relaxation', default: 2, min: 0, max: 10, step: 0.01},
+    {type: Range, name: 'liquidRelaxation', desc: 'Liquid Relaxation', default: 1.5, min: 0, max: 10, step: 0.01},
     {type: Range, name: 'elasticRelaxation', desc: 'Elastic Relaxation', default: 1.5, min: 0, max: 10, step: 0.01},
     {type: Range, name: 'frictionAngle', desc: 'Sand Friction Angle', default: 30, min: 0, max: 45, step: 0.1},
     {type: Range, name: 'plasticity', desc: 'Visco Plasticity', default: 0, min: 0, max: 1, step: 0.01},