From 78b0b28b289a4a7878340195bcbbc7f74f74247d Mon Sep 17 00:00:00 2001 From: Gregg Tavares Date: Tue, 14 Nov 2023 18:45:58 +0900 Subject: [PATCH] add timing article --- toc.hanson | 1 + webgpu/lessons/resources/euclidean-modulo.svg | 98 ++ webgpu/lessons/resources/modulo.svg | 101 ++ .../webgpu-compute-shaders-histogram.md | 8 +- webgpu/lessons/webgpu-timing.md | 873 ++++++++++++++++++ webgpu/resources/js/timing-helper.js | 29 +- ...ers-histogram-4ch-javascript-w-timing.html | 52 +- ...shaders-histogram-4ch-optimized-16x16.html | 4 +- ...-4ch-optimized-more-gpu-draw-w-timing.html | 4 +- ...histogram-4ch-optimized-more-gpu-draw.html | 2 +- ...histogram-4ch-optimized-more-w-timing.html | 61 +- ...-shaders-histogram-4ch-optimized-more.html | 2 +- ...ders-histogram-4ch-optimized-w-timing.html | 51 +- ...mpute-shaders-histogram-4ch-optimized.html | 2 +- ...togram-4ch-race-fixed-fragment-shader.html | 2 +- ...ogram-4ch-race-fixed-w-timing-w-inner.html | 2 +- ...ers-histogram-4ch-race-fixed-w-timing.html | 28 +- ...shaders-histogram-4ch-slow-draw-in-js.html | 2 +- ...ute-shaders-histogram-4ch-slow-w-draw.html | 2 +- ...pu-compute-shaders-histogram-4ch-slow.html | 2 +- ...-shaders-histogram-equalization-video.html | 4 +- ...haders-histogram-invocations-w-timing.html | 2 +- ...shaders-histogram-javascript-w-timing.html | 4 +- ...ute-shaders-histogram-optimized-16x16.html | 4 +- ...ers-histogram-optimized-more-w-timing.html | 92 +- ...pute-shaders-histogram-optimized-more.html | 2 +- ...-shaders-histogram-optimized-w-timing.html | 58 +- ...-histogram-race-fixed-fragment-shader.html | 2 +- ...histogram-race-fixed-w-timing-w-inner.html | 2 +- ...shaders-histogram-race-fixed-w-timing.html | 21 +- ...ute-shaders-histogram-slow-draw-in-js.html | 2 +- ...compute-shaders-histogram-slow-w-draw.html | 2 +- ...mpute-shaders-histogram-slow-w-timing.html | 31 +- ...webgpu-compute-shaders-histogram-slow.html | 2 +- ...pute-shaders-histogram-video-w-timing.html | 74 +- ...ebgpu-compute-shaders-histogram-video.html | 2 +- webgpu/webgpu-timing-animated.html | 336 +++++++ webgpu/webgpu-timing-with-fps-js-time.html | 357 +++++++ ...ebgpu-timing-with-timestamp-w-average.html | 432 +++++++++ webgpu/webgpu-timing-with-timestamp.html | 406 ++++++++ webgpu/webgpu-timing-with-timing-helper.html | 496 ++++++++++ 41 files changed, 3379 insertions(+), 278 deletions(-) create mode 100644 webgpu/lessons/resources/euclidean-modulo.svg create mode 100644 webgpu/lessons/resources/modulo.svg create mode 100644 webgpu/lessons/webgpu-timing.md create mode 100644 webgpu/webgpu-timing-animated.html create mode 100644 webgpu/webgpu-timing-with-fps-js-time.html create mode 100644 webgpu/webgpu-timing-with-timestamp-w-average.html create mode 100644 webgpu/webgpu-timing-with-timestamp.html create mode 100644 webgpu/webgpu-timing-with-timing-helper.html diff --git a/toc.hanson b/toc.hanson index 901bf56c..d7df3def 100644 --- a/toc.hanson +++ b/toc.hanson @@ -17,6 +17,7 @@ 'webgpu-memory-layout.md', 'webgpu-copying-data.md', 'webgpu-limits-and-features.md', + 'webgpu-timing.md', 'webgpu-wgsl.md', 'webgpu-how-it-works.md', ], diff --git a/webgpu/lessons/resources/euclidean-modulo.svg b/webgpu/lessons/resources/euclidean-modulo.svg new file mode 100644 index 00000000..37c3dfb1 --- /dev/null +++ b/webgpu/lessons/resources/euclidean-modulo.svg @@ -0,0 +1,98 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 0 + + + 2 + + + 4 + + + 6 + + + -2 + + + -4 + + + -6 + + + -8 + + + + 0 + + + -2 + + + 2 + + + diff --git a/webgpu/lessons/resources/modulo.svg b/webgpu/lessons/resources/modulo.svg new file mode 100644 index 00000000..4b537122 --- /dev/null +++ b/webgpu/lessons/resources/modulo.svg @@ -0,0 +1,101 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 0 + + + 2 + + + 4 + + + 6 + + + -2 + + + -4 + + + -6 + + + -8 + + + + + 0 + + + -2 + + + 2 + + + + + + + diff --git a/webgpu/lessons/webgpu-compute-shaders-histogram.md b/webgpu/lessons/webgpu-compute-shaders-histogram.md index 05702305..42810b93 100644 --- a/webgpu/lessons/webgpu-compute-shaders-histogram.md +++ b/webgpu/lessons/webgpu-compute-shaders-histogram.md @@ -488,7 +488,7 @@ We can now setup the commands to run the compute shader ```js const encoder = device.createCommandEncoder({ label: 'histogram encoder' }); - const pass = encoder.beginComputePass(encoder); + const pass = encoder.beginComputePass(); pass.setPipeline(pipeline); pass.setBindGroup(0, bindGroup); pass.dispatchWorkgroups(1); @@ -499,7 +499,7 @@ We need to copy the histogram buffer to the result buffer ```js const encoder = device.createCommandEncoder({ label: 'histogram encoder' }); - const pass = encoder.beginComputePass(encoder); + const pass = encoder.beginComputePass(); pass.setPipeline(pipeline); pass.setBindGroup(0, bindGroup); pass.dispatchWorkgroups(1); @@ -512,7 +512,7 @@ and then execute the commands ```js const encoder = device.createCommandEncoder({ label: 'histogram encoder' }); - const pass = encoder.beginComputePass(encoder); + const pass = encoder.beginComputePass(); pass.setPipeline(pipeline); pass.setBindGroup(0, bindGroup); pass.dispatchWorkgroups(1); @@ -1308,7 +1308,7 @@ dispatches until we've reduced things to 1 chunk - pass.setBindGroup(0, chunkSumBindGroup); - pass.dispatchWorkgroups(1); + // reduce the chunks -+ const pass = encoder.beginComputePass(encoder); ++ const pass = encoder.beginComputePass(); + pass.setPipeline(chunkSumPipeline); + let chunksLeft = numChunks; + sumBindGroups.forEach(bindGroup => { diff --git a/webgpu/lessons/webgpu-timing.md b/webgpu/lessons/webgpu-timing.md new file mode 100644 index 00000000..b63e3200 --- /dev/null +++ b/webgpu/lessons/webgpu-timing.md @@ -0,0 +1,873 @@ +Title: WebGPU Timing +Description: Timing operations in WebGPU +TOC: Timing + +
The `'timestamp-query'` feature used in this article +should be available in Chrome 121 or 122. If it's not available you can +turn it on by enabling on enable-webgpu-developer-features in about:flags. +
+ +Let's go over various things you might want +to time for performance. We'll time 3 things + +* The frame rate in frames per second (fps) +* The time spent in JavaScript per frame +* The time spent on the GPU per frame + +First, let's take our circle example from +[the article on vertex buffers](webgpu-vertex-buffers.html) +and lets and animate them so we have something that's easy +to see changes in how much time things take + +In that example we had 3 vertex buffers. One was for +the positions and brightness of a the vertices for a circle. +One was for things that are per instance but static +which included the circle's offset and color. And the last +one was for things that change reach time we render, in this +case it was the scale so we could keep the aspect ratio of +the circles correct so they stayed circles and no ellipses. + +We want to animate them moving so let's move the offset +to the same buffer as the scale. First we'll change the +render pipeline to move the offset to the same buffer +as the scale. + +```js + const pipeline = device.createRenderPipeline({ + label: 'per vertex color', + layout: 'auto', + vertex: { + module, + entryPoint: 'vs', + buffers: [ + { + arrayStride: 2 * 4 + 4, // 2 floats, 4 bytes each + 4 bytes + attributes: [ + {shaderLocation: 0, offset: 0, format: 'float32x2'}, // position + {shaderLocation: 4, offset: 8, format: 'unorm8x4'}, // perVertexColor + ], + }, + { +- arrayStride: 4 + 2 * 4, // 4 bytes + 2 floats, 4 bytes each ++ arrayStride: 4, // 4 bytes + stepMode: 'instance', + attributes: [ + {shaderLocation: 1, offset: 0, format: 'unorm8x4'}, // color +- {shaderLocation: 2, offset: 4, format: 'float32x2'}, // offset + ], + }, + { +- arrayStride: 2 * 4, // 2 floats, 4 bytes each ++ arrayStride: 4 * 4, // 4 floats, 4 bytes each + stepMode: 'instance', + attributes: [ +- {shaderLocation: 3, offset: 0, format: 'float32x2'}, // scale ++ {shaderLocation: 2, offset: 0, format: 'float32x2'}, // offset + {shaderLocation: 3, offset: 0, format: 'float32x2'}, // scale ++ {shaderLocation: 3, offset: 8, format: 'float32x2'}, // scale + ], + }, + ], + }, + fragment: { + module, + entryPoint: 'fs', + targets: [{ format: presentationFormat }], + }, + }); +``` + +Then we'll change the part that sets up the vertex buffers +to move the offsets together with the scale. + +```js + // create 2 vertex buffers + const staticUnitSize = +- 4 + // color is 4 bytes +- 2 * 4; // offset is 2 32bit floats (4bytes each) ++ 4; // color is 4 bytes + const changingUnitSize = +- 2 * 4; // scale is 2 32bit floats (4bytes each) ++ 2 * 4 + // offset is 2 32bit floats (4bytes each) ++ 2 * 4; // scale is 2 32bit floats (4bytes each) + const staticVertexBufferSize = staticUnitSize * kNumObjects; + const changingVertexBufferSize = changingUnitSize * kNumObjects; + + const staticVertexBuffer = device.createBuffer({ + label: 'static vertex for objects', + size: staticVertexBufferSize, + usage: GPUBufferUsage.VERTEX | GPUBufferUsage.COPY_DST, + }); + + const changingVertexBuffer = device.createBuffer({ + label: 'changing storage for objects', + size: changingVertexBufferSize, + usage: GPUBufferUsage.VERTEX | GPUBufferUsage.COPY_DST, + }); + + // offsets to the various uniform values in float32 indices + const kColorOffset = 0; ++ const kOffsetOffset = 1; ++ ++ const kScaleOffset = 0; ++ const kOffsetOffset = 0; ++ const kScaleOffset = 2; + + { + const staticVertexValuesU8 = new Uint8Array(staticVertexBufferSize); +- const staticVertexValuesF32 = new Float32Array(staticVertexValuesU8.buffer); + for (let i = 0; i < kNumObjects; ++i) { + const staticOffsetU8 = i * staticUnitSize; +- const staticOffsetF32 = staticOffsetU8 / 4; + + // These are only set once so set them now + staticVertexValuesU8.set( // set the color + [rand() * 255, rand() * 255, rand() * 255, 255], + staticOffsetU8 + kColorOffset); + +- staticVertexValuesF32.set( // set the offset +- [rand(-0.9, 0.9), rand(-0.9, 0.9)], +- staticOffsetF32 + kOffsetOffset); + + objectInfos.push({ + scale: rand(0.2, 0.5), ++ offset: [rand(-0.9, 0.9), rand(-0.9, 0.9)], ++ velocity: [rand(-0.1, 0.1), rand(-0.1, 0.1)], + }); + } +- device.queue.writeBuffer(staticVertexBuffer, 0, staticVertexValuesF32); ++ device.queue.writeBuffer(staticVertexBuffer, 0, staticVertexValuesU8); + } +``` + +At render time we can update the offsets of the circles based on their velocity and then upload those to the GPU. + +```js ++ const euclideanModulo = (x, a) => x - a * Math.floor(x / a); + ++ let then = 0; +- function render() { + function render(now) { ++ now *= 0.001; // convert to seconds ++ const deltaTime = now - then; ++ then = now; + +... + // set the scales for each object +- objectInfos.forEach(({scale}, ndx) => { +- const offset = ndx * (changingUnitSize / 4); +- vertexValues.set([scale / aspect, scale], offset + kScaleOffset); // set the scale ++ objectInfos.forEach(({scale, offset, veloctiy}, ndx) => { ++ // -1.5 to 1.5 ++ offset[0] = euclideanModulo(offset[0] + velocity[0] * deltaTime + 1.5, 3) - 1.5; ++ offset[1] = euclideanModulo(offset[1] + velocity[1] * deltaTime + 1.5, 3) - 1.5; + ++ const off = ndx * (changingUnitSize / 4); ++ vertexValues.set(offset, off + kOffsetOffset); + vertexValues.set([scale / aspect, scale], off + kScaleOffset); + }); + +... + ++ requestAnimationFrame(render); + } ++ requestAnimationFrame(render); + + const observer = new ResizeObserver(entries => { + for (const entry of entries) { + const canvas = entry.target; + const width = entry.contentBoxSize[0].inlineSize; + const height = entry.contentBoxSize[0].blockSize; + canvas.width = Math.max(1, Math.min(width, device.limits.maxTextureDimension2D)); + canvas.height = Math.max(1, Math.min(height, device.limits.maxTextureDimension2D)); +- // re-render +- render(); + } + }); + observer.observe(canvas); +``` + +We also switched to a rAF loop. + +The code above uses `euclideanModulo` to update the offset. +`euclideanModulo` returns the remainder of a division where +the remainder always is always positive. For example + +
+
+
+
+
+
modulo 2 of % vs euclideanModulo
+
+ +To put it another way, here's a graph of `%` vs `euclideanModulo` + +
+ +
euclideanModule(v, 2)
+
+
+ +
v % 2
+
+ +So, the code above takes the offset, which is in clip space, and adds 1.5. It then takes the euclideanModulo +by 3 which will give us a number between 0.0 and 3.0 +and then subtracts 1.5. This gives us numbers +that stay between -1.5 and +1.5 and lets them wrap +around to the other side. We use -1.5 to +1.5 so that +the circles don't wrap until they are off the screen. [^offscreen] + +[^offscreen]: This only works if the radius of the circle is less than 0.5 + +To give us something to adjust, lets make it so we can +set how many circles to draw. + +```js +- const kNumObjects = 100; ++ const kNumObjects = 10000; + + +... + + const settings = { + numObjects: 100, + }; + + const gui = new GUI(); + gui.add(settings, 'numObjects', 0, kNumObjects, 1); + + ... + + // set the scale and offset for each object +- objectInfos.forEach(({scale, offset, veloctiy}, ndx) => { ++ for (let ndx = 0; ndx < settings.numObjects; ++ndx) { ++ const {scale, offset, velocity} = objectInfos[ndx]; + + // -1.5 to 1.5 + offset[0] = euclideanModulo(offset[0] + velocity[0] * deltaTime + 1.5, 3) - 1.5; + offset[1] = euclideanModulo(offset[1] + velocity[1] * deltaTime + 1.5, 3) - 1.5; + + const off = ndx * (changingUnitSize / 4); + vertexValues.set(offset, off + kOffsetOffset); + vertexValues.set([scale / aspect, scale], off + kScaleOffset); +- }); ++ } + + // upload all offsets and scales at once +- device.queue.writeBuffer(changingVertexBuffer, 0, vertexValues); ++ device.queue.writeBuffer( + changingVertexBuffer, 0, + vertexValues, 0, settings.numObjects * changingUnitSize / 4); + +- pass.draw(numVertices, kNumObjects); ++ pass.draw(numVertices, settings.numObjects); +``` + +So now we should have something that animates +and we can adjust how much work is done by setting +the number of circles. + +{{{example url="../webgpu-timing-animated.html"}}} + +To that, let's add frames per second (fps) and +time spent in JavaScript + +First we need a way to display this info so lets +add a div positions on top of the canvas. + +```html + + ++

+  
+```
+
+```css
+html, body {
+  margin: 0;       /* remove the default margin          */
+  height: 100%;    /* make the html,body fill the page   */
+}
+canvas {
+  display: block;  /* make the canvas act like a block   */
+  width: 100%;     /* make the canvas fill its container */
+  height: 100%;
+}
++#info {
++  position: absolute;
++  top: 0;
++  left: 0;
++  margin: 0;
++  padding: 0.5em;
++  background-color: rgba(0, 0, 0, 0.8);
++  color: white;
++}
+```
+
+We already have the data needed to display
+frames per second. It's the `deltaTime` we
+computed above.
+
+For JavaScript time we can record the time
+our requestAnimationFrame started and the
+time it ended 
+
+```js
+  let then = 0;
+  function render(now) {
+    now *= 0.001;  // convert to seconds
+    const deltaTime = now - then;
+    then = now;
+
++    const startTime = performance.now();
+
+    ...
+
++    const jsTime = performance.now() - startTime;
+
++    infoElem.textContent = `\
++fps: ${(1 / deltaTime).toFixed(1)}
++js: ${jsTime.toFixed(1)}ms
++`;
+
+    requestAnimationFrame(render);
+  }
+  requestAnimationFrame(render);
+```
+
+And that gives us our first 2 timing measurements.
+
+{{{example url="../webgpu-timing-with-fps-js-time.html"}}}
+
+
+WebGPU provides **optional** `'timestamp-query'` feature for checking how long an operation takes on the GPU.
+Since it's an optional feature we need to see if it
+exists and request it like we covered in [the article on limits and features](webgpu-limits-and-features.html).
+
+```js
+async function main() {
+  const adapter = await navigator.gpu?.requestAdapter();
+-  const device = await adapter?.requestDevice();
++  const canTimestamp = adapter.features.has('timestamp-query');
++  const device = await adapter?.requestDevice({
++    requiredFeatures: [
++      ...(canTimestamp ? ['timestamp-query'] : []),
++     ],
++  });
+  if (!device) {
+    fail('need a browser that supports WebGPU');
+    return;
+  }
+```
+
+Above, we set `canTimestamp` to true or false
+based on if the adapter supports the `'timestamp-query'` feature. If it does we require
+that feature when we create our device.
+
+With the feature enabled we can ask WebGPU for *timestamps* for a render pass or compute
+pass. You do this by making a `GPUQuerySet` and adding it
+to your compute or render pass. A `GPUQuerySet` is effectively
+an array of query results. You tell WebGPU where in the array to
+record the time the pass started and where in the array record
+when the pass ended. You can then copy those timestamps to
+a buffer and map the buffer to read the results.
+
+So, first we create a query set
+
+
+```js
+  const querySet = device.createQuerySet({
+     type: 'timestamp',
+     count: 2,
+  });
+```
+
+We need count to be at least 2 so we can write
+both a start and end timestamp.
+
+We need a buffer to convert the querySet info
+into data we can access
+
+```js
+  const resolveBuffer = device.createBuffer({
+    size: querySet.count * 8,
+    usage: GPUBufferUsage.QUERY_RESOLVE | GPUBufferUsage.COPY_SRC,
+  });
+```
+
+Each element in a querySet takes 8 bytes.
+We need to give it a usage of `QUERY_RESOLVE`
+and, if we want be able to read the results
+back in JavaScript we need the `COPY_SRC` usage
+so we can copy the result to a mappable buffer.
+
+Finally we create a mappable buffer to read the
+results
+
+```js
+  const resultBuffer = device.createBuffer({
+    size: resolveBuffer.size,
+    usage: GPUBufferUsage.COPY_DST | GPUBufferUsage.MAP_READ,
+  });
+```
+
+We need to wrap this code in a way that only
+creates these things if the feature exists
+
+```js
++  const { querySet, resolveBuffer, resultBuffer } = (() => {
++    if (!canTimestamp) {
++      return {};
++    }
+
+    const querySet = device.createQuerySet({
+       type: 'timestamp',
+       count: 2,
+    });
+    const resolveBuffer = device.createBuffer({
+      size: querySet.count * 8,
+      usage: GPUBufferUsage.QUERY_RESOLVE | GPUBufferUsage.COPY_SRC,
+    });
+    const resultBuffer = device.createBuffer({
+      size: resolveBuffer.size,
+      usage: GPUBufferUsage.COPY_DST | GPUBufferUsage.MAP_READ,
+    });
++    return {querySet, resolveBuffer, resultBuffer };
++  })();
+```
+
+In our render pass descriptor we tell it the
+querySet to use and the index of the elements
+in the querySet to write the start and ending
+timestamps.
+
+```js
+  const renderPassDescriptor = {
+    label: 'our basic canvas renderPass with timing',
+    colorAttachments: [
+      {
+        // view: <- to be filled out when we render
+        clearValue: [0.3, 0.3, 0.3, 1],
+        loadOp: 'clear',
+        storeOp: 'store',
+      },
+    ],
+    ...(canTimestamp && {
+      timestampWrites: {
+        querySet,
+        beginningOfPassWriteIndex: 0,
+        endOfPassWriteIndex: 1,
+      },
+    }),
+  };
+```
+
+Above, if the feature exists, we add a `timestampWrites` section to our
+renderPassDescriptor and pass in the querySet
+and tell it to write the start to element 0
+of the set and the end to element 1.
+
+After we end the pass we need to call `resolveQuerySet`. This takes the results
+of the query and puts them in a buffer. We pass it the querySet, the first index
+in the query set where to start resolving, the number of entries to resolve,
+a buffer to resolve to, and a offset in that buffer where to store the result.
+
+```
+    pass.end();
+
++    if (canTimestamp) {
++      encoder.resolveQuerySet(querySet, 0, querySet.count, resolveBuffer, 0);
++    }
+```
+
+We also want to copy the `resolveBuffer` to our 
+`resultsBuffer` so we can map it and look at
+the results in JavaScript. We have an issue
+though. We can not copy to our `resultsBuffer`
+while it's mapped. Fortunately buffers have
+a `mapState` property we can check. If it's
+set to `unmapped` then it's safe to copy to it.
+
+```js
+    if (canTimestamp) {
+      encoder.resolveQuerySet(querySet, 0, 2, resolveBuffer, 0);
++      if (resultBuffer.mapState === 'unmapped') {
++        encoder.copyBufferToBuffer(resolveBuffer, 0, resultBuffer, 0, resultBuffer.size);
++      }
+    }
+```
+
+After we've submitted the command buffer we
+can map the `resultBuffer`. Like above, only want to map it if it's `'unmapped'`
+
+```js
++  let gpuTime = 0;
+
+   ...
+
+   function render(now) {
+
+    ...
+
+    const commandBuffer = encoder.finish();
+    device.queue.submit([commandBuffer]);
+
++    if (canTimestamp && resultBuffer.mapState === 'unmapped') {
++      resultBuffer.mapAsync(GPUMapMode.READ).then(() => {
++        const times = new BigInt64Array(resultBuffer.getMappedRange());
++        gpuTime = Number(times[1] - times[0]);
++        resultBuffer.unmap();
++      });
++    }
+```
+
+Query set results are nanoseconds stored in 64bit integers. To read them in JavaScript we can use a `BigInt64Array` typedarray view.
+Using `BigInt64Array` requires special care. When
+you read an element from a `BitInt64Array` the type
+is a `bigint`, not a `number`. If you convert them
+to numbers they'll lose precision because a number
+can only hold integers of 53 bits in size. So, first
+we subtract the 2 `bigint`s and then convert them to
+a number so we can use them as normal.
+
+In the code above, because we are are only copying the results to `resultBuffer` some times, when it's
+not mapped, we'll only be reading the time on
+some frames. Most likely every other frame but
+there is no strict guarantee how long it will
+take until `mapAsync` resolves. Because of that,
+we update `gpuTime` which we can use at anytime
+to get the last recorded time.
+
+```js
+    infoElem.textContent = `\
+fps: ${(1 / deltaTime).toFixed(1)}
+js: ${jsTime.toFixed(1)}ms
++gpu: ${canTimestamp ? `${(gpuTime / 1000).toFixed(1)}µs` : 'N/A'}
+`;
+```
+
+And we that we get a GPU time from WebGPU
+
+{{{example url="../webgpu-timing-with-timestamp.html"}}}
+
+For me, the numbers change too often to see anything
+useful. One way to fix that is to compute a rolling
+average. Here's a class to help compute a rolling
+average.
+
+```js
+class RollingAverage {
+  #total = 0;
+  #samples = [];
+  #cursor = 0;
+  #numSamples;
+  constructor(numSamples = 30) {
+    this.#numSamples = numSamples;
+  }
+  addSample(v) {
+    this.#total += v - (this.#samples[this.#cursor] || 0);
+    this.#samples[this.#cursor] = v;
+    this.#cursor = (this.#cursor + 1) % this.#numSamples;
+  }
+  get() {
+    return this.#total / this.#samples.length;
+  }
+}
+```
+
+Which we can use like this
+
+```js
++const fpsAverage = new RollingAverage();
++const jsAverage = new RollingAverage();
++const gpuAverage = new RollingAverage();
+
+function render(now) {
+  ...
+
+    const commandBuffer = encoder.finish();
+    device.queue.submit([commandBuffer]);
+
+    if (canTimestamp && resultBuffer.mapState === 'unmapped') {
+      resultBuffer.mapAsync(GPUMapMode.READ).then(() => {
+        const times = new BigInt64Array(resultBuffer.getMappedRange());
+        gpuTime = Number(times[1] - times[0]);
++        gpuAverage.addSample(gpuTime / 1000);
+        resultBuffer.unmap();
+      });
+    }
+
+    const jsTime = performance.now() - startTime;
+
++    fpsAverage.addSample(1 / deltaTime);
++    jsAverage.addSample(jsTime);
+
+    infoElem.textContent = `\
+-fps: ${(1 / deltaTime).toFixed(1)}
+-js: ${jsTime.toFixed(1)}ms
+-gpu: ${canTimestamp ? `${(gpuTime / 1000).toFixed(1)}µs` : 'N/A'}
++fps: ${fpsAverage.get().toFixed(1)}
++js: ${jsAverage.get().toFixed(1)}ms
++gpu: ${canTimestamp ? `${gpuAverage.get().toFixed(1)}µs` : 'N/A'}
+`;
+
+    requestAnimationFrame(render);
+  }
+  requestAnimationFrame(render);
+}
+```
+
+And now the numbers a little more stable.
+
+{{{example url="../webgpu-timing-with-timestamp-w-average.html"}}}
+
+##  Using a helper
+
+For me, I find all of this a little tedious
+and probably easy to get something wrong.
+One way to fix this would be to make a class
+to helps us do the timing. For example,
+after we make a pass with a timestamp we
+have to resolve it and copy it to a mappable
+buffer[^use-on-gpu]
+
+[^use-on-gpu]: If some how we were going to
+use the result only on the GPU itself we might
+not need to copy it.
+
+Here's on example of a helper that might
+help with some of these issues.
+
+```js
+function assert(cond, msg = '') {
+  if (!cond) {
+    throw new Error(msg);
+  }
+}
+
+class TimingHelper {
+  #canTimestamp;
+  #device;
+  #querySet;
+  #resolveBuffer;
+  #resultBuffer;
+  #resultBuffers = [];
+  // state can be 'free', 'need resolve', 'wait for result'
+  #state = 'free';
+
+  constructor(device) {
+    this.#device = device;
+    this.#canTimestamp = device.features.has('timestamp-query');
+    this.#querySet = device.createQuerySet({
+       type: 'timestamp',
+       count: 2,
+    });
+    this.#resolveBuffer = device.createBuffer({
+      size: this.#querySet.count * 8,
+      usage: GPUBufferUsage.QUERY_RESOLVE | GPUBufferUsage.COPY_SRC,
+    });
+  }
+
+  #beginTimestampPass(encoder, fnName, descriptor) {
+    if (this.#canTimestamp) {
+      assert(this.#state === 'free', 'state not free');
+      this.#state = 'need resolve';
+
+      const pass = encoder[fnName]({
+        ...descriptor,
+        ...{
+          timestampWrites: {
+            querySet: this.#querySet,
+            beginningOfPassWriteIndex: 0,
+            endOfPassWriteIndex: 1,
+          },
+        },
+      });
+
+      const resolve = () => this.#resolveTiming(encoder);
+      pass.end = (function(origFn) {
+        return function() {
+          origFn.call(this);
+          resolve();
+        };
+      })(pass.end);
+
+      return pass;
+    } else {
+      return encoder[fnName](descriptor);
+    }
+  }
+
+  beginRenderPass(encoder, descriptor = {}) {
+    return this.#beginTimestampPass(encoder, 'beginRenderPass', descriptor);
+  }
+
+  beginComputePass(encoder, descriptor = {}) {
+    return this.#beginTimestampPass(encoder, 'beginComputePass', descriptor);
+  }
+
+  #resolveTiming(encoder) {
+    if (!this.#canTimestamp) {
+      return;
+    }
+    assert(this.#state === 'need resolve', 'must call addTimestampToPass');
+    this.#state = 'wait for result';
+
+    this.#resultBuffer = this.#resultBuffers.pop() || this.#device.createBuffer({
+      size: this.#resolveBuffer.size,
+      usage: GPUBufferUsage.COPY_DST | GPUBufferUsage.MAP_READ,
+    });
+
+    encoder.resolveQuerySet(this.#querySet, 0, this.#querySet.count, this.#resolveBuffer, 0);
+    encoder.copyBufferToBuffer(this.#resolveBuffer, 0, this.#resultBuffer, 0, this.#resultBuffer.size);
+  }
+
+  async getResult() {
+    if (!this.#canTimestamp) {
+      return 0;
+    }
+    assert(this.#state === 'wait for result', 'must call resolveTiming');
+    this.#state = 'free';
+
+    const resultBuffer = this.#resultBuffer;
+    await resultBuffer.mapAsync(GPUMapMode.READ);
+    const times = new BigInt64Array(resultBuffer.getMappedRange());
+    const duration = Number(times[1] - times[0]);
+    resultBuffer.unmap();
+    this.#resultBuffers.push(resultBuffer);
+    return duration;
+  }
+}
+```
+
+The asserts are there to helps us not use this class wrong. For example if we end a pass but don't resolve it. Or if we resolve it but don't read the result.
+
+With this class, we can remove much of the code
+we had before. 
+
+```js
+async function main() {
+  const adapter = await navigator.gpu?.requestAdapter();
+  const canTimestamp = adapter.features.has('timestamp-query');
+  const device = await adapter?.requestDevice({
+    requiredFeatures: [
+      ...(canTimestamp ? ['timestamp-query'] : []),
+     ],
+  });
+  if (!device) {
+    fail('need a browser that supports WebGPU');
+    return;
+  }
+
++  const timingHelper = new TimingHelper(device);
+
+  ...
+
+-  const { querySet, resolveBuffer, resultBuffer } = (() => {
+-    if (!canTimestamp) {
+-      return {};
+-    }
+-
+-    const querySet = device.createQuerySet({
+-       type: 'timestamp',
+-       count: 2,
+-    });
+-    const resolveBuffer = device.createBuffer({
+-      size: querySet.count * 8,
+-      usage: GPUBufferUsage.QUERY_RESOLVE | GPUBufferUsage.COPY_SRC,
+-    });
+-    const resultBuffer = device.createBuffer({
+-      size: resolveBuffer.size,
+-      usage: GPUBufferUsage.COPY_DST | GPUBufferUsage.MAP_READ,
+-    });
+-    return {querySet, resolveBuffer, resultBuffer };
+-  })();
+
+  ...
+
+  function render(now) {
+
+    ...
+
+    pass.end();
+
+    -if (canTimestamp) {
+    -  encoder.resolveQuerySet(querySet, 0, querySet.count, resolveBuffer, 0);
+    -  if (resultBuffer.mapState === 'unmapped') {
+    -    encoder.copyBufferToBuffer(resolveBuffer, 0, resultBuffer, 0, resultBuffer.size);
+    -  }
+    -}
+
+    const commandBuffer = encoder.finish();
+    device.queue.submit([commandBuffer]);
+
++    timingHelper.getResult().then(gpuTime => {
++        gpuAverage.addSample(gpuTime / 1000);
++    });
+
+    ...
+```
+
+A few points about the `TimingHelper` class
+
+* You still have to manually request the `'timestamp-query` feature when you
+create your device but the class handles whether it exists or not on the device.
+
+* When you call `timerHelper.beginRenderPass` or `timerHelper.beginComputePass`
+  it automatically adds the appropriate properties to the pass descriptor. It
+  also returns a pass encoder who's `end` function automatically resolves the
+  queries.
+
+* It's designed so if you use it wrong it will complain
+
+* The only handles 1 pass.
+
+  There are a bunch of tradeoffs here and without more exploration it's not
+  clear what would be best.
+
+  A class that handles multiple passes could be useful but, ideally, you'd use a
+  single `GPUQuerySet` that has enough space for all of your passes, rather than
+  1 `GPUQuerySet` per pass.
+
+  But, in order to do that you'd either need to have the user tell you up front
+  the maximum number of passes they'll use. Or, you need to make the code more
+  complicated where it starts with a small `GPUQuerySet` and deletes it and
+  makes a new larger one if you use more. But then, at least for 1 frame, you'd
+  need to handle having multiple `GPUQuerySet`s
+
+  All of that seemed overkill so for now it seemed best to make it handle one
+  pass and you can build on top of it until you decide it needs to be changed.
+
+In any case, I've used this class to time the various
+examples from [the articles on using compute shaders to compute image histograms](webgpu-compute-shaders-histogram.html). Here's
+a list of them. Since only the video example runs continuously it's probably
+the best example
+
+* 4 channel video histogram
+
+The rest just run once and print their result to the JavaScript console.
+
+* 4 channel workgroup per chunk histogram with reduce
+* 4 channel workgroup per pixel histogram
+* 4 channel JavaScript histogram
+* 1 channel workgroup per chunk histogram with reduce
+* 1 channel workgroup per chunk histogram with sum
+* 1 channel workgroup per pixel histogram 
+* 1 channel single core histogram
+* 1 channel JavaScript histogram
+
+
By default the `'timestamp-query'` time values +are quantized to 100µ seconds. In Chrome, if you enable ["enable-webgpu-developer-features"](chrome://flags/#enable-webgpu-developer-features) in [about:flags]((chrome://flags/#enable-webgpu-developer-features) the time values may not be quantized. This would +theoretically give you more accurate timings. That said, normally 100µ second quantized values should be enough for you to compare shaders or passes. +
diff --git a/webgpu/resources/js/timing-helper.js b/webgpu/resources/js/timing-helper.js index b4e0b734..306fb32f 100644 --- a/webgpu/resources/js/timing-helper.js +++ b/webgpu/resources/js/timing-helper.js @@ -5,25 +5,28 @@ function assert(cond, msg = '') { } export default class TimingHelper { - #device; #canTimestamp; + #device; + #querySet; + #resolveBuffer; #resultBuffer; #resultBuffers = []; + // state can be 'free', 'need resolve', 'wait for result' #state = 'free'; - #querySet; - #resolveBuffer; constructor(device) { this.#device = device; this.#canTimestamp = device.features.has('timestamp-query'); - this.#querySet = device.createQuerySet({ - type: 'timestamp', - count: 2, - }); - this.#resolveBuffer = device.createBuffer({ - size: 2 * 8, - usage: GPUBufferUsage.QUERY_RESOLVE | GPUBufferUsage.COPY_SRC, - }); + if (this.#canTimestamp) { + this.#querySet = device.createQuerySet({ + type: 'timestamp', + count: 2, + }); + this.#resolveBuffer = device.createBuffer({ + size: this.#querySet.count * 8, + usage: GPUBufferUsage.QUERY_RESOLVE | GPUBufferUsage.COPY_SRC, + }); + } } #beginTimestampPass(encoder, fnName, descriptor) { @@ -72,11 +75,11 @@ export default class TimingHelper { this.#state = 'wait for result'; this.#resultBuffer = this.#resultBuffers.pop() || this.#device.createBuffer({ - size: 2 * 8, + size: this.#resolveBuffer.size, usage: GPUBufferUsage.COPY_DST | GPUBufferUsage.MAP_READ, }); - encoder.resolveQuerySet(this.#querySet, 0, 2, this.#resolveBuffer, 0); + encoder.resolveQuerySet(this.#querySet, 0, this.#querySet.count, this.#resolveBuffer, 0); encoder.copyBufferToBuffer(this.#resolveBuffer, 0, this.#resultBuffer, 0, this.#resultBuffer.size); } diff --git a/webgpu/webgpu-compute-shaders-histogram-4ch-javascript-w-timing.html b/webgpu/webgpu-compute-shaders-histogram-4ch-javascript-w-timing.html index 671bd766..eae69ab4 100644 --- a/webgpu/webgpu-compute-shaders-histogram-4ch-javascript-w-timing.html +++ b/webgpu/webgpu-compute-shaders-histogram-4ch-javascript-w-timing.html @@ -21,51 +21,53 @@ const imgBitmap = await loadImageBitmap('resources/images/pexels-chevanon-photography-1108099.jpg'); /* webgpufundamentals: url */ const imgData = getImageData(imgBitmap); -const start = performance.now(); + const start = performance.now(); const numBins = 256; const histogram = computeHistogram(numBins, imgData); -const elapsed = performance.now() - start; -console.log(Math.floor(elapsed * 1000 * 1000)); + const elapsed = performance.now() - start; + console.log(`duration: ${Math.floor(elapsed * 1000 * 1000)}ns`); showImageBitmap(imgBitmap); // draw the red, green, and blue channels - drawHistogram(histogram, [0, 1, 2]); + const numEntries = imgData.width * imgData.height; + drawHistogram(histogram, numEntries, [0, 1, 2]); // draw the luminosity channel - drawHistogram(histogram, [3]); + drawHistogram(histogram, numEntries, [3]); } function computeHistogram(numBins, imgData) { const {width, height, data} = imgData; - const histogram = new Array(numBins * 4).fill(0); + const bins = new Array(numBins * 4).fill(0); for (let y = 0; y < height; ++y) { for (let x = 0; x < width; ++x) { const offset = (y * width + x) * 4; + for (let ch = 0; ch < 4; ++ch) { const v = ch < 3 ? data[offset + ch] / 255 - : luminance(data, offset); + : srgbLuminance(data[offset + 0] / 255, + data[offset + 1] / 255, + data[offset + 2] / 255); const bin = Math.min(numBins - 1, v * numBins) | 0; - ++histogram[bin * 4 + ch]; + ++bins[bin * 4 + ch]; } } } - return histogram; + return bins; } -function drawHistogram(histogram, channels, height = 100) { +function drawHistogram(histogram, numEntries, channels, height = 100) { // find the highest value for each channel + const numBins = histogram.length / 4; const max = [0, 0, 0, 0]; - const total = [0, 0, 0, 0]; - histogram.forEach((v, i) => { - const ch = i % 4; + histogram.forEach((v, ndx) => { + const ch = ndx % 4; max[ch] = Math.max(max[ch], v); - total[ch] += v; - }); - console.log('total:', total); + }); + const scale = max.map(max => Math.max(1 / max, 0.2 * numBins / numEntries)); - const numBins = histogram.length / 4; const canvas = document.createElement('canvas'); canvas.width = numBins; canvas.height = height; @@ -84,8 +86,7 @@ for (let x = 0; x < numBins; ++x) { const offset = x * 4; for (const ch of channels) { - const scale = 0.2 * numBins / total[ch]; - const v = histogram[offset + ch] * scale * height; + const v = histogram[offset + ch] * scale[ch] * height; ctx.fillStyle = colors[ch]; ctx.fillRect(x, height - v, 1, v); } @@ -93,15 +94,10 @@ } // from: https://www.w3.org/WAI/GL/wiki/Relative_luminance -function luminance(data, offset) { - const r = data[offset + 0]; - const g = data[offset + 1]; - const b = data[offset + 2]; - - const l = r * 0.2126 / 255 + - g * 0.7152 / 255 + - b * 0.0722 / 255; - return l; +function srgbLuminance(r, g, b) { + return r * 0.2126 + + g * 0.7152 + + b * 0.0722; } function getImageData(imgBitmap) { diff --git a/webgpu/webgpu-compute-shaders-histogram-4ch-optimized-16x16.html b/webgpu/webgpu-compute-shaders-histogram-4ch-optimized-16x16.html index 66b5fed0..54e64579 100644 --- a/webgpu/webgpu-compute-shaders-histogram-4ch-optimized-16x16.html +++ b/webgpu/webgpu-compute-shaders-histogram-4ch-optimized-16x16.html @@ -197,10 +197,10 @@ device.queue.submit([commandBuffer]); timingHelper.getResult().then(duration => { - console.log('duration 1:', duration); + console.log(`duration 1: ${duration}ns`); }); timingHelper2.getResult().then(duration => { - console.log('duration 2:', duration); + console.log(`duration 2: ${duration}ns`); }); showImageBitmap(imgBitmap); diff --git a/webgpu/webgpu-compute-shaders-histogram-4ch-optimized-more-gpu-draw-w-timing.html b/webgpu/webgpu-compute-shaders-histogram-4ch-optimized-more-gpu-draw-w-timing.html index 0c5b742c..d72553dd 100644 --- a/webgpu/webgpu-compute-shaders-histogram-4ch-optimized-more-gpu-draw-w-timing.html +++ b/webgpu/webgpu-compute-shaders-histogram-4ch-optimized-more-gpu-draw-w-timing.html @@ -310,7 +310,7 @@ const encoder = device.createCommandEncoder({ label: 'histogram encoder' }); { - const pass = encoder.beginComputePass(encoder); + const pass = encoder.beginComputePass(); pass.setPipeline(histogramChunkPipeline); pass.setBindGroup(0, histogramBindGroup); pass.dispatchWorkgroups(chunksAcross, chunksDown); @@ -437,7 +437,7 @@ } timingHelper.getResult().then(duration => { - console.log('duration 1:', duration); + console.log(`duration 1: ${duration}ns`); }); } diff --git a/webgpu/webgpu-compute-shaders-histogram-4ch-optimized-more-gpu-draw.html b/webgpu/webgpu-compute-shaders-histogram-4ch-optimized-more-gpu-draw.html index a23e3c01..8fcd4375 100644 --- a/webgpu/webgpu-compute-shaders-histogram-4ch-optimized-more-gpu-draw.html +++ b/webgpu/webgpu-compute-shaders-histogram-4ch-optimized-more-gpu-draw.html @@ -287,7 +287,7 @@ } const encoder = device.createCommandEncoder({ label: 'histogram encoder' }); - const pass = encoder.beginComputePass(encoder); + const pass = encoder.beginComputePass(); // create a histogram for each chunk pass.setPipeline(histogramChunkPipeline); diff --git a/webgpu/webgpu-compute-shaders-histogram-4ch-optimized-more-w-timing.html b/webgpu/webgpu-compute-shaders-histogram-4ch-optimized-more-w-timing.html index 0611cd44..d0169fee 100644 --- a/webgpu/webgpu-compute-shaders-histogram-4ch-optimized-more-w-timing.html +++ b/webgpu/webgpu-compute-shaders-histogram-4ch-optimized-more-w-timing.html @@ -49,8 +49,8 @@ code: ` ${sharedConstants} const chunkSize = chunkWidth * chunkHeight; - var histogram: array, 4>, chunkSize>; - @group(0) @binding(0) var histogramChunks: array>; + var bins: array, 4>, chunkSize>; + @group(0) @binding(0) var chunks: array>; @group(0) @binding(1) var ourTexture: texture_2d; const kSRGBLuminanceFactors = vec3f(0.2126, 0.7152, 0.0722); @@ -72,8 +72,9 @@ var channels = textureLoad(ourTexture, position, 0); channels.w = srgbLuminance(channels.rgb); for (var ch = 0; ch < 4; ch++) { - let bin = min(u32(channels[ch] * numBins), lastBinIndex); - atomicAdd(&histogram[bin][ch], 1u); + let v = channels[ch]; + let bin = min(u32(v * numBins), lastBinIndex); + atomicAdd(&bins[bin][ch], 1u); } } @@ -83,11 +84,11 @@ let chunk = workgroup_id.y * chunksAcross + workgroup_id.x; let bin = local_invocation_id.y * chunkWidth + local_invocation_id.x; - histogramChunks[chunk][bin] = vec4u( - atomicLoad(&histogram[bin][0]), - atomicLoad(&histogram[bin][1]), - atomicLoad(&histogram[bin][2]), - atomicLoad(&histogram[bin][3]), + chunks[chunk][bin] = vec4u( + atomicLoad(&bins[bin][0]), + atomicLoad(&bins[bin][1]), + atomicLoad(&bins[bin][2]), + atomicLoad(&bins[bin][3]), ); } `, @@ -103,20 +104,19 @@ stride: u32, }; - @group(0) @binding(0) var histogramChunks: array>; + @group(0) @binding(0) var chunks: array>; @group(0) @binding(1) var uni: Uniforms; @compute @workgroup_size(chunkSize, 1, 1) fn cs( @builtin(local_invocation_id) local_invocation_id: vec3u, @builtin(workgroup_id) workgroup_id: vec3u, ) { - var sum = vec4u(0); let chunk0 = workgroup_id.x * uni.stride * 2; let chunk1 = chunk0 + uni.stride; - histogramChunks[chunk0][local_invocation_id.x] = - histogramChunks[chunk0][local_invocation_id.x] + - histogramChunks[chunk1][local_invocation_id.x]; + let sum = chunks[chunk0][local_invocation_id.x] + + chunks[chunk1][local_invocation_id.x]; + chunks[chunk0][local_invocation_id.x] = sum; } `, }); @@ -165,13 +165,9 @@ }); const sumBindGroups = []; - for (let i = 0; ; ++i) { + const numSteps = Math.ceil(Math.log2(numChunks)); + for (let i = 0; i < numSteps; ++i) { const stride = 2 ** i; - if (stride >= numChunks) { - break; - } - console.log(i, 'stride:', stride); - const uniformBuffer = device.createBuffer({ size: 4, usage: GPUBufferUsage.UNIFORM, @@ -219,10 +215,10 @@ device.queue.submit([commandBuffer]); timingHelper.getResult().then(duration => { - console.log('duration 1:', duration); + console.log(`duration histogram: ${duration}ns`); }); timingHelper2.getResult().then(duration => { - console.log('duration 2:', duration); + console.log(`duration reduce: ${duration}ns`); }); await resultBuffer.mapAsync(GPUMapMode.READ); @@ -231,27 +227,25 @@ showImageBitmap(imgBitmap); // draw the red, green, and blue channels - drawHistogram(histogram, [0, 1, 2]); + const numEntries = texture.width * texture.height; + drawHistogram(histogram, numEntries, [0, 1, 2]); // draw the luminosity channel - drawHistogram(histogram, [3]); + drawHistogram(histogram, numEntries, [3]); resultBuffer.unmap(); } - -function drawHistogram(histogram, channels, height = 100) { +function drawHistogram(histogram, numEntries, channels, height = 100) { // find the highest value for each channel + const numBins = histogram.length / 4; const max = [0, 0, 0, 0]; - const total = [0, 0, 0, 0]; - histogram.forEach((v, i) => { - const ch = i % 4; + histogram.forEach((v, ndx) => { + const ch = ndx % 4; max[ch] = Math.max(max[ch], v); - total[ch] += v; }); - console.log('total:', total); + const scale = max.map(max => Math.max(1 / max, 0.2 * numBins / numEntries)); - const numBins = histogram.length / 4; const canvas = document.createElement('canvas'); canvas.width = numBins; canvas.height = height; @@ -270,8 +264,7 @@ for (let x = 0; x < numBins; ++x) { const offset = x * 4; for (const ch of channels) { - const scale = 0.2 * numBins / total[ch]; - const v = histogram[offset + ch] * scale * height; + const v = histogram[offset + ch] * scale[ch] * height; ctx.fillStyle = colors[ch]; ctx.fillRect(x, height - v, 1, v); } diff --git a/webgpu/webgpu-compute-shaders-histogram-4ch-optimized-more.html b/webgpu/webgpu-compute-shaders-histogram-4ch-optimized-more.html index c17127af..678145c9 100644 --- a/webgpu/webgpu-compute-shaders-histogram-4ch-optimized-more.html +++ b/webgpu/webgpu-compute-shaders-histogram-4ch-optimized-more.html @@ -178,7 +178,7 @@ } const encoder = device.createCommandEncoder({ label: 'histogram encoder' }); - const pass = encoder.beginComputePass(encoder); + const pass = encoder.beginComputePass(); // create a histogram for each chunk pass.setPipeline(histogramChunkPipeline); diff --git a/webgpu/webgpu-compute-shaders-histogram-4ch-optimized-w-timing.html b/webgpu/webgpu-compute-shaders-histogram-4ch-optimized-w-timing.html index bf120ec2..10aeec13 100644 --- a/webgpu/webgpu-compute-shaders-histogram-4ch-optimized-w-timing.html +++ b/webgpu/webgpu-compute-shaders-histogram-4ch-optimized-w-timing.html @@ -137,22 +137,16 @@ const imgBitmap = await loadImageBitmap('resources/images/pexels-chevanon-photography-1108099.jpg'); /* webgpufundamentals: url */ const texture = createTextureFromSource(device, imgBitmap); - const histogramBuffer = device.createBuffer({ - size: chunkSize * 4 * 4, // 256 entries * 4 (rgba) * 4 bytes per (u32) - usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC, - }); - const chunksAcross = Math.ceil(texture.width / k.chunkWidth); const chunksDown = Math.ceil(texture.height / k.chunkHeight); const numChunks = chunksAcross * chunksDown; - console.log(numChunks, chunksAcross, chunksDown); const chunksBuffer = device.createBuffer({ size: numChunks * chunkSize * 4 * 4, usage: GPUBufferUsage.STORAGE, }); const resultBuffer = device.createBuffer({ - size: histogramBuffer.size, + size: chunkSize * 4 * 4, usage: GPUBufferUsage.COPY_DST | GPUBufferUsage.MAP_READ, }); @@ -168,40 +162,37 @@ layout: chunkSumPipeline.getBindGroupLayout(0), entries: [ { binding: 0, resource: { buffer: chunksBuffer }}, - { binding: 1, resource: { buffer: histogramBuffer }}, ], }); - const encoder = device.createCommandEncoder({ - label: 'histogram encoder', - }); + const encoder = device.createCommandEncoder({ label: 'histogram encoder' }); { - const pass = timingHelper.beginComputePass(encoder); - pass.setPipeline(histogramChunkPipeline); - pass.setBindGroup(0, histogramBindGroup); - pass.dispatchWorkgroups(chunksAcross, chunksDown); - pass.end(); + const pass = timingHelper.beginComputePass(encoder); + pass.setPipeline(histogramChunkPipeline); + pass.setBindGroup(0, histogramBindGroup); + pass.dispatchWorkgroups(chunksAcross, chunksDown); + pass.end(); } { - const pass = timingHelper2.beginComputePass(encoder); - pass.setPipeline(chunkSumPipeline); - pass.setBindGroup(0, chunkSumBindGroup); - pass.dispatchWorkgroups(1); - pass.end(); + const pass = timingHelper2.beginComputePass(encoder); + pass.setPipeline(chunkSumPipeline); + pass.setBindGroup(0, chunkSumBindGroup); + pass.dispatchWorkgroups(1); + pass.end(); } - encoder.copyBufferToBuffer(histogramBuffer, 0, resultBuffer, 0, resultBuffer.size); + encoder.copyBufferToBuffer(chunksBuffer, 0, resultBuffer, 0, resultBuffer.size); const commandBuffer = encoder.finish(); device.queue.submit([commandBuffer]); timingHelper.getResult().then(duration => { - console.log('duration 1:', duration); + console.log(`duration histogram: ${duration}ns`); }); timingHelper2.getResult().then(duration => { - console.log('duration 2:', duration); + console.log(`duration sum: ${duration}ns`); }); await resultBuffer.mapAsync(GPUMapMode.READ); @@ -215,17 +206,7 @@ // draw the luminosity channel drawHistogram(histogram, [3]); - const to3 = v => v.toString().padStart(3); - - const sum = [0, 0, 0, 0]; - for (let i = 0; i < chunkSize; ++i) { - const off = i * 4; - console.log(to3(i), to3(histogram[off]), to3(histogram[off + 1]), to3(histogram[off + 2]), to3(histogram[off + 3])); - for (let j = 0; j < 4; ++j) { - sum[j] += histogram[off + j]; - } - } - console.log('sum:', sum); + resultBuffer.unmap(); } function drawHistogram(histogram, channels, height = 100) { diff --git a/webgpu/webgpu-compute-shaders-histogram-4ch-optimized.html b/webgpu/webgpu-compute-shaders-histogram-4ch-optimized.html index a79ea21a..88da140c 100644 --- a/webgpu/webgpu-compute-shaders-histogram-4ch-optimized.html +++ b/webgpu/webgpu-compute-shaders-histogram-4ch-optimized.html @@ -162,7 +162,7 @@ }); const encoder = device.createCommandEncoder({ label: 'histogram encoder' }); - const pass = encoder.beginComputePass(encoder); + const pass = encoder.beginComputePass(); // create a histogram for each area pass.setPipeline(histogramChunkPipeline); diff --git a/webgpu/webgpu-compute-shaders-histogram-4ch-race-fixed-fragment-shader.html b/webgpu/webgpu-compute-shaders-histogram-4ch-race-fixed-fragment-shader.html index 9e54607d..96ad6898 100644 --- a/webgpu/webgpu-compute-shaders-histogram-4ch-race-fixed-fragment-shader.html +++ b/webgpu/webgpu-compute-shaders-histogram-4ch-race-fixed-fragment-shader.html @@ -140,7 +140,7 @@ device.queue.submit([commandBuffer]); timingHelper.getResult().then(duration => { - console.log('duration:', duration); + console.log(`duration: ${duration}ns`); }); await resultBuffer.mapAsync(GPUMapMode.READ); diff --git a/webgpu/webgpu-compute-shaders-histogram-4ch-race-fixed-w-timing-w-inner.html b/webgpu/webgpu-compute-shaders-histogram-4ch-race-fixed-w-timing-w-inner.html index 93c40f27..ca52a467 100644 --- a/webgpu/webgpu-compute-shaders-histogram-4ch-race-fixed-w-timing-w-inner.html +++ b/webgpu/webgpu-compute-shaders-histogram-4ch-race-fixed-w-timing-w-inner.html @@ -107,7 +107,7 @@ device.queue.submit([commandBuffer]); timingHelper.getResult().then(duration => { - console.log('duration:', duration); + console.log(`duration: ${duration}ns`); }); await resultBuffer.mapAsync(GPUMapMode.READ); diff --git a/webgpu/webgpu-compute-shaders-histogram-4ch-race-fixed-w-timing.html b/webgpu/webgpu-compute-shaders-histogram-4ch-race-fixed-w-timing.html index 627fc5f1..2370175a 100644 --- a/webgpu/webgpu-compute-shaders-histogram-4ch-race-fixed-w-timing.html +++ b/webgpu/webgpu-compute-shaders-histogram-4ch-race-fixed-w-timing.html @@ -1,7 +1,7 @@ - WebGPU Compute Shaders - Histogram + WebGPU Compute Shaders - Histogram, race fixed, draw with JavaScript +
+
+    
▶️
+ diff --git a/webgpu/webgpu-timing-with-fps-js-time.html b/webgpu/webgpu-timing-with-fps-js-time.html new file mode 100644 index 00000000..db543d49 --- /dev/null +++ b/webgpu/webgpu-timing-with-fps-js-time.html @@ -0,0 +1,357 @@ + + + + WebGPU Timing - Step 2 - FPS/JS Time + + + + +

+  
+  
+
diff --git a/webgpu/webgpu-timing-with-timestamp-w-average.html b/webgpu/webgpu-timing-with-timestamp-w-average.html
new file mode 100644
index 00000000..9abae0a8
--- /dev/null
+++ b/webgpu/webgpu-timing-with-timestamp-w-average.html
@@ -0,0 +1,432 @@
+
+
+  
+    WebGPU Timing - Step 1 - Animated
+    
+  
+  
+    
+    

+  
+  
+
diff --git a/webgpu/webgpu-timing-with-timestamp.html b/webgpu/webgpu-timing-with-timestamp.html
new file mode 100644
index 00000000..27e03f0a
--- /dev/null
+++ b/webgpu/webgpu-timing-with-timestamp.html
@@ -0,0 +1,406 @@
+
+
+  
+    WebGPU Timing - Step 1 - Animated
+    
+  
+  
+    
+    

+  
+  
+
diff --git a/webgpu/webgpu-timing-with-timing-helper.html b/webgpu/webgpu-timing-with-timing-helper.html
new file mode 100644
index 00000000..b0ef11fb
--- /dev/null
+++ b/webgpu/webgpu-timing-with-timing-helper.html
@@ -0,0 +1,496 @@
+
+
+  
+    WebGPU Timing - Step 1 - Animated
+    
+  
+  
+    
+    

+  
+  
+