diff --git a/content/apple/docs/metal-argument-buffers-and-residency/DOC.md b/content/apple/docs/metal-argument-buffers-and-residency/DOC.md new file mode 100644 index 00000000..59e0c7a4 --- /dev/null +++ b/content/apple/docs/metal-argument-buffers-and-residency/DOC.md @@ -0,0 +1,80 @@ +--- +name: metal-argument-buffers-and-residency +description: "Apple Metal argument buffer patterns: encoding resource tables, residency requirements, and useResource or useHeap rules for compute workloads." +metadata: + languages: "cpp" + versions: "4.0" + revision: 1 + updated-on: "2026-03-21" + source: official + tags: "apple,metal,argument-buffers,residency,useresource,useheap,resource-binding,indirect-access,compute,mtlargumentencoder" +--- + +# Metal Argument Buffers And Residency + +Use this page when a Metal compute kernel needs many resources, indirect resource access, or lower CPU-side binding overhead. + +## What Argument Buffers Solve + +Argument buffers let you encode resource references into a buffer-backed table instead of rebinding many individual buffers or textures for every dispatch. + +This is useful when: + +- one kernel reads many buffers or textures +- the set of resources changes per dispatch +- the resource table is reused across many dispatches +- GPU-driven or indirection-heavy workflows need resource tables + +## Core Host-Side Pattern + +The usual structure is: + +1. create an `MTLArgumentEncoder` +2. allocate a backing `MTLBuffer` +3. encode resource references into that buffer +4. bind the argument buffer to the kernel +5. make indirect resources resident before dispatch + +The last step is the part people miss most often. + +## Residency Rules Matter + +If a kernel reaches resources through an argument buffer, those resources must be resident for the duration of the compute pass. + +In practice this means: + +- call `useResource(_:usage:)` for resources reached indirectly through an argument buffer +- call `useHeap(_:)` when residency is managed through a heap +- do this before the encoded dispatch that consumes those resources + +If you bind a resource directly to a kernel argument, you do not need the extra residency call for that direct binding path. + +## Good Usage Pattern + +- keep the argument buffer layout stable across many dispatches +- separate "table rebuild" work from "per-dispatch scalar parameter" work +- prefer argument buffers when the binding count is the CPU bottleneck +- keep residency calls explicit and near the dispatch site + +## Common Failure Modes + +- a resource is encoded into the argument buffer but never made resident +- the argument buffer is updated but a stale resource table is still reused +- CPU-side code mutates resident resources during the compute pass +- argument buffers are introduced for tiny fixed-bind workloads that did not need them + +## Review Checklist + +- Does the kernel really access resources indirectly? +- Are all indirectly referenced buffers or textures marked resident? +- Is the argument buffer rebuilt only when the table actually changes? +- Is the performance goal CPU submission overhead rather than kernel ALU time? + +## Official Source Links (Fact Check) + +- Compute passes: https://developer.apple.com/documentation/metal/compute-passes +- Improving CPU performance by using argument buffers: https://developer.apple.com/documentation/metal/improving-cpu-performance-by-using-argument-buffers +- `useResource(_:usage:)`: https://developer.apple.com/documentation/metal/mtlcomputecommandencoder/useresource%28_%3Ausage%3A%29 +- Metal shader converter binding model and argument buffer notes: https://developer.apple.com/metal/shader-converter/ + +Last cross-check date: 2026-03-21 diff --git a/content/apple/docs/metal-broadcast-kernel-patterns/DOC.md b/content/apple/docs/metal-broadcast-kernel-patterns/DOC.md new file mode 100644 index 00000000..2940513c --- /dev/null +++ b/content/apple/docs/metal-broadcast-kernel-patterns/DOC.md @@ -0,0 +1,56 @@ +--- +name: metal-broadcast-kernel-patterns +description: "Apple Metal broadcast kernel patterns: scalar or vector expansion, shape alignment, masked edges, and correctness checks for broadcast-heavy compute code." +metadata: + languages: "cpp" + versions: "4.0" + revision: 1 + updated-on: "2026-03-21" + source: official + tags: "apple,metal,broadcast,shape-alignment,elementwise,tensor-shapes,masked-edges,compute" +--- + +# Metal Broadcast Kernel Patterns + +Use this page when a Metal kernel combines tensors of different logical shapes using broadcast semantics. + +## Why Broadcast Bugs Are Common + +Broadcast kernels look simple because the math is usually elementwise. + +The real complexity is shape alignment: + +- which dimensions are expanded +- which dimensions are equal +- whether one operand is scalar, row-wise, column-wise, or channel-wise + +If the shape contract is vague, the kernel may appear correct on square or fully dense cases while failing on realistic shapes. + +## Safe Baseline Pattern + +- align shapes explicitly in host code before launch +- pass logical sizes needed for each broadcasted dimension +- write one straightforward reference kernel +- test cases where only one dimension is broadcast, then several + +## What To Verify + +- broadcasted dimensions reuse the intended source index +- non-broadcasted dimensions advance normally +- output shape is derived from the broadcast rule, not copied from one operand blindly +- masked tails or rounded dispatches cannot write past the real output shape + +## Common Failure Modes + +- one dimension is broadcast on the host side but advanced in the kernel +- scalar and vector broadcast paths behave differently +- the kernel passes tests where all dimensions match, hiding broken broadcast logic +- the output allocation follows one input shape instead of the broadcasted result shape + +## Official Source Links (Fact Check) + +- Compute passes: https://developer.apple.com/documentation/metal/compute-passes +- Calculating threadgroup and grid sizes: https://developer.apple.com/documentation/metal/calculating-threadgroup-and-grid-sizes +- Metal Shading Language Specification: https://developer.apple.com/metal/resources/ + +Last cross-check date: 2026-03-21 diff --git a/content/apple/docs/metal-buffer-layout-and-alignment/DOC.md b/content/apple/docs/metal-buffer-layout-and-alignment/DOC.md new file mode 100644 index 00000000..1db35d45 --- /dev/null +++ b/content/apple/docs/metal-buffer-layout-and-alignment/DOC.md @@ -0,0 +1,76 @@ +--- +name: metal-buffer-layout-and-alignment +description: "Apple Metal buffer layout and alignment: resource sizing, texture-from-buffer alignment, and host/kernel layout discipline." +metadata: + languages: "cpp" + versions: "4.0" + revision: 1 + updated-on: "2026-03-21" + source: official + tags: "apple,metal,buffer,alignment,layout,mtlbuffer,minimumtexturebufferalignment,minimumlineartexturealignment,bytesperrow,heap-size-align" +--- + +# Metal Buffer Layout And Alignment + +Use this page when host-side data layout, buffer sizing, or buffer-backed texture creation is part of the kernel path. + +## Why This Matters + +Many Metal failures that look like "bad math" are actually layout problems: + +- host structs and shader expectations do not match +- offsets are aligned incorrectly +- buffer-backed textures use invalid row pitch or offset values +- heap sizing and alignment are estimated incorrectly + +## Layout Discipline + +- define host and kernel-visible struct layouts explicitly +- keep element size, stride, and offset calculations centralized +- treat texture-from-buffer paths as alignment-sensitive, not as generic byte blobs + +## Alignment APIs Apple Exposes + +Apple documents alignment helpers on `MTLDevice`, including: + +- `minimumTextureBufferAlignment(for:)` +- `minimumLinearTextureAlignment(for:)` +- `heapBufferSizeAndAlign(length:options:)` +- `heapTextureSizeAndAlign(descriptor:)` + +Use these APIs instead of guessing alignment from prior hardware experience. + +## Buffer-Backed Texture Rules + +When creating textures from buffers, values such as: + +- buffer offset +- bytes per row +- pixel format alignment + +must satisfy the device's documented alignment constraints. + +Apple's documentation explicitly ties alignment values to texture creation parameters. + +## Common Failure Modes + +- Buffer length is correct in bytes, but element stride is wrong. +- Struct fields are logically correct but host/kernel padding expectations differ. +- Texture buffer offset is not aligned to the device minimum. +- `bytesPerRow` is computed from logical width only and ignores required alignment. +- Heap size estimates ignore size-and-align APIs and under-allocate. + +## Safe Practice + +1. Compute all offsets from element-size and alignment helpers. +2. Reuse one layout definition across host and shader-facing code. +3. Validate resource creation parameters before debugging kernel math. + +## Official Source Links (Fact Check) + +- `minimumTextureBufferAlignment(for:)`: https://developer.apple.com/documentation/metal/mtldevice/minimumtexturebufferalignment%28for%3A%29 +- `minimumLinearTextureAlignment(for:)`: https://developer.apple.com/documentation/metal/mtldevice/minimumlineartexturealignment%28for%3A%29 +- `heapBufferSizeAndAlign(length:options:)`: https://developer.apple.com/documentation/metal/mtldevice/heapbuffersizeandalign%28length%3Aoptions%3A%29 +- `bufferBytesPerRow`: https://developer.apple.com/documentation/metal/mtltexture/bufferbytesperrow + +Last cross-check date: 2026-03-21 diff --git a/content/apple/docs/metal-command-buffer-reuse-and-batching/DOC.md b/content/apple/docs/metal-command-buffer-reuse-and-batching/DOC.md new file mode 100644 index 00000000..94445a7e --- /dev/null +++ b/content/apple/docs/metal-command-buffer-reuse-and-batching/DOC.md @@ -0,0 +1,73 @@ +--- +name: metal-command-buffer-reuse-and-batching +description: "Apple Metal command buffer reuse and batching guidance: transient versus persistent objects, submission frequency, and indirect command buffer tradeoffs." +metadata: + languages: "cpp" + versions: "4.0" + revision: 1 + updated-on: "2026-03-21" + source: official + tags: "apple,metal,command-buffer,batching,submission,indirect-command-buffer,persistent-objects,command-queue,reuse,icb" +--- + +# Metal Command Buffer Reuse And Batching + +Use this page when CPU submission overhead starts to matter as much as the Metal kernel itself. + +## First Principle + +Apple distinguishes between transient and persistent objects. + +Persistent objects should be created early and reused: + +- `MTLDevice` +- `MTLCommandQueue` +- buffers +- textures +- pipeline states + +Command buffers themselves are transient single-use objects. + +That means: + +- reuse pipeline and resource objects +- do not try to reuse committed command buffers +- reduce submission overhead by batching work intelligently + +## Batching Guidance + +Apple's best-practices material emphasizes submitting as few command buffers as practical without starving the GPU. + +This usually means: + +- group related work into fewer submissions +- avoid over-fragmenting compute work into many tiny command buffers +- profile CPU/GPU overlap before changing submission policy + +## When Indirect Command Buffers Matter + +Apple documents indirect command buffers (ICBs) as a way to reduce CPU overhead for repeated command patterns. + +Use them when: + +- command structure is repeated +- CPU encoding cost is significant +- the workload benefits from reusing encoded command structure + +Do not reach for ICBs before validating that ordinary command submission is actually the bottleneck. + +## Common Failure Modes + +- wrapper code "optimizes" by caching the wrong objects and leaves command-buffer churn untouched +- work is split into too many tiny submissions +- submission count is reduced blindly and introduces dependency or latency issues +- ICB complexity is introduced before measuring CPU encoding cost + +## Official Source Links (Fact Check) + +- Command Organization and Execution Model: https://developer.apple.com/library/archive/documentation/Miscellaneous/Conceptual/MetalProgrammingGuide/Cmd-Submiss/Cmd-Submiss.html +- Metal Best Practices Guide: Persistent Objects: https://developer.apple.com/library/archive/documentation/3DDrawing/Conceptual/MTLBestPracticesGuide/PersistentObjects.html +- Metal Best Practices Guide: Command Buffers: https://developer.apple.com/library/archive/documentation/3DDrawing/Conceptual/MTLBestPracticesGuide/CommandBuffers.html +- Encoding indirect command buffers on the CPU: https://developer.apple.com/documentation/metal/encoding-indirect-command-buffers-on-the-cpu + +Last cross-check date: 2026-03-21 diff --git a/content/apple/docs/metal-compute-launch-patterns/DOC.md b/content/apple/docs/metal-compute-launch-patterns/DOC.md new file mode 100644 index 00000000..bee596a0 --- /dev/null +++ b/content/apple/docs/metal-compute-launch-patterns/DOC.md @@ -0,0 +1,93 @@ +--- +name: metal-compute-launch-patterns +description: "Apple Metal compute launch patterns: MTLDevice, pipeline creation, buffers, encoders, and dispatch sizing." +metadata: + languages: "cpp" + versions: "4.0" + revision: 1 + updated-on: "2026-03-21" + source: official + tags: "apple,metal,compute,mtldevice,mtlcommandqueue,mtlcommandbuffer,mtlcomputecommandencoder,dispatchthreads,threadsperthreadgroup,metallib" +--- + +# Metal Compute Launch Patterns + +Use this page for the host-side structure of launching Metal compute work: device discovery, pipeline creation, resource binding, and dispatch. + +## Core Host Objects + +The standard compute path is built around: + +- `MTLDevice`: the GPU device handle +- `MTLCommandQueue`: source of command buffers +- `MTLCommandBuffer`: unit of submitted GPU work +- `MTLComputePipelineState`: compiled compute kernel state +- `MTLComputeCommandEncoder`: binds resources and dispatches a compute kernel + +## Minimal Host Flow + +1. Get a `MTLDevice`. +2. Create or load a library containing the kernel function. +3. Build a `MTLComputePipelineState` from the kernel. +4. Allocate buffers/textures. +5. Create a command buffer and compute encoder. +6. Bind resources with `setBuffer`, `setTexture`, and related APIs. +7. Dispatch threads or threadgroups. +8. End encoding, commit the command buffer, and wait only when the CPU truly needs completion. + +## Dispatch Sizing Rule + +There are two separate choices: + +- total work size: how many threads should run overall +- threadgroup size: how many threads cooperate locally + +The host must choose both consistently with the kernel's indexing logic and any threadgroup-memory usage. + +## Practical Example Shape + +```cpp +id cb = [queue commandBuffer]; +id enc = [cb computeCommandEncoder]; + +[enc setComputePipelineState:pso]; +[enc setBuffer:inBuffer offset:0 atIndex:0]; +[enc setBuffer:outBuffer offset:0 atIndex:1]; + +MTLSize grid = MTLSizeMake(n, 1, 1); +MTLSize tpg = MTLSizeMake(256, 1, 1); +[enc dispatchThreads:grid threadsPerThreadgroup:tpg]; + +[enc endEncoding]; +[cb commit]; +``` + +## Buffer And Binding Discipline + +- buffer index values must match kernel `[[buffer(i)]]` attributes +- host-side buffer sizes must cover the kernel's full access range +- threadgroup memory declarations require matching dispatch assumptions +- command-buffer completion only guarantees GPU completion for that buffer, not correctness of your indexing logic + +## Common Failure Modes + +- Binding order does not match kernel buffer indices. +- Dispatch shape changes but kernel index math is left unchanged. +- Threadgroup size exceeds hardware or pipeline limits. +- CPU waits on every command buffer and destroys overlap unnecessarily. +- Library and pipeline creation are done inside a hot loop instead of being cached + +## Profiling And Debugging Guidance + +- Use Xcode Metal debugging tools to inspect resource bindings and dispatch layout. +- Use runtime validation to catch invalid API usage early. +- Treat incorrect output and poor throughput separately: one is often indexing or binding, the other is often sizing or memory behavior. + +## Official Source Links (Fact Check) + +- Performing calculations on a GPU: https://developer.apple.com/documentation/metal/performing-calculations-on-a-gpu +- Compute passes: https://developer.apple.com/documentation/metal/compute-passes +- Metal Programming Guide (archive): https://developer.apple.com/library/archive/documentation/Miscellaneous/Conceptual/MetalProgrammingGuide/Compute-Ctx/Compute-Ctx.html +- Metal developer tools: https://developer.apple.com/metal/tools/ + +Last cross-check date: 2026-03-21 diff --git a/content/apple/docs/metal-convolution-and-stencil-patterns/DOC.md b/content/apple/docs/metal-convolution-and-stencil-patterns/DOC.md new file mode 100644 index 00000000..6055654e --- /dev/null +++ b/content/apple/docs/metal-convolution-and-stencil-patterns/DOC.md @@ -0,0 +1,79 @@ +--- +name: metal-convolution-and-stencil-patterns +description: "Apple Metal convolution and stencil kernel patterns: neighborhood loads, edge handling, temporary storage, and multi-pass image filtering structure." +metadata: + languages: "cpp" + versions: "4.0" + revision: 1 + updated-on: "2026-03-21" + source: official + tags: "apple,metal,convolution,stencil,image-filter,neighborhood,texture,threadgroup,image-processing,gaussian-blur,compute" +--- + +# Metal Convolution And Stencil Patterns + +Use this page when implementing image filters, local neighborhood operators, or stencil-style kernels in Metal. + +## Typical Workloads + +This pattern covers: + +- box blur or Gaussian blur +- Sobel or edge filters +- local stencil updates on 2D grids +- any kernel where each output depends on nearby input elements + +## Core Design Questions + +Before tuning, decide: + +- is the source better represented as a texture or a linear buffer? +- how are border pixels handled? +- is a one-pass kernel enough, or is a separable multi-pass filter better? +- does threadgroup staging reduce enough repeated neighborhood reads to be worth the complexity? + +## Safe Baseline Pattern + +- start with a correct kernel that loads neighbors directly +- make border handling explicit +- validate against a CPU reference on tiny inputs +- only then consider threadgroup staging or separable decomposition + +This keeps correctness separate from optimization. + +## Edge Handling Rules + +Stencil kernels fail most often on boundaries. + +Choose and document one border policy: + +- clamp to edge +- zero pad +- mirror +- skip out-of-range contributions + +Do not leave edge behavior implicit between host and kernel code. + +## Optimization Path + +For larger filters, common improvements are: + +- separable two-pass decomposition where mathematically valid +- threadgroup staging for reused neighborhoods +- batching several filter passes while minimizing temporary resource churn + +## Common Failure Modes + +- border handling is inconsistent with the reference implementation +- staging layout is correct for interior tiles but wrong on halo regions +- texture versus buffer choice is made late and forces wrapper rewrites +- a multi-pass filter is encoded without a clean intermediate-resource plan + +## Official Source Links (Fact Check) + +- Processing a texture in a compute function: https://developer.apple.com/documentation/metal/processing-a-texture-in-a-compute-function +- Compute passes: https://developer.apple.com/documentation/metal/compute-passes +- Implementing a multistage image filter using heaps and events: https://developer.apple.com/documentation/metal/implementing-a-multistage-image-filter-using-heaps-and-events +- Metal Shading Language Specification: https://developer.apple.com/metal/resources/ + +Last cross-check date: 2026-03-21 diff --git a/content/apple/docs/metal-convolution-tiling-playbook/DOC.md b/content/apple/docs/metal-convolution-tiling-playbook/DOC.md new file mode 100644 index 00000000..790af41c --- /dev/null +++ b/content/apple/docs/metal-convolution-tiling-playbook/DOC.md @@ -0,0 +1,69 @@ +--- +name: metal-convolution-tiling-playbook +description: "Apple Metal convolution tiling playbook: halo regions, threadgroup staging, separable decomposition, and tile-size tradeoffs for filter kernels." +metadata: + languages: "cpp" + versions: "4.0" + revision: 1 + updated-on: "2026-03-21" + source: official + tags: "apple,metal,convolution,tiling,stencil,halo,threadgroup,image-filter,separable,compute" +--- + +# Metal Convolution Tiling Playbook + +Use this page when a convolution or stencil kernel has moved past the naive baseline and needs a more structured tiling plan. + +## What Tiling Adds + +Compared with a direct neighborhood-read kernel, tiling tries to reuse nearby input data across threads in the same threadgroup. + +For convolution-style kernels, this usually means: + +- loading a tile of source data into `threadgroup` memory +- including halo elements around the core output region +- synchronizing before computation +- writing only the core output region + +## The Halo Problem + +The hardest part is rarely the multiply-accumulate loop itself. + +The hard part is defining: + +- which threads load the halo +- how the halo is clipped at image borders +- which staged region corresponds to each output pixel + +If halo ownership is vague, the kernel is usually wrong at edges. + +## Optimization Order + +1. correct direct convolution baseline +2. explicit border policy +3. separable decomposition when mathematically valid +4. threadgroup tiling with halo handling +5. tile-size tuning only after correctness is stable + +## Common Failure Modes + +- the staged tile omits halo pixels needed by the filter radius +- border handling in the tiled kernel no longer matches the untiled reference +- tile size improves reuse but increases synchronization or threadgroup-memory cost too much +- dispatch geometry and output-core tile dimensions drift apart + +## Review Checklist + +- What filter radius determines halo width? +- Is the border policy identical between reference and optimized kernels? +- Does each output pixel read only staged data that was initialized? +- Is the separable path available, and is it actually mathematically valid for this filter? + +## Official Source Links (Fact Check) + +- Processing a texture in a compute function: https://developer.apple.com/documentation/metal/processing-a-texture-in-a-compute-function +- Compute passes: https://developer.apple.com/documentation/metal/compute-passes +- Implementing a multistage image filter using heaps and events: https://developer.apple.com/documentation/metal/implementing-a-multistage-image-filter-using-heaps-and-events +- Metal Shading Language Specification: https://developer.apple.com/metal/resources/ + +Last cross-check date: 2026-03-21 diff --git a/content/apple/docs/metal-double-buffered-pipeline-patterns/DOC.md b/content/apple/docs/metal-double-buffered-pipeline-patterns/DOC.md new file mode 100644 index 00000000..f205b2c8 --- /dev/null +++ b/content/apple/docs/metal-double-buffered-pipeline-patterns/DOC.md @@ -0,0 +1,58 @@ +--- +name: metal-double-buffered-pipeline-patterns +description: "Apple Metal double-buffered pipeline patterns: alternating resources, producer-consumer overlap, and synchronization discipline for staged compute workflows." +metadata: + languages: "cpp" + versions: "4.0" + revision: 1 + updated-on: "2026-03-21" + source: official + tags: "apple,metal,double-buffering,pipeline,producer-consumer,overlap,staging,synchronization,compute" +--- + +# Metal Double-Buffered Pipeline Patterns + +Use this page when a compute pipeline benefits from alternating between two resource sets so that one stage can produce while another later stage consumes the previous result. + +## What Double Buffering Solves + +Double buffering is useful when: + +- one stage writes data that a later stage reads +- the next iteration can start producing into a different buffer +- command scheduling can overlap work without read-write hazards + +The goal is controlled overlap, not just owning two copies of the same resource. + +## Core Structure + +The usual pattern is: + +1. allocate buffer set A and buffer set B +2. iteration `n` writes into one set +3. iteration `n + 1` writes into the other set +4. each consumer stage reads only the completed set for its iteration + +This requires explicit ownership of which stage may read or write each slot. + +## What To Verify + +- producer and consumer never touch the same slot at the same time +- slot selection logic matches the iteration index exactly +- the first and last iterations handle warm-up and drain correctly +- synchronization is at the real producer-consumer boundary, not inserted everywhere + +## Common Failure Modes + +- slot parity logic is off by one and consumers read partially written data +- double buffering is introduced but both stages still serialize on one command buffer boundary +- warm-up or flush iterations are omitted, so the first or last result is wrong +- resource reuse starts before the previous iteration completed + +## Official Source Links (Fact Check) + +- Implementing a multistage image filter using heaps and events: https://developer.apple.com/documentation/metal/implementing-a-multistage-image-filter-using-heaps-and-events +- Synchronizing passes with a fence: https://developer.apple.com/documentation/metal/synchronizing-passes-with-a-fence +- Compute passes: https://developer.apple.com/documentation/metal/compute-passes + +Last cross-check date: 2026-03-21 diff --git a/content/apple/docs/metal-edge-tile-and-bounds-check-playbook/DOC.md b/content/apple/docs/metal-edge-tile-and-bounds-check-playbook/DOC.md new file mode 100644 index 00000000..88f53dfd --- /dev/null +++ b/content/apple/docs/metal-edge-tile-and-bounds-check-playbook/DOC.md @@ -0,0 +1,72 @@ +--- +name: metal-edge-tile-and-bounds-check-playbook +description: "Apple Metal edge-tile and bounds-check playbook: rounded dispatches, partial tiles, and safe handling of non-divisible problem sizes." +metadata: + languages: "cpp" + versions: "4.0" + revision: 1 + updated-on: "2026-03-21" + source: official + tags: "apple,metal,bounds-check,edge-tile,partial-tile,dispatchthreads,thread_position_in_grid,rounded-dispatch,2d-kernel" +--- + +# Metal Edge-Tile And Bounds-Check Playbook + +Use this page when a Metal kernel is correct on neat shapes but fails on odd sizes, edge tiles, or rounded-up dispatches. + +## Why This Matters + +Metal dispatches are often rounded up so that threadgroup sizing remains simple. + +That means: + +- some launched threads correspond to no real output element +- edge tiles may only partially overlap valid input and output ranges +- a kernel must separate "dispatched thread exists" from "logical element exists" + +## Baseline Rule + +Every kernel that can be launched on rounded or tiled problem sizes should make its validity checks explicit. + +Typical checks include: + +- global index < logical extent +- row < height and col < width +- tile-local reads are masked if source coverage is partial + +## Edge-Tile Strategy + +For tiled kernels: + +1. compute global coordinates +2. test whether each source or destination coordinate is logically valid +3. stage only valid data or substitute neutral values where appropriate +4. guard final writes with output bounds checks + +This avoids out-of-range reads and writes without requiring perfectly divisible problem dimensions. + +## Common Failure Modes + +- bounds checks guard writes but not staged reads +- elementwise kernels are protected, but tiled kernels still read invalid neighbor data +- 2D kernels use width bounds correctly but forget height bounds +- host dispatch changes after resize, but kernel checks still assume old extents + +## Good Validation Set + +Always test: + +- one exact multiple of the threadgroup shape +- one size smaller than the shape +- one off-by-one larger than the shape +- non-square 2D shapes + +If a kernel only works on clean multiples, the bounds logic is not finished. + +## Official Source Links (Fact Check) + +- Compute passes: https://developer.apple.com/documentation/metal/compute-passes +- Performing calculations on a GPU: https://developer.apple.com/documentation/metal/performing-calculations-on-a-gpu +- Processing a texture in a compute function: https://developer.apple.com/documentation/metal/processing-a-texture-in-a-compute-function + +Last cross-check date: 2026-03-21 diff --git a/content/apple/docs/metal-gather-scatter-and-indirect-access-patterns/DOC.md b/content/apple/docs/metal-gather-scatter-and-indirect-access-patterns/DOC.md new file mode 100644 index 00000000..c71a5ed2 --- /dev/null +++ b/content/apple/docs/metal-gather-scatter-and-indirect-access-patterns/DOC.md @@ -0,0 +1,70 @@ +--- +name: metal-gather-scatter-and-indirect-access-patterns +description: "Apple Metal gather, scatter, and indirect access patterns: index-buffer workflows, resource tables, irregular memory traffic, and correctness-first design." +metadata: + languages: "cpp" + versions: "4.0" + revision: 1 + updated-on: "2026-03-21" + source: official + tags: "apple,metal,gather,scatter,indirect-access,indexing,argument-buffers,resource-table,irregular-memory,compute" +--- + +# Metal Gather, Scatter, And Indirect Access Patterns + +Use this page when a Metal kernel reads or writes through index arrays, indirection tables, or argument-buffer-backed resource tables. + +## Why Indirect Access Is Different + +Indirect kernels usually lose the regular memory behavior that makes dense kernels easy to optimize. + +Typical examples include: + +- gather from a source buffer using an index list +- scatter updates to output positions selected at runtime +- table-driven resource access through argument buffers +- sparse or graph-like traversal patterns + +## Good Baseline Strategy + +- first prove that index validity and output ownership are correct +- separate gather-only cases from scatter-with-conflicts cases +- keep the resource table or indirection structure explicit in host code +- add performance tuning only after bounds and ownership rules are stable + +## Gather Versus Scatter + +Gather is usually easier: + +- each thread reads from an indirect source location +- writes go to a predictable output slot + +Scatter is riskier: + +- many threads may target the same destination +- update ordering may matter +- atomics or staged conflict-resolution may be required + +## Resource-Table Cases + +If the kernel reaches resources indirectly through argument buffers: + +- keep the table layout stable +- make all indirectly referenced resources resident +- verify that the host and shader agree on table indexing + +## Common Failure Modes + +- the index buffer contains out-of-range values and validation is too weak to catch them +- a scatter kernel assumes one-writer ownership when the workload has write conflicts +- the host rebuilds a resource table but the dispatch still uses stale bindings +- indirect access is tuned for speed before correctness on duplicate indices is understood + +## Official Source Links (Fact Check) + +- Compute passes: https://developer.apple.com/documentation/metal/compute-passes +- Improving CPU performance by using argument buffers: https://developer.apple.com/documentation/metal/improving-cpu-performance-by-using-argument-buffers +- Encoding indirect command buffers on the GPU: https://developer.apple.com/documentation/metal/encoding-indirect-command-buffers-on-the-gpu +- Metal shader converter: https://developer.apple.com/metal/shader-converter/ + +Last cross-check date: 2026-03-21 diff --git a/content/apple/docs/metal-gpu-driven-work-generation-patterns/DOC.md b/content/apple/docs/metal-gpu-driven-work-generation-patterns/DOC.md new file mode 100644 index 00000000..abd09ecc --- /dev/null +++ b/content/apple/docs/metal-gpu-driven-work-generation-patterns/DOC.md @@ -0,0 +1,65 @@ +--- +name: metal-gpu-driven-work-generation-patterns +description: "Apple Metal GPU-driven work generation patterns: indirect command buffers, indirect arguments, residency, and when GPU-generated work reduces CPU round trips." +metadata: + languages: "cpp" + versions: "4.0" + revision: 1 + updated-on: "2026-03-21" + source: official + tags: "apple,metal,gpu-driven,indirect-command-buffer,indirect-dispatch,argument-buffers,work-generation,submission,compute" +--- + +# Metal GPU-Driven Work Generation Patterns + +Use this page when the GPU can determine later work more efficiently than round-tripping those decisions through the CPU. + +## When This Pattern Helps + +GPU-driven work generation matters when: + +- one pass decides which later tasks are necessary +- the CPU does not need to inspect intermediate results +- repeated CPU readback and resubmission would stall the pipeline + +This pattern is common in visibility-driven rendering, indirect work compaction, and resource-table-driven pipelines. + +## What To Reach For + +The relevant Metal building blocks are: + +- indirect command buffers for reusable or GPU-generated command streams +- indirect argument patterns when draw or dispatch parameters are produced later +- argument buffers when resource tables are reached indirectly + +The goal is to eliminate unnecessary CPU-GPU round trips, not to make every workload indirect by default. + +## Good Usage Pattern + +1. keep the producer pass entirely on the GPU +2. write indirect arguments or command state into GPU-visible memory +3. make indirectly referenced resources resident +4. execute the generated work in a later pass without CPU inspection + +## Common Failure Modes + +- the CPU still waits on generated results, defeating the point of the indirect path +- indirect resources are not marked resident +- indirect execution is introduced for trivial workloads where direct encoding was cheaper +- the generated work format is hard to validate, so correctness debugging becomes opaque + +## Review Checklist + +- Does the CPU truly need to read the intermediate decision data? +- Is the generated work reused enough to justify the setup cost? +- Are residency and synchronization rules explicit? +- Can the indirect path be validated against a direct baseline? + +## Official Source Links (Fact Check) + +- Encoding indirect command buffers on the CPU: https://developer.apple.com/documentation/metal/encoding-indirect-command-buffers-on-the-cpu +- Encoding indirect command buffers on the GPU: https://developer.apple.com/documentation/metal/encoding-indirect-command-buffers-on-the-gpu +- Indirect buffers best practices: https://developer.apple.com/library/archive/documentation/3DDrawing/Conceptual/MTLBestPracticesGuide/IndirectBuffers.html +- Compute passes: https://developer.apple.com/documentation/metal/compute-passes + +Last cross-check date: 2026-03-21 diff --git a/content/apple/docs/metal-heaps-fences-and-events/DOC.md b/content/apple/docs/metal-heaps-fences-and-events/DOC.md new file mode 100644 index 00000000..f0cfc078 --- /dev/null +++ b/content/apple/docs/metal-heaps-fences-and-events/DOC.md @@ -0,0 +1,73 @@ +--- +name: metal-heaps-fences-and-events +description: "Apple Metal heap and synchronization patterns: temporary resource reuse, aliasing, fences, and events for multi-stage compute workflows." +metadata: + languages: "cpp" + versions: "4.0" + revision: 1 + updated-on: "2026-03-21" + source: official + tags: "apple,metal,heap,heaps,fence,event,aliasing,synchronization,multistage,pipeline,temporary-resources,compute" +--- + +# Metal Heaps, Fences, And Events + +Use this page when a compute workflow spans multiple passes and needs temporary resources, explicit dependency tracking, or tighter memory reuse. + +## What Heaps Change + +Heaps give you more control over resource allocation and lifetime. + +They are useful when: + +- many temporary textures or buffers are created per frame +- intermediate resources can alias or be recycled +- a multi-stage pipeline has predictable lifetime boundaries + +## What Fences And Events Solve + +When multiple encoders or command buffers produce and consume shared resources, synchronization must be explicit. + +- use fences when ordering access within one command queue or closely related submission flow +- use events when the dependency needs broader or more explicit cross-encoder coordination + +The right goal is not "add synchronization everywhere." The goal is to add only the synchronization required to protect data hazards. + +## Good Multi-Stage Pattern + +1. allocate reusable temporary resources from a heap +2. group passes that produce and consume those resources +3. place synchronization at real producer/consumer boundaries +4. reuse or alias intermediate resources only after the previous use is complete + +This is especially useful for filter graphs, staged image processing, and pipelines with several intermediate textures. + +## Common Failure Modes + +- heap-backed resources are reused before the earlier encoder finished with them +- fences or events are missing at a producer/consumer boundary +- temporary resources are allocated individually even though their lifetime is short and repetitive +- aliasing is introduced before correctness is stable, making data hazards hard to diagnose + +## Practical Guidance + +- start with a non-aliased but correct pipeline first +- introduce heaps to reduce allocation churn and memory footprint +- add fences or events only where reuse boundaries actually exist +- validate correctness before optimizing with aliasing + +## Review Checklist + +- Which pass writes each intermediate resource? +- Which later pass consumes it? +- Can that resource be reused only after an explicit completion boundary? +- Is heap usage reducing real allocation pressure, or just adding complexity? + +## Official Source Links (Fact Check) + +- Compute passes: https://developer.apple.com/documentation/metal/compute-passes +- Implementing a multistage image filter using heaps and events: https://developer.apple.com/documentation/metal/implementing-a-multistage-image-filter-using-heaps-and-events +- Metal Programming Guide archive: https://developer.apple.com/library/archive/documentation/Miscellaneous/Conceptual/MetalProgrammingGuide/Introduction/Introduction.html +- Metal developer tools: https://developer.apple.com/metal/tools/ + +Last cross-check date: 2026-03-21 diff --git a/content/apple/docs/metal-histogram-and-binning-patterns/DOC.md b/content/apple/docs/metal-histogram-and-binning-patterns/DOC.md new file mode 100644 index 00000000..f68d6cca --- /dev/null +++ b/content/apple/docs/metal-histogram-and-binning-patterns/DOC.md @@ -0,0 +1,68 @@ +--- +name: metal-histogram-and-binning-patterns +description: "Apple Metal histogram and binning patterns: local accumulation, conflict management, multi-pass merge structure, and validation strategy for bucketed kernels." +metadata: + languages: "cpp" + versions: "4.0" + revision: 1 + updated-on: "2026-03-21" + source: official + tags: "apple,metal,histogram,binning,bucket,counting,atomic,threadgroup,aggregation,compute" +--- + +# Metal Histogram And Binning Patterns + +Use this page when implementing histograms, bucket counting, or coarse binning passes in Metal. + +## Why Histograms Need Their Own Pattern + +Histogram kernels are usually dominated by update conflicts, not arithmetic cost. + +Many threads may try to update the same bin at nearly the same time, so the design question is not only "how many bins exist?" but also "where does each accumulation happen first?" + +## Safe Structure + +A practical approach is: + +1. each thread reads input elements +2. the threadgroup accumulates into local temporary bins when possible +3. local bins are merged into global output bins + +This reduces direct contention on the final destination. + +## Design Questions + +- how many bins exist? +- can the threadgroup hold a local partial histogram? +- are updates sparse, uniform, or highly skewed? +- does the final output need only counts, or also offsets for later scattering? + +## Validation Strategy + +- test highly skewed inputs where nearly all values map to one bin +- test uniform distributions +- test bin counts that are not powers of two +- compare against a CPU reference on small arrays + +Skewed inputs are the fastest way to expose contention-sensitive logic errors. + +## Common Failure Modes + +- every input directly updates global bins and contention dominates +- local bin initialization is incomplete across the threadgroup +- merge logic drops bins on edge cases or partial workgroups +- the kernel writes counts correctly but later stages assume prefix offsets instead of raw counts + +## Performance Notes + +- threadgroup-local accumulation is often the first optimization step +- the best binning strategy depends on contention shape, not just total input size +- if the next stage needs stable positions, histogram alone is not enough; it usually pairs with a scan or prefix-sum stage + +## Official Source Links (Fact Check) + +- Compute passes: https://developer.apple.com/documentation/metal/compute-passes +- Performing calculations on a GPU: https://developer.apple.com/documentation/metal/performing-calculations-on-a-gpu +- Metal Shading Language Specification: https://developer.apple.com/metal/resources/ + +Last cross-check date: 2026-03-21 diff --git a/content/apple/docs/metal-host-device-synchronization-checklist/DOC.md b/content/apple/docs/metal-host-device-synchronization-checklist/DOC.md new file mode 100644 index 00000000..6e5c539a --- /dev/null +++ b/content/apple/docs/metal-host-device-synchronization-checklist/DOC.md @@ -0,0 +1,55 @@ +--- +name: metal-host-device-synchronization-checklist +description: "Apple Metal host-device synchronization checklist: command completion, CPU visibility, resource lifetime, and debugging rules for wrapper-level correctness." +metadata: + languages: "cpp" + versions: "4.0" + revision: 1 + updated-on: "2026-03-21" + source: official + tags: "apple,metal,host-device-synchronization,completion,resource-lifetime,cpu-visibility,command-buffer,wrapper,compute" +--- + +# Metal Host-Device Synchronization Checklist + +Use this page when the kernel itself may be correct, but the wrapper reads results too early, overwrites resources too soon, or otherwise mishandles CPU-GPU coordination. + +## First Principles + +The CPU and GPU do not execute in lockstep. + +The wrapper must answer: + +- when the GPU has finished writing a resource +- when the CPU is allowed to read it +- when a resource may be reused or destroyed +- whether multiple command buffers overlap on the same resource + +## Safe Wrapper Rules + +- treat command-buffer completion as the authoritative completion boundary +- keep resource lifetime longer than every in-flight use +- do not overwrite upload buffers or readback targets before the GPU is done with them +- make synchronization points explicit in code review and debugging + +## What To Verify + +- completion handlers or waits match the true resource dependency +- CPU reads only happen after GPU writes are complete +- reused buffers are not still in flight +- debug and benchmark modes use the same correctness-critical synchronization + +## Common Failure Modes + +- the CPU reads a buffer after encoding but before GPU completion +- a temporary resource is freed or reused while still referenced by an in-flight command buffer +- benchmark code removes synchronization that correctness silently depended on +- several command buffers touch the same resource and ownership is assumed rather than stated + +## Official Source Links (Fact Check) + +- Command organization and execution model: https://developer.apple.com/library/archive/documentation/Miscellaneous/Conceptual/MetalProgrammingGuide/Cmd-Submiss/Cmd-Submiss.html +- Compute passes: https://developer.apple.com/documentation/metal/compute-passes +- Metal developer tools: https://developer.apple.com/metal/tools/ + +Last cross-check date: 2026-03-21 diff --git a/content/apple/docs/metal-host-wrapper-patterns/DOC.md b/content/apple/docs/metal-host-wrapper-patterns/DOC.md new file mode 100644 index 00000000..2a9213cc --- /dev/null +++ b/content/apple/docs/metal-host-wrapper-patterns/DOC.md @@ -0,0 +1,98 @@ +--- +name: metal-host-wrapper-patterns +description: "Apple Metal host wrapper patterns: library loading, pipeline caching, buffer binding, command-buffer lifecycle, and reusable launch structure." +metadata: + languages: "cpp" + versions: "4.0" + revision: 1 + updated-on: "2026-03-21" + source: official + tags: "apple,metal,host-wrapper,objective-c++,cpp,mtldevice,mtlcommandqueue,mtlcommandbuffer,mtlcomputecommandencoder,pipeline-cache" +--- + +# Metal Host Wrapper Patterns + +Use this page when you need the host-side structure around a Metal kernel, not just the kernel itself. + +## Wrapper Responsibilities + +A practical host wrapper usually owns: + +- device and queue initialization +- library and function loading +- compute pipeline creation and reuse +- input/output buffer allocation +- encoder setup and resource binding +- dispatch geometry selection +- command-buffer submission and optional synchronization + +This is the layer that decides whether a correct kernel is actually runnable. + +## Recommended Object Lifetime + +Create once and reuse: + +- `MTLDevice` +- `MTLCommandQueue` +- `MTLComputePipelineState` +- long-lived reusable buffers when shapes are stable + +Create per launch: + +- command buffers +- compute encoders +- temporary shape-specific resources when required + +## Minimal Wrapper Shape + +```cpp +class MetalKernelRunner { + public: + void init(); + void run(const void* in0, const void* in1, void* out, size_t n); + + private: + id device_ = nil; + id queue_ = nil; + id pso_ = nil; +}; +``` + +The key engineering point is separation: + +- initialization path prepares durable GPU objects +- run path binds data and dispatches work only + +## Binding Discipline + +- keep wrapper-side buffer indices aligned with kernel `[[buffer(i)]]` +- centralize binding order instead of scattering `setBuffer` calls +- keep threadgroup size logic next to dispatch logic, not buried in kernel-selection code + +## Synchronization Strategy + +- do not block on every command buffer unless the caller truly needs immediate CPU visibility +- expose synchronous and asynchronous wrapper modes only if the caller actually needs both +- document when output memory is safe to read on the host + +## Common Failure Modes + +- pipeline creation is accidentally repeated inside the steady-state run path +- wrapper binds buffers in a different order than the kernel expects +- shape-dependent dispatch logic is duplicated in multiple call sites +- command buffer completion is used as a substitute for correct resource and indexing logic + +## Good Wrapper Design + +- one wrapper owns one kernel family or one coherent fused op +- shape and dispatch calculations are explicit +- pipeline and resource reuse are intentional +- correctness checks happen before performance tuning + +## Official Source Links (Fact Check) + +- Performing calculations on a GPU: https://developer.apple.com/documentation/metal/performing-calculations-on-a-gpu +- Compute passes: https://developer.apple.com/documentation/metal/compute-passes +- Metal Programming Guide (archive): https://developer.apple.com/library/archive/documentation/Miscellaneous/Conceptual/MetalProgrammingGuide/Compute-Ctx/Compute-Ctx.html + +Last cross-check date: 2026-03-21 diff --git a/content/apple/docs/metal-image-and-2d-kernel-patterns/DOC.md b/content/apple/docs/metal-image-and-2d-kernel-patterns/DOC.md new file mode 100644 index 00000000..baa15775 --- /dev/null +++ b/content/apple/docs/metal-image-and-2d-kernel-patterns/DOC.md @@ -0,0 +1,70 @@ +--- +name: metal-image-and-2d-kernel-patterns +description: "Apple Metal image and 2D kernel patterns: texture-based indexing, 2D dispatch layout, read/write texture rules, and bounds handling." +metadata: + languages: "cpp" + versions: "4.0" + revision: 1 + updated-on: "2026-03-21" + source: official + tags: "apple,metal,image,texture,2d-kernel,texture2d,thread-position-in-grid,read-write-texture,bytesperrow,compute" +--- + +# Metal Image And 2D Kernel Patterns + +Use this page when the kernel operates on 2D textures, image grids, or other width-height data layouts. + +## 2D Dispatch Model + +For texture and image kernels, use a 2D mapping between dispatched threads and output coordinates. + +Typical pattern: + +- `thread_position_in_grid` is interpreted as a 2D coordinate +- the kernel reads from one or more textures or buffers +- the kernel writes to an output texture or buffer-backed layout + +## Why This Is Different From 1D Kernels + +2D kernels usually combine: + +- texture coordinate rules +- row/column bounds logic +- format and layout constraints +- potentially different read and write access modes + +That makes them more sensitive to layout mistakes than simple 1D elementwise kernels. + +## Read/Write Texture Rule + +Apple documents explicit constraints for read-write textures in Metal. + +For example: + +- read-write textures use `access::read_write` +- they have usage restrictions compared with sampled textures +- not every texture type supports every access pattern + +Treat read-write texture kernels as a separate class from sampled-image code. + +## Practical Checklist + +- dispatch grid matches output width and height +- bounds checks cover partial edge dispatches +- texture access mode matches the kernel's actual reads and writes +- host-side texture layout and row pitch are validated before kernel debugging + +## Common Failure Modes + +- 1D indexing logic is reused in a 2D dispatch +- bytes-per-row or texture layout assumptions are wrong on the host side +- read-write texture mode is chosen but later code assumes sampled-texture behavior +- output dimensions and dispatch dimensions drift apart after a resize path + +## Official Source Links (Fact Check) + +- Processing a texture in a compute function: https://developer.apple.com/documentation/metal/processing-a-texture-in-a-compute-function +- Compute passes: https://developer.apple.com/documentation/metal/compute-passes +- Metal Programming Guide (archive): https://developer.apple.com/library/archive/documentation/Miscellaneous/Conceptual/MetalProgrammingGuide/WhatsNewiniOS10tvOS10andOSX1012/WhatsNewiniOS10tvOS10andOSX1012.html + +Last cross-check date: 2026-03-21 diff --git a/content/apple/docs/metal-kernel-basics/DOC.md b/content/apple/docs/metal-kernel-basics/DOC.md new file mode 100644 index 00000000..e429b444 --- /dev/null +++ b/content/apple/docs/metal-kernel-basics/DOC.md @@ -0,0 +1,95 @@ +--- +name: metal-kernel-basics +description: "Apple Metal kernel basics: Metal Shading Language entry points, grid/threadgroup indexing, and compute-kernel structure." +metadata: + languages: "cpp" + versions: "4.0" + revision: 1 + updated-on: "2026-03-21" + source: official + tags: "apple,metal,msl,metal-shading-language,compute,kernel,threadgroup,grid,simdgroup,thread-position-in-grid" +--- + +# Metal Kernel Basics + +Use this page when you need the Metal Shading Language view of compute kernels: entry-point syntax, built-in indices, and the mapping between grid work and threadgroup work. + +## What A Compute Kernel Looks Like + +In Metal, a compute kernel is a `kernel` function written in Metal Shading Language (MSL). + +- `kernel` marks a compute entry point +- parameters include buffers/textures plus built-in execution coordinates +- execution is organized over a grid of threads, grouped into threadgroups + +## Minimal Kernel Shape + +```cpp +#include +using namespace metal; + +kernel void add_arrays(device const float* a [[buffer(0)]], + device const float* b [[buffer(1)]], + device float* out [[buffer(2)]], + uint gid [[thread_position_in_grid]]) { + out[gid] = a[gid] + b[gid]; +} +``` + +## Built-In Indices That Matter + +The official Metal documentation and MSL specification expose built-ins that map work to threads: + +- `thread_position_in_grid`: global linear or multidimensional position +- `thread_position_in_threadgroup`: local index inside one threadgroup +- `threadgroup_position_in_grid`: which threadgroup is executing +- `threads_per_threadgroup`: threadgroup shape selected by the host + +Use global indices for final data addressing and local indices for threadgroup-shared staging patterns. + +## Mental Model + +For most data-parallel kernels: + +1. Host code defines a grid size. +2. Host code chooses a threadgroup size. +3. Metal launches one kernel instance per thread in the grid. +4. Each thread derives the element or tile it owns from built-in indices. + +This is conceptually close to CUDA block/thread indexing, but the naming and API boundaries are different. + +## Practical Mapping Rules + +- Treat `thread_position_in_grid` as the default index for elementwise kernels. +- Use `thread_position_in_threadgroup` only when coordinating through `threadgroup` memory. +- Keep threadgroup dimensions explicit when moving from 1D kernels to 2D or 3D image/tile kernels. +- Do not assume CUDA warp semantics; when subgroup behavior matters, use Metal SIMD-group concepts explicitly. + +## Common Failure Modes + +- Using local threadgroup indices as if they were global output indices. +- Choosing a threadgroup shape that does not match the kernel's indexing math. +- Porting CUDA code and assuming block/thread identifiers map one-to-one to Metal names. +- Forgetting bounds checks when the dispatched grid is rounded up past tensor or image extent. + +## When To Escalate + +Stay at this layer for: + +- basic compute kernels +- elementwise operations +- tiled kernels with straightforward threadgroup staging + +Jump to more specific docs when you need: + +- threadgroup memory and barriers +- host-side pipeline creation and dispatch +- PyTorch `mps` integration or custom op wiring + +## Official Source Links (Fact Check) + +- Metal resources hub: https://developer.apple.com/metal/resources/ +- Metal Shading Language Specification: https://developer.apple.com/metal/resources/ +- Performing calculations on a GPU: https://developer.apple.com/documentation/metal/performing-calculations-on-a-gpu + +Last cross-check date: 2026-03-21 diff --git a/content/apple/docs/metal-kernel-debugging-checklist/DOC.md b/content/apple/docs/metal-kernel-debugging-checklist/DOC.md new file mode 100644 index 00000000..3c70c040 --- /dev/null +++ b/content/apple/docs/metal-kernel-debugging-checklist/DOC.md @@ -0,0 +1,82 @@ +--- +name: metal-kernel-debugging-checklist +description: "Apple Metal kernel debugging checklist: validation, binding mismatches, dispatch errors, and shader-level inspection." +metadata: + languages: "cpp" + versions: "4.0" + revision: 1 + updated-on: "2026-03-21" + source: official + tags: "apple,metal,debugging,validation,shader-debugger,compute,resource-binding,dispatch,threadgroup,xcode" +--- + +# Metal Kernel Debugging Checklist + +Use this page when a Metal compute kernel builds but behaves incorrectly, produces zeros, or performs far below expectation. + +## First Checks + +- confirm the expected kernel function is actually compiled into the loaded library +- confirm host buffer bindings match `[[buffer(i)]]` indices +- confirm dispatch shape matches kernel indexing assumptions +- confirm output buffers are large enough for the full access range + +These catch a large fraction of real integration bugs. + +## Validation Path + +Apple's runtime validation and Metal debugging tools should be enabled early in debugging. + +Use them to catch: + +- invalid API usage +- resource binding mistakes +- synchronization mistakes +- shader execution issues + +## Kernel-Side Checks + +- verify bounds checks for rounded-up dispatches +- verify threadgroup barriers are present where shared local memory is reused +- verify local and global indices are not mixed accidentally +- verify any subgroup-sensitive logic is tested independently from the threadgroup baseline + +## Host-Side Checks + +- verify `dispatchThreads` / `threadsPerThreadgroup` or threadgroup counts are computed correctly +- verify pipeline state corresponds to the intended kernel entry point +- verify command-buffer ordering matches read/write dependencies between kernels + +## Tooling Workflow + +1. Reproduce with validation enabled. +2. Inspect resource bindings in the Metal debugger. +3. Inspect shader variable values at failing points. +4. Check whether the bug is indexing, synchronization, or data-layout related. +5. Only then start performance debugging. + +## Common Failure Modes + +- Wrong buffer index with otherwise valid kernel code. +- Bounds checks omitted on rounded dispatch sizes. +- Threadgroup memory is reused without a barrier. +- Host code writes CPU data but forgets resource synchronization requirements on relevant macOS paths. +- A performance issue is misdiagnosed as a correctness issue, or vice versa. + +## Debugging Principle + +Separate these questions: + +- Is the kernel producing the right result? +- Is the launch path binding the right resources? +- Is the performance bottleneck in the shader, memory system, or host submission path? + +Do not try to solve all three at once. + +## Official Source Links (Fact Check) + +- Metal developer tools: https://developer.apple.com/metal/tools/ +- Compute passes: https://developer.apple.com/documentation/metal/compute-passes +- Performing calculations on a GPU: https://developer.apple.com/documentation/metal/performing-calculations-on-a-gpu + +Last cross-check date: 2026-03-21 diff --git a/content/apple/docs/metal-kernel-fusion-tradeoff-checklist/DOC.md b/content/apple/docs/metal-kernel-fusion-tradeoff-checklist/DOC.md new file mode 100644 index 00000000..ecfc8eb1 --- /dev/null +++ b/content/apple/docs/metal-kernel-fusion-tradeoff-checklist/DOC.md @@ -0,0 +1,64 @@ +--- +name: metal-kernel-fusion-tradeoff-checklist +description: "Apple Metal kernel fusion tradeoff checklist: bandwidth savings, intermediate lifetime reduction, occupancy risks, and debugging costs when fusing stages." +metadata: + languages: "cpp" + versions: "4.0" + revision: 1 + updated-on: "2026-03-21" + source: official + tags: "apple,metal,kernel-fusion,fusion,tradeoff,bandwidth,occupancy,intermediate-resources,performance,compute" +--- + +# Metal Kernel Fusion Tradeoff Checklist + +Use this page when deciding whether two or more Metal compute stages should remain separate or be fused into one kernel. + +## What Fusion Can Improve + +Fusion can help when it: + +- removes intermediate reads and writes +- reduces temporary resource lifetime +- avoids extra submission overhead +- keeps data in registers or threadgroup memory longer + +## What Fusion Can Hurt + +Fusion is not free. + +It can also: + +- increase register pressure +- reduce occupancy or scheduling flexibility +- make debugging harder because intermediates disappear +- tie together stages that used to be independently measurable + +## Safe Decision Process + +- start from a validated unfused baseline +- measure whether bandwidth or submission overhead is the real bottleneck +- fuse only stages with a clear dataflow advantage +- keep one reproducible benchmark before and after the fusion + +## Review Checklist + +- does fusion remove a meaningful intermediate resource? +- will the fused kernel increase per-thread state too much? +- do the two stages share compatible dispatch geometry? +- is the performance problem actually bandwidth or CPU submission overhead? + +## Common Failure Modes + +- fusion is attempted before the per-stage baseline is validated +- bandwidth savings are assumed but register pressure becomes the new bottleneck +- stage-specific bugs become harder to isolate because intermediates vanish +- one stage's natural launch shape is a poor fit for the other stage + +## Official Source Links (Fact Check) + +- Compute passes: https://developer.apple.com/documentation/metal/compute-passes +- Command buffers best practices: https://developer.apple.com/library/archive/documentation/3DDrawing/Conceptual/MTLBestPracticesGuide/CommandBuffers.html +- Metal developer tools: https://developer.apple.com/metal/tools/ + +Last cross-check date: 2026-03-21 diff --git a/content/apple/docs/metal-library-and-pipeline-compilation/DOC.md b/content/apple/docs/metal-library-and-pipeline-compilation/DOC.md new file mode 100644 index 00000000..d63ef5e4 --- /dev/null +++ b/content/apple/docs/metal-library-and-pipeline-compilation/DOC.md @@ -0,0 +1,76 @@ +--- +name: metal-library-and-pipeline-compilation +description: "Apple Metal library and pipeline compilation: build-time metallib usage, runtime loading, and avoiding unnecessary pipeline compilation overhead." +metadata: + languages: "cpp" + versions: "4.0" + revision: 1 + updated-on: "2026-03-21" + source: official + tags: "apple,metal,metallib,mtllibrary,pipeline-compilation,compute-pipeline,pso,build-time-compilation,runtime-loading,metal4compiler" +--- + +# Metal Library And Pipeline Compilation + +Use this page when the kernel exists, but you need a sane compilation and pipeline-creation strategy on the host side. + +## First Principle + +Apple's best-practices guidance is explicit: + +- compile Metal shader code at build time whenever possible +- load compiled libraries at runtime +- avoid repeated runtime compilation unless shader code is genuinely generated dynamically + +Compiling shader source at runtime is one of the most expensive stages in a Metal app lifecycle. + +## Recommended Compilation Strategy + +### Default Path + +- compile `.metal` sources during the app build +- load the resulting library once at initialization +- create compute pipeline states once and reuse them + +This is the right default for nearly all production compute wrappers. + +### Runtime Compilation + +Use runtime compilation only when: + +- the shader source is generated dynamically +- the function set cannot reasonably be compiled ahead of time +- your tooling or research workflow truly requires runtime generation + +Even then, isolate runtime compilation from the steady-state hot path. + +## Pipeline Creation Strategy + +- load or create the `MTLLibrary` +- resolve the kernel function by name +- create the `MTLComputePipelineState` +- cache it for future launches + +For modern APIs, Apple also exposes newer compilation flows such as Metal 4 compilation interfaces and binary/pipeline tooling. These are useful when your pipeline management needs become more advanced, but they do not change the main rule: avoid repeated compilation in hot paths. + +## Common Failure Modes + +- source compilation is left inside per-inference or per-batch execution +- library loading is repeated across wrapper instances +- multiple pipelines for the same kernel are rebuilt because cache keys are not centralized +- runtime compilation is chosen for convenience even though the function set is static + +## Practical Review Checklist + +1. Are `.metal` files compiled at build time? +2. Is library loading done once during initialization? +3. Is pipeline creation cached by kernel/function identity? +4. Is any runtime compilation path isolated from steady-state execution? + +## Official Source Links (Fact Check) + +- Metal Best Practices Guide: Functions and Libraries: https://developer.apple.com/library/archive/documentation/3DDrawing/Conceptual/MTLBestPracticesGuide/FunctionsandLibraries.html +- Metal shader converter: https://developer.apple.com/metal/shader-converter/ +- `MTL4Compiler`: https://developer.apple.com/documentation/metal/mtl4compiler + +Last cross-check date: 2026-03-21 diff --git a/content/apple/docs/metal-memory-and-threadgroup/DOC.md b/content/apple/docs/metal-memory-and-threadgroup/DOC.md new file mode 100644 index 00000000..16cfd467 --- /dev/null +++ b/content/apple/docs/metal-memory-and-threadgroup/DOC.md @@ -0,0 +1,95 @@ +--- +name: metal-memory-and-threadgroup +description: "Apple Metal memory and threadgroup basics: address spaces, threadgroup memory, synchronization, and local-cooperation rules." +metadata: + languages: "cpp" + versions: "4.0" + revision: 1 + updated-on: "2026-03-21" + source: official + tags: "apple,metal,msl,threadgroup,threadgroup-memory,barrier,threadgroup-barrier,device-memory,constant-memory,simdgroup,synchronization" +--- + +# Metal Memory And Threadgroup Basics + +Use this page when a Metal kernel needs local cooperation: shared staging, barriers, or explicit memory-space reasoning. + +## MSL Address Spaces + +Metal Shading Language distinguishes address spaces such as: + +- `device`: GPU-accessible buffer memory +- `constant`: read-only constant data +- `threadgroup`: memory shared by threads in one threadgroup +- `thread`: per-thread storage + +This separation matters for both correctness and performance. + +## Threadgroup Memory + +Use `threadgroup` memory when: + +- threads in one threadgroup reuse the same tile +- you need to reorder accesses before writing final outputs +- local reductions or stencil neighborhoods would otherwise reread device memory excessively + +Conceptually this is the closest Metal analogue to CUDA shared memory. + +## Synchronization Rule + +If one subset of threads writes `threadgroup` memory and another subset reads it later, insert an appropriate threadgroup barrier before the read phase. + +Without that barrier: + +- readers may observe incomplete writes +- results can depend on scheduling details +- bugs may disappear in small tests and reappear at scale + +## Practical Pattern + +```cpp +threadgroup float tile[256]; + +uint lid = tid_in_tg; +uint gid = tid_in_grid; + +tile[lid] = in[gid]; +threadgroup_barrier(mem_flags::mem_threadgroup); + +out[gid] = tile[lid] * 2.0f; +``` + +## What Not To Assume + +- Metal threadgroups are not CUDA thread blocks in naming only; validate every ported synchronization point. +- SIMD-group behavior is not a drop-in CUDA warp model. +- A barrier for threadgroup memory does not automatically substitute for all higher-level producer-consumer protocol design. + +## Common Failure Modes + +- Barrier omitted after threadgroup writes and before dependent reads. +- Kernel uses threadgroup-local indices but dispatch size no longer matches the threadgroup allocation. +- Threadgroup allocation is too large for the chosen dispatch geometry or resource budget. +- CUDA-style warp assumptions are reused without checking Metal SIMD-group semantics. + +## When To Stay High-Level + +Stay in higher-level frameworks when: + +- you only need standard tensor operators +- the framework already fuses or schedules the operation adequately +- custom kernel ownership is not worth the debugging surface + +Drop to custom Metal kernels when: + +- memory movement dominates and local staging is necessary +- you need a custom fused operation +- framework coverage or performance is insufficient + +## Official Source Links (Fact Check) + +- Metal resources hub: https://developer.apple.com/metal/resources/ +- Metal Shading Language Specification: https://developer.apple.com/metal/resources/ +- Compute passes: https://developer.apple.com/documentation/metal/compute-passes + +Last cross-check date: 2026-03-21 diff --git a/content/apple/docs/metal-memory-pressure-checklist/DOC.md b/content/apple/docs/metal-memory-pressure-checklist/DOC.md new file mode 100644 index 00000000..5e8c3aa0 --- /dev/null +++ b/content/apple/docs/metal-memory-pressure-checklist/DOC.md @@ -0,0 +1,67 @@ +--- +name: metal-memory-pressure-checklist +description: "Apple Metal memory pressure checklist: storage modes, heap strategy, transient allocation reuse, and debugger-driven memory-footprint review." +metadata: + languages: "cpp" + versions: "4.0" + revision: 1 + updated-on: "2026-03-21" + source: official + tags: "apple,metal,memory-pressure,memory-footprint,storage-mode,heaps,resource-options,purgeable,allocation,compute" +--- + +# Metal Memory Pressure Checklist + +Use this page when a Metal compute path works functionally but consumes too much memory, thrashes transient allocations, or behaves poorly under tight budgets. + +## First Questions To Ask + +- are resources using the right storage mode? +- are temporary resources recreated instead of reused? +- should several transient resources live on a heap? +- is the app tracking memory footprint with Metal debugging tools, or guessing? + +Memory pressure is usually a resource-lifecycle problem before it is a shader problem. + +## Storage Mode Review + +Review whether each buffer or texture should be: + +- CPU-visible and updated frequently +- GPU-private after upload +- truly temporary and a candidate for heap-backed reuse + +Using the wrong storage mode or usage flags can waste memory bandwidth and increase footprint. + +## Heap Review + +Heaps are useful when: + +- many transient resources have predictable lifetimes +- temporary resources do not all need to exist simultaneously +- allocation churn is becoming visible in profiling or debugging + +Do not introduce aliasing until the non-aliased pipeline is already correct and measurable. + +## Practical Checklist + +- reuse steady-state buffers and textures +- prefer GPU-private resources when the CPU never reads them +- group transient resources by lifetime and type +- inspect memory reports in Metal debugging tools instead of inferring from symptoms + +## Common Failure Modes + +- a hot path reallocates temporary buffers every iteration +- long-lived and short-lived resources are mixed into one unmanaged pool +- heap aliasing is introduced before synchronization boundaries are understood +- memory usage is judged only by process RSS instead of Metal resource inspection + +## Official Source Links (Fact Check) + +- Resource options best practices: https://developer.apple.com/library/archive/documentation/3DDrawing/Conceptual/MTLBestPracticesGuide/ResourceOptions.html +- Resource heaps: https://developer.apple.com/library/archive/documentation/Miscellaneous/Conceptual/MetalProgrammingGuide/ResourceHeaps/ResourceHeaps.html +- Implementing a multistage image filter using heaps and events: https://developer.apple.com/documentation/metal/implementing-a-multistage-image-filter-using-heaps-and-events +- Metal developer tools: https://developer.apple.com/metal/tools/ + +Last cross-check date: 2026-03-21 diff --git a/content/apple/docs/metal-multistage-tensor-pipeline-patterns/DOC.md b/content/apple/docs/metal-multistage-tensor-pipeline-patterns/DOC.md new file mode 100644 index 00000000..f4ac2b9c --- /dev/null +++ b/content/apple/docs/metal-multistage-tensor-pipeline-patterns/DOC.md @@ -0,0 +1,63 @@ +--- +name: metal-multistage-tensor-pipeline-patterns +description: "Apple Metal multi-stage tensor pipeline patterns: staging intermediate buffers, synchronization boundaries, and wrapper design for chained compute kernels." +metadata: + languages: "cpp" + versions: "4.0" + revision: 1 + updated-on: "2026-03-21" + source: official + tags: "apple,metal,tensor,pipeline,multistage,intermediate-buffers,compute-pass,heaps,events,command-buffer" +--- + +# Metal Multi-Stage Tensor Pipeline Patterns + +Use this page when a workload is no longer one kernel, but a sequence such as pack, transform, reduce, post-process, or fuse-when-possible stages. + +## Typical Structure + +A multi-stage compute pipeline usually has: + +- one or more input preparation stages +- intermediate buffers or textures +- one or more main compute stages +- optional post-processing or reduction stages + +The engineering problem is deciding which stages belong in one command buffer, which intermediates can be reused, and where explicit synchronization boundaries are required. + +## Good Pipeline Discipline + +- start with a stage-by-stage pipeline that is easy to reason about +- name every intermediate resource by producer and consumer role +- keep producer/consumer boundaries explicit +- only introduce heap reuse, aliasing, or fusion after the unfused baseline is stable + +## Common Design Questions + +- should this intermediate be a texture or a linear buffer? +- can two neighboring stages share one command buffer? +- is this stage bandwidth-bound, making fusion worthwhile? +- does an intermediate resource live long enough to justify heap-backed reuse? + +## Common Failure Modes + +- several stages are packed together before each stage is independently validated +- intermediate resources are reused too early +- command-buffer boundaries are added by convenience rather than by data dependency +- stage fusion is attempted before the per-stage baseline is measurable + +## Review Checklist + +- Which stage writes each intermediate? +- Which later stage consumes it? +- Is the resource type aligned with the access pattern? +- Would one reproducible benchmark expose whether the pipeline is CPU-bound or bandwidth-bound? + +## Official Source Links (Fact Check) + +- Compute passes: https://developer.apple.com/documentation/metal/compute-passes +- Implementing a multistage image filter using heaps and events: https://developer.apple.com/documentation/metal/implementing-a-multistage-image-filter-using-heaps-and-events +- Command organization and execution model: https://developer.apple.com/library/archive/documentation/Miscellaneous/Conceptual/MetalProgrammingGuide/Cmd-Submiss/Cmd-Submiss.html +- Metal developer tools: https://developer.apple.com/metal/tools/ + +Last cross-check date: 2026-03-21 diff --git a/content/apple/docs/metal-numerical-drift-debugging-checklist/DOC.md b/content/apple/docs/metal-numerical-drift-debugging-checklist/DOC.md new file mode 100644 index 00000000..2a9fbda2 --- /dev/null +++ b/content/apple/docs/metal-numerical-drift-debugging-checklist/DOC.md @@ -0,0 +1,66 @@ +--- +name: metal-numerical-drift-debugging-checklist +description: "Apple Metal numerical drift debugging checklist: isolate precision changes, compare reference paths, inspect intermediates, and separate math drift from synchronization bugs." +metadata: + languages: "cpp" + versions: "4.0" + revision: 1 + updated-on: "2026-03-21" + source: official + tags: "apple,metal,numerics,precision,half,float,debugging,drift,intermediate-values,shader-debugger,compute" +--- + +# Metal Numerical Drift Debugging Checklist + +Use this page when a Metal kernel runs and produces plausible values, but the results drift from a CPU or reference implementation. + +## First Separate Math Drift From Logic Bugs + +Do not assume every mismatch is a floating-point issue. + +Check first whether the error comes from: + +- incorrect indexing +- uninitialized threadgroup data +- missing synchronization +- different border or reduction semantics + +Only after those are ruled out should you treat the problem as pure numeric drift. + +## Practical Debugging Order + +1. compare against a small deterministic reference input +2. inspect intermediate tensors or buffers, not only final output +3. compare one precision change at a time +4. test whether accumulation order changed +5. use Xcode shader debugging tools to inspect suspicious intermediates + +## Typical Sources Of Drift + +- `half` versus `float` accumulation +- reordered reductions or fused stages +- transcendental function precision differences +- changed evaluation order in a tiled or vectorized kernel + +## Good Review Questions + +- Was accumulation precision reduced during optimization? +- Does the reference path use the same border and reduction semantics? +- Did stage fusion change evaluation order? +- Is the mismatch bounded and systematic, or does it indicate broken synchronization? + +## Common Failure Modes + +- a synchronization bug is misdiagnosed as harmless floating-point noise +- an optimization changes accumulation order and the allowed tolerance is never revisited +- only final output is compared, so the first bad intermediate is never identified +- different precision modes are combined with data-layout changes, hiding the real source of drift + +## Official Source Links (Fact Check) + +- Metal Shading Language Specification: https://developer.apple.com/metal/resources/ +- Metal developer tools: https://developer.apple.com/metal/tools/ +- Compute passes: https://developer.apple.com/documentation/metal/compute-passes +- Metal shader converter precision and debugging notes: https://developer.apple.com/metal/shader-converter/ + +Last cross-check date: 2026-03-21 diff --git a/content/apple/docs/metal-performance-tuning/DOC.md b/content/apple/docs/metal-performance-tuning/DOC.md new file mode 100644 index 00000000..0bfe1460 --- /dev/null +++ b/content/apple/docs/metal-performance-tuning/DOC.md @@ -0,0 +1,91 @@ +--- +name: metal-performance-tuning +description: "Apple Metal performance tuning: dispatch sizing, pipeline reuse, synchronization costs, and profiling-first optimization." +metadata: + languages: "cpp" + versions: "4.0" + revision: 1 + updated-on: "2026-03-21" + source: official + tags: "apple,metal,performance,profiling,dispatchthreads,threadsperthreadgroup,command-buffer,pipeline-state,managed-resource,instruments" +--- + +# Metal Performance Tuning + +Use this page for practical optimization decisions on Metal compute workloads. + +## Start With The Correct Bottleneck + +For Metal kernels, performance problems usually come from one of these classes: + +- dispatch sizing is poor +- memory access is inefficient +- CPU submission overhead is too high +- synchronization is too frequent +- pipeline creation or resource setup happens in hot paths + +Measure first, then tune. + +## High-Value Tuning Areas + +### Dispatch Shape + +- choose threadgroup sizes that match the kernel's locality pattern +- do not assume one fixed threadgroup size is optimal for every kernel +- keep bounds-check overhead small by dispatching close to true workload extent + +### Pipeline Reuse + +- create `MTLComputePipelineState` objects once and reuse them +- avoid recompiling libraries or pipelines inside steady-state execution loops + +### CPU/GPU Synchronization + +- do not wait on command buffers unless the CPU truly needs the result immediately +- minimize synchronization points, especially on resource-sharing paths + +### Managed Resources On macOS + +Apple documents that managed resources on macOS require explicit synchronization between CPU and GPU views. + +This matters especially on Intel Macs or external GPU workflows: + +- synchronization has real cost +- extra sync points can damage throughput +- resource mode assumptions should be reviewed when code moves across Mac hardware configurations + +## Tools That Matter + +Apple's Metal developer tools are the primary source of truth for optimization work: + +- Metal debugger in Xcode +- runtime validation +- performance counters +- Metal System Trace in Instruments + +These tools are better than guessing from output values alone. + +## Common Failure Modes + +- Dispatch size is copied from another kernel with different memory behavior. +- Command buffers are committed and waited on too frequently. +- Pipeline compilation is left in a hot code path. +- Resource synchronization is correct but far more frequent than necessary. +- Kernel tuning changes are made without inspecting counters or traces. + +## Tuning Order + +1. Make the kernel correct. +2. Cache pipeline and resource setup. +3. Profile dispatch and memory behavior. +4. Reduce synchronization and submission overhead. +5. Revisit threadgroup sizing and data layout. + +## Official Source Links (Fact Check) + +- Metal developer tools: https://developer.apple.com/metal/tools/ +- Compute passes: https://developer.apple.com/documentation/metal/compute-passes +- Synchronizing a managed resource in macOS: https://developer.apple.com/documentation/metal/synchronizing-a-managed-resource-in-macos +- Metal overview: https://developer.apple.com/metal/ + +Last cross-check date: 2026-03-21 diff --git a/content/apple/docs/metal-persistent-objects-and-submission-overhead/DOC.md b/content/apple/docs/metal-persistent-objects-and-submission-overhead/DOC.md new file mode 100644 index 00000000..d1d39441 --- /dev/null +++ b/content/apple/docs/metal-persistent-objects-and-submission-overhead/DOC.md @@ -0,0 +1,69 @@ +--- +name: metal-persistent-objects-and-submission-overhead +description: "Apple Metal persistent object and submission-overhead guidance: reuse devices, queues, pipelines, resources, and minimize command-buffer churn." +metadata: + languages: "cpp" + versions: "4.0" + revision: 1 + updated-on: "2026-03-21" + source: official + tags: "apple,metal,persistent-objects,command-buffer,command-queue,pipeline-reuse,submission-overhead,device,compute,performance" +--- + +# Metal Persistent Objects And Submission Overhead + +Use this page when a Metal compute path is functionally correct but wastes time rebuilding objects or submitting work too granularly. + +## The Main Rule + +Create persistent Metal objects early and reuse them often. + +For compute code, this usually means: + +- one long-lived `MTLDevice` per GPU +- one long-lived `MTLCommandQueue` for the main compute path +- long-lived `MTLComputePipelineState` objects +- reusable buffers and textures when sizes are stable + +Do not rebuild these objects in the hot dispatch loop unless the workload genuinely requires it. + +## Where Submission Overhead Comes From + +The most common CPU-side overhead sources are: + +- too many tiny command buffers +- repeatedly rebuilding pipeline state +- creating short-lived resources every iteration +- rebinding or recomputing state that could be cached + +The fix is usually structural, not a shader micro-optimization. + +## Good Baseline Pattern + +1. create device, queue, libraries, and pipelines at initialization +2. preallocate reusable resources for steady-state shapes +3. batch related compute work into as few command buffers as practical +4. profile CPU encoding time separately from GPU execution time + +## Practical Review Questions + +- Is the wrapper creating a command buffer per tiny operation? +- Is pipeline creation happening outside initialization? +- Are resources reused, or recreated every invocation? +- Is the bottleneck CPU submission time rather than GPU execution time? + +## Common Failure Modes + +- command buffers are fragmented into many tiny submissions +- a stable pipeline is recreated every dispatch +- dynamic-shape support is implemented by reallocating everything every time +- GPU slowdown is assumed when the real issue is CPU encoding churn + +## Official Source Links (Fact Check) + +- Persistent objects best practices: https://developer.apple.com/library/archive/documentation/3DDrawing/Conceptual/MTLBestPracticesGuide/PersistentObjects.html +- Command buffers best practices: https://developer.apple.com/library/archive/documentation/3DDrawing/Conceptual/MTLBestPracticesGuide/CommandBuffers.html +- Command organization and execution model: https://developer.apple.com/library/archive/documentation/Miscellaneous/Conceptual/MetalProgrammingGuide/Cmd-Submiss/Cmd-Submiss.html +- Metal developer tools: https://developer.apple.com/metal/tools/ + +Last cross-check date: 2026-03-21 diff --git a/content/apple/docs/metal-prefetch-and-reuse-heuristics/DOC.md b/content/apple/docs/metal-prefetch-and-reuse-heuristics/DOC.md new file mode 100644 index 00000000..ad886361 --- /dev/null +++ b/content/apple/docs/metal-prefetch-and-reuse-heuristics/DOC.md @@ -0,0 +1,61 @@ +--- +name: metal-prefetch-and-reuse-heuristics +description: "Apple Metal prefetch and reuse heuristics: when staged loads help, how to reason about locality, and how to avoid overcomplicating memory traffic." +metadata: + languages: "cpp" + versions: "4.0" + revision: 1 + updated-on: "2026-03-21" + source: official + tags: "apple,metal,prefetch,reuse,locality,threadgroup-memory,memory-traffic,heuristics,compute" +--- + +# Metal Prefetch And Reuse Heuristics + +Use this page when deciding whether a Metal kernel should explicitly stage data into `threadgroup` memory or continue reading directly from device memory. + +## The Main Question + +Will staging actually increase reuse, or just add barriers and complexity? + +Prefetch-like staging helps only when: + +- several nearby threads reuse the same source region +- the staged footprint fits the threadgroup plan +- the extra synchronization cost is lower than the saved memory traffic + +## Good Heuristic + +Prefer explicit staging when all of the following are true: + +- the same source values are consumed repeatedly by nearby threads +- access is regular enough to load cooperatively +- the kernel already has a natural threadgroup structure + +Prefer direct loads when: + +- reuse is weak or highly irregular +- the staged region would need complex halo logic +- the kernel is small and synchronization would dominate + +## What To Verify + +- the staged tile is consumed enough times to justify the load +- barriers are placed only around the actual reuse window +- edge or halo handling does not erase the benefit +- the staged working set does not make threadgroup sizing worse + +## Common Failure Modes + +- prefetch logic is added to a kernel with little real reuse +- threadgroup staging improves one synthetic benchmark but hurts realistic edge cases +- staged regions are larger than needed, increasing pressure without clear gain +- reuse assumptions are copied from CUDA-style kernels without checking Metal-specific dispatch costs + +## Official Source Links (Fact Check) + +- Compute passes: https://developer.apple.com/documentation/metal/compute-passes +- Performing calculations on a GPU: https://developer.apple.com/documentation/metal/performing-calculations-on-a-gpu +- Metal developer tools: https://developer.apple.com/metal/tools/ + +Last cross-check date: 2026-03-21 diff --git a/content/apple/docs/metal-prefix-scan-patterns/DOC.md b/content/apple/docs/metal-prefix-scan-patterns/DOC.md new file mode 100644 index 00000000..1428ef33 --- /dev/null +++ b/content/apple/docs/metal-prefix-scan-patterns/DOC.md @@ -0,0 +1,66 @@ +--- +name: metal-prefix-scan-patterns +description: "Apple Metal prefix scan patterns: inclusive or exclusive scan structure, threadgroup staging, multi-pass composition, and validation strategy." +metadata: + languages: "cpp" + versions: "4.0" + revision: 1 + updated-on: "2026-03-21" + source: official + tags: "apple,metal,prefix-scan,scan,exclusive-scan,inclusive-scan,parallel-prefix,threadgroup,compute,multi-pass" +--- + +# Metal Prefix Scan Patterns + +Use this page when implementing prefix sum or scan-style kernels in Metal. + +## What Makes Scan Different + +A scan is not just a reduction. + +It preserves a prefix value for every element, which means: + +- local threadgroup work is only one stage of the algorithm +- large arrays usually require multiple passes +- correctness depends on both local scan logic and inter-block carry propagation + +## Safe Decomposition + +A practical structure is: + +1. scan data within each threadgroup +2. write one block total per threadgroup +3. scan the block totals +4. add scanned block offsets back to the per-element outputs + +This keeps local and global logic separate. + +## Validation Strategy + +- test inclusive and exclusive semantics separately +- test sizes smaller than one threadgroup and much larger than one threadgroup +- test non-power-of-two sizes +- compare against a simple CPU scan on tiny arrays + +Scan bugs often hide until odd lengths or multi-block carry propagation is exercised. + +## Performance Notes + +- threadgroup memory is usually the first optimization tool +- dispatch geometry and block size must match the carry propagation logic +- a fast local scan is not enough if global offset propagation is poorly structured + +## Common Failure Modes + +- exclusive and inclusive semantics are mixed +- block totals are written correctly but not added back correctly +- logic assumes power-of-two sizes everywhere +- scan is treated as a one-pass problem on arrays that require hierarchical composition + +## Official Source Links (Fact Check) + +- Compute passes: https://developer.apple.com/documentation/metal/compute-passes +- Performing calculations on a GPU: https://developer.apple.com/documentation/metal/performing-calculations-on-a-gpu +- Metal Shading Language Specification: https://developer.apple.com/metal/resources/ + +Last cross-check date: 2026-03-21 diff --git a/content/apple/docs/metal-producer-consumer-staging-playbook/DOC.md b/content/apple/docs/metal-producer-consumer-staging-playbook/DOC.md new file mode 100644 index 00000000..465e95d5 --- /dev/null +++ b/content/apple/docs/metal-producer-consumer-staging-playbook/DOC.md @@ -0,0 +1,57 @@ +--- +name: metal-producer-consumer-staging-playbook +description: "Apple Metal producer-consumer staging playbook: explicit ownership, handoff points, intermediate resources, and synchronization boundaries for chained kernels." +metadata: + languages: "cpp" + versions: "4.0" + revision: 1 + updated-on: "2026-03-21" + source: official + tags: "apple,metal,producer-consumer,staging,handoff,intermediate-resource,synchronization,compute-pipeline,compute" +--- + +# Metal Producer-Consumer Staging Playbook + +Use this page when one Metal kernel produces an intermediate that another kernel consumes, and the handoff itself is where correctness or performance starts to break down. + +## The Main Rule + +Treat every intermediate resource as a contract between one producer and one consumer. + +That contract should answer: + +- who writes it +- who reads it next +- when it becomes valid +- when it may be reused or discarded + +If those four points are vague, the pipeline usually becomes fragile. + +## Safe Staging Pattern + +- give each intermediate a clear producer and consumer role +- keep resource type aligned with access pattern +- place synchronization only at real handoff boundaries +- validate each stage separately before fusing or aliasing resources + +## Review Checklist + +- does the consumer read the intermediate only after the producer signaled completion? +- is the intermediate buffer or texture type still appropriate for the next stage? +- can the handoff be expressed in one command buffer, or does it need an explicit event or fence? +- is the same intermediate reused too early by another stage? + +## Common Failure Modes + +- the consumer is launched before the producer made the intermediate valid +- one intermediate is shared by several stages without explicit ownership +- synchronization is added globally instead of just around the true handoff +- the resource type was chosen for the producer and is poor for the consumer + +## Official Source Links (Fact Check) + +- Compute passes: https://developer.apple.com/documentation/metal/compute-passes +- Implementing a multistage image filter using heaps and events: https://developer.apple.com/documentation/metal/implementing-a-multistage-image-filter-using-heaps-and-events +- Synchronizing passes with a fence: https://developer.apple.com/documentation/metal/synchronizing-passes-with-a-fence + +Last cross-check date: 2026-03-21 diff --git a/content/apple/docs/metal-pytorch-custom-op-host-patterns/DOC.md b/content/apple/docs/metal-pytorch-custom-op-host-patterns/DOC.md new file mode 100644 index 00000000..59a54d0e --- /dev/null +++ b/content/apple/docs/metal-pytorch-custom-op-host-patterns/DOC.md @@ -0,0 +1,84 @@ +--- +name: metal-pytorch-custom-op-host-patterns +description: "Apple Metal PyTorch custom-op host patterns: deciding when to leave MPS, structuring the host path, and validating custom op integration." +metadata: + languages: "cpp" + versions: "4.0" + revision: 1 + updated-on: "2026-03-21" + source: official + tags: "apple,metal,pytorch,custom-op,host-wrapper,mps,cpp-extension,torch-library,metal-custom-op,opcheck" +--- + +# Metal PyTorch Custom-Op Host Patterns + +Use this page when PyTorch `mps` is no longer enough and you need a custom operation backed by Metal. + +## When To Leave Plain `mps` + +Stay with stock PyTorch `mps` when: + +- the op graph is already expressible with built-in operators +- performance is acceptable +- your main goal is device portability, not custom kernel ownership + +Move to a custom-op path when: + +- a fused op is missing +- launch or memory overhead dominates +- kernel behavior must be controlled directly + +## Host-Side Responsibility Split + +There are two distinct layers: + +- PyTorch custom-op registration and dispatch boundary +- Metal-side wrapper that owns pipeline creation, binding, and dispatch + +Keep them separate. + +That means: + +- PyTorch-facing code validates tensor shapes, dtypes, and device placement +- Metal-facing code handles buffers, pipeline state, and launch geometry + +## Good Integration Pattern + +1. Validate tensor arguments at the PyTorch boundary. +2. Convert or view storage into the expected Metal wrapper inputs. +3. Dispatch a dedicated Metal wrapper for the op. +4. Return tensors whose lifetime and synchronization rules are clear. +5. Test the custom op independently from model-level integration. + +## Why This Matters + +Without a clean boundary: + +- debugging mixes PyTorch registration bugs with Metal launch bugs +- tensor layout assumptions leak into shader logic +- synchronization ownership becomes unclear + +## Testing Strategy + +Use both: + +- PyTorch-side correctness tests for the registered op +- lower-level Metal-side tests for binding and dispatch correctness + +PyTorch's custom operator guidance and `torch.library.opcheck` are relevant here for validating the operator boundary itself. + +## Common Failure Modes + +- PyTorch boundary accepts tensors the Metal wrapper does not actually support +- wrapper assumes contiguous layout but op registration never enforces it +- custom op is "correct" on CPU fallback but incorrect on Metal because launch geometry differs +- model-level tests hide whether the failure is registration, layout, or shader execution + +## Official Source Links (Fact Check) + +- Apple: Customizing a PyTorch operation: https://developer.apple.com/documentation/metal/customizing-a-pytorch-operation +- Apple: Accelerated PyTorch training on Mac: https://developer.apple.com/metal/pytorch/ +- PyTorch custom C++/CUDA operators tutorial: https://docs.pytorch.org/tutorials/advanced/cpp_custom_ops.html +- PyTorch MPS backend notes: https://docs.pytorch.org/docs/stable/notes/mps + +Last cross-check date: 2026-03-21 diff --git a/content/apple/docs/metal-ragged-tensors-and-masked-kernels/DOC.md b/content/apple/docs/metal-ragged-tensors-and-masked-kernels/DOC.md new file mode 100644 index 00000000..ff266916 --- /dev/null +++ b/content/apple/docs/metal-ragged-tensors-and-masked-kernels/DOC.md @@ -0,0 +1,62 @@ +--- +name: metal-ragged-tensors-and-masked-kernels +description: "Apple Metal ragged tensor and masked kernel patterns: variable-length work, explicit masks, bounds-safe dispatch, and divergence-aware validation." +metadata: + languages: "cpp" + versions: "4.0" + revision: 1 + updated-on: "2026-03-21" + source: official + tags: "apple,metal,ragged,masked-kernel,variable-length,bounds-check,divergence,tail-processing,compute" +--- + +# Metal Ragged Tensors And Masked Kernels + +Use this page when logical rows, sequences, or tiles have variable length and the kernel cannot assume a dense rectangular workload. + +## What Changes In Ragged Workloads + +Dense kernels often assume that every launched thread owns one valid element. + +Ragged workloads break that assumption: + +- valid work per row or segment may differ +- the launched grid is often rounded up beyond the real element count +- some lanes are active only under an explicit mask + +If this is not modeled clearly, the kernel either reads out of range or silently includes invalid elements in the result. + +## Safe Baseline Pattern + +- dispatch a simple rectangular superset +- pass explicit logical lengths, masks, or prefix offsets +- guard each load and store with the real validity condition +- validate with very uneven row lengths and short tails + +## Design Questions + +- is validity expressed by lengths, offsets, or a boolean mask? +- is masked-off work supposed to write zero, skip the write, or keep the previous value? +- does divergence only affect performance, or can it change reduction semantics? + +## Common Failure Modes + +- bounds checks protect loads but not stores +- masked-out lanes still participate in reductions or normalizations +- the wrapper rounds dispatch size correctly but passes the wrong logical lengths +- only near-dense cases are tested, so pathological ragged shapes never run + +## Review Checklist + +- What defines the valid region for each row, segment, or tile? +- Do masked lanes avoid both invalid reads and invalid writes? +- Are reductions and normalizations computed only over valid elements? +- Are tail and empty-row cases part of the test set? + +## Official Source Links (Fact Check) + +- Calculating threadgroup and grid sizes: https://developer.apple.com/documentation/metal/calculating-threadgroup-and-grid-sizes +- Compute passes: https://developer.apple.com/documentation/metal/compute-passes +- Metal Shading Language Specification: https://developer.apple.com/metal/resources/ + +Last cross-check date: 2026-03-21 diff --git a/content/apple/docs/metal-reduction-patterns/DOC.md b/content/apple/docs/metal-reduction-patterns/DOC.md new file mode 100644 index 00000000..ebacfc0d --- /dev/null +++ b/content/apple/docs/metal-reduction-patterns/DOC.md @@ -0,0 +1,63 @@ +--- +name: metal-reduction-patterns +description: "Apple Metal reduction patterns: threadgroup accumulation, staged reductions, and synchronization rules for sum/max-like kernels." +metadata: + languages: "cpp" + versions: "4.0" + revision: 1 + updated-on: "2026-03-21" + source: official + tags: "apple,metal,reduction,sum,max,min,threadgroup,accumulation,parallel-reduction,barrier,compute" +--- + +# Metal Reduction Patterns + +Use this page when implementing parallel reductions such as sum, max, min, norm accumulation, or partial aggregation in Metal. + +## Reduction Structure + +Most Metal reductions follow a staged pattern: + +1. each thread loads one or more input elements +2. partial values are stored in `threadgroup` memory +3. the threadgroup synchronizes +4. the threadgroup reduces partial values locally +5. one thread writes a block result or final result + +Large reductions usually require multiple passes or hierarchical aggregation. + +## Good Baseline Strategy + +- start with one-threadgroup local reduction +- validate correctness on non-power-of-two sizes +- then extend to hierarchical reduction across multiple threadgroups + +This prevents mixing local-synchronization bugs with global-aggregation bugs. + +## Important Design Choices + +- how many input elements each thread loads +- where partial sums live (`threadgroup` memory) +- when barriers are required +- whether the kernel emits one final value or many partial block values + +## Common Failure Modes + +- reduction assumes a power-of-two threadgroup size but the wrapper dispatches another value +- barriers are missing between reduction stages +- edge elements past the logical input length are not masked correctly +- one-pass logic is used on a workload that actually requires hierarchical aggregation + +## Performance Notes + +- reduction kernels are often memory-sensitive first, not math-sensitive first +- threadgroup size and per-thread load count should be tuned together +- correctness for odd sizes is a stronger baseline check than large synthetic powers of two + +## Official Source Links (Fact Check) + +- Compute passes: https://developer.apple.com/documentation/metal/compute-passes +- Performing calculations on a GPU: https://developer.apple.com/documentation/metal/performing-calculations-on-a-gpu +- Metal Shading Language Specification: https://developer.apple.com/metal/resources/ + +Last cross-check date: 2026-03-21 diff --git a/content/apple/docs/metal-resource-binding-checklist/DOC.md b/content/apple/docs/metal-resource-binding-checklist/DOC.md new file mode 100644 index 00000000..86a72db4 --- /dev/null +++ b/content/apple/docs/metal-resource-binding-checklist/DOC.md @@ -0,0 +1,67 @@ +--- +name: metal-resource-binding-checklist +description: "Apple Metal resource binding checklist: buffer indices, texture binding, argument consistency, residency, and launch-time validation." +metadata: + languages: "cpp" + versions: "4.0" + revision: 1 + updated-on: "2026-03-21" + source: official + tags: "apple,metal,resource-binding,buffer-index,texture-binding,argument-buffer,useResource,useHeap,compute-encoder,validation" +--- + +# Metal Resource Binding Checklist + +Use this page when a Metal kernel compiles but returns wrong data because host-side resources are bound incorrectly. + +## Binding Checklist + +- `[[buffer(i)]]` indices match `setBuffer(... atIndex:i)` +- texture indices match `setTexture(... atIndex:i)` +- constant arguments are passed through the intended binding path +- offsets are aligned and within buffer bounds +- resources referenced indirectly are made resident when required + +Binding errors often look like math bugs even though the shader logic is fine. + +## Direct Binding Discipline + +For ordinary compute encoders: + +- define one authoritative table for buffer slots +- reuse the same slot definitions across wrapper and shader code +- keep optional resources explicit instead of shifting slot numbers dynamically + +## Indirect Or Advanced Binding + +When using argument-buffer-style or indirect resource binding, Apple documents that resource residency and dependency signaling still matter. + +That means APIs such as: + +- `useResource` +- `useHeap` + +may be necessary depending on the binding model. + +## Common Failure Modes + +- one buffer index shifts after a wrapper refactor and all later bindings become wrong +- an offset is valid as a byte value but invalid for the element layout +- optional resources are omitted and later bindings silently slide into the wrong slots +- indirectly referenced resources are not made resident +- host code updates struct layout but not shader-side expectations + +## Practical Review Method + +1. Write down the full binding table. +2. Compare each slot against kernel attributes. +3. Validate offsets and sizes separately from kernel math. +4. Only after binding is proven correct, debug shader arithmetic. + +## Official Source Links (Fact Check) + +- Compute passes: https://developer.apple.com/documentation/metal/compute-passes +- Metal shader converter resource binding notes: https://developer.apple.com/metal/shader-converter/ +- Metal developer tools: https://developer.apple.com/metal/tools/ + +Last cross-check date: 2026-03-21 diff --git a/content/apple/docs/metal-scatter-conflict-resolution-patterns/DOC.md b/content/apple/docs/metal-scatter-conflict-resolution-patterns/DOC.md new file mode 100644 index 00000000..cd40d202 --- /dev/null +++ b/content/apple/docs/metal-scatter-conflict-resolution-patterns/DOC.md @@ -0,0 +1,56 @@ +--- +name: metal-scatter-conflict-resolution-patterns +description: "Apple Metal scatter conflict-resolution patterns: ownership rules, atomics, staged merges, and validation for write-conflict-heavy kernels." +metadata: + languages: "cpp" + versions: "4.0" + revision: 1 + updated-on: "2026-03-21" + source: official + tags: "apple,metal,scatter,conflict-resolution,atomics,write-conflict,merge,indirect-access,compute" +--- + +# Metal Scatter Conflict-Resolution Patterns + +Use this page when several threads may write to the same output location and a plain scatter kernel is no longer safe. + +## Why Scatter Is Risky + +Scatter kernels are not defined just by index math. + +They also need a conflict policy: + +- one-writer ownership +- atomic accumulation or update +- local staging followed by a merge +- deterministic winner selection if ordering matters + +Without an explicit policy, the kernel is only accidentally correct. + +## Safe Design Order + +1. prove whether writes are unique or conflicting +2. choose the conflict policy +3. validate the policy on duplicate-heavy inputs +4. only then optimize for throughput + +## Typical Strategies + +- if writes are unique, keep the kernel simple and document the invariant +- if writes collide, use atomics or a staged merge path +- if ordering matters, define a deterministic winner rule instead of relying on timing + +## Common Failure Modes + +- scatter is treated like gather and duplicate destinations are ignored +- atomics are added, but the operation is still not semantically correct for the workload +- a staged merge path is introduced without validating duplicate-heavy cases +- tests contain mostly unique indices, so collisions are underrepresented + +## Official Source Links (Fact Check) + +- Compute passes: https://developer.apple.com/documentation/metal/compute-passes +- Metal Shading Language Specification: https://developer.apple.com/metal/resources/ +- Encoding indirect command buffers on the GPU: https://developer.apple.com/documentation/metal/encoding-indirect-command-buffers-on-the-gpu + +Last cross-check date: 2026-03-21 diff --git a/content/apple/docs/metal-segmented-reduction-patterns/DOC.md b/content/apple/docs/metal-segmented-reduction-patterns/DOC.md new file mode 100644 index 00000000..7ab01340 --- /dev/null +++ b/content/apple/docs/metal-segmented-reduction-patterns/DOC.md @@ -0,0 +1,64 @@ +--- +name: metal-segmented-reduction-patterns +description: "Apple Metal segmented reduction patterns: segment boundaries, local aggregation, carry handling, and validation for irregular grouped reductions." +metadata: + languages: "cpp" + versions: "4.0" + revision: 1 + updated-on: "2026-03-21" + source: official + tags: "apple,metal,segmented-reduction,reduction,grouped,irregular,scan,threadgroup,aggregation,compute" +--- + +# Metal Segmented Reduction Patterns + +Use this page when reducing values within logical groups, where segment boundaries are part of the input instead of one fixed contiguous range. + +## Why Segmented Reduction Is Harder + +A segmented reduction is not just "many reductions at once." + +The kernel must preserve segment boundaries while still parallelizing work. This usually means: + +- segment starts or segment IDs are part of the input +- local threadgroup aggregation may cross segment boundaries if indexing is careless +- large segments may span multiple threadgroups and need carry logic + +## Safe Decomposition + +Start with three concerns separated: + +1. local aggregation inside one threadgroup +2. representation of segment boundaries or segment IDs +3. merge logic when one segment spans multiple blocks + +This avoids mixing local correctness bugs with cross-block carry bugs. + +## What To Verify First + +- segment boundaries are reproduced exactly from the reference path +- empty or single-element segments behave correctly +- segments spanning more than one threadgroup are handled explicitly +- output layout is unambiguous: one value per segment or one partial per block + +## Common Failure Modes + +- reduction logic silently merges adjacent segments +- local shared state is reused without rechecking segment boundaries +- cross-block carry logic is missing for long segments +- the test set contains only evenly sized segments, hiding edge cases + +## Validation Strategy + +- test highly irregular segment lengths +- test single-element and empty-edge cases if the API allows them +- compare against a CPU grouped reduction on tiny inputs +- separately test "one long segment" and "many tiny segments" + +## Official Source Links (Fact Check) + +- Compute passes: https://developer.apple.com/documentation/metal/compute-passes +- Performing calculations on a GPU: https://developer.apple.com/documentation/metal/performing-calculations-on-a-gpu +- Metal Shading Language Specification: https://developer.apple.com/metal/resources/ + +Last cross-check date: 2026-03-21 diff --git a/content/apple/docs/metal-silent-nan-inf-debugging-checklist/DOC.md b/content/apple/docs/metal-silent-nan-inf-debugging-checklist/DOC.md new file mode 100644 index 00000000..d5816522 --- /dev/null +++ b/content/apple/docs/metal-silent-nan-inf-debugging-checklist/DOC.md @@ -0,0 +1,61 @@ +--- +name: metal-silent-nan-inf-debugging-checklist +description: "Apple Metal silent NaN and Inf debugging checklist: isolate the first bad intermediate, validate boundary conditions, and separate arithmetic overflow from data corruption." +metadata: + languages: "cpp" + versions: "4.0" + revision: 1 + updated-on: "2026-03-21" + source: official + tags: "apple,metal,nan,inf,debugging,overflow,intermediate-values,numerics,shader-debugger,compute" +--- + +# Metal Silent NaN And Inf Debugging Checklist + +Use this page when a Metal kernel runs to completion but outputs contain `NaN` or `Inf` values without an obvious crash or validation error. + +## First Principle + +Find the first bad intermediate, not just the final bad output. + +`NaN` and `Inf` propagation often turns one earlier mistake into a later explosion, so the useful question is: + +"Which intermediate value became invalid first?" + +## Common Root Causes + +- exponentials or divisions overflow +- a denominator becomes zero or near zero +- uninitialized threadgroup data participates in arithmetic +- an out-of-range read introduces garbage that later becomes `NaN` + +## Safe Debugging Order + +1. compare against a tiny deterministic reference input +2. inspect stage outputs or intermediate buffers +3. add finite-value checks around suspicious steps +4. test whether the first invalid value appears before or after synchronization points +5. re-run with simplified inputs that remove extreme magnitudes + +## What To Verify + +- denominators cannot become zero unless the algorithm defines that case +- exponentials, reciprocals, and normalization steps use the intended stable form +- all staged data is initialized before use +- invalid values are not caused by layout or indexing bugs upstream + +## Common Failure Modes + +- a numerical overflow is blamed on tolerance alone while the kernel is actually unstable +- `NaN` appears only on edge tiles, but debugging focuses on the main interior path +- the first invalid intermediate is never captured because only final output is checked +- a managed or synchronized resource issue is mistaken for arithmetic instability + +## Official Source Links (Fact Check) + +- Metal developer tools: https://developer.apple.com/metal/tools/ +- Metal Shading Language Specification: https://developer.apple.com/metal/resources/ +- Compute passes: https://developer.apple.com/documentation/metal/compute-passes +- Synchronizing a managed resource in macOS: https://developer.apple.com/documentation/metal/synchronizing-a-managed-resource-in-macos + +Last cross-check date: 2026-03-21 diff --git a/content/apple/docs/metal-simdgroup-patterns/DOC.md b/content/apple/docs/metal-simdgroup-patterns/DOC.md new file mode 100644 index 00000000..c3857ee3 --- /dev/null +++ b/content/apple/docs/metal-simdgroup-patterns/DOC.md @@ -0,0 +1,80 @@ +--- +name: metal-simdgroup-patterns +description: "Apple Metal SIMD-group patterns: subgroup-level cooperation, lane-sensitive logic, and when not to assume CUDA warp behavior." +metadata: + languages: "cpp" + versions: "4.0" + revision: 1 + updated-on: "2026-03-21" + source: official + tags: "apple,metal,simdgroup,simd-group,subgroup,warp-like,msl,threadgroup,shuffle,reduction" +--- + +# Metal SIMD-Group Patterns + +Use this page when a Metal kernel depends on subgroup-level cooperation or lane-sensitive execution. + +## What A SIMD Group Is + +Metal exposes SIMD-group concepts for threads that execute together at a finer scope than a full threadgroup. + +This is the closest Metal concept to a CUDA warp, but it should not be treated as a drop-in replacement. + +## Practical Rule + +- Use SIMD-group reasoning only when a kernel genuinely needs subgroup-local cooperation. +- Keep the default kernel design based on grid and threadgroup semantics unless subgroup behavior is the real performance or correctness bottleneck. +- If a kernel was originally designed around CUDA warp intrinsics, re-check every assumption before porting it to Metal. + +## Where SIMD-Group Logic Shows Up + +Common cases include: + +- subgroup reductions +- lane-local exchange patterns +- ballot-like control decisions +- wave/warp-sensitive prefix patterns + +These are performance-sensitive patterns, but they are also some of the easiest places to introduce incorrect CUDA-to-Metal translations. + +## What Not To Assume + +- Do not assume CUDA warp width rules or naming transfer directly to Metal. +- Do not assume a threadgroup barrier is a substitute for subgroup-local logic. +- Do not hard-code lane-sensitive algorithms unless the Metal-side subgroup contract is clear and tested. + +## Safe Porting Strategy + +1. Start with a threadgroup-correct version. +2. Validate numerics and synchronization first. +3. Introduce SIMD-group optimization only after the baseline is correct. +4. Re-profile and re-validate after each subgroup-specific change. + +## Common Failure Modes + +- Ported CUDA warp code relies on implicit warp invariants that are not documented the same way in Metal. +- A kernel mixes threadgroup-wide and subgroup-wide assumptions without clear boundaries. +- Performance tuning introduces subgroup-specific logic before the threadgroup baseline is correct. +- Lane-sensitive debugging is skipped because output "usually" looks correct on one machine. + +## When To Stay At Threadgroup Scope + +Stay at threadgroup scope when: + +- the data-sharing pattern naturally spans the whole tile +- correctness depends on threadgroup memory and barriers anyway +- subgroup logic would make the kernel harder to reason about than it is worth + +Use SIMD-group techniques when: + +- the algorithm is genuinely subgroup-local +- shared-memory traffic can be reduced materially +- the optimization has a measurable benefit on target hardware + +## Official Source Links (Fact Check) + +- Metal resources hub: https://developer.apple.com/metal/resources/ +- Metal Shading Language Specification: https://developer.apple.com/metal/resources/ +- Metal developer tools: https://developer.apple.com/metal/tools/ + +Last cross-check date: 2026-03-21 diff --git a/content/apple/docs/metal-softmax-and-logsumexp-stability-patterns/DOC.md b/content/apple/docs/metal-softmax-and-logsumexp-stability-patterns/DOC.md new file mode 100644 index 00000000..bbfa91e4 --- /dev/null +++ b/content/apple/docs/metal-softmax-and-logsumexp-stability-patterns/DOC.md @@ -0,0 +1,63 @@ +--- +name: metal-softmax-and-logsumexp-stability-patterns +description: "Apple Metal softmax and logsumexp stability patterns: max-subtraction, staged reductions, accumulation precision, and validation for numerically sensitive kernels." +metadata: + languages: "cpp" + versions: "4.0" + revision: 1 + updated-on: "2026-03-21" + source: official + tags: "apple,metal,softmax,logsumexp,numerical-stability,max-subtraction,reduction,precision,compute" +--- + +# Metal Softmax And Logsumexp Stability Patterns + +Use this page when implementing softmax, logsumexp, or related normalization kernels that are numerically sensitive even when the indexing logic is correct. + +## The Main Stability Rule + +Do not exponentiate raw values directly when they may have large magnitude. + +For softmax-style kernels, the stable baseline is: + +1. reduce to find the maximum value over the logical reduction axis +2. subtract that maximum before exponentiation +3. accumulate exponentials in stable precision +4. divide by the accumulated total or take the log as needed + +## Why This Matters + +Without max-subtraction: + +- large positive inputs can overflow exponentials +- large negative inputs can underflow to zero +- the final normalized output can become `NaN`, `Inf`, or silently wrong + +## What To Verify + +- the reduction axis for the maximum matches the intended softmax axis +- accumulation precision is high enough for the workload +- masked or ragged elements do not participate incorrectly in the max or sum +- the kernel handles tiny and very large logits consistently with the reference path + +## Common Failure Modes + +- max is reduced over the wrong extent or wrong stride +- one optimization keeps max-subtraction but lowers accumulation precision too aggressively +- masked elements are included in the denominator +- only moderate-value test cases are used, hiding overflow behavior + +## Validation Strategy + +- compare against a stable CPU reference +- test extremely large and extremely negative values +- test all-equal inputs and one-dominant-value inputs +- test long reduction axes where accumulation error is more visible + +## Official Source Links (Fact Check) + +- Metal Shading Language Specification: https://developer.apple.com/metal/resources/ +- Compute passes: https://developer.apple.com/documentation/metal/compute-passes +- Metal developer tools: https://developer.apple.com/metal/tools/ + +Last cross-check date: 2026-03-21 diff --git a/content/apple/docs/metal-streaming-and-online-kernel-patterns/DOC.md b/content/apple/docs/metal-streaming-and-online-kernel-patterns/DOC.md new file mode 100644 index 00000000..0ce0d42b --- /dev/null +++ b/content/apple/docs/metal-streaming-and-online-kernel-patterns/DOC.md @@ -0,0 +1,63 @@ +--- +name: metal-streaming-and-online-kernel-patterns +description: "Apple Metal streaming and online kernel patterns: chunked processing, persistent wrapper state, rolling buffers, and latency-aware compute design." +metadata: + languages: "cpp" + versions: "4.0" + revision: 1 + updated-on: "2026-03-21" + source: official + tags: "apple,metal,streaming,online,chunked,rolling-buffer,latency,persistent-state,incremental,compute" +--- + +# Metal Streaming And Online Kernel Patterns + +Use this page when inputs arrive incrementally and the compute path must process chunks over time instead of one static batch. + +## What Makes Streaming Work Different + +Streaming kernels care about latency and state continuity, not only peak throughput. + +Typical examples include: + +- chunked signal or sequence processing +- incremental statistics +- sliding-window transforms +- real-time preprocessing pipelines + +## Good Wrapper Structure + +- keep long-lived pipeline objects and reusable buffers persistent +- separate one-time setup from per-chunk updates +- make chunk boundaries explicit in both host code and kernel parameters +- keep any rolling state or history buffer ownership unambiguous + +## Design Questions + +- what state carries from one chunk to the next? +- does the kernel need overlap with the previous chunk? +- is the critical metric latency, throughput, or both? +- can several small chunks be batched without breaking semantics? + +## Common Failure Modes + +- state buffers are reinitialized accidentally between chunks +- overlap windows are handled inconsistently at chunk boundaries +- tiny chunks cause submission overhead to dominate execution time +- testing uses only one chunk, hiding inter-chunk state bugs + +## Review Checklist + +- Which buffers persist across chunk boundaries? +- Does the first chunk use different initialization rules than later chunks? +- Are boundary overlaps or carry values explicitly validated? +- Is batching small chunks possible without changing correctness? + +## Official Source Links (Fact Check) + +- Command buffers best practices: https://developer.apple.com/library/archive/documentation/3DDrawing/Conceptual/MTLBestPracticesGuide/CommandBuffers.html +- Persistent objects best practices: https://developer.apple.com/library/archive/documentation/3DDrawing/Conceptual/MTLBestPracticesGuide/PersistentObjects.html +- Compute passes: https://developer.apple.com/documentation/metal/compute-passes +- Metal developer tools: https://developer.apple.com/metal/tools/ + +Last cross-check date: 2026-03-21 diff --git a/content/apple/docs/metal-strided-views-and-subtensor-access-patterns/DOC.md b/content/apple/docs/metal-strided-views-and-subtensor-access-patterns/DOC.md new file mode 100644 index 00000000..5d49ff59 --- /dev/null +++ b/content/apple/docs/metal-strided-views-and-subtensor-access-patterns/DOC.md @@ -0,0 +1,55 @@ +--- +name: metal-strided-views-and-subtensor-access-patterns +description: "Apple Metal strided-view and subtensor access patterns: offset math, slice validity, and when to materialize a contiguous copy instead of using irregular strides." +metadata: + languages: "cpp" + versions: "4.0" + revision: 1 + updated-on: "2026-03-21" + source: official + tags: "apple,metal,strided-view,subtensor,slice,offset-math,noncontiguous,layout,compute" +--- + +# Metal Strided Views And Subtensor Access Patterns + +Use this page when a Metal kernel consumes a non-contiguous tensor view, a slice, or a subtensor defined by base offsets and strides. + +## The Main Question + +Should the kernel operate directly on the strided view, or should the wrapper materialize a contiguous copy first? + +Direct strided access is attractive because it avoids a copy, but it can: + +- complicate index math +- reduce memory locality +- make vectorization harder +- hide subtle shape or offset bugs + +## Safe Decision Process + +- start by proving the strided index math on tiny cases +- check whether the access pattern becomes highly irregular +- if the view is reused many times, consider packing once into a contiguous buffer +- choose the simpler path unless the copy cost is clearly the bottleneck + +## What To Verify + +- base offset is correct for the chosen slice +- each dimension's stride matches the view contract +- the logical shape of the view is passed separately from underlying storage size +- out-of-range elements are impossible under the view definition + +## Common Failure Modes + +- one dimension's stride is interpreted in bytes on one side and elements on the other +- the view shape is correct but the base offset is wrong +- a direct strided kernel is kept even though repeated reuse made a packed copy cheaper +- a contiguous fast path and strided slow path diverge semantically + +## Official Source Links (Fact Check) + +- Compute passes: https://developer.apple.com/documentation/metal/compute-passes +- Performing calculations on a GPU: https://developer.apple.com/documentation/metal/performing-calculations-on-a-gpu +- Metal Shading Language Specification: https://developer.apple.com/metal/resources/ + +Last cross-check date: 2026-03-21 diff --git a/content/apple/docs/metal-tensor-packing-and-unpacking-patterns/DOC.md b/content/apple/docs/metal-tensor-packing-and-unpacking-patterns/DOC.md new file mode 100644 index 00000000..ba2448fc --- /dev/null +++ b/content/apple/docs/metal-tensor-packing-and-unpacking-patterns/DOC.md @@ -0,0 +1,54 @@ +--- +name: metal-tensor-packing-and-unpacking-patterns +description: "Apple Metal tensor packing and unpacking patterns: contiguous staging, vector-friendly layouts, and explicit shape or stride contracts for data transform kernels." +metadata: + languages: "cpp" + versions: "4.0" + revision: 1 + updated-on: "2026-03-21" + source: official + tags: "apple,metal,tensor-packing,unpacking,layout-transform,contiguous,vectorization,shape-contract,stride,compute" +--- + +# Metal Tensor Packing And Unpacking Patterns + +Use this page when a Metal workflow needs to rearrange tensor data into a layout that is easier for later kernels to consume. + +## Why Packing Exists + +Packing and unpacking kernels are usually inserted to: + +- convert between framework layout and kernel-friendly layout +- make later accesses contiguous or vector-friendly +- batch several small logical tensors into one regular representation + +These kernels do little arithmetic, so correctness and memory behavior matter more than ALU throughput. + +## Safe Baseline Pattern + +- define the source logical shape and destination logical shape explicitly +- write one direct mapping kernel first +- validate that unpacking reconstructs the original tensor +- only then tune for vector-width alignment or tiled movement + +## What To Verify + +- packed layout contract is documented in both host code and the kernel +- destination strides or offsets are derived from the intended packed format +- padding regions are defined explicitly +- later kernels actually use the new layout as intended + +## Common Failure Modes + +- packing logic is correct but the consumer still assumes the old layout +- padding bytes or elements are left uninitialized and later treated as valid +- the host wrapper allocates for logical size, not packed size +- pack and unpack kernels drift apart and stop being inverses + +## Official Source Links (Fact Check) + +- Compute passes: https://developer.apple.com/documentation/metal/compute-passes +- Calculating threadgroup and grid sizes: https://developer.apple.com/documentation/metal/calculating-threadgroup-and-grid-sizes +- Metal Shading Language Specification: https://developer.apple.com/metal/resources/ + +Last cross-check date: 2026-03-21 diff --git a/content/apple/docs/metal-texture-vs-buffer-path-selection/DOC.md b/content/apple/docs/metal-texture-vs-buffer-path-selection/DOC.md new file mode 100644 index 00000000..49c3aa96 --- /dev/null +++ b/content/apple/docs/metal-texture-vs-buffer-path-selection/DOC.md @@ -0,0 +1,82 @@ +--- +name: metal-texture-vs-buffer-path-selection +description: "Apple Metal texture versus buffer path selection: choosing between formatted image resources and linear buffers for compute kernels." +metadata: + languages: "cpp" + versions: "4.0" + revision: 1 + updated-on: "2026-03-21" + source: official + tags: "apple,metal,texture,buffer,resource-selection,mtlbuffer,mtltexture,image-kernel,linear-memory,formatted-data" +--- + +# Metal Texture Vs Buffer Path Selection + +Use this page when deciding whether a compute kernel should operate on `MTLBuffer` or `MTLTexture` resources. + +## Resource Model Difference + +Apple's resource model separates: + +- `MTLBuffer`: unformatted linear memory +- `MTLTexture`: formatted image data with explicit texture type and pixel format + +This is not a cosmetic distinction. It affects indexing, layout, access mode, and host-side creation rules. + +## Choose Buffers When + +- data is naturally linear +- the kernel uses tensor- or array-like indexing +- you need explicit struct or element layout control +- formatted texture semantics do not add value + +Typical cases: + +- linear algebra +- reductions +- custom fused tensor ops +- buffer-backed staging paths + +## Choose Textures When + +- the data is naturally image-like +- format and dimensionality matter directly +- the kernel logic is built around 2D or 3D coordinates +- texture-specific access patterns or image-processing semantics are central + +Typical cases: + +- image filters +- screen-space compute passes +- 2D texture transforms + +## Host-Side Consequences + +- buffers push more responsibility onto explicit stride and alignment calculations +- textures push more responsibility onto format, size, usage, and access-mode correctness +- buffer-backed texture paths require alignment and row-pitch validation + +## Common Failure Modes + +- choosing textures for linear tensor workloads and adding unnecessary layout complexity +- choosing buffers for image workloads and re-implementing texture semantics badly +- switching between the two without updating kernel indexing assumptions +- assuming one path can be substituted for the other without changing host-side setup + +## Decision Rule + +Ask: + +1. Is the data fundamentally formatted image data or unformatted linear data? +2. Does the kernel index by image coordinates or by element offsets? +3. Which path minimizes layout ambiguity in the host wrapper? + +Choose the path that makes both kernel logic and host binding simpler. + +## Official Source Links (Fact Check) + +- Resource Objects: Buffers and Textures: https://developer.apple.com/library/archive/documentation/Miscellaneous/Conceptual/MetalProgrammingGuide/Mem-Obj/Mem-Obj.html +- Processing a texture in a compute function: https://developer.apple.com/documentation/metal/processing-a-texture-in-a-compute-function +- Compute passes: https://developer.apple.com/documentation/metal/compute-passes + +Last cross-check date: 2026-03-21 diff --git a/content/apple/docs/metal-threadgroup-sizing-playbook/DOC.md b/content/apple/docs/metal-threadgroup-sizing-playbook/DOC.md new file mode 100644 index 00000000..fd213ec0 --- /dev/null +++ b/content/apple/docs/metal-threadgroup-sizing-playbook/DOC.md @@ -0,0 +1,74 @@ +--- +name: metal-threadgroup-sizing-playbook +description: "Apple Metal threadgroup sizing playbook: dispatch geometry, pipeline limits, locality tradeoffs, and measurement-first tuning." +metadata: + languages: "cpp" + versions: "4.0" + revision: 1 + updated-on: "2026-03-21" + source: official + tags: "apple,metal,threadgroup,threadsperthreadgroup,dispatchthreads,threadexecutionwidth,maxTotalThreadsPerThreadgroup,occupancy,tiling" +--- + +# Metal Threadgroup Sizing Playbook + +Use this page when a Metal kernel is correct but performance or scaling depends heavily on threadgroup shape. + +## The Two Inputs You Must Balance + +Threadgroup sizing is driven by both: + +- algorithm locality +- hardware and pipeline execution limits + +Apple exposes pipeline and device properties that matter here, including: + +- `threadExecutionWidth` +- `maxTotalThreadsPerThreadgroup` + +These should guide threadgroup sizing instead of hard-coded folklore. + +## Baseline Strategy + +1. Start with a simple threadgroup shape that maps naturally to the data layout. +2. Confirm correctness and bounds handling. +3. Read pipeline limits. +4. Sweep a small set of candidate threadgroup shapes. +5. Keep the best measured configuration for the target kernel family. + +## What Shapes Tend To Work Well + +- 1D kernels: choose widths that align naturally with the pipeline execution width +- 2D tiled kernels: make threadgroup shape match tile layout and local reuse pattern +- threadgroup-memory kernels: size the group jointly with threadgroup memory footprint + +The right shape is kernel-specific. Reuse only after measurement. + +## Constraints To Respect + +- total threads in a threadgroup must not exceed pipeline/device limits +- threadgroup memory usage scales with group shape +- larger groups can reduce scheduling flexibility or increase local-memory pressure + +## Common Failure Modes + +- one universal threadgroup size is applied to every kernel +- threadgroup size is chosen without checking `threadExecutionWidth` +- size is increased for throughput, but threadgroup memory footprint becomes the real bottleneck +- 2D kernels use a shape that is convenient for indexing but poor for locality + +## Tuning Order + +1. make indexing and bounds logic correct +2. pick a shape consistent with data layout +3. check `threadExecutionWidth` and `maxTotalThreadsPerThreadgroup` +4. benchmark a small grid of candidate shapes +5. keep the chosen size documented in the wrapper or kernel-selection logic + +## Official Source Links (Fact Check) + +- `threadExecutionWidth`: https://developer.apple.com/documentation/metal/mtlcomputepipelinestate/threadexecutionwidth +- `maxTotalThreadsPerThreadgroup`: https://developer.apple.com/documentation/metal/mtlcomputepipelinestate/maxtotalthreadsperthreadgroup +- Performing calculations on a GPU: https://developer.apple.com/documentation/metal/performing-calculations-on-a-gpu + +Last cross-check date: 2026-03-21 diff --git a/content/apple/docs/metal-tiled-matmul-patterns/DOC.md b/content/apple/docs/metal-tiled-matmul-patterns/DOC.md new file mode 100644 index 00000000..cc900b11 --- /dev/null +++ b/content/apple/docs/metal-tiled-matmul-patterns/DOC.md @@ -0,0 +1,74 @@ +--- +name: metal-tiled-matmul-patterns +description: "Apple Metal tiled matmul patterns: threadgroup staging, tile indexing, synchronization points, and correctness-first optimization." +metadata: + languages: "cpp" + versions: "4.0" + revision: 1 + updated-on: "2026-03-21" + source: official + tags: "apple,metal,matmul,gemm,tiled-matmul,threadgroup-memory,tiling,matrix-multiply,compute,kernel" +--- + +# Metal Tiled Matmul Patterns + +Use this page when implementing a matrix multiply or GEMM-like kernel in Metal with threadgroup staging. + +## Why Tiling Matters + +For matrix multiplication, the naive kernel often reloads the same source data many times from device memory. + +The standard optimization path is: + +1. partition the problem into tiles +2. stage tile data into `threadgroup` memory +3. synchronize the threadgroup +4. accumulate over the K dimension tile by tile + +This is the same broad optimization idea as tiled CUDA GEMM, but the implementation details live in Metal's threadgroup and dispatch model. + +## Core Structure + +A typical tiled matmul kernel needs: + +- 2D mapping from grid coordinates to output tile coordinates +- `threadgroup` memory for A and B subtiles +- one barrier after staging +- one barrier before reusing the same threadgroup buffers for the next K tile + +## Safe Baseline Pattern + +- start with a correct 2D tiled kernel +- make tile sizes match dispatch shape +- keep bounds handling explicit for edge tiles +- only then tune threadgroup shape and staging layout + +## What To Verify First + +- global row and column indices are derived from grid position, not local threadgroup position alone +- local thread indices match the threadgroup tile layout +- all threads in the threadgroup reach both staging and reuse barriers +- threadgroup memory allocation matches tile dimensions exactly + +## Common Failure Modes + +- threadgroup tile indexing is correct for full tiles but wrong on edge tiles +- one barrier is missing, so later K-step accumulations read stale or partial tile data +- host dispatch geometry and kernel tile assumptions drift apart +- tile sizes are increased for arithmetic reuse but threadgroup memory or scheduling cost becomes the new bottleneck + +## Optimization Order + +1. correct tiled baseline +2. stable threadgroup sizing +3. better threadgroup layout and memory access pattern +4. wrapper-level reuse and batching +5. only then consider subgroup-sensitive refinements + +## Official Source Links (Fact Check) + +- Compute passes: https://developer.apple.com/documentation/metal/compute-passes +- Performing calculations on a GPU: https://developer.apple.com/documentation/metal/performing-calculations-on-a-gpu +- Metal Shading Language Specification: https://developer.apple.com/metal/resources/ + +Last cross-check date: 2026-03-21 diff --git a/content/apple/docs/metal-transpose-and-layout-reorder-patterns/DOC.md b/content/apple/docs/metal-transpose-and-layout-reorder-patterns/DOC.md new file mode 100644 index 00000000..9edaeccf --- /dev/null +++ b/content/apple/docs/metal-transpose-and-layout-reorder-patterns/DOC.md @@ -0,0 +1,71 @@ +--- +name: metal-transpose-and-layout-reorder-patterns +description: "Apple Metal transpose and layout reorder patterns: tile-based remapping, threadgroup staging, edge handling, and correctness checks for data-layout kernels." +metadata: + languages: "cpp" + versions: "4.0" + revision: 1 + updated-on: "2026-03-21" + source: official + tags: "apple,metal,transpose,layout,reorder,permute,tile,threadgroup,data-layout,memory-access,compute" +--- + +# Metal Transpose And Layout Reorder Patterns + +Use this page when implementing tensor transpose, matrix transpose, channel reordering, or other layout-conversion kernels in Metal. + +## Why These Kernels Are Tricky + +Transpose and reorder kernels usually do little arithmetic. + +The real problems are: + +- read and write coordinates change together +- memory access patterns become strided or permuted +- edge tiles are easy to mishandle +- a kernel that is logically correct can still perform badly because writes lose locality + +## Safe Baseline Pattern + +- start with a direct coordinate remap kernel +- make source and destination indexing explicit +- verify shape and stride assumptions on tiny non-square inputs +- only then introduce tile-based threadgroup staging + +This is more reliable than optimizing the access pattern before the permutation logic is stable. + +## When Tiling Helps + +Threadgroup staging is useful when: + +- neighboring threads read a coherent input tile +- the output layout would otherwise produce highly strided access +- the tile shape can be matched cleanly to the dispatch geometry + +The common structure is: + +1. load an input tile +2. synchronize the threadgroup +3. write the transposed or reordered tile + +## What To Verify + +- source index math matches the original logical shape +- destination index math matches the new logical shape +- non-square shapes and partial edge tiles are handled explicitly +- host-side output allocation matches the reordered logical dimensions + +## Common Failure Modes + +- the kernel is tested only on square shapes where row/column mistakes cancel out +- edge tiles read or write out of range +- transpose logic is correct but the wrapper still interprets output with the old layout +- a threadgroup tile is introduced without updating bounds checks for the staged region + +## Official Source Links (Fact Check) + +- Compute passes: https://developer.apple.com/documentation/metal/compute-passes +- Calculating threadgroup and grid sizes: https://developer.apple.com/documentation/metal/calculating-threadgroup-and-grid-sizes +- Metal Shading Language Specification: https://developer.apple.com/metal/resources/ + +Last cross-check date: 2026-03-21 diff --git a/content/apple/docs/metal-transpose-free-layout-choice-playbook/DOC.md b/content/apple/docs/metal-transpose-free-layout-choice-playbook/DOC.md new file mode 100644 index 00000000..40dd9de4 --- /dev/null +++ b/content/apple/docs/metal-transpose-free-layout-choice-playbook/DOC.md @@ -0,0 +1,55 @@ +--- +name: metal-transpose-free-layout-choice-playbook +description: "Apple Metal transpose-free layout choice playbook: selecting data layout early to avoid extra layout-conversion kernels and preserve downstream locality." +metadata: + languages: "cpp" + versions: "4.0" + revision: 1 + updated-on: "2026-03-21" + source: official + tags: "apple,metal,layout-choice,transpose-free,data-layout,locality,packing,reorder,compute" +--- + +# Metal Transpose-Free Layout Choice Playbook + +Use this page when a workload repeatedly inserts transpose or reorder kernels only because earlier layout decisions made later stages awkward. + +## The Main Idea + +Sometimes the best transpose kernel is no transpose kernel. + +If several downstream stages naturally prefer one layout, it is often better to: + +- choose that layout earlier +- keep data in that layout longer +- avoid repeated pack, transpose, unpack cycles + +## When To Reconsider Layout + +Revisit layout choice when: + +- the pipeline spends noticeable time in layout-conversion kernels +- one stage prefers row-major while most later stages prefer another view +- a tensor is repeatedly packed and unpacked around the same hotspot + +## Safe Decision Process + +- identify which stages dominate time or bandwidth +- choose the layout that helps the dominant downstream path +- validate that the new layout reduces, rather than relocates, conversion cost +- keep one explicit fallback path until the new layout is stable + +## Common Failure Modes + +- a transpose is optimized heavily instead of removing the need for it +- each stage chooses its own preferred layout, causing repeated conversions +- a new layout helps one kernel but makes several later kernels worse +- the wrapper and kernel disagree on which layout is now canonical + +## Official Source Links (Fact Check) + +- Compute passes: https://developer.apple.com/documentation/metal/compute-passes +- Performing calculations on a GPU: https://developer.apple.com/documentation/metal/performing-calculations-on-a-gpu +- Metal developer tools: https://developer.apple.com/metal/tools/ + +Last cross-check date: 2026-03-21 diff --git a/content/apple/docs/metal-validation-and-profiling-workflow/DOC.md b/content/apple/docs/metal-validation-and-profiling-workflow/DOC.md new file mode 100644 index 00000000..5e7e9c16 --- /dev/null +++ b/content/apple/docs/metal-validation-and-profiling-workflow/DOC.md @@ -0,0 +1,81 @@ +--- +name: metal-validation-and-profiling-workflow +description: "Apple Metal validation and profiling workflow: runtime validation, debugger use, Instruments traces, and staged optimization discipline." +metadata: + languages: "cpp" + versions: "4.0" + revision: 1 + updated-on: "2026-03-21" + source: official + tags: "apple,metal,validation,profiling,workflow,xcode,metal-debugger,instruments,metal-system-trace,performance-counters" +--- + +# Metal Validation And Profiling Workflow + +Use this page when moving from "the kernel runs" to "the kernel is correct, explainable, and fast." + +## The Workflow Order Matters + +Do not optimize first. + +A stable Metal workflow is: + +1. runtime validation +2. debugger-assisted correctness checks +3. dispatch and resource inspection +4. Instruments / trace-based profiling +5. targeted optimization + +Skipping the early steps usually turns performance work into guesswork. + +## Validation Stage + +Use Metal validation and Xcode debugging support to catch: + +- invalid API usage +- bad bindings +- resource misuse +- obvious synchronization mistakes + +This stage should eliminate launch-path bugs before you interpret performance counters. + +## Profiling Stage + +Once correctness is stable, use Metal profiling tools to answer: + +- is the workload CPU-submission bound or GPU-execution bound? +- is dispatch sizing reasonable? +- are synchronization points too frequent? +- is memory behavior dominating runtime? + +Apple's Metal System Trace in Instruments is the right place to answer CPU/GPU overlap questions. + +## Practical Review Questions + +- Is command encoding time too high? +- Is GPU occupancy limited by threadgroup shape or resource pressure? +- Are command buffers fragmented into too many tiny submissions? +- Are resource synchronization points serialized more than necessary? + +## Common Failure Modes + +- profiling starts before validation is clean, so counters are interpreted on a broken kernel +- debugger findings and profiler findings are mixed into one undifferentiated "slow" diagnosis +- a CPU submission bottleneck is treated as a shader bottleneck +- optimization changes are made without capturing a before/after trace + +## Good Engineering Practice + +- keep one reproducible benchmark input +- capture validation-clean traces first +- make one optimization change at a time +- keep the wrapper, dispatch shape, and kernel change history explainable + +## Official Source Links (Fact Check) + +- Metal developer tools: https://developer.apple.com/metal/tools/ +- Compute passes: https://developer.apple.com/documentation/metal/compute-passes +- Metal System Trace: https://developer.apple.com/documentation/metal/analyzing-the-performance-of-your-metal-app +- Discover Metal 4 (barriers and workflow context): https://developer.apple.com/videos/play/wwdc2025/205/ + +Last cross-check date: 2026-03-21 diff --git a/content/apple/docs/pytorch-mps-vs-custom-metal/python/DOC.md b/content/apple/docs/pytorch-mps-vs-custom-metal/python/DOC.md new file mode 100644 index 00000000..c42e1c8f --- /dev/null +++ b/content/apple/docs/pytorch-mps-vs-custom-metal/python/DOC.md @@ -0,0 +1,103 @@ +--- +name: pytorch-mps-vs-custom-metal +description: "PyTorch MPS versus custom Metal kernels: backend boundaries, capability checks, and when to write a custom op." +metadata: + languages: "python" + versions: "2.10" + revision: 1 + updated-on: "2026-03-21" + source: official + tags: "apple,pytorch,mps,metal,metal-performance-shaders,mps-graph,custom-op,custom-metal,torch.mps,macos" +--- + +# PyTorch MPS Vs Custom Metal + +Use this page to avoid mixing four different layers: + +- PyTorch `mps` device +- Metal Performance Shaders (MPS) +- MPS Graph +- custom Metal kernels or custom PyTorch operations + +These are related, but they are not interchangeable. + +## What `mps` Means In PyTorch + +PyTorch's `mps` device is the PyTorch backend for Mac GPU execution. + +According to the official PyTorch MPS notes and Apple Metal/PyTorch material: + +- `torch.device("mps")` moves tensors and modules to the MPS backend +- that backend maps operations onto MPS Graph and tuned kernels provided by Metal Performance Shaders +- availability depends on both the installed PyTorch build and the machine/runtime environment + +## Minimal Availability Check + +```python +import torch + +if torch.backends.mps.is_available(): + device = torch.device("mps") +else: + device = torch.device("cpu") +``` + +If `is_available()` is false, check: + +- `torch.backends.mps.is_built()` +- macOS version and device support + +## What Custom Metal Means + +Custom Metal means you are writing your own Metal kernel and host-side integration rather than relying entirely on stock PyTorch operators. + +That is a different level of ownership: + +- you own kernel code +- you own bindings and launch configuration +- you own debugging and correctness validation + +Apple's `Customizing a PyTorch operation` sample exists specifically for this case. + +## Choose The Right Layer + +Stay with plain PyTorch `mps` when: + +- standard PyTorch ops already express the computation +- performance is acceptable +- your main need is "run on Mac GPU" + +Move toward a custom Metal or custom PyTorch op path when: + +- a critical fused op is missing or too slow +- memory movement or launch overhead dominates +- you need kernel behavior that stock operators do not expose + +## Important `torch.mps` APIs + +The official `torch.mps` docs expose runtime tools such as: + +- `device_count` +- `synchronize` +- memory reporting helpers +- `compile_shader` +- profiler and Metal-capture helpers + +These are useful for debugging and runtime inspection, but they do not replace understanding the lower-level Metal execution model. + +## Common Failure Modes + +- Treating `mps` as if it were a direct custom-kernel API. +- Assuming every CUDA path has an equivalent `mps` kernel or feature surface. +- Using `mps` successfully for stock ops, then assuming custom Metal integration will require no host-side work. +- Confusing MPS Graph and custom Metal kernels in design discussions. + +## Official Source Links (Fact Check) + +- Apple: Accelerated PyTorch training on Mac: https://developer.apple.com/metal/pytorch/ +- Apple: Customizing a PyTorch operation: https://developer.apple.com/documentation/metal/customizing-a-pytorch-operation +- PyTorch MPS backend notes: https://docs.pytorch.org/docs/stable/notes/mps +- PyTorch `torch.mps` package: https://docs.pytorch.org/docs/stable/mps.html +- PyTorch MPS environment variables: https://docs.pytorch.org/docs/stable/mps_environment_variables.html + +Last cross-check date: 2026-03-21 diff --git a/content/cuda/docs/async-copy/DOC.md b/content/cuda/docs/async-copy/DOC.md new file mode 100644 index 00000000..354b8cfe --- /dev/null +++ b/content/cuda/docs/async-copy/DOC.md @@ -0,0 +1,117 @@ +--- +name: async-copy +description: "CUDA async copy essentials: cooperative_groups::memcpy_async, cuda::pipeline, wait rules, and the bridge to cp.async/TMA." +metadata: + languages: "cpp" + versions: "13.1" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,async-copy,memcpy_async,cuda::pipeline,cuda::barrier,cp.async,tma,shared-memory" +--- + +# CUDA Async Copy (C++) + +Use this page for the CUDA C++ view of asynchronous copies from global memory to shared memory and the synchronization rules around them. + +## What Problem It Solves + +A conventional copy into shared memory: + +```cpp +shared[idx] = global[idx]; +``` + +typically expands into: + +1. load from global memory into a register +2. store from register into shared memory + +Async copy can avoid that register staging path on supported hardware and can overlap data movement with computation. + +## Main CUDA C++ Entry Points + +Two common interfaces appear in NVIDIA documentation: + +- `cooperative_groups::memcpy_async(...)` +- `cuda::memcpy_async(...)` together with `cuda::pipeline` or `cuda::barrier` + +At a high level, both start an async transfer and require an explicit wait before the data in shared memory is consumed. + +## Fundamental Safety Rule + +After initiating the async copy: + +- do not read the destination shared memory until the corresponding wait completes +- do not modify the source or destination participating region while the transfer is in flight + +Until the wait completes, reading or writing the participating data can create a data race. + +## Cooperative Groups Pattern + +```cpp +namespace cg = cooperative_groups; + +auto block = cg::this_thread_block(); +extern __shared__ float smem[]; + +cg::memcpy_async(block, smem, gmem_ptr, bytes); +cg::wait(block); +block.sync(); +``` + +Use `cg::wait(group)` before consuming the copied shared-memory data. + +## Pipeline Pattern + +For newer CUDA C++ paths, `cuda::pipeline` can express staged copy/compute overlap. + +The common structure is: + +1. acquire / start pipeline stage +2. issue `cuda::memcpy_async` +3. commit or advance the stage +4. wait for the prior stage +5. compute on the completed shared-memory tile + +This is the higher-level CUDA C++ bridge to lower-level async copy hardware behavior. + +## When Hardware Acceleration Matters + +NVIDIA documents that on compute capability 8.0 and higher, async copies from global to shared memory can benefit from hardware acceleration that avoids an intermediate register path. + +That does not remove the need for: + +- alignment discipline +- correct wait behavior +- sensible shared-memory layout + +## When To Escalate To PTX / TMA + +Stay in CUDA C++ docs when: + +- you are using `memcpy_async` +- you need pipeline-level copy/compute overlap +- you want a supported C++ interface + +Drop to PTX / TMA docs when: + +- you need precise `cp.async` group semantics +- you need bulk async copies or TMA +- you need `mbarrier` or cluster-scope completion behavior + +## Related Topics + +- Shared memory usage: `../shared-memory/DOC.md` +- Synchronization rules: `../synchronization/DOC.md` +- Cooperative Groups: `../cooperative-groups/DOC.md` +- PTX `cp.async`: `../ptx/instructions/data-movement/references/cp-async.md` +- PTX TMA: `../ptx/instructions/tma/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA Programming Guide, Asynchronous Data Copies: https://docs.nvidia.com/cuda/archive/13.1.1/cuda-programming-guide/04-special-topics/async-copies.html +- CUDA Programming Guide, Cooperative Groups async copy examples: https://docs.nvidia.com/cuda/archive/11.8.0/cuda-c-programming-guide/index.html +- CUDA Programming Guide, `memcpy_async` and `cuda::pipeline`: https://docs.nvidia.com/cuda/archive/11.6.2/cuda-c-programming-guide/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/atomics-and-reductions/DOC.md b/content/cuda/docs/atomics-and-reductions/DOC.md new file mode 100644 index 00000000..aa8b6b91 --- /dev/null +++ b/content/cuda/docs/atomics-and-reductions/DOC.md @@ -0,0 +1,94 @@ +--- +name: atomics-and-reductions +description: "CUDA atomics and reduction essentials: atomicAdd, shared/global scope, warp-first reduction, and common tradeoffs." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,atomics,reduction,atomicAdd,atomicCAS,shared-memory,warp-reduction" +--- + +# CUDA Atomics And Reductions (C++) + +Use this page when deciding between direct atomics, shared-memory reductions, and warp-first reduction patterns. + +## Atomic Basics + +An atomic operation performs a read-modify-write sequence as one atomic transaction on a word in global or shared memory. + +Common examples: + +- `atomicAdd` +- `atomicCAS` +- `atomicMax` +- `atomicMin` + +Atomics are correct tools for contention-sensitive updates, but they can serialize hot spots. + +## Scope Choice + +- shared-memory atomics are useful for contention within one block +- global-memory atomics are visible across blocks but usually cost more under heavy contention + +A common pattern is: + +1. reduce within a warp +2. reduce within a block using shared memory +3. emit one global atomic per block + +## Preferred Reduction Structure + +For many reductions, do not start with one atomic per thread. + +Better default: + +- first use warp shuffle reduction +- then combine warp results in shared memory +- then write one value per block or one atomic per block + +This reduces contention and memory traffic. + +## When Direct Atomics Are Fine + +Direct global atomics are often acceptable when: + +- the output has low contention +- the kernel is not dominated by the atomic path +- simplicity matters more than peak throughput + +Examples: + +- histogram with many bins and good distribution +- sparse accumulation with low collision probability + +## When Atomics Become A Problem + +Expect trouble when: + +- many threads update the same location +- the output space is very small +- the kernel becomes serialization-bound + +In those cases, switch to hierarchical reduction or privatization. + +## Minimal Strategy Guide + +- one scalar result per block: block reduction in shared memory +- one scalar result for the whole grid: block reduction plus final stage +- many bins with moderate collisions: shared-memory privatization, then flush +- warp-local aggregation: use shuffle before touching shared or global memory + +## Related Topics + +- Shared memory staging: `../shared-memory/DOC.md` +- Warp-level collectives: `../warp-primitives/DOC.md` +- Synchronization rules: `../synchronization/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Programming Guide, atomic functions: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html +- CUDA C++ Best Practices Guide, reduction and shared-memory patterns: https://docs.nvidia.com/cuda/archive/13.0.0/cuda-c-best-practices-guide/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/benchmarking-methodology/DOC.md b/content/cuda/docs/benchmarking-methodology/DOC.md new file mode 100644 index 00000000..4a41274b --- /dev/null +++ b/content/cuda/docs/benchmarking-methodology/DOC.md @@ -0,0 +1,74 @@ +--- +name: benchmarking-methodology +description: "CUDA benchmarking methodology essentials: warmup, synchronization discipline, stable inputs, percentile reporting, and fair comparisons." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,benchmark,methodology,warmup,timing,percentile,variance,fair-comparison" +--- + +# CUDA Benchmarking Methodology (C++) + +Use this page when you need benchmark numbers that are comparable and reproducible. + +## Core Rules + +1. measure steady state, not cold start. +2. use correct synchronization for the scope being measured. +3. keep input shapes and distributions fixed across variants. +4. report variability, not just one best run. + +## Warmup + +Always include warmup iterations before measurement to absorb: + +- JIT or first-use overheads +- cache/allocator/transient startup effects + +## Timing Discipline + +For kernel timing: + +- use event-based timing around the measured stream segment +- avoid mixing host wall-clock timing with unsynchronized device work + +For end-to-end latency: + +- include all relevant host/device stages intentionally +- document what is excluded + +## Comparison Hygiene + +- same hardware and driver/toolkit +- same input set and batch strategy +- same precision and algorithm settings +- same determinism flags where relevant + +Any mismatch here can invalidate claimed speedups. + +## Reporting + +Report at least: + +- median +- p90/p95 (or similar tail percentile) +- run-to-run variance + +Single minimum time is not sufficient for production-facing claims. + +## Related Topics + +- Streams and events: `../streams-and-events/DOC.md` +- Performance debugging: `../performance-debugging/DOC.md` +- NVTX profiling workflow: `../nvtx-and-profiling-workflow/DOC.md` +- Regression testing and CI: `../regression-testing-and-ci/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Best Practices Guide, measurement and optimization workflow context: https://docs.nvidia.com/cuda/archive/13.0.0/cuda-c-best-practices-guide/index.html +- CUDA Runtime API, event timing APIs: https://docs.nvidia.com/cuda/cuda-runtime-api/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/build-and-abi-compatibility/DOC.md b/content/cuda/docs/build-and-abi-compatibility/DOC.md new file mode 100644 index 00000000..2c69e868 --- /dev/null +++ b/content/cuda/docs/build-and-abi-compatibility/DOC.md @@ -0,0 +1,72 @@ +--- +name: build-and-abi-compatibility +description: "CUDA build and ABI compatibility essentials: arch targets, PTX/SASS forward-compat strategy, runtime/driver constraints, and packaging hygiene." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,build,abi,compatibility,sm-arch,ptx,sass,nvcc,driver-runtime" +--- + +# CUDA Build And ABI Compatibility (C++) + +Use this page when shipping CUDA binaries across different GPU architectures and deployment environments. + +## Targeting Strategy + +Build artifacts can include: + +- SASS for specific SM architectures +- PTX for forward compatibility via JIT on newer compatible drivers + +A common practical strategy is to include both: + +- native SASS for known deployment GPUs +- PTX fallback for future-compatible targets + +## Why Compatibility Breaks + +Typical mismatch classes: + +- runtime-toolkit vs driver capability mismatch +- missing arch target in build flags +- ABI or dependency mismatch in host integration + +Treat compatibility as part of release engineering, not a last-minute fix. + +## NVCC Arch Hygiene + +Use explicit arch targets and document them in build config. + +- keep `-gencode` matrix aligned with actual fleet GPUs +- avoid shipping only one narrow arch unless environment is fixed + +## Runtime/Driver Considerations + +- new toolkits can require minimum driver versions +- deployment systems may lag driver updates + +Validate on representative driver/toolkit combinations before release. + +## Package-Level Practices + +- pin toolkit version in CI +- record compile flags in build metadata +- verify cold-start JIT overhead if PTX fallback is expected +- add smoke tests per target GPU class + +## Related Topics + +- Error handling and debug build: `../error-handling-and-debug-build/DOC.md` +- Runtime API overview: `../runtime/DOC.md` +- PTX ISA overview: `../ptx/DOC.md` + +## Official Source Links (Fact Check) + +- NVCC Compiler Driver documentation: https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html +- CUDA Compatibility documentation: https://docs.nvidia.com/deploy/cuda-compatibility/index.html +- CUDA Installation Guide (version/driver context): https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/cache-behavior-and-access-policy/DOC.md b/content/cuda/docs/cache-behavior-and-access-policy/DOC.md new file mode 100644 index 00000000..24ca3df3 --- /dev/null +++ b/content/cuda/docs/cache-behavior-and-access-policy/DOC.md @@ -0,0 +1,73 @@ +--- +name: cache-behavior-and-access-policy +description: "CUDA cache-behavior essentials: locality patterns, read-only paths, L2 persistence windows, and access-policy tradeoffs." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,cache,l2,access-policy,persistence-window,read-only-cache,locality,stream-attributes" +--- + +# CUDA Cache Behavior And Access Policy (C++) + +Use this page when kernels are bandwidth-limited and cache behavior is the next bottleneck. + +## First Principle + +No cache hint compensates for fundamentally poor locality. + +Always fix: + +- coalescing +- reuse distance +- working set shape + +before tuning cache policy knobs. + +## Read-Only And Locality-Aware Access + +Read-only paths and locality-aware layouts can reduce memory traffic pressure. + +- group neighboring accesses by neighboring threads +- avoid random scatter in the hottest loops +- keep reused regions compact when possible + +## L2 Access Policy Window + +CUDA exposes stream-level access-policy controls for L2 persistence behavior. + +- set stream attributes for persistence windows +- use them only for demonstrably reused regions +- tune hit ratio assumptions carefully + +Overusing persistence windows can hurt other traffic and reduce global efficiency. + +## Practical Workflow + +1. identify hotspot kernels. +2. confirm memory-bound behavior with profiling. +3. improve layout/coalescing first. +4. test cache/access-policy changes incrementally. +5. keep only changes that improve end-to-end latency. + +## Common Pitfalls + +- setting cache policy globally without per-kernel evidence +- treating cache hints as deterministic guarantees +- ignoring multi-stream interference in shared cache resources + +## Related Topics + +- Coalescing: `../coalescing/DOC.md` +- Data layout and alignment: `../data-layout-and-alignment/DOC.md` +- Streams and events: `../streams-and-events/DOC.md` +- Performance debugging: `../performance-debugging/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Programming Guide, L2 persistence/access-policy window APIs: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html +- CUDA C++ Best Practices Guide, memory-system optimization context: https://docs.nvidia.com/cuda/archive/13.0.0/cuda-c-best-practices-guide/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/coalescing/DOC.md b/content/cuda/docs/coalescing/DOC.md new file mode 100644 index 00000000..296a5271 --- /dev/null +++ b/content/cuda/docs/coalescing/DOC.md @@ -0,0 +1,132 @@ +--- +name: coalescing +description: "CUDA global-memory coalescing essentials: contiguous access, pitch, striding, and when shared memory helps." +metadata: + languages: "cpp" + versions: "13.0" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,coalescing,memory-coalescing,coalesced-access,uncoalesced-access,global-memory,memory-bandwidth,stride,pitch,shared-memory,transpose" +--- + +# CUDA Memory Coalescing (C++) + +Use this page for global-memory access-pattern rules that determine whether a kernel uses bandwidth efficiently. + +## What Coalescing Means + +Coalescing is the hardware combining a warp's global-memory accesses into as few memory transactions as possible. + +At a high level: + +- adjacent threads should usually access adjacent addresses +- strided or scattered access wastes bandwidth +- good coalescing matters most in memory-bound kernels + +## Best Default Pattern + +For a 1D array, prefer: + +```cpp +int i = blockIdx.x * blockDim.x + threadIdx.x; +value = input[i]; +``` + +This maps neighboring threads to neighboring elements. + +## Common Bad Pattern + +Patterns like this often destroy coalescing: + +```cpp +int i = blockIdx.x * blockDim.x + threadIdx.x; +value = input[i * stride]; +``` + +Large stride across a warp usually turns one efficient transaction pattern into many inefficient ones. + +## 2D Arrays and Pitch + +For 2D row-major arrays, accesses are most efficient when: + +- threads move along the row dimension together +- row width is aligned well for warp-based access + +If width is not naturally aligned for the hardware, use pitched allocation: + +- `cudaMallocPitch` +- `cudaMemcpy2D` + +This is the standard fix when row width is awkward and rows need padding. + +## Shared Memory As A Reordering Tool + +Shared memory is often used together with coalescing: + +- load from global memory in a coalesced pattern +- reorder in shared memory +- consume in the algorithm's preferred order + +This is a common pattern for: + +- transpose +- tiled GEMM +- stencil halos +- gather/scatter restructuring + +## Coalescing vs Bank Conflicts + +These are different problems: + +- coalescing concerns global-memory transactions +- bank conflicts concern shared-memory accesses + +A kernel can have good coalescing and bad shared-memory banking, or the reverse. + +## Practical Heuristics + +- if a warp reads a row of contiguous elements, that is usually good +- if a warp reads a column from a row-major array directly, that is usually bad +- if a transpose-like pattern is needed, use shared memory to convert the access pattern +- align vectorized loads when using `float2` / `float4` + +## Minimal Tiling Pattern + +```cpp +__shared__ float tile[32][33]; + +int x = blockIdx.x * 32 + threadIdx.x; +int y = blockIdx.y * 32 + threadIdx.y; + +tile[threadIdx.y][threadIdx.x] = input[y * width + x]; +__syncthreads(); +``` + +This style is common because: + +- the global load can be coalesced +- the padded shared tile helps avoid bank conflicts during transposed access + +## When To Suspect Coalescing Problems + +- bandwidth is far below expectation +- profiling shows many global-memory transactions per requested byte +- a transpose or gather/scatter kernel is unexpectedly slow +- changing block shape changes performance dramatically + +## Related Topics + +- Shared memory usage: `../shared-memory/DOC.md` +- Synchronization rules: `../synchronization/DOC.md` +- Memory-space selection: `../memory-hierarchy/DOC.md` +- Runtime API overview: `../runtime/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Best Practices Guide, optimizing memory access: https://docs.nvidia.com/cuda/archive/13.0.0/cuda-c-best-practices-guide/index.html +- CUDA C++ Best Practices Guide, coalesced access to global memory: https://docs.nvidia.com/cuda/archive/13.0.0/cuda-c-best-practices-guide/index.html#coalesced-access-to-global-memory +- CUDA C++ Best Practices Guide, shared memory and matrix multiplication examples: https://docs.nvidia.com/cuda/archive/13.0.0/cuda-c-best-practices-guide/index.html#shared-memory +- CUDA C++ Programming Guide, 2D arrays and pitched allocation: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/collective-communication-patterns/DOC.md b/content/cuda/docs/collective-communication-patterns/DOC.md new file mode 100644 index 00000000..f7d55132 --- /dev/null +++ b/content/cuda/docs/collective-communication-patterns/DOC.md @@ -0,0 +1,66 @@ +--- +name: collective-communication-patterns +description: "CUDA collective communication essentials: reductions, scans, histogram-like updates, and hierarchical aggregation patterns." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,collective,reduction,scan,histogram,aggregation,warp-collective,block-collective" +--- + +# CUDA Collective Communication Patterns (C++) + +Use this page for patterns where many threads combine, distribute, or summarize values. + +## Common Collective Types + +- reduction (sum/max/min/etc.) +- scan/prefix sum +- histogram and bucketized accumulation +- vote/ballot-based filtering + +## Hierarchical Strategy + +A standard high-performance pattern is hierarchical: + +1. intra-warp collective (shuffle/vote) +2. intra-block collective (shared memory) +3. cross-block aggregation (global memory or multi-stage kernel) + +This minimizes global contention. + +## Reduction Pattern + +- reduce in warp first with `__shfl*_sync` +- write one value per warp to shared memory +- final block reduction +- optionally one global write/atomic per block + +## Scan Pattern + +- use block-local scan primitives +- stitch block boundaries in a second phase when global prefix is required + +Avoid forcing a single global synchronization model in one monolithic kernel. + +## Histogram-Like Pattern + +- privatize bins per warp/block when feasible +- merge privately accumulated bins later + +Direct global atomics on a small bin set are usually the worst-case path. + +## Related Topics + +- Warp primitives: `../warp-primitives/DOC.md` +- Atomics and reductions: `../atomics-and-reductions/DOC.md` +- Synchronization: `../synchronization/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Programming Guide, warp intrinsics and synchronization primitives: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html +- CUDA C++ Best Practices Guide, reduction and memory optimization context: https://docs.nvidia.com/cuda/archive/13.0.0/cuda-c-best-practices-guide/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/compute-bound-kernel-optimization-playbook/DOC.md b/content/cuda/docs/compute-bound-kernel-optimization-playbook/DOC.md new file mode 100644 index 00000000..9a6ae433 --- /dev/null +++ b/content/cuda/docs/compute-bound-kernel-optimization-playbook/DOC.md @@ -0,0 +1,65 @@ +--- +name: compute-bound-kernel-optimization-playbook +description: "Compute-bound kernel optimization playbook: instruction mix, occupancy/ILP balance, register pressure control, and path selection." +metadata: + languages: "cpp" + versions: "13.1" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,compute-bound,optimization,instruction-mix,occupancy,ilp,register-pressure,cuda-core,tensor-core" +--- + +# Compute-Bound Kernel Optimization Playbook (C++) + +Use this page after profiling indicates arithmetic throughput is the dominant limiter. + +## Primary Objectives + +- Improve useful instruction issue rate. +- Reduce dependency and scheduling stalls. +- Select the right arithmetic path (CUDA Core vs Tensor Core). + +## High-Impact Levers + +- Improve instruction mix in hot loops. +- Balance occupancy and ILP. +- Control register usage to avoid spill-driven regressions. +- Evaluate Tensor Core migration only when workload shape supports it. + +## Triage Sequence + +1. Confirm the kernel is truly compute-bound after memory cleanup. +2. Inspect stall reasons related to dependencies and issue efficiency. +3. Tune unroll depth and block geometry together. +4. Re-evaluate path selection (`cuda-core` vs `wmma`/Tensor Core). + +## Common Failure Modes + +- Aggressive unrolling increases spills and slows kernel. +- Occupancy chasing hurts per-warp progress. +- Tensor Core migration applied to non-matrix-like workloads. + +## Verification Checklist + +- Throughput metrics improve with stable correctness. +- Register spills do not increase unexpectedly. +- End-to-end runtime improves for production-representative shapes. + +## Related Topics + +- Compute throughput: `../compute-throughput/DOC.md` +- CUDA Core path: `../cuda-core/DOC.md` +- CUDA Core optimization checklist: `../cuda-core-optimization-checklist/DOC.md` +- Tensor Cores: `../tensor-cores/DOC.md` +- CUDA Core vs Tensor Core path selection: `../cuda-core-vs-tensor-core-path-selection/DOC.md` +- Kernel bottleneck diagnosis workflow: `../kernel-bottleneck-diagnosis-workflow/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Programming Guide, arithmetic throughput context: https://docs.nvidia.com/cuda/cuda-c-programming-guide/ +- CUDA C++ Best Practices Guide: https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/ +- Nsight Compute Profiling Guide: https://docs.nvidia.com/nsight-compute/2024.2/ProfilingGuide/index.html + +Last cross-check date: 2026-03-20 + diff --git a/content/cuda/docs/compute-throughput/DOC.md b/content/cuda/docs/compute-throughput/DOC.md new file mode 100644 index 00000000..1c4e9d3a --- /dev/null +++ b/content/cuda/docs/compute-throughput/DOC.md @@ -0,0 +1,105 @@ +--- +name: compute-throughput +description: "CUDA compute-throughput essentials: arithmetic throughput tables, latency hiding, and when Tensor Cores beat ordinary arithmetic paths." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,throughput,compute-bound,fp32,fp16,int32,cuda-core,tensor-core,latency-hiding" +--- + +# CUDA Compute Throughput (C++) + +Use this page to reason about whether a kernel is limited by ordinary arithmetic throughput, Tensor Core throughput, or memory behavior. + +## The First Split + +Ask this first: + +- is the kernel memory-bound? +- or is it compute-bound? + +If memory traffic dominates, moving from ordinary arithmetic to Tensor Cores may not help much until memory behavior is fixed. + +## Ordinary Arithmetic Path + +The CUDA Programming Guide publishes per-SM throughput tables for native arithmetic instructions. + +These tables show that: + +- throughput depends strongly on architecture +- FP32, FP16, INT32, and FP64 do not have the same peak rates +- per-SM throughput must be multiplied by SM count for whole-device peak + +So a generic "CUDA Core throughput" number is not enough by itself. The relevant question is which instruction family the kernel actually uses. + +## Tensor Core Path + +Tensor Cores can provide much higher matrix-multiply-accumulate throughput than ordinary scalar arithmetic paths when: + +- the algorithm is matrix-multiply-like +- supported data types are acceptable +- tile shapes and layouts match the API and hardware requirements +- data staging overhead does not erase the gains + +This is why GEMM, attention, and convolution-like kernels are common Tensor Core candidates, while control-heavy kernels usually are not. + +## Throughput Is Not Just Peak Math + +A kernel can miss peak throughput because of: + +- dependency chains that the scheduler cannot hide +- low occupancy +- poor instruction mix +- register pressure +- memory stalls before arithmetic units are saturated + +So "Tensor Core capable" does not imply "Tensor Core efficient". + +## Practical Decision Rule + +Stay on the ordinary arithmetic path when: + +- the operation is elementwise or irregular +- there is too much branching or indexing complexity +- supported Tensor Core types or layouts do not fit the problem + +Move toward Tensor Cores when: + +- the kernel is dominated by dense matrix multiply-accumulate +- the math can be tiled at warp granularity +- data movement can be organized cleanly + +## What To Check In Practice + +- achieved memory bandwidth +- achieved occupancy +- instruction mix +- whether warp-level matrix instructions are present +- whether the kernel is actually compute-bound after memory optimization + +## Related Topics + +- Execution model: `../execution-model/DOC.md` +- CUDA Core path: `../cuda-core/DOC.md` +- CUDA Core optimization checklist: `../cuda-core-optimization-checklist/DOC.md` +- Occupancy tuning: `../occupancy/DOC.md` +- Tensor Core API usage: `../tensor-cores/DOC.md` +- WMMA debugging checklist: `../wmma-debugging-checklist/DOC.md` +- Tensor Core pipeline patterns: `../tensor-core-pipeline-patterns/DOC.md` +- Tensor Core numerical validation: `../tensor-core-numerical-validation/DOC.md` +- CUDA Core vs Tensor Core path selection: `../cuda-core-vs-tensor-core-path-selection/DOC.md` +- Kernel bottleneck diagnosis workflow: `../kernel-bottleneck-diagnosis-workflow/DOC.md` +- Shared memory staging: `../shared-memory/DOC.md` +- Numerics and precision: `../numerics-and-precision/DOC.md` +- Fused kernel design patterns: `../fused-kernel-design-patterns/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Programming Guide, arithmetic instruction throughput tables: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html +- CUDA C++ Programming Guide, instruction-throughput interpretation: https://docs.nvidia.com/cuda/archive/11.7.0/cuda-c-programming-guide/index.html +- Turing Tuning Guide, SM execution resources and latency hiding discussion: https://docs.nvidia.com/cuda/archive/12.4.0/turing-tuning-guide/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/cooperative-groups/DOC.md b/content/cuda/docs/cooperative-groups/DOC.md new file mode 100644 index 00000000..1a775076 --- /dev/null +++ b/content/cuda/docs/cooperative-groups/DOC.md @@ -0,0 +1,104 @@ +--- +name: cooperative-groups +description: "CUDA Cooperative Groups essentials: thread_block, tiled_partition, coalesced_threads, cluster groups, and collective participation rules." +metadata: + languages: "cpp" + versions: "13.1" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,cooperative-groups,thread_block,tiled_partition,coalesced_threads,this_grid,this_cluster,group-sync" +--- + +# CUDA Cooperative Groups (C++) + +Use this page when kernels need explicit group objects rather than hard-coding assumptions about blocks and warps. + +## Why Cooperative Groups Exists + +Cooperative Groups makes the participating set of threads explicit. + +Instead of assuming "all threads in the block" or "one warp", code can pass a group object into a helper and make the collective scope explicit. + +This improves: + +- software composition +- readability +- portability across newer GPU behaviors + +## Common Group Handles + +Frequently used accessors include: + +- `this_thread_block()` +- `this_grid()` +- `coalesced_threads()` +- `this_cluster()` + +Common types and concepts include: + +- `thread_group` +- `thread_block` +- tiled partitions +- cluster groups + +## Basic Thread Block Example + +```cpp +namespace cg = cooperative_groups; + +cg::thread_block block = cg::this_thread_block(); +block.sync(); +``` + +`block.sync()` is the Cooperative Groups form of block-wide synchronization. + +## Tiled Partition + +Use `tiled_partition()` to decompose a block into smaller groups: + +```cpp +auto block = cg::this_thread_block(); +auto tile32 = cg::tiled_partition(block, 32); +``` + +This is useful for warp-sized or sub-warp collectives without manually reasoning about lane groups everywhere in the code. + +## Participation Rule + +Collective operations require correct participation. + +- all threads in the group must participate in collective operations +- the group handle should be created consistently +- it is best to obtain implicit groups early, before divergence + +Violating participation assumptions leads to undefined behavior. + +## Practical Guidance + +- pass group handles by reference into helper functions +- prefer specialized groups instead of over-generic abstractions when performance matters +- create implicit handles early in the kernel + +## Where It Connects To Other Features + +Cooperative Groups is the user-facing bridge for several advanced CUDA features: + +- tiled warp/block decomposition +- async copy collectives like `memcpy_async` +- cluster groups with `this_cluster()` + +## Related Topics + +- Synchronization rules: `../synchronization/DOC.md` +- Warp primitives: `../warp-primitives/DOC.md` +- Async copy: `../async-copy/DOC.md` +- Thread Block Clusters: `../thread-block-clusters/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA Programming Guide, Cooperative Groups: https://docs.nvidia.com/cuda/cuda-programming-guide/04-special-topics/cooperative-groups.html +- CUDA Programming Guide, classic Cooperative Groups overview: https://docs.nvidia.com/cuda/archive/9.2/cuda-c-programming-guide/ +- CUDA Programming Guide, modern cluster and implicit-group accessors: https://docs.nvidia.com/cuda/archive/13.1.1/cuda-programming-guide/01-introduction/programming-model.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/cublas-cudnn-integration-patterns/DOC.md b/content/cuda/docs/cublas-cudnn-integration-patterns/DOC.md new file mode 100644 index 00000000..9c486942 --- /dev/null +++ b/content/cuda/docs/cublas-cudnn-integration-patterns/DOC.md @@ -0,0 +1,71 @@ +--- +name: cublas-cudnn-integration-patterns +description: "CUDA library integration essentials: cuBLAS/cuDNN handle lifecycle, stream binding, workspace policy, and mixed custom-kernel pipelines." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,cublas,cudnn,integration,handle,stream-binding,workspace,mixed-pipeline" +--- + +# cuBLAS/cuDNN Integration Patterns (C++) + +Use this page when combining custom CUDA kernels with cuBLAS or cuDNN calls. + +## Handle Lifecycle + +Library handles should usually be: + +- created once per host thread/context +- reused across iterations +- destroyed at controlled shutdown + +Frequent create/destroy in hot paths adds overhead. + +## Stream Binding Rule + +Bind library handles to the correct stream before issuing calls. + +- cuBLAS/cuDNN work should run in the intended stream +- stream mismatch causes accidental serialization or race-like ordering bugs + +## Workspace Strategy + +Many cuDNN and some cuBLAS paths use temporary workspace. + +- allocate and reuse workspace buffers where possible +- avoid repeated malloc/free during steady-state loops +- keep workspace sizing policy consistent with algorithm selection + +## Mixed Pipelines + +Common pattern: + +1. pre/post-processing in custom kernels +2. dense math in cuBLAS/cuDNN +3. follow-up custom kernels + +Use events/stream ordering rather than global synchronization between stages. + +## Determinism And Performance + +Algorithm choices can trade determinism and speed. + +- production training/inference pipelines should explicitly document determinism expectations +- benchmark with the exact settings that production will use + +## Related Topics + +- Streams and events: `../streams-and-events/DOC.md` +- CUDA Graphs: `../cuda-graphs/DOC.md` +- Performance debugging: `../performance-debugging/DOC.md` + +## Official Source Links (Fact Check) + +- cuBLAS documentation: https://docs.nvidia.com/cuda/cublas/index.html +- cuDNN documentation: https://docs.nvidia.com/deeplearning/cudnn/latest/ +- CUDA Runtime API (stream interoperability): https://docs.nvidia.com/cuda/cuda-runtime-api/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/cuda-core-optimization-checklist/DOC.md b/content/cuda/docs/cuda-core-optimization-checklist/DOC.md new file mode 100644 index 00000000..71d9ed4f --- /dev/null +++ b/content/cuda/docs/cuda-core-optimization-checklist/DOC.md @@ -0,0 +1,73 @@ +--- +name: cuda-core-optimization-checklist +description: "CUDA Core optimization checklist: coalescing, divergence control, occupancy/ILP balancing, and measurement-first tuning." +metadata: + languages: "cpp" + versions: "13.1" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,cuda-core,optimization,checklist,coalescing,divergence,occupancy,ilp,register-pressure,latency-hiding" +--- + +# CUDA Core Optimization Checklist (C++) + +Use this page when a kernel is intentionally on the ordinary arithmetic path and needs systematic optimization. + +## Step 1: Confirm The Bottleneck Class + +Before changing code, classify the kernel: + +- memory-bound +- compute-bound +- launch/orchestration-bound + +Use profiling first. Do not optimize blind. + +## Step 2: Memory Access Quality + +- Ensure global-memory accesses are coalesced. +- Reduce redundant loads with reuse (register/shared memory where appropriate). +- Avoid severe shared-memory bank conflicts in staging buffers. + +## Step 3: Control Flow Quality + +- Reduce divergence in hot warps. +- Make branch conditions uniform where possible. +- Move rare-path logic off hot loops when feasible. + +## Step 4: Occupancy And ILP Balance + +- Avoid maximizing occupancy as a standalone goal. +- Tune block size, unroll depth, and register footprint together. +- Improve ILP when scoreboard/dependency stalls dominate. + +## Step 5: Validate Every Optimization + +- Reprofile after each major change. +- Track throughput, stall mix, occupancy, and memory metrics together. +- Keep correctness checks and numerical checks in the loop. + +## Common Anti-Patterns + +- Chasing one metric (for example occupancy) while total throughput worsens. +- Heavy unrolling that increases register spills. +- Introducing shared memory without fixing access pattern quality. + +## Related Topics + +- CUDA Core path overview: `../cuda-core/DOC.md` +- Compute throughput: `../compute-throughput/DOC.md` +- Occupancy: `../occupancy/DOC.md` +- Coalescing: `../coalescing/DOC.md` +- Performance debugging: `../performance-debugging/DOC.md` +- Bottleneck diagnosis workflow: `../kernel-bottleneck-diagnosis-workflow/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Programming Guide: https://docs.nvidia.com/cuda/cuda-c-programming-guide/ +- CUDA C++ Best Practices Guide: https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/ +- Nsight Compute Profiling Guide: https://docs.nvidia.com/nsight-compute/2024.2/ProfilingGuide/index.html + +Last cross-check date: 2026-03-20 + diff --git a/content/cuda/docs/cuda-core-vs-tensor-core-path-selection/DOC.md b/content/cuda/docs/cuda-core-vs-tensor-core-path-selection/DOC.md new file mode 100644 index 00000000..9bf072b5 --- /dev/null +++ b/content/cuda/docs/cuda-core-vs-tensor-core-path-selection/DOC.md @@ -0,0 +1,92 @@ +--- +name: cuda-core-vs-tensor-core-path-selection +description: "Path selection guide: deciding between CUDA Core and Tensor Core execution using workload shape, dtype, layout, and numerical constraints." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,cuda-core,tensor-core,path-selection,wmma,wgmma,dtype,layout,precision,fallback" +--- + +# CUDA Core vs Tensor Core Path Selection (C++) + +Use this page when deciding whether to implement or keep a kernel on ordinary arithmetic pipelines or move it to Tensor Core matrix instructions. + +## Fast Decision Matrix + +Choose CUDA Core path first when: + +- operation is elementwise, reduction-heavy, sparse, or control-heavy +- matrix structure is weak or tile reuse is poor +- required dtype/layout does not map cleanly to Tensor Core-supported combinations + +Choose Tensor Core path first when: + +- workload is dominated by dense matrix-multiply-accumulate +- shape and layout can be tiled consistently at warp or warpgroup granularity +- allowed dtype/accumulation policy matches supported Tensor Core paths + +## Data-Type And Numerics Gate + +Before migration, verify: + +- multiplicand and accumulator types are supported by the target path +- error budget tolerates the chosen precision policy +- baseline parity tests pass with realistic input distributions + +If these checks fail, forcing Tensor Core instructions can create unstable numerics or hidden fallback behavior. + +## Layout And Staging Gate + +Tensor Core speedups depend on movement cost. + +Require: + +- consistent layout contracts (`row_major`/`col_major`, leading dimensions) +- efficient shared-memory staging plan +- synchronization protocol that does not serialize hot loops + +If memory behavior remains dominant after staging optimization, keep CUDA Core path and optimize arithmetic/memory overlap there. + +## Performance Validation Protocol + +1. Build a correctness baseline. +2. Profile CUDA Core implementation to identify real bottlenecks. +3. Implement Tensor Core path candidate. +4. Compare throughput, memory pressure, occupancy, and stall behavior. +5. Keep the faster path under expected production shapes, not just synthetic peak cases. + +## Fallback Strategy + +Production kernels should keep explicit fallback behavior: + +- capability checks for architecture/toolchain support +- shape or dtype guards for unsupported combinations +- deterministic fallback to CUDA Core implementation + +This avoids silent behavior drift across deployment environments. + +## Practical Rule Of Thumb + +- Default to CUDA Core path for generality and low complexity. +- Move to Tensor Core path for matrix-dense hotspots after profiling confirms arithmetic throughput is the limiting factor. +- Keep both paths when workload diversity is high. + +## Related Topics + +- CUDA Core path: `../cuda-core/DOC.md` +- Tensor Core overview: `../tensor-cores/DOC.md` +- WMMA practical patterns: `../wmma-kernel-patterns/DOC.md` +- Tensor Core pipeline patterns: `../tensor-core-pipeline-patterns/DOC.md` +- Fallback/capability detection: `../fallback-strategies-and-capability-detection/DOC.md` +- Numerics and precision: `../numerics-and-precision/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Programming Guide (execution model, WMMA, memory model): https://docs.nvidia.com/cuda/cuda-c-programming-guide/ +- CUDA C++ Best Practices Guide (memory and throughput guidance): https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/ + +Last cross-check date: 2026-03-20 + diff --git a/content/cuda/docs/cuda-core/DOC.md b/content/cuda/docs/cuda-core/DOC.md new file mode 100644 index 00000000..e63a79b8 --- /dev/null +++ b/content/cuda/docs/cuda-core/DOC.md @@ -0,0 +1,91 @@ +--- +name: cuda-core +description: "CUDA Core path essentials: SIMT arithmetic pipelines, warp scheduling, ILP/occupancy tradeoffs, and practical optimization workflow." +metadata: + languages: "cpp" + versions: "13.1" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,cuda-core,simt,fp32,int32,warp,scheduler,ilp,occupancy,latency-hiding" +--- + +# CUDA Core Path (C++) + +Use this page for kernels that run on ordinary SM arithmetic pipelines (the path developers usually call "CUDA Core path"), not Tensor Core matrix instructions. + +## What This Means In Practice + +For CUDA C++ kernels, "CUDA Core path" usually means: + +- ordinary scalar or vector arithmetic instructions (FP32, INT32, FP64, and related ops) +- SIMT warp execution on standard SM arithmetic pipelines +- no explicit warp-matrix API (`wmma`) and no PTX warpgroup matrix instructions (`wgmma`) + +There is no separate CUDA C++ API named "CUDA Core". The distinction is a performance and execution-model distinction. + +## Typical Workloads + +Kernels that usually remain on this path: + +- elementwise transforms +- reductions and scans with limited matrix structure +- indexing-heavy or branch-heavy kernels +- irregular sparse kernels + +Even in ML workloads, many preprocessing, activation, normalization, and indexing phases are CUDA Core dominated. + +## Optimization Checklist + +1. Make global memory access coalesced. +2. Remove avoidable divergence in hot warps. +3. Balance occupancy and register pressure instead of maximizing occupancy blindly. +4. Increase instruction-level parallelism where dependency chains are long. +5. Validate cache and shared-memory behavior before deep unrolling. + +## Occupancy vs ILP Tradeoff + +Two common failure modes: + +- **High occupancy, low per-warp progress:** too little ILP, frequent dependency stalls. +- **High ILP, low occupancy:** register usage or shared-memory usage blocks enough resident warps. + +Tune block size, unroll factors, and register usage together. Treat occupancy as a means to hide latency, not as the final objective. + +## How To Verify You Are On This Path + +In profiler output, check whether runtime is dominated by ordinary arithmetic instruction activity and not matrix instruction activity. Also check: + +- warp stall reasons (dependency, memory throttling, execution dependency) +- achieved occupancy +- memory throughput utilization +- instruction mix consistency with kernel intent + +If your intended Tensor Core kernel shows only ordinary arithmetic activity, the path selection is wrong. + +## When To Escalate To Tensor Cores + +Move to Tensor Cores when all are true: + +- workload is dominated by dense matrix-multiply-accumulate +- data types and layouts match supported matrix instruction paths +- staging and synchronization overhead can be controlled +- numerical policy is acceptable (for example FP16/BF16/TF32 with chosen accumulation) + +## Related Topics + +- Execution model: `../execution-model/DOC.md` +- Compute throughput: `../compute-throughput/DOC.md` +- Occupancy: `../occupancy/DOC.md` +- Warp primitives: `../warp-primitives/DOC.md` +- Tensor Cores: `../tensor-cores/DOC.md` +- Path selection guide: `../cuda-core-vs-tensor-core-path-selection/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Programming Guide, SIMT and warp execution: https://docs.nvidia.com/cuda/cuda-c-programming-guide/ +- CUDA C++ Programming Guide, arithmetic instruction throughput interpretation: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html +- Turing Tuning Guide, latency hiding and scheduler behavior: https://docs.nvidia.com/cuda/archive/12.4.0/turing-tuning-guide/index.html + +Last cross-check date: 2026-03-20 + diff --git a/content/cuda/docs/cuda-graphs/DOC.md b/content/cuda/docs/cuda-graphs/DOC.md new file mode 100644 index 00000000..027478a1 --- /dev/null +++ b/content/cuda/docs/cuda-graphs/DOC.md @@ -0,0 +1,104 @@ +--- +name: cuda-graphs +description: "CUDA Graphs essentials: definition, instantiation, execution, stream capture, cross-stream event capture, and update rules." +metadata: + languages: "cpp" + versions: "12.6" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,cuda-graphs,graph,stream-capture,cudaStreamBeginCapture,cudaGraphLaunch,cudaGraphInstantiate" +--- + +# CUDA Graphs (C++) + +Use this page when the same workflow launches repeatedly and CPU launch overhead from streams becomes significant. + +## Why Graphs Exist + +CUDA Graphs separate work submission into: + +1. definition +2. instantiation +3. execution + +This amortizes setup work and can reduce CPU launch overhead compared with issuing many short kernels one by one into streams. + +## Two Creation Paths + +Graphs can be created by: + +- explicit graph APIs +- stream capture + +Stream capture is often the easiest migration path for existing stream-based code. + +## Stream Capture + +Typical pattern: + +```cpp +cudaGraph_t graph; + +cudaStreamBeginCapture(stream); +kernelA<<>>(...); +kernelB<<>>(...); +cudaStreamEndCapture(stream, &graph); +``` + +During capture, work is appended to a graph instead of being immediately enqueued for execution. + +## Event-Based Cross-Stream Capture + +CUDA documents that stream capture can preserve cross-stream dependencies expressed with: + +- `cudaEventRecord()` +- `cudaStreamWaitEvent()` + +provided the waited-on event belongs to the same capture graph. + +## Execution Lifecycle + +After a graph is defined: + +- instantiate it into an executable graph +- launch the executable graph into a stream +- reuse it many times if the workflow is stable + +Graphs help most when the structure is repeated often enough to amortize instantiation. + +## Common Capture Hazards + +- using unsupported APIs during capture +- mixing captured and non-captured dependencies incorrectly +- synchronizing captured streams or captured events in invalid ways +- relying on legacy default stream behavior during capture + +When a capture is invalidated, the graph becomes unusable and capture must be ended. + +## When Graphs Help + +Graphs are especially useful when: + +- kernels are short and launch overhead is material +- the workflow topology repeats +- stream orchestration logic is otherwise host-heavy + +They are less useful when: + +- the workload shape changes every iteration +- the overhead is dominated by kernel execution, not launch + +## Related Topics + +- Streams and events: `../streams-and-events/DOC.md` +- Runtime API overview: `../runtime/DOC.md` +- Performance debugging: `../performance-debugging/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Programming Guide, CUDA Graphs overview: https://docs.nvidia.com/cuda/archive/12.6.1/cuda-c-programming-guide/index.html +- CUDA C++ Programming Guide, stream capture and cross-stream events: https://docs.nvidia.com/cuda/archive/11.7.0/cuda-c-programming-guide/index.html +- CUDA Programming Guide, earlier graph API examples: https://docs.nvidia.com/cuda/archive/12.2.0/cuda-c-programming-guide/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/data-layout-and-alignment/DOC.md b/content/cuda/docs/data-layout-and-alignment/DOC.md new file mode 100644 index 00000000..05281964 --- /dev/null +++ b/content/cuda/docs/data-layout-and-alignment/DOC.md @@ -0,0 +1,80 @@ +--- +name: data-layout-and-alignment +description: "CUDA data-layout and alignment essentials: struct packing, vectorized loads/stores, pitch/stride choices, and alignment-driven performance." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,data-layout,alignment,vectorized-load,vectorized-store,pitch,stride,coalescing" +--- + +# CUDA Data Layout And Alignment (C++) + +Use this page when kernel performance depends on memory layout details. + +## Why Layout Matters + +On CUDA GPUs, layout affects: + +- coalescing behavior +- transaction count +- shared-memory bank behavior +- feasibility of vectorized loads/stores + +Poor layout can dominate runtime even when arithmetic is optimized. + +## Alignment Basics + +Prefer natural alignment for data types and vectorized access. + +- align pointers and base addresses to vector width +- keep struct fields ordered to reduce padding surprises +- avoid accidental misalignment from custom allocators or byte offsets + +## AoS vs SoA + +For many throughput-oriented kernels: + +- SoA (structure of arrays) is often better for coalesced parallel access +- AoS (array of structs) can be easier semantically but may scatter accessed fields + +Choose based on the access pattern of active threads, not only code convenience. + +## Vectorized Access + +Vectorized loads/stores (`float2`, `float4`, etc.) are useful when: + +- data is aligned to the vector width +- adjacent threads follow contiguous access +- vectorization does not introduce awkward tail handling overhead + +Always verify achieved bandwidth after vectorization; assumptions are often wrong. + +## 2D Layouts + +For 2D tensors/arrays: + +- row-major contiguous row access is usually easiest to coalesce +- use pitched allocation when row width alignment is problematic +- treat logical shape and physical stride as separate concepts in APIs + +## Common Pitfalls + +- hidden misalignment from packed/byte-offset structs +- mixing row-major assumptions with column-oriented access +- forcing vectorized access on unaligned data + +## Related Topics + +- Coalescing: `../coalescing/DOC.md` +- Shared memory: `../shared-memory/DOC.md` +- Memory hierarchy: `../memory-hierarchy/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Best Practices Guide, memory access patterns and alignment context: https://docs.nvidia.com/cuda/archive/13.0.0/cuda-c-best-practices-guide/index.html +- CUDA C++ Programming Guide, memory model and type/layout background: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/driver/DOC.md b/content/cuda/docs/driver/DOC.md new file mode 100644 index 00000000..ee388665 --- /dev/null +++ b/content/cuda/docs/driver/DOC.md @@ -0,0 +1,73 @@ +--- +name: driver +description: "CUDA Driver API essentials: explicit context management, module loading, and kernel launch." +metadata: + languages: "cpp" + versions: "12.4" + revision: 1 + updated-on: "2026-03-18" + source: community + tags: "cuda,gpu,kernel,driver,api,ptx" +--- + +# CUDA Driver API (C++) + +Use the Driver API when you need explicit control over contexts, modules, and dynamic kernel loading. It is lower-level than the Runtime API. + +## Basic Flow + +1. Initialize the driver and pick a device +2. Create a context +3. Load a module (PTX or cubin) +4. Get the kernel function +5. Allocate memory and launch +6. Cleanup + +```cpp +#include +#include + +int main() { + CUdevice dev; + CUcontext ctx; + cuInit(0); + cuDeviceGet(&dev, 0); + cuCtxCreate(&ctx, 0, dev); + + CUmodule module; + CUfunction func; + cuModuleLoad(&module, "kernel.ptx"); + cuModuleGetFunction(&func, module, "my_kernel"); + + CUdeviceptr d_out; + cuMemAlloc(&d_out, 1024); + + void* args[] = { &d_out }; + cuLaunchKernel(func, + 1, 1, 1, + 256, 1, 1, + 0, 0, args, 0); + + cuMemFree(d_out); + cuModuleUnload(module); + cuCtxDestroy(ctx); + return 0; +} +``` + +## Core Driver APIs + +- Context: `cuInit`, `cuDeviceGet`, `cuCtxCreate`, `cuCtxDestroy` +- Module: `cuModuleLoad`, `cuModuleLoadData`, `cuModuleGetFunction` +- Memory: `cuMemAlloc`, `cuMemFree`, `cuMemcpyHtoD`, `cuMemcpyDtoH` +- Launch: `cuLaunchKernel` + +## Common Pitfalls + +- Forgetting to create a context before module operations +- Using mismatched kernel names between PTX and host code +- Not checking return codes (Driver API returns `CUresult`) + +## Related Topics + +- Module loading details: `references/module-loading.md` diff --git a/content/cuda/docs/driver/references/module-loading.md b/content/cuda/docs/driver/references/module-loading.md new file mode 100644 index 00000000..c7634b1a --- /dev/null +++ b/content/cuda/docs/driver/references/module-loading.md @@ -0,0 +1,19 @@ +# CUDA Driver Module Loading + +You can load modules from: + +- PTX text (JIT compiled): `cuModuleLoadData` or `cuModuleLoadDataEx` +- Cubin file (precompiled): `cuModuleLoad` + +Common patterns: + +```cpp +CUmodule module = nullptr; +CUresult r = cuModuleLoad(&module, "kernel.cubin"); +// or +r = cuModuleLoadData(&module, ptx_string); +``` + +Notes: +- `cuModuleLoadDataEx` lets you pass JIT options for diagnostics or optimization. +- Always unload modules with `cuModuleUnload` when done. diff --git a/content/cuda/docs/dynamic-parallelism/DOC.md b/content/cuda/docs/dynamic-parallelism/DOC.md new file mode 100644 index 00000000..068410f6 --- /dev/null +++ b/content/cuda/docs/dynamic-parallelism/DOC.md @@ -0,0 +1,65 @@ +--- +name: dynamic-parallelism +description: "CUDA Dynamic Parallelism essentials: device-side kernel launch semantics, synchronization behavior, and memory-space constraints." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,dynamic-parallelism,cdp,device-side-launch,child-kernel,cudaDeviceSynchronize,memory-coherence" +--- + +# CUDA Dynamic Parallelism (C++) + +Use this page when kernels launch other kernels on the device. + +## What It Is + +Dynamic Parallelism (CDP) lets device code launch child grids. + +- parent and child execute on the device +- launch configuration is provided from device code +- useful for irregular recursion-like or adaptive decomposition patterns + +## Core Semantics + +- child launch is asynchronous with respect to the launching thread by default +- synchronization choices in parent code determine when child results are consumed +- launch overhead is non-trivial; avoid using CDP for tiny kernels in hot loops + +## Memory-Space Coherence + +Key memory-space rule from CUDA docs: + +- parent and child share global/constant memory +- local and shared memory are private to their respective thread/block contexts + +Do not assume parent shared memory is visible to child kernels. + +## Typical Use Cases + +- adaptive refinement +- irregular tree/graph traversal +- work generation discovered during device execution + +For regular dense workloads, host-side launch or CUDA Graphs is usually better. + +## Common Pitfalls + +- launching too many tiny child kernels +- misunderstanding parent/child visibility boundaries +- relying on implicit ordering that is not guaranteed + +## Related Topics + +- CUDA Graphs: `../cuda-graphs/DOC.md` +- Streams and events: `../streams-and-events/DOC.md` +- Memory fences and ordering: `../memory-fences-and-ordering/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Programming Guide, Dynamic Parallelism: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html +- CUDA C++ Programming Guide, memory coherence in CDP: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/error-handling-and-debug-build/DOC.md b/content/cuda/docs/error-handling-and-debug-build/DOC.md new file mode 100644 index 00000000..e3453b5a --- /dev/null +++ b/content/cuda/docs/error-handling-and-debug-build/DOC.md @@ -0,0 +1,75 @@ +--- +name: error-handling-and-debug-build +description: "CUDA error-handling and debug-build essentials: launch checks, sync checks, debug flags, and diagnosis workflow." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,error-handling,cudaGetLastError,cudaPeekAtLastError,cudaDeviceSynchronize,debug-build,nvcc,-G,lineinfo" +--- + +# CUDA Error Handling And Debug Build (C++) + +Use this page for practical correctness diagnostics in CUDA applications. + +## Two-Step Error Check Pattern + +Always separate: + +1. launch configuration/API errors +2. runtime execution errors + +Typical pattern: + +```cpp +kernel<<>>(...); +cudaError_t e1 = cudaGetLastError(); // launch/config error +cudaError_t e2 = cudaDeviceSynchronize(); // execution error +``` + +Use stream-specific synchronization when possible instead of global device sync. + +## Why This Matters + +- some errors are detected at launch +- others appear only when kernel execution actually runs + +Checking only one side can hide failures. + +## Debug Build Basics + +For debugging kernels, common compile choices include: + +- device debug info (`-G`) for heavy debug sessions +- line info (`-lineinfo`) for profiling-friendly symbol mapping + +Debug builds can change optimization and performance; do not compare debug and release timings directly. + +## Runtime Diagnostics + +- use descriptive error strings with `cudaGetErrorString` +- include kernel name / input shape in logs +- fail fast in development paths to avoid cascading corruption + +## Practical Workflow + +1. reproduce with smallest failing input. +2. enable strict launch+sync checks. +3. switch to debug-oriented build flags if needed. +4. profile or inspect only after correctness is stable. + +## Related Topics + +- Runtime overview: `../runtime/DOC.md` +- Performance debugging: `../performance-debugging/DOC.md` +- NVTX workflow: `../nvtx-and-profiling-workflow/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA Runtime API, error handling APIs: https://docs.nvidia.com/cuda/cuda-runtime-api/index.html +- CUDA C++ Best Practices Guide, correctness and debugging guidance: https://docs.nvidia.com/cuda/archive/13.0.0/cuda-c-best-practices-guide/index.html +- NVCC documentation (debug flags): https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/execution-model/DOC.md b/content/cuda/docs/execution-model/DOC.md new file mode 100644 index 00000000..bbe48a0c --- /dev/null +++ b/content/cuda/docs/execution-model/DOC.md @@ -0,0 +1,93 @@ +--- +name: execution-model +description: "CUDA execution model essentials: warps, SM scheduling, divergence, and how ordinary arithmetic paths differ from Tensor Core paths." +metadata: + languages: "cpp" + versions: "13.1" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,execution-model,simt,warp,sm,scheduler,divergence,cuda-core,tensor-core" +--- + +# CUDA Execution Model (C++) + +Use this page to understand how CUDA threads are grouped and scheduled, and how ordinary arithmetic execution differs from Tensor Core execution. + +## SIMT Basics + +CUDA executes threads in groups of 32 called warps. + +- a warp is the main scheduling unit inside an SM +- threads in a warp conceptually execute the same kernel code in SIMT style +- divergence inside a warp reduces efficiency because different branch paths are executed separately + +This is why block sizes are usually chosen as multiples of 32. + +## SM-Level Scheduling + +An SM manages many resident warps and switches among them to hide latency. + +- when one warp stalls on memory or dependencies, the SM can issue instructions from another ready warp +- latency hiding depends on both occupancy and instruction-level parallelism +- exact scheduler and execution-unit details vary by architecture + +## What Developers Mean By "CUDA Core" + +NVIDIA documentation usually talks about instruction throughput, FP32/INT32/FP64 units, and SM execution resources rather than a CUDA C++ API called "CUDA Core". + +In practice, developers use "CUDA Core path" to mean: + +- ordinary arithmetic instructions such as FP32 / INT32 math +- standard SIMT execution on the SM's general arithmetic pipelines +- kernels that do not explicitly target Tensor Core matrix instructions + +This is an interpretation of the hardware execution model, not a separate CUDA C++ programming interface. + +## Tensor Core Path + +Tensor Cores are specialized matrix-multiply-accumulate units. + +- they are exposed in CUDA C++ through warp-level matrix APIs such as `nvcuda::wmma` +- they are exposed in PTX through matrix instructions such as `wgmma` +- they are most relevant when the computation naturally maps to small matrix tiles and supported types/layouts + +If a kernel is written using ordinary scalar or vector arithmetic, it is usually on the ordinary SM arithmetic path rather than the Tensor Core path. + +## Divergence And Utilization + +Ordinary arithmetic kernels often lose efficiency because of: + +- warp divergence +- uncoalesced memory access +- bank conflicts +- low occupancy or long dependency chains + +Tensor Core kernels add extra constraints: + +- warp-wide participation +- shape / layout / alignment restrictions +- staging and synchronization overhead around fragments or async pipelines + +## Rule Of Thumb + +- generic elementwise, reduction, indexing-heavy, and control-heavy kernels usually live on the ordinary arithmetic path +- dense matrix-multiply-like kernels are the main candidates for Tensor Core acceleration + +## Related Topics + +- CUDA Core path: `../cuda-core/DOC.md` +- Compute throughput model: `../compute-throughput/DOC.md` +- Occupancy tuning: `../occupancy/DOC.md` +- Warp-level primitives: `../warp-primitives/DOC.md` +- Tensor Core API usage: `../tensor-cores/DOC.md` +- WMMA kernel patterns: `../wmma-kernel-patterns/DOC.md` +- CUDA Core vs Tensor Core path selection: `../cuda-core-vs-tensor-core-path-selection/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA Programming Guide, programming model and warps: https://docs.nvidia.com/cuda/archive/13.1.1/cuda-programming-guide/01-introduction/programming-model.html +- CUDA Programming Guide, SIMT execution model: https://docs.nvidia.com/cuda/cuda-programming-guide/03-advanced/advanced-kernel-programming.html +- Turing Tuning Guide, SM scheduling and execution resources: https://docs.nvidia.com/cuda/archive/12.4.0/turing-tuning-guide/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/fallback-strategies-and-capability-detection/DOC.md b/content/cuda/docs/fallback-strategies-and-capability-detection/DOC.md new file mode 100644 index 00000000..fa0fe900 --- /dev/null +++ b/content/cuda/docs/fallback-strategies-and-capability-detection/DOC.md @@ -0,0 +1,63 @@ +--- +name: fallback-strategies-and-capability-detection +description: "CUDA capability detection and fallback essentials: feature probes, architecture guards, and safe runtime degradation paths." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,capability-detection,fallback,feature-probe,sm-version,graceful-degradation,runtime-guards" +--- + +# Fallback Strategies And Capability Detection (C++) + +Use this page when kernels depend on architecture-specific features (Tensor Cores, clusters, async paths, etc.). + +## Capability Detection + +Query device properties at runtime and gate features explicitly. + +Typical inputs: + +- compute capability (SM version) +- shared-memory limits +- cooperative/cluster support +- peer access/topology capabilities + +Do not infer support from GPU name strings. + +## Fallback Hierarchy + +Define ordered execution paths: + +1. preferred fast path (feature-rich) +2. compatible optimized fallback +3. conservative correctness fallback + +All paths should be tested; fallback code is production code. + +## Guardrail Principles + +- fail fast for unsupported required features +- degrade gracefully for optional accelerations +- log selected path for observability and debugging + +## Common Mistakes + +- fallback exists but is untested +- path selection logic diverges from documented requirements +- silent fallback causes unnoticed performance regressions + +## Related Topics + +- Build and ABI compatibility: `../build-and-abi-compatibility/DOC.md` +- Multi-GPU and peer access: `../multi-gpu-and-peer-access/DOC.md` +- Production readiness checklist: `../production-readiness-checklist/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA Runtime API, device property query interfaces: https://docs.nvidia.com/cuda/cuda-runtime-api/index.html +- CUDA C++ Programming Guide, architecture/capability-dependent feature context: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/fused-kernel-design-patterns/DOC.md b/content/cuda/docs/fused-kernel-design-patterns/DOC.md new file mode 100644 index 00000000..ec01fb56 --- /dev/null +++ b/content/cuda/docs/fused-kernel-design-patterns/DOC.md @@ -0,0 +1,75 @@ +--- +name: fused-kernel-design-patterns +description: "CUDA fused-kernel design essentials: when fusion helps, when it hurts, and practical patterns for memory-traffic reduction." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,fusion,fused-kernel,memory-traffic,register-pressure,launch-overhead,epilogue-fusion" +--- + +# CUDA Fused-Kernel Design Patterns (C++) + +Use this page when deciding whether to combine multiple operations into one kernel. + +## Why Fusion Helps + +Fusion can improve performance by: + +- reducing global-memory round trips +- reducing kernel-launch overhead +- keeping intermediate values in registers/shared memory + +## Why Fusion Can Hurt + +Over-fusion can degrade performance due to: + +- register pressure and spills +- lower occupancy +- larger instruction footprint +- harder scheduling and poorer maintainability + +Fusion is beneficial only when memory/launch savings outweigh these costs. + +## Common Fusion Patterns + +- elementwise chain fusion (A->B->C) +- reduction + lightweight post-processing +- GEMM epilogue fusion (bias/add/activation) +- load-transform-store pipelines with shared-memory staging + +## Practical Decision Rule + +Fuse when: + +- intermediate tensors are large +- extra kernel boundaries dominate runtime +- the fused kernel remains resource-balanced + +Do not fuse when: + +- each op is already compute-heavy and well-optimized +- fusion introduces high register pressure or complex control divergence + +## Validation Workflow + +1. benchmark unfused baseline. +2. fuse one boundary at a time. +3. profile register usage, spills, occupancy, and bandwidth. +4. keep fusion only where end-to-end latency improves. + +## Related Topics + +- Coalescing: `../coalescing/DOC.md` +- Occupancy: `../occupancy/DOC.md` +- Launch bounds and registers: `../launch-bounds-and-registers/DOC.md` +- CUDA Graphs: `../cuda-graphs/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Best Practices Guide, memory and launch optimization context: https://docs.nvidia.com/cuda/archive/13.0.0/cuda-c-best-practices-guide/index.html +- CUDA C++ Programming Guide, execution and memory behavior background: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/incident-response-and-rollback-playbook/DOC.md b/content/cuda/docs/incident-response-and-rollback-playbook/DOC.md new file mode 100644 index 00000000..adcfc788 --- /dev/null +++ b/content/cuda/docs/incident-response-and-rollback-playbook/DOC.md @@ -0,0 +1,68 @@ +--- +name: incident-response-and-rollback-playbook +description: "CUDA incident-response essentials: triage, rollback criteria, mitigation levers, and post-incident hardening steps." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,incident,response,rollback,mitigation,triage,oncall,postmortem" +--- + +# Incident Response And Rollback Playbook (C++) + +Use this page when a CUDA optimization regresses latency, correctness, or stability in production. + +## Fast Triage Checklist + +1. identify blast radius (which models/tasks/hardware). +2. classify failure mode (correctness, latency, crash, OOM, timeout). +3. isolate recent kernel/config/toolchain changes. +4. determine safe rollback target. + +## Rollback Criteria + +Rollback immediately when: + +- correctness deviations exceed policy +- crash rate or timeout rate breaches SLO +- latency regression is severe and sustained + +Do not wait for perfect root-cause certainty before restoring service. + +## Mitigation Levers + +- disable risky fast paths via feature flags +- switch to known-safe kernel variant +- reduce batch size or concurrency temporarily +- force conservative precision/mode where necessary + +## Evidence Collection + +- capture failing inputs and minimal repro shapes +- record selected kernel path/capability info +- collect timeline + kernel profiles for before/after comparison + +## Post-Incident Hardening + +- add regression tests for the triggering pattern +- add rollout guardrails (canary, staged enablement) +- improve observability for path-selection and error counters +- document lessons and owner actions + +## Related Topics + +- Production readiness checklist: `../production-readiness-checklist/DOC.md` +- Regression testing and CI: `../regression-testing-and-ci/DOC.md` +- NVTX profiling workflow: `../nvtx-and-profiling-workflow/DOC.md` +- Fallback strategies: `../fallback-strategies-and-capability-detection/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Best Practices Guide (verification + optimization workflow context): https://docs.nvidia.com/cuda/archive/13.0.0/cuda-c-best-practices-guide/index.html +- Nsight Systems / Nsight Compute docs for triage instrumentation: + - https://docs.nvidia.com/nsight-systems/UserGuide/index.html + - https://docs.nvidia.com/nsight-compute/2024.2/ProfilingGuide/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/input-shape-specialization-and-autotuning/DOC.md b/content/cuda/docs/input-shape-specialization-and-autotuning/DOC.md new file mode 100644 index 00000000..575e5ce2 --- /dev/null +++ b/content/cuda/docs/input-shape-specialization-and-autotuning/DOC.md @@ -0,0 +1,60 @@ +--- +name: input-shape-specialization-and-autotuning +description: "CUDA shape specialization and autotuning essentials: variant spaces, compile/runtime dispatch, and robust tuning workflows." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,autotuning,shape-specialization,dispatch,variant-selection,tile-size,benchmarking" +--- + +# Input Shape Specialization And Autotuning (C++) + +Use this page when one kernel configuration cannot serve all input shapes efficiently. + +## Why Specialization Is Needed + +Kernel performance often depends on: + +- shape geometry +- stride/layout +- precision mode +- architecture/resource limits + +A single static launch/config choice is usually suboptimal across broad workloads. + +## Specialization Strategies + +- compile-time variants for known shape classes +- runtime dispatch by shape buckets +- autotuned parameter sets (tile sizes, block sizes, staging depth) + +Keep variant count bounded to control maintenance overhead. + +## Autotuning Workflow + +1. define search space (block/tile/stage variants). +2. benchmark representative shape corpus. +3. store winning config per shape bucket and hardware class. +4. validate correctness and stability of selected variants. + +## Robustness Rules + +- never tune on one micro-benchmark only +- include tail shapes and borderline sizes +- preserve safe fallback when no tuned profile matches + +## Related Topics + +- Benchmarking methodology: `../benchmarking-methodology/DOC.md` +- Fused kernel patterns: `../fused-kernel-design-patterns/DOC.md` +- Build and ABI compatibility: `../build-and-abi-compatibility/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Best Practices Guide, empirical optimization guidance: https://docs.nvidia.com/cuda/archive/13.0.0/cuda-c-best-practices-guide/index.html +- CUDA C++ Programming Guide, launch/resource model background: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/kernel-api-design-guidelines/DOC.md b/content/cuda/docs/kernel-api-design-guidelines/DOC.md new file mode 100644 index 00000000..02e795d1 --- /dev/null +++ b/content/cuda/docs/kernel-api-design-guidelines/DOC.md @@ -0,0 +1,67 @@ +--- +name: kernel-api-design-guidelines +description: "CUDA kernel API design essentials: parameter contracts, shape/stride conventions, launch invariants, and forward-compatible interface choices." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,api-design,shape,stride,contracts,launch-invariants,interface,maintainability" +--- + +# CUDA Kernel API Design Guidelines (C++) + +Use this page when defining or refactoring kernel-facing interfaces for long-term maintainability. + +## Interface Contracts First + +Document and enforce: + +- tensor shape expectations +- stride/layout assumptions +- alignment requirements +- supported dtype/precision combinations + +Unstated assumptions become production bugs. + +## Parameter Design + +Prefer explicit parameters over hidden globals: + +- dimensions (`n`, `h`, `w`, etc.) +- leading dimensions/strides +- flags that affect algorithmic paths + +Keep argument ordering stable and predictable across related kernels. + +## Launch Invariants + +Define launch invariants close to API: + +- valid block size range +- shared-memory requirements +- grid coverage model + +Validate invariants early in host code where possible. + +## Versioning Mindset + +If a kernel API is reused across modules: + +- avoid breaking parameter semantics silently +- add new fields/options in backward-compatible ways +- keep deprecation path explicit + +## Related Topics + +- Data layout and alignment: `../data-layout-and-alignment/DOC.md` +- Build and ABI compatibility: `../build-and-abi-compatibility/DOC.md` +- Regression testing and CI: `../regression-testing-and-ci/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Programming Guide, kernel launch and execution model background: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html +- CUDA C++ Best Practices Guide, software design and optimization workflow context: https://docs.nvidia.com/cuda/archive/13.0.0/cuda-c-best-practices-guide/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/kernel-bottleneck-diagnosis-workflow/DOC.md b/content/cuda/docs/kernel-bottleneck-diagnosis-workflow/DOC.md new file mode 100644 index 00000000..9417333f --- /dev/null +++ b/content/cuda/docs/kernel-bottleneck-diagnosis-workflow/DOC.md @@ -0,0 +1,83 @@ +--- +name: kernel-bottleneck-diagnosis-workflow +description: "Kernel bottleneck diagnosis workflow: classify memory-bound vs compute-bound vs launch-bound, then choose targeted optimization paths." +metadata: + languages: "cpp" + versions: "2024.2" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,bottleneck,diagnosis,workflow,memory-bound,compute-bound,launch-bound,profiling,nsight" +--- + +# Kernel Bottleneck Diagnosis Workflow (C++) + +Use this page when you need a repeatable way to decide which optimization direction is actually relevant. + +## Classification First + +Classify each hot kernel into one of three primary classes: + +- memory-bound +- compute-bound +- launch/orchestration-bound + +Do this with profiling evidence, not intuition. + +## Evidence Signals + +Memory-bound indicators: + +- high memory-pipeline utilization with low arithmetic utilization +- strong sensitivity to coalescing/layout changes + +Compute-bound indicators: + +- arithmetic pipeline pressure dominates +- throughput improves mainly with instruction-mix or scheduling improvements + +Launch-bound indicators: + +- many short kernels +- significant CPU/launch overhead and weak overlap + +## Optimization Routing + +If memory-bound: + +- prioritize coalescing, reuse, layout, and staging fixes. + +If compute-bound: + +- optimize instruction mix, occupancy/ILP balance, and path selection (CUDA Core vs Tensor Core). + +If launch-bound: + +- reduce launch count, fuse kernels where valid, and evaluate CUDA Graphs. + +## Guardrails + +- Reclassify after each major optimization; bottleneck class can change. +- Keep correctness and numerical checks active during performance iteration. +- Record profiler snapshots per step to avoid regression ambiguity. + +## Related Topics + +- Performance debugging: `../performance-debugging/DOC.md` +- Compute throughput: `../compute-throughput/DOC.md` +- Memory-bound optimization playbook: `../memory-bound-kernel-optimization-playbook/DOC.md` +- Compute-bound optimization playbook: `../compute-bound-kernel-optimization-playbook/DOC.md` +- Launch-bound optimization playbook: `../launch-bound-optimization-playbook/DOC.md` +- Nsight metrics interpretation cheatsheet: `../nsight-metrics-interpretation-cheatsheet/DOC.md` +- CUDA Core optimization checklist: `../cuda-core-optimization-checklist/DOC.md` +- CUDA Core vs Tensor Core path selection: `../cuda-core-vs-tensor-core-path-selection/DOC.md` +- CUDA Graphs: `../cuda-graphs/DOC.md` +- Fused kernel design patterns: `../fused-kernel-design-patterns/DOC.md` + +## Official Source Links (Fact Check) + +- Nsight Systems User Guide: https://docs.nvidia.com/nsight-systems/UserGuide/index.html +- Nsight Compute Profiling Guide: https://docs.nvidia.com/nsight-compute/2024.2/ProfilingGuide/index.html +- CUDA C++ Best Practices Guide: https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/ + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/launch-bound-optimization-playbook/DOC.md b/content/cuda/docs/launch-bound-optimization-playbook/DOC.md new file mode 100644 index 00000000..fbc3a7ee --- /dev/null +++ b/content/cuda/docs/launch-bound-optimization-playbook/DOC.md @@ -0,0 +1,64 @@ +--- +name: launch-bound-optimization-playbook +description: "Launch-bound optimization playbook: reducing launch overhead, improving overlap, and deciding when to use fusion or CUDA Graphs." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,launch-bound,optimization,launch-overhead,cuda-graphs,fusion,stream-overlap,orchestration" +--- + +# Launch-Bound Optimization Playbook (C++) + +Use this page when many short kernels or orchestration overhead dominate runtime. + +## Primary Objectives + +- Reduce launch overhead. +- Increase useful overlap between copy and compute. +- Simplify scheduling structure for repeated execution patterns. + +## High-Impact Levers + +- Reduce kernel launch count where semantically safe. +- Apply kernel fusion when it improves end-to-end cost. +- Evaluate CUDA Graphs for repetitive execution DAGs. +- Improve stream/event structure to avoid accidental serialization. + +## Triage Sequence + +1. Confirm launch/orchestration bottleneck in timeline profiling. +2. Identify high-frequency short kernels and synchronization hotspots. +3. Test fusion and graph capture candidates. +4. Reprofile overlap and CPU-side launch cost. + +## Common Failure Modes + +- Fusion increases register pressure and hurts throughput. +- Graph capture applied to highly dynamic control flow without clear gain. +- Stream dependencies unintentionally serialize work. + +## Verification Checklist + +- CPU launch overhead decreases. +- Timeline overlap improves. +- Overall runtime drops on production traces, not just micro-tests. + +## Related Topics + +- CUDA Graphs: `../cuda-graphs/DOC.md` +- Fused kernel design patterns: `../fused-kernel-design-patterns/DOC.md` +- Streams and events: `../streams-and-events/DOC.md` +- NVTX and profiling workflow: `../nvtx-and-profiling-workflow/DOC.md` +- Kernel bottleneck diagnosis workflow: `../kernel-bottleneck-diagnosis-workflow/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA Graphs programming guidance: https://docs.nvidia.com/cuda/cuda-c-programming-guide/ +- Nsight Systems User Guide: https://docs.nvidia.com/nsight-systems/UserGuide/index.html +- CUDA C++ Best Practices Guide: https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/ + +Last cross-check date: 2026-03-20 + diff --git a/content/cuda/docs/launch-bounds-and-registers/DOC.md b/content/cuda/docs/launch-bounds-and-registers/DOC.md new file mode 100644 index 00000000..cfa16467 --- /dev/null +++ b/content/cuda/docs/launch-bounds-and-registers/DOC.md @@ -0,0 +1,69 @@ +--- +name: launch-bounds-and-registers +description: "CUDA launch bounds and register-pressure essentials: __launch_bounds__, occupancy tradeoffs, and spill-aware tuning." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,launch-bounds,__launch_bounds__,register-pressure,spills,occupancy,maxrregcount" +--- + +# CUDA Launch Bounds And Registers (C++) + +Use this page when kernel performance depends on register pressure and block residency. + +## What `__launch_bounds__` Does + +`__launch_bounds__(maxThreadsPerBlock, minBlocksPerMultiprocessor)` gives the compiler launch-time assumptions. + +- `maxThreadsPerBlock` constrains the intended block size +- optional `minBlocksPerMultiprocessor` asks the compiler to keep enough resources for a target block residency + +This can change register allocation decisions and instruction scheduling. + +## Why It Matters + +Register pressure directly affects occupancy. + +- too many registers per thread can reduce active blocks/warps +- too few registers can cause spills to local memory + +So tuning is a balance: occupancy gain versus spill cost. + +## Practical Tuning Pattern + +1. Start from correctness and baseline performance. +2. Inspect occupancy and local-memory traffic in Nsight Compute. +3. Try `__launch_bounds__` with realistic block sizes. +4. Re-measure runtime, spills, and achieved occupancy. +5. Keep the setting only if end-to-end time improves. + +## `-maxrregcount` Caution + +Compiler flag `-maxrregcount` can cap registers globally, but it is blunt. + +- it may improve occupancy +- it can also increase spills and hurt performance + +Prefer targeted kernel-level tuning (`__launch_bounds__`) before applying global caps. + +## Common Mistakes + +- optimizing for occupancy percentage alone +- forcing low register count without checking spill metrics +- setting launch bounds that do not match actual launch configuration + +## Related Topics + +- Occupancy tuning: `../occupancy/DOC.md` +- Compute throughput: `../compute-throughput/DOC.md` +- Performance debugging: `../performance-debugging/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Programming Guide, launch bounds: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html +- CUDA C++ Programming Guide, occupancy and execution model discussion: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/memory-bound-kernel-optimization-playbook/DOC.md b/content/cuda/docs/memory-bound-kernel-optimization-playbook/DOC.md new file mode 100644 index 00000000..944dd57e --- /dev/null +++ b/content/cuda/docs/memory-bound-kernel-optimization-playbook/DOC.md @@ -0,0 +1,64 @@ +--- +name: memory-bound-kernel-optimization-playbook +description: "Memory-bound kernel optimization playbook: coalescing, cache locality, shared-memory staging, and bandwidth-focused validation." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,memory-bound,optimization,coalescing,cache,shared-memory,bandwidth,staging,latency" +--- + +# Memory-Bound Kernel Optimization Playbook (C++) + +Use this page after profiling confirms the kernel is limited by memory movement instead of arithmetic throughput. + +## Primary Objectives + +- Increase effective bandwidth. +- Reduce wasted traffic. +- Improve locality and access regularity. + +## High-Impact Levers + +- Coalesced global-memory access. +- Reuse through registers/shared memory. +- Shared-memory layouts that avoid severe bank conflicts. +- Data-layout changes that reduce strided/scattered loads. + +## Triage Sequence + +1. Validate coalescing quality for major tensors. +2. Check L1/L2 reuse opportunity and cache-policy behavior. +3. Add or improve shared-memory staging for high-reuse tiles. +4. Recheck occupancy/register pressure after staging changes. + +## Common Failure Modes + +- Correct staging logic but poor layout (bank conflicts dominate). +- More shared memory with no reuse gain (occupancy drops, throughput worsens). +- Overly complex index math adds latency and defeats memory gains. + +## Verification Checklist + +- Achieved bandwidth increases in profiler metrics. +- Memory-related warp stalls decrease in hot sections. +- Total runtime improves on representative production shapes. + +## Related Topics + +- Coalescing: `../coalescing/DOC.md` +- Shared memory: `../shared-memory/DOC.md` +- Cache behavior: `../cache-behavior-and-access-policy/DOC.md` +- Data layout and alignment: `../data-layout-and-alignment/DOC.md` +- Kernel bottleneck diagnosis workflow: `../kernel-bottleneck-diagnosis-workflow/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Best Practices Guide, memory optimizations: https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/ +- CUDA C++ Programming Guide, memory hierarchy and access behavior: https://docs.nvidia.com/cuda/cuda-c-programming-guide/ +- Nsight Compute Profiling Guide: https://docs.nvidia.com/nsight-compute/2024.2/ProfilingGuide/index.html + +Last cross-check date: 2026-03-20 + diff --git a/content/cuda/docs/memory-fences-and-ordering/DOC.md b/content/cuda/docs/memory-fences-and-ordering/DOC.md new file mode 100644 index 00000000..7897149d --- /dev/null +++ b/content/cuda/docs/memory-fences-and-ordering/DOC.md @@ -0,0 +1,86 @@ +--- +name: memory-fences-and-ordering +description: "CUDA memory-ordering essentials: weak ordering, __threadfence* scopes, visibility vs ordering, and fence-based handoff patterns." +metadata: + languages: "cpp" + versions: "12.6" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,memory-ordering,memory-fence,__threadfence,__threadfence_block,__threadfence_system,visibility,volatile" +--- + +# CUDA Memory Fences And Ordering (C++) + +Use this page when kernels communicate through memory and correctness depends on ordering rather than just synchronization. + +## Weak Ordering + +CUDA uses a weakly ordered memory model. + +- two unsynchronized threads reading and writing the same location create a data race +- memory fences enforce ordering of a thread's memory operations +- fences do not automatically provide block-wide participation like `__syncthreads()` + +## Fence Scope Variants + +CUDA provides three common fence scopes: + +- `__threadfence_block()` +- `__threadfence()` +- `__threadfence_system()` + +Roughly: + +- block scope: ordering relevant to the calling block +- device scope: ordering relevant across the device +- system scope: ordering visible to host threads and peer devices as well + +## Ordering vs Visibility + +This distinction matters: + +- fences order memory operations by the calling thread +- barriers coordinate participating threads +- visibility to observers may still require the right memory access path and synchronization pattern + +In other words, a fence is not a replacement for `__syncthreads()`. + +## Typical Pattern + +Producer-consumer handoff across blocks often looks like: + +1. producer writes data +2. producer executes `__threadfence()` +3. producer updates a flag or counter atomically +4. consumer observes the flag and then reads the data + +Without the fence, the flag can become visible before the data it is meant to publish. + +## Choosing The Scope + +- same block only: usually `__threadfence_block()` or a block barrier pattern +- different blocks on the same device: typically `__threadfence()` +- host or peer-device observers: `__threadfence_system()` + +Choose the narrowest scope that matches the communication pattern. + +## Common Mistakes + +- assuming atomics alone solve all ordering problems +- using `__threadfence()` when a block-local barrier is the real need +- forgetting that fences do not synchronize other threads +- using device-wide or system-wide fences more broadly than necessary + +## Related Topics + +- Synchronization rules: `../synchronization/DOC.md` +- Atomics and reductions: `../atomics-and-reductions/DOC.md` +- Streams and events: `../streams-and-events/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Programming Guide, memory fence functions: https://docs.nvidia.com/cuda/archive/12.6.1/cuda-c-programming-guide/index.html +- CUDA C++ Programming Guide, historical fence examples and ordering discussion: https://docs.nvidia.com/cuda/archive/11.5.0/cuda-c-programming-guide/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/memory-hierarchy/DOC.md b/content/cuda/docs/memory-hierarchy/DOC.md new file mode 100644 index 00000000..d10803be --- /dev/null +++ b/content/cuda/docs/memory-hierarchy/DOC.md @@ -0,0 +1,115 @@ +--- +name: memory-hierarchy +description: "CUDA memory hierarchy essentials: registers, local, shared, global, constant, and texture/read-only paths." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,memory-hierarchy,registers,local-memory,shared-memory,global-memory,constant-memory,texture-memory" +--- + +# CUDA Memory Hierarchy (C++) + +Use this page to decide which CUDA memory space fits a kernel access pattern. + +## The Main Spaces + +- registers: fastest per-thread storage, but limited +- local memory: per-thread memory in device memory, commonly used for spills or large automatic objects +- shared memory: on-chip storage shared by threads in a block +- global memory: large device memory visible across kernels and blocks +- constant memory: cached read-only storage, especially effective when many threads read the same location +- texture/read-only path: cached read-only access path that can help some spatial access patterns + +## Registers + +Registers are the first-choice storage for hot per-thread values. + +- lowest-latency storage for thread-private temporaries +- high register pressure can reduce occupancy +- if the compiler runs out of registers, values may spill to local memory + +## Local Memory + +Despite the name, local memory is not on-chip shared scratchpad memory. + +- it is private to one thread +- it resides in device memory +- it often appears when large automatic arrays are used or when register pressure causes spills + +If a kernel unexpectedly slows down, local-memory traffic is often a sign that register use is too high. + +## Shared Memory + +Shared memory is the standard block-level scratchpad. + +- shared by threads in one block +- useful for data reuse, tiling, transpose, and reduction +- requires explicit synchronization when threads communicate through it +- performance depends on avoiding bank conflicts + +See `../shared-memory/DOC.md` for the detailed usage rules. + +## Global Memory + +Global memory is the default large device memory space. + +- visible to all threads and across kernel launches +- highest capacity among device spaces +- much slower than on-chip storage +- performance depends heavily on coalesced access patterns + +See `../coalescing/DOC.md` for access-pattern guidance. + +## Constant Memory + +Constant memory is read-only from device code and is cached. + +- best when many threads read the same address +- not a substitute for shared memory +- useful for broadcast-like parameters or small read-only tables + +## Texture / Read-Only Path + +Texture and read-only cached access paths can help when: + +- access is read-only +- locality is irregular or spatial +- the pattern is not ideal for standard coalesced global loads + +Do not default to texture memory for ordinary linear arrays; it is a pattern-specific tool. + +## Selection Heuristics + +- value reused only by one thread: registers first +- value reused by many threads in one block: shared memory +- large tensor or array visible across blocks: global memory +- small read-only broadcast table: constant memory +- read-only data with irregular spatial locality: texture/read-only path + +## Practical Warnings + +- local memory is usually a warning sign, not a target optimization space +- shared memory helps only when reuse or reordering outweighs its setup and sync cost +- high occupancy alone does not guarantee fast memory behavior +- coalescing and bank conflicts often matter more than raw memory-space choice + +## Related Topics + +- Shared memory details: `../shared-memory/DOC.md` +- Synchronization rules: `../synchronization/DOC.md` +- Coalesced access patterns: `../coalescing/DOC.md` +- Unified Memory: `../unified-memory/DOC.md` +- Pinned memory and transfers: `../pinned-memory-and-transfers/DOC.md` +- PTX state spaces: `../ptx/references/state-spaces-and-types.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Programming Guide, programming model and memory overview: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html +- CUDA C++ Programming Guide, device memory space specifiers: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html#device-memory-space-specifiers +- CUDA C++ Programming Guide, local memory discussion: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html +- CUDA C++ Programming Guide, shared memory: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html#shared + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/multi-gpu-and-peer-access/DOC.md b/content/cuda/docs/multi-gpu-and-peer-access/DOC.md new file mode 100644 index 00000000..3cd3a878 --- /dev/null +++ b/content/cuda/docs/multi-gpu-and-peer-access/DOC.md @@ -0,0 +1,74 @@ +--- +name: multi-gpu-and-peer-access +description: "CUDA multi-GPU essentials: device selection, peer access (P2P), topology constraints, and cross-device synchronization basics." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,multi-gpu,peer-access,p2p,cudaDeviceEnablePeerAccess,cudaMemcpyPeerAsync,topology,nvlink" +--- + +# CUDA Multi-GPU And Peer Access (C++) + +Use this page for process-level multi-GPU programming and direct device-to-device data movement. + +## Device Selection Basics + +Multi-GPU programs typically: + +1. query device count and capabilities +2. assign work partitions per device +3. set active device with `cudaSetDevice` +4. create per-device streams/resources + +Avoid frequent device switching in tight host loops unless necessary. + +## Peer Access (P2P) + +Peer access allows one GPU to access memory on another GPU directly when topology and capability permit it. + +Core APIs: + +- `cudaDeviceCanAccessPeer` +- `cudaDeviceEnablePeerAccess` +- `cudaMemcpyPeerAsync` + +Always check capability before enabling peer access. + +## Why P2P Matters + +When supported, P2P can reduce host staging overhead for inter-GPU exchange. + +Performance depends on topology: + +- NVLink-connected peers often outperform PCIe-only paths +- some GPU pairs may not support peer access at all + +## Synchronization Notes + +Cross-device workflows still need explicit ordering and synchronization. + +- use stream/event patterns per device +- avoid global sync unless required +- ensure destination-side readiness before kernel consumption + +## Common Mistakes + +- assuming all GPU pairs support P2P +- forgetting to set the correct active device before API calls +- building one global stream strategy across devices without per-device ownership + +## Related Topics + +- Streams and events: `../streams-and-events/DOC.md` +- Pinned memory and transfers: `../pinned-memory-and-transfers/DOC.md` +- Unified Memory: `../unified-memory/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Programming Guide, multi-device and peer access: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html +- CUDA Runtime API, peer-device memory access APIs: https://docs.nvidia.com/cuda/cuda-runtime-api/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/nsight-metrics-interpretation-cheatsheet/DOC.md b/content/cuda/docs/nsight-metrics-interpretation-cheatsheet/DOC.md new file mode 100644 index 00000000..a562ba37 --- /dev/null +++ b/content/cuda/docs/nsight-metrics-interpretation-cheatsheet/DOC.md @@ -0,0 +1,53 @@ +--- +name: nsight-metrics-interpretation-cheatsheet +description: "Nsight metrics interpretation cheatsheet: practical mapping from common metric patterns to likely bottleneck classes and next actions." +metadata: + languages: "cpp" + versions: "2024.2" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,nsight,metrics,profiling,interpretation,warp-stalls,occupancy,bandwidth,bottleneck" +--- + +# Nsight Metrics Interpretation Cheatsheet (C++) + +Use this page for fast mapping from profiler symptoms to likely root causes and next steps. + +## Symptom To Action Map + +- High memory pressure + low arithmetic utilization: + likely memory-bound, prioritize coalescing/layout/reuse. +- Low issue efficiency + dependency-heavy stalls: + likely compute-bound scheduling/dependency bottleneck. +- Many short kernels + high CPU orchestration share: + likely launch-bound, evaluate fusion/graphs/overlap changes. + +## Warp Stall Reading Rules + +- Treat stall reasons as supporting evidence, not standalone truth. +- Interpret stall categories together with achieved throughput and occupancy. +- Re-check after each optimization stage because dominant stalls can shift. + +## Minimal Workflow + +1. Timeline classify (Nsight Systems). +2. Kernel-level metrics drilldown (Nsight Compute). +3. Route to memory/compute/launch playbook. +4. Reprofile and confirm bottleneck shift. + +## Related Topics + +- Performance debugging: `../performance-debugging/DOC.md` +- Kernel bottleneck diagnosis workflow: `../kernel-bottleneck-diagnosis-workflow/DOC.md` +- Memory-bound playbook: `../memory-bound-kernel-optimization-playbook/DOC.md` +- Compute-bound playbook: `../compute-bound-kernel-optimization-playbook/DOC.md` +- Launch-bound playbook: `../launch-bound-optimization-playbook/DOC.md` + +## Official Source Links (Fact Check) + +- Nsight Compute Profiling Guide: https://docs.nvidia.com/nsight-compute/2024.2/ProfilingGuide/index.html +- Nsight Systems User Guide: https://docs.nvidia.com/nsight-systems/UserGuide/index.html + +Last cross-check date: 2026-03-20 + diff --git a/content/cuda/docs/numerics-and-precision/DOC.md b/content/cuda/docs/numerics-and-precision/DOC.md new file mode 100644 index 00000000..671db38f --- /dev/null +++ b/content/cuda/docs/numerics-and-precision/DOC.md @@ -0,0 +1,74 @@ +--- +name: numerics-and-precision +description: "CUDA numerics and precision essentials: FP16/BF16/TF32 behavior, accumulation choices, and stability-aware kernel design." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,numerics,precision,fp16,bf16,tf32,accumulation,rounding,tensor-cores" +--- + +# CUDA Numerics And Precision (C++) + +Use this page when correctness and performance depend on precision mode choices. + +## Precision Choices Matter + +CUDA kernels often trade off: + +- throughput +- memory footprint +- numeric stability + +Common formats include FP32, FP16, BF16, and TF32 (Tensor Core-oriented math mode). + +## Storage Type vs Accumulation Type + +A robust pattern is mixed precision: + +- store inputs in lower precision (for bandwidth / throughput) +- accumulate in higher precision (for stability) + +Example direction: + +- FP16/BF16 inputs with FP32 accumulation for reductions and GEMM-like operations. + +## Tensor Core Precision Modes + +Tensor Core paths can use type-specific behavior (for example TF32/FP16/BF16 combinations depending on architecture and library mode). + +When enabling Tensor Core math modes: + +- verify expected numeric tolerance +- compare against a high-precision baseline +- record configuration to keep benchmark results reproducible + +## Common Instability Patterns + +- long reductions in low precision +- subtractive cancellation with similar-magnitude values +- iterative algorithms without periodic re-normalization + +## Practical Guardrails + +1. define accuracy targets first (absolute/relative tolerance). +2. choose accumulation precision before micro-optimizing. +3. test on representative dynamic ranges, not only random unit-scale inputs. +4. keep a reference path (often FP32 accumulation) for regression checks. + +## Related Topics + +- Tensor Core usage: `../tensor-cores/DOC.md` +- Tensor Core numerical validation: `../tensor-core-numerical-validation/DOC.md` +- WMMA debugging checklist: `../wmma-debugging-checklist/DOC.md` +- Compute throughput: `../compute-throughput/DOC.md` +- Performance debugging: `../performance-debugging/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Programming Guide, floating-point and mixed precision behavior: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html +- CUDA C++ Programming Guide, WMMA/Tensor Core precision context: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/nvtx-and-profiling-workflow/DOC.md b/content/cuda/docs/nvtx-and-profiling-workflow/DOC.md new file mode 100644 index 00000000..ab838296 --- /dev/null +++ b/content/cuda/docs/nvtx-and-profiling-workflow/DOC.md @@ -0,0 +1,58 @@ +--- +name: nvtx-and-profiling-workflow +description: "CUDA NVTX and profiling workflow essentials: annotation strategy, Nsight Systems correlation, and handoff to Nsight Compute." +metadata: + languages: "cpp" + versions: "2024.2" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,nvtx,profiling,nsight-systems,nsight-compute,annotation,timeline" +--- + +# NVTX And Profiling Workflow (C++) + +Use this page for a repeatable profiling workflow across host code and CUDA kernels. + +## Why NVTX First + +NVTX markers make timeline analysis actionable. + +- they label logical phases in host code +- Nsight Systems can correlate those ranges with stream activity and kernel launches +- this reduces guesswork before deep kernel-level profiling + +## Recommended Workflow + +1. add NVTX ranges around pipeline phases. +2. run Nsight Systems to identify timeline bottlenecks. +3. select top kernels from the timeline. +4. run Nsight Compute for per-kernel microanalysis. + +This avoids premature micro-optimization of non-critical kernels. + +## Annotation Guidelines + +- annotate coarse phases first (data load, preprocess, compute, postprocess) +- add finer ranges only where needed +- keep naming stable across runs for easy diffing + +## Common Mistakes + +- profiling kernels without timeline context +- over-annotating every tiny function +- changing workload shape between profiling runs + +## Related Topics + +- Performance debugging: `../performance-debugging/DOC.md` +- Streams and events: `../streams-and-events/DOC.md` +- CUDA Graphs: `../cuda-graphs/DOC.md` + +## Official Source Links (Fact Check) + +- NVTX documentation: https://nvidia.github.io/NVTX/ +- Nsight Systems User Guide: https://docs.nvidia.com/nsight-systems/UserGuide/index.html +- Nsight Compute Profiling Guide: https://docs.nvidia.com/nsight-compute/2024.2/ProfilingGuide/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/occupancy/DOC.md b/content/cuda/docs/occupancy/DOC.md new file mode 100644 index 00000000..c5f1b841 --- /dev/null +++ b/content/cuda/docs/occupancy/DOC.md @@ -0,0 +1,103 @@ +--- +name: occupancy +description: "CUDA occupancy essentials: active warps, launch configuration APIs, and the tradeoff with registers and shared memory." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,occupancy,launch-configuration,block-size,register-pressure,shared-memory,cudaOccupancyMaxPotentialBlockSize" +--- + +# CUDA Occupancy (C++) + +Use this page when tuning block size, shared memory size, or register usage and you need to reason about how many warps and blocks can stay active on an SM. + +## What Occupancy Means + +Occupancy is the ratio of active warps on an SM to the maximum supported warps on that SM. + +In practice, occupancy is constrained by: + +- threads per block +- registers used per thread +- shared memory used per block +- architectural limits on blocks and warps per SM + +## Important Caveat + +Higher occupancy is not automatically better. + +- low occupancy can hurt latency hiding +- very high occupancy can be unnecessary if the kernel is already bandwidth-limited or instruction-efficient +- reducing registers just to raise occupancy can backfire if it causes spills to local memory + +Treat occupancy as a constraint and diagnostic, not a standalone optimization target. + +## Runtime APIs + +CUDA provides helper APIs for launch configuration: + +- `cudaOccupancyMaxActiveBlocksPerMultiprocessor` +- `cudaOccupancyMaxPotentialBlockSize` +- `cudaOccupancyMaxPotentialBlockSizeVariableSMem` + +Use them to estimate a reasonable starting block size based on register and shared-memory usage. + +Minimal pattern: + +```cpp +int minGridSize = 0; +int blockSize = 0; +cudaOccupancyMaxPotentialBlockSize( + &minGridSize, + &blockSize, + my_kernel, + 0, + 0); +``` + +This gives a good starting point, not a final answer. + +## What Usually Lowers Occupancy + +- large dynamic shared memory allocations +- high register pressure +- overly large block sizes +- cluster or architecture-specific launch constraints on newer GPUs + +## Practical Tuning Rules + +- start in the 128 to 256 threads-per-block range unless you have a strong reason otherwise +- prefer a multiple of warp size +- if a kernel frequently calls `__syncthreads()`, several smaller blocks can outperform one very large block +- if reducing block size barely changes runtime, the kernel may not be occupancy-limited + +## Common Misread + +If performance is poor, ask these in order: + +1. Is memory access coalesced? +2. Are there bank conflicts? +3. Is there divergence? +4. Is occupancy actually the limiting factor? + +Very often, memory behavior matters more than squeezing out a few more active warps. + +## Related Topics + +- Execution model: `../execution-model/DOC.md` +- Compute throughput model: `../compute-throughput/DOC.md` +- Shared memory constraints: `../shared-memory/DOC.md` +- Memory hierarchy overview: `../memory-hierarchy/DOC.md` +- Synchronization behavior: `../synchronization/DOC.md` +- Coalesced global memory access: `../coalescing/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Programming Guide, occupancy calculator APIs: https://docs.nvidia.com/cuda/archive/11.8.0/cuda-c-programming-guide/index.html +- CUDA C++ Best Practices Guide, thread/block sizing guidance: https://docs.nvidia.com/cuda/archive/13.0.0/cuda-c-best-practices-guide/index.html +- CUDA Driver API occupancy reference: https://docs.nvidia.com/cuda/archive/11.4.4/cuda-driver-api/group__CUDA__OCCUPANCY.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/performance-debugging/DOC.md b/content/cuda/docs/performance-debugging/DOC.md new file mode 100644 index 00000000..7c1d1624 --- /dev/null +++ b/content/cuda/docs/performance-debugging/DOC.md @@ -0,0 +1,100 @@ +--- +name: performance-debugging +description: "CUDA performance debugging essentials: when to use Nsight Systems vs Nsight Compute, key metrics, and how to read warp stalls." +metadata: + languages: "cpp" + versions: "2024.2" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,performance-debugging,nsight-compute,nsight-systems,warp-stalls,occupancy,bandwidth,profiling" +--- + +# CUDA Performance Debugging (C++) + +Use this page when a kernel is correct but slow and you need to decide what to profile first. + +## First Tool Choice + +Use the tools for different questions: + +- Nsight Systems: timeline, host/device orchestration, overlap, streams, events, graph behavior +- Nsight Compute: per-kernel metrics, throughput, occupancy, warp stalls, memory behavior + +If you do not yet know whether the problem is on the host side or inside the kernel, start with Nsight Systems. + +## Nsight Systems + +Use Nsight Systems when you need to answer: + +- are streams actually overlapping? +- are copies blocking kernels? +- is the CPU launch path the bottleneck? +- are events or graphs introducing serialization? + +NVTX ranges are useful here for relating CPU regions to CUDA activity. + +## Nsight Compute + +Use Nsight Compute when you need to answer: + +- is the kernel memory-bound or compute-bound? +- is occupancy too low? +- are schedulers issuing efficiently? +- what are the top warp stall reasons? + +Useful report sections include: + +- SpeedOfLight +- Occupancy +- SchedulerStats +- WarpStateStats + +## Reading Stall Reasons Carefully + +NVIDIA's profiling guide explicitly warns not to over-focus on stalls unless schedulers are failing to issue well. + +Examples: + +- high short-scoreboard stalls often point to shared-memory operations or similar MIO dependencies +- high barrier-related stalls often mean uneven work before synchronization +- high not-selected can simply indicate there are enough eligible warps + +So stall interpretation should follow, not replace, a top-level throughput diagnosis. + +## Practical Triage Order + +1. check total runtime structure with Nsight Systems +2. identify the expensive kernel(s) +3. inspect throughput, occupancy, and warp states in Nsight Compute +4. map the dominant issue back to code: + coalescing, bank conflicts, divergence, occupancy, or launch overhead + +## Related Topics + +- Occupancy tuning: `../occupancy/DOC.md` +- Compute throughput: `../compute-throughput/DOC.md` +- Kernel bottleneck diagnosis workflow: `../kernel-bottleneck-diagnosis-workflow/DOC.md` +- Nsight metrics interpretation cheatsheet: `../nsight-metrics-interpretation-cheatsheet/DOC.md` +- Memory-bound optimization playbook: `../memory-bound-kernel-optimization-playbook/DOC.md` +- Compute-bound optimization playbook: `../compute-bound-kernel-optimization-playbook/DOC.md` +- Launch-bound optimization playbook: `../launch-bound-optimization-playbook/DOC.md` +- CUDA Core optimization checklist: `../cuda-core-optimization-checklist/DOC.md` +- WMMA debugging checklist: `../wmma-debugging-checklist/DOC.md` +- Tensor Core numerical validation: `../tensor-core-numerical-validation/DOC.md` +- Streams and events: `../streams-and-events/DOC.md` +- CUDA Graphs: `../cuda-graphs/DOC.md` +- NVTX workflow: `../nvtx-and-profiling-workflow/DOC.md` +- Error handling and debug build: `../error-handling-and-debug-build/DOC.md` +- Benchmarking methodology: `../benchmarking-methodology/DOC.md` +- Regression testing and CI: `../regression-testing-and-ci/DOC.md` +- Data layout and alignment: `../data-layout-and-alignment/DOC.md` +- Cache behavior and access policy: `../cache-behavior-and-access-policy/DOC.md` + +## Official Source Links (Fact Check) + +- Nsight Systems User Guide: https://docs.nvidia.com/nsight-systems/UserGuide/index.html +- Nsight Compute Profiling Guide: https://docs.nvidia.com/nsight-compute/2024.2/ProfilingGuide/index.html +- Older Nsight Compute profiling guide with stall explanations: https://docs.nvidia.com/nsight-compute/2022.4/ProfilingGuide/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/persistent-kernels-and-work-queues/DOC.md b/content/cuda/docs/persistent-kernels-and-work-queues/DOC.md new file mode 100644 index 00000000..1fbd90db --- /dev/null +++ b/content/cuda/docs/persistent-kernels-and-work-queues/DOC.md @@ -0,0 +1,62 @@ +--- +name: persistent-kernels-and-work-queues +description: "CUDA persistent-kernel essentials: resident worker model, device work queues, load balancing, and synchronization hazards." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,persistent-kernel,work-queue,load-balancing,atomics,producer-consumer,latency" +--- + +# CUDA Persistent Kernels And Work Queues (C++) + +Use this page for latency-sensitive or irregular workloads where one long-lived kernel processes dynamic work. + +## Persistent Kernel Model + +A persistent kernel keeps a fixed set of resident blocks/warps alive and repeatedly pulls tasks from a queue. + +This can reduce launch overhead and improve responsiveness for fine-grained dynamic work. + +## Typical Components + +- global/device work queue +- atomic enqueue/dequeue indices +- worker loop with termination protocol +- backoff or batching strategy for queue contention + +## Where It Helps + +- irregular task sizes +- real-time/low-latency pipelines +- workloads where kernel launch overhead is a large fraction of runtime + +## Where It Hurts + +- queue contention hotspots +- heavy atomic traffic +- poor fairness or starvation in naive dequeue policies +- over-occupying resources and blocking other kernels + +## Design Guardrails + +1. define clear producer/consumer ordering rules. +2. minimize global atomics per task (batch when possible). +3. bound queue contention with per-block or per-warp staging. +4. profile fairness and tail latency, not only average throughput. + +## Related Topics + +- Sparse and irregular kernels: `../sparse-and-irregular-kernels/DOC.md` +- Atomics and reductions: `../atomics-and-reductions/DOC.md` +- Memory fences and ordering: `../memory-fences-and-ordering/DOC.md` +- Streams/events and graphs: `../streams-and-events/DOC.md`, `../cuda-graphs/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Programming Guide, synchronization/order primitives used by queue-based designs: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html +- CUDA C++ Best Practices Guide, launch overhead and memory/atomic considerations: https://docs.nvidia.com/cuda/archive/13.0.0/cuda-c-best-practices-guide/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/pinned-memory-and-transfers/DOC.md b/content/cuda/docs/pinned-memory-and-transfers/DOC.md new file mode 100644 index 00000000..b08a1793 --- /dev/null +++ b/content/cuda/docs/pinned-memory-and-transfers/DOC.md @@ -0,0 +1,66 @@ +--- +name: pinned-memory-and-transfers +description: "CUDA pinned-memory and transfer essentials: page-locked host memory, async memcpy overlap, and transfer-path tuning." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,pinned-memory,page-locked,cudaHostAlloc,cudaMemcpyAsync,transfer-overlap,pcie" +--- + +# CUDA Pinned Memory And Transfers (C++) + +Use this page when host-device transfer performance or overlap is a bottleneck. + +## What Pinned Memory Is + +Pinned (page-locked) host memory is allocated with APIs such as: + +- `cudaHostAlloc` +- `cudaMallocHost` + +Because it is page-locked, the runtime can perform faster and more predictable DMA transfers. + +## Why It Matters + +`cudaMemcpyAsync` overlap with kernel execution generally requires: + +- non-default stream usage +- pinned host buffers for transfer endpoints + +Without pinned memory, many async-copy scenarios degrade to serialized behavior. + +## Basic Pattern + +1. allocate pinned host buffers +2. launch `cudaMemcpyAsync(..., stream)` +3. launch kernels in suitable streams +4. synchronize with stream/event primitives, not global device sync + +## Tradeoffs + +- pinned memory improves transfer behavior +- but excessive pinning can hurt overall system memory behavior on the host + +Pin only hot buffers and reuse them. + +## Common Mistakes + +- assuming `cudaMemcpyAsync` always overlaps without checking buffer type +- mixing default-stream semantics and expecting full concurrency +- over-allocating pinned memory globally + +## Related Topics + +- Streams and events: `../streams-and-events/DOC.md` +- Unified Memory: `../unified-memory/DOC.md` +- CUDA Graphs: `../cuda-graphs/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Best Practices Guide, host-device transfer optimization: https://docs.nvidia.com/cuda/archive/13.0.0/cuda-c-best-practices-guide/index.html +- CUDA Runtime API, host-memory management and async memcpy: https://docs.nvidia.com/cuda/cuda-runtime-api/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/production-readiness-checklist/DOC.md b/content/cuda/docs/production-readiness-checklist/DOC.md new file mode 100644 index 00000000..f155304f --- /dev/null +++ b/content/cuda/docs/production-readiness-checklist/DOC.md @@ -0,0 +1,72 @@ +--- +name: production-readiness-checklist +description: "CUDA production-readiness checklist: correctness gates, performance stability, observability, compatibility, and rollout safeguards." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,production,readiness,checklist,observability,compatibility,rollback,release-gates" +--- + +# CUDA Production Readiness Checklist (C++) + +Use this page before shipping optimized CUDA kernels to production environments. + +## 1) Correctness Gates + +- reference-baseline comparison on representative datasets +- tolerance policy per precision mode +- stress tests for boundary sizes and adversarial shapes +- deterministic/reproducibility expectations documented + +## 2) Performance Gates + +- benchmark methodology fixed and repeatable +- p50/p95 latency and throughput baselines recorded +- regression thresholds defined and enforced in CI/perf jobs +- cold-start versus steady-state behavior measured + +## 3) Observability + +- NVTX ranges present for major pipeline phases +- key metrics exported (latency, error rates, fallback rate) +- profiler workflows documented for oncall debugging + +## 4) Compatibility + +- target `-gencode` matrix matches deployment fleet +- driver/toolkit compatibility validated +- fallback path behavior tested when preferred kernels are unavailable + +## 5) Operational Safety + +- feature flag or staged rollout strategy +- fast rollback path +- runtime guardrails for unexpected shapes/resource exhaustion + +## 6) Documentation Hygiene + +- kernel assumptions and constraints documented +- precision and determinism modes documented +- known limitations and troubleshooting notes linked + +## Related Topics + +- Regression testing and CI: `../regression-testing-and-ci/DOC.md` +- Benchmarking methodology: `../benchmarking-methodology/DOC.md` +- Build and ABI compatibility: `../build-and-abi-compatibility/DOC.md` +- NVTX profiling workflow: `../nvtx-and-profiling-workflow/DOC.md` +- Fallback strategies and capability detection: `../fallback-strategies-and-capability-detection/DOC.md` +- Incident response and rollback playbook: `../incident-response-and-rollback-playbook/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Best Practices Guide: https://docs.nvidia.com/cuda/archive/13.0.0/cuda-c-best-practices-guide/index.html +- CUDA Compatibility documentation: https://docs.nvidia.com/deploy/cuda-compatibility/index.html +- Nsight Systems / Compute docs for observability workflows: + - https://docs.nvidia.com/nsight-systems/UserGuide/index.html + - https://docs.nvidia.com/nsight-compute/2024.2/ProfilingGuide/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/ptx-atomic-and-reduction-patterns/DOC.md b/content/cuda/docs/ptx-atomic-and-reduction-patterns/DOC.md new file mode 100644 index 00000000..9bbcd56f --- /dev/null +++ b/content/cuda/docs/ptx-atomic-and-reduction-patterns/DOC.md @@ -0,0 +1,68 @@ +--- +name: ptx-atomic-and-reduction-patterns +description: "PTX atomic and reduction patterns: atom/cas/red/redux usage, scope/semantic choices, and lock-free update templates." +metadata: + languages: "cpp" + versions: "9.2" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,ptx,atomics,reduction,atom,atom.cas,compare-and-swap,cas,cas-loop,red,redux,scope,acquire,release,lock-free,lockfree" +--- + +# PTX Atomic and Reduction Patterns + +Use this page when designing concurrent PTX update paths with explicit scope and memory semantics. + +## Instruction Families + +- Atomic RMW: `atom.*` +- Compare-and-swap: `atom.cas` +- Reduction-update: `red.*` +- Warp/group reduction helper: `redux.sync` + +## Scope and Semantics First + +Correctness depends on selecting: + +- target state space (shared/global/cluster forms as supported) +- scope (`cta`, `cluster`, `gpu`, `sys` as applicable) +- semantics (relaxed/acquire/release/acq_rel where available) + +A wrong scope can appear correct in tests but fail under real concurrency. + +## Canonical Patterns + +- Lock-free queue/head update: + CAS loop with explicit acquire/release semantics. +- Aggregation path: + `red.*` for one-way accumulation where return value is not required. +- Predicate-guided lane aggregation: + warp-level reduction then fewer global atomics. + +## Failure Modes + +- Missing acquire/release pairing between producer and consumer. +- Overly wide scope adds contention and latency. +- Excessive global atomics with no local aggregation stage. + +## Verification Checklist + +- Stress under high contention and varied scheduling. +- Validate determinism policy (if required) separately from correctness. +- Profile contention hotspots and retry-loop pressure. + +## Related Topics + +- PTX synchronization instructions: `../ptx/instructions/sync-comm/DOC.md` +- PTX memory consistency model: `../ptx/references/memory-consistency-model.md` +- PTX warp synchronization patterns: `../ptx-warp-synchronization-patterns/DOC.md` + +## Official Source Links (Fact Check) + +- PTX atom instruction family: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-atom +- PTX red instruction family: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-red +- PTX redux.sync: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-redux-sync +- PTX Memory Consistency Model: https://docs.nvidia.com/cuda/parallel-thread-execution/#memory-consistency-model + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/ptx-integer-bit-manipulation-patterns/DOC.md b/content/cuda/docs/ptx-integer-bit-manipulation-patterns/DOC.md new file mode 100644 index 00000000..29a0d417 --- /dev/null +++ b/content/cuda/docs/ptx-integer-bit-manipulation-patterns/DOC.md @@ -0,0 +1,57 @@ +--- +name: ptx-integer-bit-manipulation-patterns +description: "PTX integer and bit-manipulation patterns: logic/shift/select primitives, packing/unpacking strategies, and common correctness traps." +metadata: + languages: "cpp" + versions: "9.2" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,ptx,integer,bit-manipulation,logic,shift,selp,lop3,bfe,bfi,popc,brev,prmt" +--- + +# PTX Integer and Bit-Manipulation Patterns + +Use this page for practical composition of PTX integer/logic instructions in performance-sensitive kernels. + +## Core Primitive Groups + +- Logic: `and`, `or`, `xor`, `not`, `lop3` +- Shift and funnel-shift: `shl`, `shr`, `shf` +- Bitfield extraction/insert: `bfe`, `bfi` +- Bit counting/permutation: `clz`, `popc`, `brev`, `prmt` +- Predicate-style selection: `selp`, `setp` + +## Common Composition Patterns + +- Use `setp + selp` for branchless integer clamps and conditional assignment. +- Use `bfe/bfi` for packed-field decode/encode instead of long mask chains. +- Use `lop3` to fuse multi-step boolean logic into fewer instructions. +- Use `popc` and `clz` for bitset analytics and index derivation. + +## Correctness Traps + +- Signed vs unsigned shift semantics (`shr.s*` vs `shr.u*`) change high-bit fill behavior. +- Type width mismatches silently change mask and overflow behavior. +- Packing/unpacking code must define bit positions and endianness assumptions explicitly. + +## Performance Heuristics + +- Prefer fewer dependent bit-ops in hot loops to reduce scoreboard pressure. +- Validate whether `lop3` or `prmt` reduces instruction count on target architecture. +- Recheck register pressure after replacing arithmetic with heavy bit-manipulation sequences. + +## Related Topics + +- PTX integer instruction index: `../ptx/instructions/integer/DOC.md` +- PTX control flow: `../ptx/instructions/control-flow/DOC.md` +- PTX synchronization and communication: `../ptx/instructions/sync-comm/DOC.md` + +## Official Source Links (Fact Check) + +- PTX Integer Arithmetic Instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#integer-arithmetic-instructions +- PTX Logic and Shift Instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#logic-and-shift-instructions +- PTX Comparison and Selection Instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#comparison-and-selection-instructions + +Last cross-check date: 2026-03-20 + diff --git a/content/cuda/docs/ptx-mbarrier-protocol-patterns/DOC.md b/content/cuda/docs/ptx-mbarrier-protocol-patterns/DOC.md new file mode 100644 index 00000000..9eda6556 --- /dev/null +++ b/content/cuda/docs/ptx-mbarrier-protocol-patterns/DOC.md @@ -0,0 +1,57 @@ +--- +name: ptx-mbarrier-protocol-patterns +description: "PTX mbarrier protocol patterns: arrive/test_wait/arrive_drop flows, async-copy integration, and phase-safety rules." +metadata: + languages: "cpp" + versions: "9.2" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,ptx,mbarrier,arrive,test_wait,arrive_drop,cp.async,cp-async-mbarrier-arrive,cp.async.wait_group,cp.async.wait_all,async-proxy,phase,completion-protocol,producer-consumer" +--- + +# PTX mbarrier Protocol Patterns + +Use this page for robust phase-based synchronization protocols around async copy/compute pipelines. + +## Core Operations + +- Producer-side phase signal: `mbarrier.arrive` +- Participant drop from future phases: `mbarrier.arrive_drop` +- Consumer-side wait/poll: `mbarrier.test_wait` / `mbarrier.try_wait` +- Async-copy completion bridge: `cp.async.mbarrier.arrive` + +## Protocol Template + +1. Initialize barrier state and participant expectations. +2. Issue producer operations (for example async copy). +3. Signal completion with appropriate arrive semantics. +4. Wait on consumer side before data use. +5. Advance phases safely and apply `arrive_drop` when participation changes. + +## Phase Safety Rules + +- Keep producer and consumer on the same phase contract. +- Respect no-complete restrictions for `.noComplete` variants. +- Use sink `_` rules correctly for remote cluster-only flows. +- Avoid mixing unrelated work into the same mbarrier protocol. + +## Common Failure Modes + +- Deadlock from mismatched participant counts. +- Premature consumer reads due to missing wait checks. +- Undefined behavior by allowing `.noComplete` variant to complete a phase. + +## Related Topics + +- PTX data-movement async references: `../ptx/instructions/data-movement/references/cp-async.md` +- PTX TMA instructions: `../ptx/instructions/tma/DOC.md` +- PTX synchronization instructions: `../ptx/instructions/sync-comm/DOC.md` + +## Official Source Links (Fact Check) + +- PTX mbarrier instruction set: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-mbarrier +- PTX cp.async.mbarrier.arrive: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-cp-async-mbarrier-arrive +- PTX Asynchronous Operations: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-operations + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/ptx-warp-synchronization-patterns/DOC.md b/content/cuda/docs/ptx-warp-synchronization-patterns/DOC.md new file mode 100644 index 00000000..8be6eb2d --- /dev/null +++ b/content/cuda/docs/ptx-warp-synchronization-patterns/DOC.md @@ -0,0 +1,65 @@ +--- +name: ptx-warp-synchronization-patterns +description: "PTX warp synchronization patterns: vote/shfl/match/elect/bar.warp.sync composition for warp-cooperative algorithms." +metadata: + languages: "cpp" + versions: "9.2" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,ptx,warp,synchronization,shfl.sync,vote.sync,match.sync,elect.sync,bar.warp.sync,membermask" +--- + +# PTX Warp Synchronization Patterns + +Use this page for warp-cooperative control/data exchange patterns without escalating to CTA-wide barriers. + +## Key Warp-Level Primitives + +- `bar.warp.sync` +- `vote.sync` +- `shfl.sync` +- `match.sync` +- `elect.sync` + +## Practical Compositions + +- Warp reduction: + `shfl.sync` plus lane-conditional accumulation. +- Warp agreement checks: + `vote.sync` for any/all consensus. +- Key-based grouping: + `match.sync` for same-value subgrouping. +- Single-lane leadership: + `elect.sync` for representative-thread control logic. + +## Membermask Discipline + +Correctness depends on accurate `membermask` usage: + +- mask must match actual participating lanes on that control path +- mismatched masks can cause undefined or misleading results +- keep mask derivation stable across phases of the same protocol + +## Common Failure Modes + +- Divergent lanes use different masks for the same warp primitive. +- Lane index assumptions are invalid after control-flow divergence. +- Warp-level protocol accidentally used for cross-warp coordination. + +## Related Topics + +- PTX synchronization instructions: `../ptx/instructions/sync-comm/DOC.md` +- PTX control flow: `../ptx/instructions/control-flow/DOC.md` +- CUDA warp primitives (C++ view): `../warp-primitives/DOC.md` + +## Official Source Links (Fact Check) + +- PTX shfl.sync: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-shfl-sync +- PTX vote.sync: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-vote-sync +- PTX match.sync: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-match-sync +- PTX elect.sync: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-elect-sync +- PTX bar.warp.sync: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-bar-warp-sync + +Last cross-check date: 2026-03-20 + diff --git a/content/cuda/docs/ptx/DOC.md b/content/cuda/docs/ptx/DOC.md new file mode 100644 index 00000000..a44c7533 --- /dev/null +++ b/content/cuda/docs/ptx/DOC.md @@ -0,0 +1,118 @@ +--- +name: ptx +description: "NVIDIA PTX ISA 9.2 guide: instruction model, constraints, and architecture mapping." +metadata: + languages: "cpp" + versions: "9.2" + revision: 2 + updated-on: "2026-03-19" + source: official + tags: "cuda,ptx,isa,gpu,assembly,nvidia,wmma,tensor-core,tensorcore,matrix-multiply,matrix-multiply-accumulate,shared-memory,cp.async,mbarrier,bank-conflict,swizzling" +--- + +# PTX ISA 9.2 Navigation + +This directory follows the PTX ISA 9.2 official documentation and provides executable, constrained, and traceable instruction semantics for agents. + +## Coverage + +- Program model: thread hierarchy, state spaces, data types, functions, and ABI +- Instruction format: predicates, opcodes, type suffixes, modifiers, and operands +- Memory model: scope + semantics (relaxed/acquire/release) +- Instruction families: integer, floating point, data movement, control flow, synchronization, WGMMA, and TMA +- Special registers: `%tid`, `%ctaid`, `%smid`, and related registers + +## Recommended Reading Path + +1. `references/programming-model.md` +2. `references/state-spaces-and-types.md` +3. `references/instruction-format-and-operands.md` +4. `references/memory-consistency-model.md` +5. `references/abi-and-calling-convention.md` +6. `instructions/*/DOC.md` + +## Shared Memory Related Entry Points + +- CUDA C++ shared memory base entry: `../shared-memory/DOC.md` +- CUDA C++ Tensor Core entry: `../tensor-cores/DOC.md` +- CUDA execution model entry: `../execution-model/DOC.md` +- CUDA throughput model entry: `../compute-throughput/DOC.md` +- CUDA Core path entry: `../cuda-core/DOC.md` +- CUDA path-selection entry: `../cuda-core-vs-tensor-core-path-selection/DOC.md` +- CUDA WMMA patterns entry: `../wmma-kernel-patterns/DOC.md` +- CUDA Tensor Core pipeline entry: `../tensor-core-pipeline-patterns/DOC.md` +- CUDA async copy entry: `../async-copy/DOC.md` +- CUDA Cooperative Groups entry: `../cooperative-groups/DOC.md` +- CUDA Cluster / DSM entry: `../thread-block-clusters/DOC.md` +- CUDA stream/event entry: `../streams-and-events/DOC.md` +- CUDA fence/ordering entry: `../memory-fences-and-ordering/DOC.md` +- CUDA Graphs entry: `../cuda-graphs/DOC.md` +- CUDA performance diagnostics entry: `../performance-debugging/DOC.md` +- CUDA launch bounds/registers entry: `../launch-bounds-and-registers/DOC.md` +- CUDA Unified Memory entry: `../unified-memory/DOC.md` +- CUDA pinned transfer entry: `../pinned-memory-and-transfers/DOC.md` +- CUDA multi-GPU/P2P entry: `../multi-gpu-and-peer-access/DOC.md` +- CUDA Dynamic Parallelism entry: `../dynamic-parallelism/DOC.md` +- CUDA debug-build/error-handling entry: `../error-handling-and-debug-build/DOC.md` +- CUDA cuBLAS/cuDNN integration entry: `../cublas-cudnn-integration-patterns/DOC.md` +- CUDA NVTX profiling entry: `../nvtx-and-profiling-workflow/DOC.md` +- CUDA numerics/precision entry: `../numerics-and-precision/DOC.md` +- CUDA reproducibility entry: `../randomness-and-reproducibility/DOC.md` +- CUDA fused-kernel design entry: `../fused-kernel-design-patterns/DOC.md` +- CUDA build/ABI compatibility entry: `../build-and-abi-compatibility/DOC.md` +- CUDA sparse/irregular kernels entry: `../sparse-and-irregular-kernels/DOC.md` +- CUDA collective communication patterns entry: `../collective-communication-patterns/DOC.md` +- CUDA benchmarking methodology entry: `../benchmarking-methodology/DOC.md` +- CUDA regression testing/CI entry: `../regression-testing-and-ci/DOC.md` +- CUDA data-layout/alignment entry: `../data-layout-and-alignment/DOC.md` +- CUDA cache behavior entry: `../cache-behavior-and-access-policy/DOC.md` +- CUDA persistent-kernel/work-queue entry: `../persistent-kernels-and-work-queues/DOC.md` +- CUDA production readiness checklist entry: `../production-readiness-checklist/DOC.md` +- CUDA kernel API design entry: `../kernel-api-design-guidelines/DOC.md` +- CUDA shape-specialization/autotuning entry: `../input-shape-specialization-and-autotuning/DOC.md` +- CUDA capability-detection/fallback entry: `../fallback-strategies-and-capability-detection/DOC.md` +- CUDA incident-response/rollback entry: `../incident-response-and-rollback-playbook/DOC.md` +- `.shared` state-space reference: `references/state-spaces-and-types.md` +- `cp.async` reference: `instructions/data-movement/references/cp-async.md` +- `mbarrier` reference: `instructions/sync-comm/DOC.md` +- TMA/shared-memory layout reference: `instructions/tma/DOC.md` + +## PTX Pattern Playbooks + +- Integer and bit-manipulation patterns: `../ptx-integer-bit-manipulation-patterns/DOC.md` +- Atomic and reduction patterns: `../ptx-atomic-and-reduction-patterns/DOC.md` +- mbarrier protocol patterns: `../ptx-mbarrier-protocol-patterns/DOC.md` +- Warp synchronization patterns: `../ptx-warp-synchronization-patterns/DOC.md` + +## Instruction Category Entry Points + +- Integer Arithmetic: `instructions/integer/DOC.md` +- Floating-Point: `instructions/floating-point/DOC.md` +- Data Movement: `instructions/data-movement/DOC.md` +- Control Flow: `instructions/control-flow/DOC.md` +- Synchronization and Communication: `instructions/sync-comm/DOC.md` +- Warpgroup MMA: `instructions/wgmma/DOC.md` +- Tensor Memory Accelerator: `instructions/tma/DOC.md` +- Special Registers: `instructions/special-registers/DOC.md` + +## Documentation Reliability Notes + +- Syntax and semantic claims in this directory map to NVIDIA PTX ISA sections. +- Each document includes section-level anchors for direct verification. +- If newer PTX versions are released, prioritize release-notes deltas. + +## Official Source Links (fact check) + +- PTX main documentation: https://docs.nvidia.com/cuda/parallel-thread-execution/ +- PTX ISA Notes: https://docs.nvidia.com/cuda/parallel-thread-execution/#ptx-isa-notes +- Target ISA Notes: https://docs.nvidia.com/cuda/parallel-thread-execution/#target-isa-notes +- Release Notes: https://docs.nvidia.com/cuda/parallel-thread-execution/#release-notes + +Last verified date: 2026-03-19 + +## B-series Special Entry Points + +- H-series special instruction summary: `references/h-series-special-instructions.md` +- Architecture capability matrix: `references/b-series-arch-matrix.md` +- Delta vs Hopper: `references/b-series-delta-from-hopper.md` +- tcgen05 special topic: `instructions/tcgen05/DOC.md` diff --git a/content/cuda/docs/ptx/instructions/control-flow/DOC.md b/content/cuda/docs/ptx/instructions/control-flow/DOC.md new file mode 100644 index 00000000..217217ea --- /dev/null +++ b/content/cuda/docs/ptx/instructions/control-flow/DOC.md @@ -0,0 +1,55 @@ +--- +name: ptx-control-flow-instructions +description: "PTX control-flow instructions and divergence-related behaviors in ISA 9.2." +metadata: + languages: "cpp" + versions: "9.2" + revision: 2 + updated-on: "2026-03-19" + source: official + tags: "cuda,ptx,control-flow,branch,call" +--- + +# PTX Control Flow + +Control-flow instructions determine branching, calling, and exit behavior, while also affecting warp divergence and execution efficiency. + +## Common Instructions + +- `bra` conditional/unconditional branch +- `call` device function call +- `ret` function return +- `exit` thread exit +- `brx.idx` indirect branch + +## Syntax Example (PTX style) + +```ptx +@p bra L_done; +call.uni (_), my_func, (arg0); +ret; +``` + +## Constraints and Pitfalls + +- Predicate-controlled branches can introduce warp divergence. +- `call` paths must satisfy parameter and ABI rules. +- Some branch variants have version or target ISA requirements. + +## Official Source Links (fact check) + +- Control Flow Instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#control-flow-instructions +- bra: https://docs.nvidia.com/cuda/parallel-thread-execution/#control-flow-instructions-bra +- call: https://docs.nvidia.com/cuda/parallel-thread-execution/#control-flow-instructions-call +- ret: https://docs.nvidia.com/cuda/parallel-thread-execution/#control-flow-instructions-ret + +Last verified date: 2026-03-19 + +## Single-Instruction References + +- `references/bra.md` +- `references/call.md` +- `references/ret.md` +- `references/brx-idx.md` +- `references/exit.md` +- `references/trap.md` diff --git a/content/cuda/docs/ptx/instructions/control-flow/references/bra.md b/content/cuda/docs/ptx/instructions/control-flow/references/bra.md new file mode 100644 index 00000000..c9076202 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/control-flow/references/bra.md @@ -0,0 +1,34 @@ +# PTX Instruction Topic: bra + +`bra` is a fundamental PTX branch instruction that supports predicate-controlled conditional branching. + +## Official Description +- Documentation section: Control Flow Instructions: `bra` +- Commonly used together with predicates generated by `setp` + +## Key Constraints +- Conditional branching depends on the result of a predicate register. +- Branch divergence can affect warp execution efficiency. +- The target label must be within a valid control-flow range. + +## Usage Notes +- Prefer `setp + bra` patterns that keep divergent regions short. +- Keep branch targets structurally simple so join behavior is easy to audit. + +## Common Failure Modes +- Predicate values are stale because producer instructions were reordered or conditionally skipped. +- Divergent branch regions grow too large and create avoidable warp-serial execution. + +## Example (PTX Style) + +```ptx +@p bra L_true; +bra L_end; +``` + +## Official Source Links (Fact Check) + +- bra: https://docs.nvidia.com/cuda/parallel-thread-execution/#control-flow-instructions-bra +- Control flow instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#control-flow-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/control-flow/references/brx-idx.md b/content/cuda/docs/ptx/instructions/control-flow/references/brx-idx.md new file mode 100644 index 00000000..7266efe1 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/control-flow/references/brx-idx.md @@ -0,0 +1,32 @@ +# PTX Instruction Topic: brx.idx + +`brx.idx` is an index-based branch control flow instruction, commonly used for jump-table-style dispatch. + +## Official Description + +- Documentation section: Control Flow Instructions: `brx.idx` +- Used to select the branch target based on an index value + +## Key Constraints + +- The index range must match the number of valid table entries. +- The strategy for handling invalid indices should be clearly defined in higher-level logic. +- Conditional paths must keep warp-level control-flow consistency manageable. + +## Usage Notes + +- Use `brx.idx` for dense dispatch tables where branch targets are static and well-audited. +- Include a default-safe path for out-of-range indices before deployment on variable inputs. + +## Example (PTX Style, Illustrative) + +```ptx +brx.idx idx, table; +``` + +## Official Source Links (Fact Check) + +- brx.idx: https://docs.nvidia.com/cuda/parallel-thread-execution/#control-flow-instructions-brx-idx +- Control flow instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#control-flow-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/control-flow/references/call.md b/content/cuda/docs/ptx/instructions/control-flow/references/call.md new file mode 100644 index 00000000..2a6e9dbf --- /dev/null +++ b/content/cuda/docs/ptx/instructions/control-flow/references/call.md @@ -0,0 +1,33 @@ +# PTX Instruction Topic: call + +`call` is used for device function calls, involving parameter passing and calling conventions. + +## Official Description + +- Documentation section: Control Flow Instructions: `call` +- Related to `.func` declarations, the `.param` parameter space, and ABI constraints + +## Key Constraints + +- The parameter list must match the callee function signature. +- Register/return value semantics along the calling path must be consistent. +- Under conditional execution, avoid control-flow inconsistencies that could lead to undefined behavior. + +## Usage Notes + +- Prefer `call.uni` only when uniform control-flow assumptions are guaranteed. +- Re-evaluate register pressure and inlining tradeoffs when introducing helper calls in hot kernels. + +## Example (PTX Style) + +```ptx +call.uni (retval), my_func, (arg0, arg1); +``` + +## Official Source Links (Fact Check) + +- call: https://docs.nvidia.com/cuda/parallel-thread-execution/#control-flow-instructions-call +- Function declarations and definitions: https://docs.nvidia.com/cuda/parallel-thread-execution/#function-declarations-and-definitions +- Abstracting the ABI: https://docs.nvidia.com/cuda/parallel-thread-execution/#abstracting-the-abi + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/control-flow/references/exit.md b/content/cuda/docs/ptx/instructions/control-flow/references/exit.md new file mode 100644 index 00000000..78ec56ef --- /dev/null +++ b/content/cuda/docs/ptx/instructions/control-flow/references/exit.md @@ -0,0 +1,31 @@ +# PTX Instruction Topic: exit + +`exit` terminates the current thread’s execution and is a thread-level exit primitive inside a kernel. + +## Official Description + +- Documentation section: Control Flow Instructions: `exit` +- Commonly used for early-exit paths and boundary-condition handling + +## Key Constraints + +- Before exiting, ensure that shared-state updates and synchronization requirements are satisfied. +- Avoid issuing `exit` early at points that require all participants in a synchronization, otherwise the protocol may be mismatched. + +## Usage Notes + +- Use early `exit` only on paths that do not participate in later collective synchronization. +- Prefer predicate-guarded compute skip when protocol consistency is more important than early termination. + +## Example (PTX Style) + +```ptx +@p exit; +``` + +## Official Source Links (Fact Check) + +- exit: https://docs.nvidia.com/cuda/parallel-thread-execution/#control-flow-instructions-exit +- Control flow instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#control-flow-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/control-flow/references/ret.md b/content/cuda/docs/ptx/instructions/control-flow/references/ret.md new file mode 100644 index 00000000..d1af344e --- /dev/null +++ b/content/cuda/docs/ptx/instructions/control-flow/references/ret.md @@ -0,0 +1,31 @@ +# PTX Instruction Topic: ret + +`ret` is used for function return, ending the current function call path. + +## Official Description + +- Documentation section: Control Flow Instructions: `ret` +- Matches the call boundary of `call` + +## Key Constraints + +- The return path must be consistent with the function definition and calling convention. +- In complex control flow, ensure that all paths can reach a valid return point. + +## Usage Notes + +- Keep return conventions explicit when mixing `.func` helpers and inlined call sites. +- Validate that predicate-driven paths still preserve a legal return sequence. + +## Example (PTX Style) + +```ptx +ret; +``` + +## Official Source Links (Fact Check) + +- ret: https://docs.nvidia.com/cuda/parallel-thread-execution/#control-flow-instructions-ret +- Control flow instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#control-flow-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/control-flow/references/trap.md b/content/cuda/docs/ptx/instructions/control-flow/references/trap.md new file mode 100644 index 00000000..ca6916a1 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/control-flow/references/trap.md @@ -0,0 +1,33 @@ +# PTX Instruction Topic: trap + +`trap` is used to trigger exceptions/debug traps and is commonly used on error paths or as a debugging breakpoint. + +## Official Description + +- Documentation section: Control Flow Instructions: `trap` +- Mainly used for diagnostics and fail-fast scenarios + +## Usage Notes + +- Trigger it only under clearly defined error conditions. +- Use it cautiously on production paths to avoid impacting throughput. +- Pair `trap` with a clear diagnostics policy so failures are reproducible. +- Avoid embedding `trap` in speculative fast paths that may execute under benign edge conditions. + +## Common Failure Modes + +- Leaving debug-only `trap` paths enabled in production builds. +- Emitting `trap` without enough context to diagnose the triggering condition. + +## Example (PTX Style) + +```ptx +@p trap; +``` + +## Official Source Links (Fact Check) + +- trap: https://docs.nvidia.com/cuda/parallel-thread-execution/#control-flow-instructions-trap +- Control flow instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#control-flow-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/data-movement/DOC.md b/content/cuda/docs/ptx/instructions/data-movement/DOC.md new file mode 100644 index 00000000..3ca7242c --- /dev/null +++ b/content/cuda/docs/ptx/instructions/data-movement/DOC.md @@ -0,0 +1,65 @@ +--- +name: ptx-data-movement-instructions +description: "PTX data movement instructions in ISA 9.2, including ld/st/ldu, cvt/cvt.pack/cvta, cp.async paths, and prefetch hints." +metadata: + languages: "cpp" + versions: "9.2" + revision: 2 + updated-on: "2026-03-19" + source: official + tags: "cuda,ptx,load,store,memory,cp.async,cp.async.bulk,ld,ldu,ld.global.nc,st,st.async,st.bulk,cvt,cvt.pack,cvta,mov,prefetch,prefetchu,data-movement" +--- + +# PTX Data Movement + +This page covers PTX load/store, conversion, and async movement patterns that dominate memory-side kernel behavior. + +## Representative Syntax + +```ptx +cp.async.ca.shared{::cta}.global{.level::cache_hint}{.level::prefetch_size} [dst], [src], cp-size{, src-size}{, cache-policy}; +cp.async.commit_group; +cp.async.wait_group N; +cp.async.wait_all; +``` + +## Minimal Async Copy Pattern + +```ptx +cp.async.ca.shared.global [smem_ptr], [gmem_ptr], 16; +cp.async.commit_group; +cp.async.wait_group 0; +``` + +## Constraints and Pitfalls + +- Source/destination state spaces must match the instruction form. +- Async copy completion must be explicitly synchronized before consumer access. +- Conversion/load/store variants have operand width and alignment constraints. + +## Official Source Links (fact check) + +- Data Movement and Conversion Instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions +- cp.async: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async +- cp.async.commit_group: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-commit-group +- cp.async.wait_group/wait_all: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-wait-group-cp-async-wait-all +- ld: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-ld + +Last verified date: 2026-03-19 + +## Single-Instruction References + +- `references/cp-async.md` +- `references/cp-async-bulk.md` +- `references/ld.md` +- `references/st.md` +- `references/cp-async-wait-group.md` +- `references/prefetch.md` +- `references/cvta.md` +- `references/mov.md` +- `references/cvt.md` +- `references/ld-global-nc.md` +- `references/st-async.md` +- `references/st-bulk.md` +- `references/cvt-pack.md` +- `references/ldu.md` diff --git a/content/cuda/docs/ptx/instructions/data-movement/references/cp-async-bulk.md b/content/cuda/docs/ptx/instructions/data-movement/references/cp-async-bulk.md new file mode 100644 index 00000000..dee0888a --- /dev/null +++ b/content/cuda/docs/ptx/instructions/data-movement/references/cp-async-bulk.md @@ -0,0 +1,32 @@ +# PTX Instruction Note: cp.async.bulk + +`cp.async.bulk` is a bulk async copy instruction with mbarrier-based completion, suitable for larger transfers. + +## Official Syntax (Representative Form) + +```ptx +cp.async.bulk.shared::cta.global.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [mbar]; +cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [mbar]; +``` + +## Key Semantics + +- The instruction executes on the async proxy and is a weak memory operation. +- Completion can be configured via `.mbarrier::complete_tx::bytes`. +- complete-tx carries `completeCount=bytes` on the mbarrier. +- The documentation states completion is followed by an implicit generic-async proxy fence. +- You still need async-group or mbarrier waits before consuming the data. + +## Key Constraints + +- Source/destination state spaces must match the selected bulk variant form. +- `size` and operand alignment must satisfy ISA requirements for the target architecture. +- Completion tracking must be explicit before downstream consumers read results. + +## Official Source Links (fact check) + +- cp.async.bulk: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-bulk +- Asynchronous data movement: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-data-movement-and-conversion-instructions +- Memory consistency model: https://docs.nvidia.com/cuda/parallel-thread-execution/#memory-consistency-model + +Last verified date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/data-movement/references/cp-async-wait-group.md b/content/cuda/docs/ptx/instructions/data-movement/references/cp-async-wait-group.md new file mode 100644 index 00000000..205f33aa --- /dev/null +++ b/content/cuda/docs/ptx/instructions/data-movement/references/cp-async-wait-group.md @@ -0,0 +1,35 @@ +# PTX Instruction Note: cp.async.wait_group / cp.async.wait_all + +`cp.async.wait_group` / `cp.async.wait_all` is used to wait for `cp.async` groups to complete. + +## Official Syntax + +```ptx +cp.async.wait_group N; +cp.async.wait_all; +``` + +## Key Semantics + +- `wait_group N`: waits until at most N recent pending groups remain, and all earlier groups complete. +- When `N=0`, waits for all prior `cp.async` groups to complete. +- This wait only applies to `cp.async` completion; it does not provide ordering/visibility for other memory operations. + +## Usage Recommendations + +- Execute the wait before consuming destination shared-memory data. +- Do not treat this as a general fence; it only applies to `cp.async` completion semantics. + +## Common Failure Modes + +- Waiting on the wrong stage depth (`N`) and reading tiles that are not yet complete. +- Mixing unrelated async pipelines into one wait protocol and causing phase confusion. +- Assuming `wait_group` replaces other synchronization steps needed by the overall algorithm. + +## Official Source Links (fact check) + +- cp.async.wait_group / wait_all: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-wait-group-cp-async-wait-all +- cp.async: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async +- Asynchronous operations: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-operations + +Last verified date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/data-movement/references/cp-async.md b/content/cuda/docs/ptx/instructions/data-movement/references/cp-async.md new file mode 100644 index 00000000..f9dec9eb --- /dev/null +++ b/content/cuda/docs/ptx/instructions/data-movement/references/cp-async.md @@ -0,0 +1,41 @@ +# PTX Instruction Note: cp.async + +`cp.async` is a non-blocking async copy instruction from `.global` to `.shared`, and requires explicit waiting via group or mbarrier mechanisms. + +## Official Syntax (Excerpt) + +```ptx +cp.async.ca.shared{::cta}.global{.level::cache_hint}{.level::prefetch_size} [dst], [src], cp-size{, src-size}{, cache-policy}; +cp.async.cg.shared{::cta}.global{.level::cache_hint}{.level::prefetch_size} [dst], [src], cp-size{, src-size}{, cache-policy}; +``` + +## Key Semantics + +- The instruction is non-blocking; the issuing thread continues execution. +- `src` must be in global memory and `dst` must be in shared memory. +- If optional `src-size` is smaller than `cp-size`, remaining `dst` bytes are zero-filled. +- `src-size > cp-size` is undefined behavior. + +## Completion and Visibility + +- Without explicit synchronization, ordering between `cp.async` operations is not guaranteed. +- Completion can be tracked through: + - `cp.async.commit_group` + `cp.async.wait_group` / `cp.async.wait_all` + - `cp.async.mbarrier.arrive` + `mbarrier.test_wait/try_wait` + +## Minimal Pattern + +```ptx +cp.async.ca.shared.global [smem_ptr], [gmem_ptr], 16; +cp.async.commit_group; +cp.async.wait_group 0; +``` + +## Official Source Links (fact check) + +- cp.async: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async +- cp.async.commit_group: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-commit-group +- cp.async.wait_group / wait_all: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-wait-group-cp-async-wait-all +- Asynchronous operations: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-operations + +Last verified date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/data-movement/references/cvt-pack.md b/content/cuda/docs/ptx/instructions/data-movement/references/cvt-pack.md new file mode 100644 index 00000000..e856dd3a --- /dev/null +++ b/content/cuda/docs/ptx/instructions/data-movement/references/cvt-pack.md @@ -0,0 +1,31 @@ +# PTX Instruction Note: cvt.pack + +`cvt.pack` converts and packs multiple source elements into a compact destination representation. + +## Official Positioning + +- Documentation section: Data Movement and Conversion Instructions: `cvt.pack` + +## Key Constraints + +- Source element types, destination packed type, and rounding/saturation modifiers must form a legal variant. +- Packing order and lane composition follow ISA-defined operand ordering. +- Use saturation/rounding modifiers explicitly when narrowing precision. + +## Usage Notes + +- Use `cvt.pack` to reduce instruction count when packing quantized outputs. +- Validate lane ordering assumptions before integrating with vectorized unpack paths. + +## Example (PTX style) + +```ptx +cvt.pack.sat.u8.s32.b32 d, a, b, c; +``` + +## Official Source Links (fact check) + +- cvt.pack: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cvt-pack +- cvt: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cvt + +Last verified date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/data-movement/references/cvt.md b/content/cuda/docs/ptx/instructions/data-movement/references/cvt.md new file mode 100644 index 00000000..c1c226d5 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/data-movement/references/cvt.md @@ -0,0 +1,33 @@ +# PTX Instruction Note: cvt + +`cvt` is used for numeric type conversion (integer/float/bit-width changes), a key instruction for mixed precision and interface adaptation. + +## Official Positioning + +- Documentation section: Data Movement and Conversion Instructions: `cvt` +- Related extension section: `cvt.pack` + +## Key Constraints + +- Target type suffix determines rounding/truncation behavior. +- Float-to-integer conversion requires overflow and rounding handling. +- Packed variants must satisfy element-type and packing-format requirements. + +## Usage Notes + +- Use explicit rounding modes (`rn`, `rz`, `rm`, `rp`) to make conversion policy reviewable. +- Validate saturation and overflow handling before deploying quantization paths. + +## Example (PTX style) + +```ptx +cvt.rn.f32.f16 f1, h1; +cvt.rzi.s32.f32 r1, f1; +``` + +## Official Source Links (fact check) + +- cvt: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cvt +- cvt.pack: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cvt-pack + +Last verified date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/data-movement/references/cvta.md b/content/cuda/docs/ptx/instructions/data-movement/references/cvta.md new file mode 100644 index 00000000..1ab09d29 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/data-movement/references/cvta.md @@ -0,0 +1,35 @@ +# PTX Instruction Note: cvta + +`cvta` is used for address conversion/normalization (`convert address`) and is critical for cross-address-space pointer handling. + +## Official Positioning + +- Documentation section: Data Movement and Conversion Instructions: `cvta` + +## Key Constraints + +- Target state space and input address must match an allowed conversion direction. +- Result register bit width must accommodate the target address representation. + +## Usage Notes + +- Use `cvta` at ABI boundaries where generic pointers must be normalized to explicit state spaces. +- Keep pointer width explicit (`u32` vs `u64`) to avoid truncation on mixed-address workflows. + +## Common Failure Modes + +- Converting to an incorrect target state space and then reusing the pointer in unrelated load/store paths. +- Address-width truncation when 64-bit addresses are forced into 32-bit intermediates. + +## Example (PTX style) + +```ptx +cvta.to.global.u64 rd, ra; +``` + +## Official Source Links (fact check) + +- cvta: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cvta +- State spaces: https://docs.nvidia.com/cuda/parallel-thread-execution/#state-spaces + +Last verified date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/data-movement/references/ld-global-nc.md b/content/cuda/docs/ptx/instructions/data-movement/references/ld-global-nc.md new file mode 100644 index 00000000..1e0cd4eb --- /dev/null +++ b/content/cuda/docs/ptx/instructions/data-movement/references/ld-global-nc.md @@ -0,0 +1,31 @@ +# PTX Instruction Note: ld.global.nc + +`ld.global.nc` performs non-coherent global-memory loads with cache-policy controls defined by ISA variants. + +## Official Positioning + +- Documentation section: Data Movement and Conversion Instructions: `ld.global.nc` + +## Key Constraints + +- Applicable only to legal global-memory address forms and supported type variants. +- Cache/modifier combinations must match the documented variant constraints. +- Ordering/visibility guarantees differ from coherent paths; combine with appropriate synchronization when required. + +## Usage Notes + +- Use `ld.global.nc` for read-mostly streams where non-coherent cache behavior is intentional. +- Validate cache-policy choices with profiler counters instead of assuming lower latency. + +## Example (PTX style) + +```ptx +ld.global.nc.u32 r1, [addr]; +``` + +## Official Source Links (fact check) + +- ld.global.nc: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-ld-global-nc +- ld: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-ld + +Last verified date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/data-movement/references/ld.md b/content/cuda/docs/ptx/instructions/data-movement/references/ld.md new file mode 100644 index 00000000..2ae24796 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/data-movement/references/ld.md @@ -0,0 +1,34 @@ +# PTX Instruction Note: ld + +`ld` is the base PTX load instruction family across global/shared/local/constant state spaces. + +## Official Positioning + +- Documentation section: Data Movement and Conversion Instructions: `ld` + +## Key Constraints + +- Address state space and instruction variant must match. +- Destination register type/width must match the loaded element format. +- Variant modifiers (cache, scope, vector width) must satisfy ISA-specific constraints. + +## Usage Notes + +- Use coherent `ld` forms by default; switch to specialized variants only with measured justification. +- Align load width and vectorization with producer layout to preserve coalescing efficiency. +- Keep cache modifiers consistent across hot paths to reduce unpredictable locality behavior. + +## Example (PTX style) + +```ptx +ld.global.u32 r1, [addr]; +ld.shared.f32 f1, [saddr]; +``` + +## Official Source Links (fact check) + +- ld: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-ld +- ld.global.nc: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-ld-global-nc +- ldu: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-ldu + +Last verified date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/data-movement/references/ldu.md b/content/cuda/docs/ptx/instructions/data-movement/references/ldu.md new file mode 100644 index 00000000..cd7f9fe9 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/data-movement/references/ldu.md @@ -0,0 +1,32 @@ +# PTX Instruction Note: ldu + +`ldu` provides a uniform load path for addresses that are expected to be uniform across threads. + +## Official Positioning + +- Documentation section: Data Movement and Conversion Instructions: `ldu` + +## Key Constraints + +- Use only with legal `ldu` state-space/type combinations documented by ISA. +- Intended uniform-access assumptions should match actual access behavior for best results. +- Do not treat `ldu` as a generic replacement for all `ld` forms. + +## Usage Recommendations + +- Prefer `ldu` when operand addresses are naturally uniform within the execution group. +- Validate performance impact with profiling because benefit is pattern-dependent. + +## Example (PTX style) + +```ptx +ldu.global.u32 r1, [addr]; +``` + +## Official Source Links (fact check) + +- ldu: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-ldu +- ld: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-ld +- ld.global.nc: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-ld-global-nc + +Last verified date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/data-movement/references/mov.md b/content/cuda/docs/ptx/instructions/data-movement/references/mov.md new file mode 100644 index 00000000..5f2876ac --- /dev/null +++ b/content/cuda/docs/ptx/instructions/data-movement/references/mov.md @@ -0,0 +1,32 @@ +# PTX Instruction Note: mov + +`mov` transfers values between registers and selected special-register/constant forms. + +## Official Positioning + +- Documentation section: Data Movement and Conversion Instructions: `mov` + +## Key Constraints + +- Source and destination operand classes must match a legal `mov` variant. +- Width/type suffixes must preserve valid bit-width semantics. +- Special-register movement forms require supported register names and target ISA. + +## Usage Notes + +- Use `mov` for explicit register/value handoff when clarity is more important than implicit compiler rewrites. +- Keep special-register reads localized to reduce accidental architectural coupling. + +## Example (PTX style) + +```ptx +mov.u32 r1, r2; +mov.u32 r_tid, %tid.x; +``` + +## Official Source Links (fact check) + +- mov: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-mov +- Special registers: https://docs.nvidia.com/cuda/parallel-thread-execution/#special-registers + +Last verified date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/data-movement/references/prefetch.md b/content/cuda/docs/ptx/instructions/data-movement/references/prefetch.md new file mode 100644 index 00000000..945e8526 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/data-movement/references/prefetch.md @@ -0,0 +1,32 @@ +# PTX Instruction Note: prefetch / prefetchu + +`prefetch` and `prefetchu` provide advisory cache prefetch behavior for eligible memory access patterns. + +## Official Positioning + +- Documentation section: Data Movement and Conversion Instructions: `prefetch, prefetchu` + +## Key Constraints + +- Prefetch instructions are hints; they do not guarantee residency or strict ordering semantics. +- Address form and state-space usage must match legal variants. +- Overuse can add overhead without gain when locality is weak. + +## Usage Recommendations + +- Use for predictable forward-access streams where cache warmup is beneficial. +- Confirm benefit with profiler metrics rather than assuming speedup. +- Combine with coalesced access patterns; prefetch does not fix poor memory layout. + +## Example (PTX style, Illustrative) + +```ptx +prefetch.global.L2 [addr]; +``` + +## Official Source Links (fact check) + +- prefetch, prefetchu: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-prefetch-prefetchu +- Data movement instruction set: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions + +Last verified date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/data-movement/references/st-async.md b/content/cuda/docs/ptx/instructions/data-movement/references/st-async.md new file mode 100644 index 00000000..f6964d37 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/data-movement/references/st-async.md @@ -0,0 +1,32 @@ +# PTX Instruction Note: st.async + +`st.async` issues asynchronous store operations with completion signaling in supported variants. + +## Official Positioning + +- Documentation section: Data Movement and Conversion Instructions: `st.async` + +## Key Semantics + +- Operation is asynchronous; consumer visibility must follow explicit completion/synchronization rules. +- mbarrier-based completion variants publish transfer completion through documented mechanisms. +- Ordering and visibility follow PTX async-operation and memory-consistency semantics. + +## Usage Notes + +- Keep each asynchronous store pipeline tied to a clear barrier/phase protocol. +- Avoid mixing unrelated producer paths into the same completion channel. + +## Example (PTX style) + +```ptx +st.async.shared::cluster.mbarrier::complete_tx::bytes.u32 [addr], b, [mbar_addr]; +``` + +## Official Source Links (fact check) + +- st.async: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-st-async +- mbarrier: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-mbarrier +- Memory consistency model: https://docs.nvidia.com/cuda/parallel-thread-execution/#memory-consistency-model + +Last verified date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/data-movement/references/st-bulk.md b/content/cuda/docs/ptx/instructions/data-movement/references/st-bulk.md new file mode 100644 index 00000000..167c8a69 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/data-movement/references/st-bulk.md @@ -0,0 +1,31 @@ +# PTX Instruction Note: st.bulk + +`st.bulk` is the bulk-store instruction family for larger transfer paths in supported state-space combinations. + +## Official Positioning + +- Documentation section: Data Movement and Conversion Instructions: `st.bulk` + +## Key Constraints + +- Source data type, destination state space, and size operands must match legal forms. +- Bulk-store usage should respect architecture-specific restrictions and completion semantics. +- Use explicit synchronization where subsequent consumers depend on completion. + +## Usage Recommendations + +- Prefer `st.bulk` for structured large transfers where the ISA form is supported. +- Validate that store granularity and alignment match your buffer layout. + +## Example (PTX style, Illustrative) + +```ptx +st.bulk.shared::cluster.u32 [addr], r1, bytes; +``` + +## Official Source Links (fact check) + +- st.bulk: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-st-bulk +- st: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-st + +Last verified date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/data-movement/references/st.md b/content/cuda/docs/ptx/instructions/data-movement/references/st.md new file mode 100644 index 00000000..dd66e624 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/data-movement/references/st.md @@ -0,0 +1,33 @@ +# PTX Instruction Note: st + +`st` stores register values to memory in the specified state space and type form. + +## Official Positioning + +- Documentation section: Data Movement and Conversion Instructions: `st` + +## Key Constraints + +- Destination address state space must match the selected `st` variant. +- Source register type must match stored element type. +- For concurrent shared-data read/write, establish ordering with fence/atom/barrier. + +## Usage Notes + +- Keep alignment and element-size choices consistent with consumer load patterns. +- Use the narrowest valid state-space form and pair with explicit synchronization when required. + +## Example (PTX style) + +```ptx +st.global.u32 [addr], r1; +st.shared.f32 [saddr], f1; +``` + +## Official Source Links (fact check) + +- st: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-st +- st.async: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-st-async +- st.bulk: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-st-bulk + +Last verified date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/floating-point/DOC.md b/content/cuda/docs/ptx/instructions/floating-point/DOC.md new file mode 100644 index 00000000..92899cda --- /dev/null +++ b/content/cuda/docs/ptx/instructions/floating-point/DOC.md @@ -0,0 +1,67 @@ +--- +name: ptx-floating-point-instructions +description: "PTX floating-point instructions, rounding behavior, and type constraints in ISA 9.2." +metadata: + languages: "cpp" + versions: "9.2" + revision: 2 + updated-on: "2026-03-19" + source: official + tags: "cuda,ptx,floating-point,math" +--- + +# PTX Floating-Point + +This page focuses on PTX floating-point paths, rounding semantics, and common pitfalls. + +## Common Instructions + +- `add` / `sub` / `mul` +- `fma` +- `div` +- `sqrt` + +## Syntax Example (PTX style) + +```ptx +fma.rn.f32 d, a, b, c; +sqrt.rn.f32 d, a; +``` + +## Constraints and Pitfalls + +- Rounding suffixes and type suffixes must match legal ISA forms. +- Approximate transcendental forms can differ from high-precision library references. +- NaN/Inf and exceptional-value behavior should be treated according to ISA semantics. + +## Usage Recommendations + +- Validate precision-sensitive kernels against a reference implementation. +- Distinguish approximate and exact variants when setting numerical tolerances. +- Keep mixed-precision policies explicit (input type, compute type, accumulation type). + +## Official Source Links (fact check) + +- Floating Point Instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions +- fma: https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions-fma +- sqrt: https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions-sqrt +- Half Precision instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#half-precision-floating-point-instructions + +Last verified date: 2026-03-19 + +## Single-Instruction References + +- `references/add.md` +- `references/sub.md` +- `references/mul.md` +- `references/fma.md` +- `references/sqrt.md` +- `references/rcp.md` +- `references/rsqrt.md` +- `references/sin.md` +- `references/cos.md` +- `references/lg2.md` +- `references/ex2.md` +- `references/tanh.md` +- `references/copysign.md` +- `references/testp.md` diff --git a/content/cuda/docs/ptx/instructions/floating-point/references/add.md b/content/cuda/docs/ptx/instructions/floating-point/references/add.md new file mode 100644 index 00000000..dca1f2c9 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/floating-point/references/add.md @@ -0,0 +1,33 @@ +# PTX Instruction Note: add (floating-point) + +`add` performs floating-point addition with PTX-defined type and rounding variants. + +## Official Positioning + +- Documentation section: Floating Point Instructions: `add` +- Related sections: Half precision and mixed precision `add` variants + +## Key Constraints + +- Use a type/rounding suffix combination that is valid for the selected variant. +- Source and destination operand types must match the instruction form. +- NaN/Inf and exceptional cases follow ISA-defined floating-point semantics. + +## Usage Notes + +- Use explicit rounding suffixes in numerically audited kernels to avoid implicit behavior drift. +- Validate mixed-precision accumulation paths when `add` consumes converted inputs. + +## Example (PTX style) + +```ptx +add.rn.f32 d, a, b; +``` + +## Official Source Links (fact check) + +- add: https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions-add +- Half precision add: https://docs.nvidia.com/cuda/parallel-thread-execution/#half-precision-floating-point-instructions-add +- Mixed precision add: https://docs.nvidia.com/cuda/parallel-thread-execution/#mixed-precision-floating-point-instructions-add + +Last verified date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/floating-point/references/copysign.md b/content/cuda/docs/ptx/instructions/floating-point/references/copysign.md new file mode 100644 index 00000000..28ed41b8 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/floating-point/references/copysign.md @@ -0,0 +1,31 @@ +# PTX Instruction Note: copysign + +`copysign` returns the magnitude of the first operand with the sign bit of the second operand. + +## Official Positioning + +- Documentation section: Floating Point Instructions: `copysign` + +## Key Constraints + +- Operand and destination types must match the selected variant. +- This is a sign-bit transform, not a fused arithmetic operation. +- Special-value behavior follows ISA-defined floating-point semantics. + +## Usage Notes + +- Use `copysign` for branchless sign injection while preserving magnitude. +- Keep NaN and signed-zero behavior aligned with your numerical policy. + +## Example (PTX style) + +```ptx +copysign.f32 d, a, b; +``` + +## Official Source Links (fact check) + +- copysign: https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions-copysign +- Floating point instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions + +Last verified date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/floating-point/references/cos.md b/content/cuda/docs/ptx/instructions/floating-point/references/cos.md new file mode 100644 index 00000000..4858c88a --- /dev/null +++ b/content/cuda/docs/ptx/instructions/floating-point/references/cos.md @@ -0,0 +1,31 @@ +# PTX Instruction Note: cos + +`cos` computes cosine using PTX-defined floating-point variants. + +## Official Positioning + +- Documentation section: Floating Point Instructions: `cos` + +## Key Constraints + +- Common forms are approximate variants; check precision requirements before use. +- Input domain handling and internal range behavior are ISA-defined. +- Use reference checks for numerically sensitive kernels. + +## Usage Notes + +- Use transcendental intrinsics selectively in hot loops because throughput is typically lower than basic arithmetic. +- Pre-normalize input range where possible to improve numerical stability of approximate forms. + +## Example (PTX style) + +```ptx +cos.approx.f32 d, a; +``` + +## Official Source Links (fact check) + +- cos: https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions-cos +- Floating point instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions + +Last verified date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/floating-point/references/ex2.md b/content/cuda/docs/ptx/instructions/floating-point/references/ex2.md new file mode 100644 index 00000000..74c47466 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/floating-point/references/ex2.md @@ -0,0 +1,32 @@ +# PTX Instruction Note: ex2 + +`ex2` computes `2^x` for PTX floating-point variants. + +## Official Positioning + +- Documentation section: Floating Point Instructions: `ex2` +- Related extension: Half precision `ex2` + +## Key Constraints + +- Common forms are approximate and may differ from high-precision library output. +- Select type suffixes that match downstream numeric requirements. +- Validate error behavior on representative production ranges. + +## Usage Notes + +- Use `ex2` for base-2 exponentiation paths to avoid extra base conversion overhead. +- Recheck stability when `ex2` output is immediately fed into normalization or softmax-like pipelines. + +## Example (PTX style) + +```ptx +ex2.approx.f32 d, a; +``` + +## Official Source Links (fact check) + +- ex2: https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions-ex2 +- Half precision ex2: https://docs.nvidia.com/cuda/parallel-thread-execution/#half-precision-floating-point-instructions-ex2 + +Last verified date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/floating-point/references/fma.md b/content/cuda/docs/ptx/instructions/floating-point/references/fma.md new file mode 100644 index 00000000..47de47da --- /dev/null +++ b/content/cuda/docs/ptx/instructions/floating-point/references/fma.md @@ -0,0 +1,33 @@ +# PTX Instruction Note: fma (floating-point) + +`fma` performs fused multiply-add with single-rounding semantics for the selected variant. + +## Official Positioning + +- Documentation section: Floating Point Instructions: `fma` +- Related extensions: Half precision and mixed precision `fma` + +## Key Constraints + +- `fma` is not equivalent to separate `mul` then `add` in rounding behavior. +- Type and rounding suffixes must match variant requirements. +- Validate precision-sensitive kernels when switching between fused and split forms. + +## Usage Notes + +- Prefer `fma` in compute-bound loops to reduce instruction count and intermediate rounding error. +- Compare against non-fused baselines when strict bitwise reproducibility is required. + +## Example (PTX style) + +```ptx +fma.rn.f32 d, a, b, c; +``` + +## Official Source Links (fact check) + +- fma: https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions-fma +- Half precision fma: https://docs.nvidia.com/cuda/parallel-thread-execution/#half-precision-floating-point-instructions-fma +- Mixed precision fma: https://docs.nvidia.com/cuda/parallel-thread-execution/#mixed-precision-floating-point-instructions-fma + +Last verified date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/floating-point/references/lg2.md b/content/cuda/docs/ptx/instructions/floating-point/references/lg2.md new file mode 100644 index 00000000..9570e088 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/floating-point/references/lg2.md @@ -0,0 +1,31 @@ +# PTX Instruction Note: lg2 + +`lg2` computes logarithm base 2 for PTX floating-point variants. + +## Official Positioning + +- Documentation section: Floating Point Instructions: `lg2` + +## Key Constraints + +- Approximate forms are common; accuracy depends on the selected variant. +- Domain handling for zero, negative, and exceptional inputs follows ISA rules. +- Use reference validation when numerical stability is critical. + +## Usage Notes + +- Use `lg2` when your algorithm is naturally base-2 (for example entropy-like or bit-scale transforms). +- Check behavior near zero and denormal ranges when downstream code assumes finite outputs. + +## Example (PTX style) + +```ptx +lg2.approx.f32 d, a; +``` + +## Official Source Links (fact check) + +- lg2: https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions-lg2 +- Floating point instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions + +Last verified date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/floating-point/references/mul.md b/content/cuda/docs/ptx/instructions/floating-point/references/mul.md new file mode 100644 index 00000000..88fbf572 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/floating-point/references/mul.md @@ -0,0 +1,32 @@ +# PTX Instruction Note: mul (floating-point) + +`mul` performs floating-point multiplication with PTX-defined type and rounding variants. + +## Official Positioning + +- Documentation section: Floating Point Instructions: `mul` +- Related extension: Half precision `mul` + +## Key Constraints + +- Use valid type/rounding suffix combinations for the target variant. +- Operand types must match the chosen instruction form. +- Verify precision behavior when combining with mixed-precision accumulation. + +## Usage Notes + +- Prefer fused forms (`fma`) when multiply-add is immediately chained and numerical policy allows it. +- Track denormal and FTZ behavior when reproducing CPU reference results. + +## Example (PTX style) + +```ptx +mul.rn.f32 d, a, b; +``` + +## Official Source Links (fact check) + +- mul: https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions-mul +- Half precision mul: https://docs.nvidia.com/cuda/parallel-thread-execution/#half-precision-floating-point-instructions-mul + +Last verified date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/floating-point/references/rcp.md b/content/cuda/docs/ptx/instructions/floating-point/references/rcp.md new file mode 100644 index 00000000..2d0b1737 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/floating-point/references/rcp.md @@ -0,0 +1,32 @@ +# PTX Instruction Note: rcp + +`rcp` computes reciprocal for the selected floating-point variant. + +## Official Positioning + +- Documentation section: Floating Point Instructions: `rcp` +- Related extension: `rcp.approx.ftz.f64` + +## Key Constraints + +- Distinguish exact/rounded vs approximate variants based on requirements. +- Zero and exceptional input behavior follows ISA-defined floating-point semantics. +- Validate error tolerance before using approximate forms in iterative kernels. + +## Usage Notes + +- Use `rcp` to replace scalar division hot paths when reciprocal error is acceptable. +- Reassess convergence/stability if approximate reciprocals feed iterative updates. + +## Example (PTX style) + +```ptx +rcp.rn.f32 d, a; +``` + +## Official Source Links (fact check) + +- rcp: https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions-rcp +- rcp.approx.ftz.f64: https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions-rcp-approx-ftz-f64 + +Last verified date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/floating-point/references/rsqrt.md b/content/cuda/docs/ptx/instructions/floating-point/references/rsqrt.md new file mode 100644 index 00000000..38982c58 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/floating-point/references/rsqrt.md @@ -0,0 +1,32 @@ +# PTX Instruction Note: rsqrt + +`rsqrt` computes reciprocal square root for the selected floating-point variant. + +## Official Positioning + +- Documentation section: Floating Point Instructions: `rsqrt` +- Related extension: `rsqrt.approx.ftz.f64` + +## Key Constraints + +- Approximate forms are common and should be validated against error budgets. +- Negative and exceptional inputs follow ISA-defined semantics. +- Choose variant precision to match normalization or solver stability needs. + +## Usage Notes + +- Use `rsqrt` in normalization-heavy kernels to reduce divide and square-root pressure. +- Pair approximate forms with one refinement step when tighter relative error is required. + +## Example (PTX style) + +```ptx +rsqrt.approx.f32 d, a; +``` + +## Official Source Links (fact check) + +- rsqrt: https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions-rsqrt +- rsqrt.approx.ftz.f64: https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions-rsqrt-approx-ftz-f64 + +Last verified date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/floating-point/references/sin.md b/content/cuda/docs/ptx/instructions/floating-point/references/sin.md new file mode 100644 index 00000000..1eba36b0 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/floating-point/references/sin.md @@ -0,0 +1,31 @@ +# PTX Instruction Note: sin + +`sin` computes sine using PTX floating-point variants. + +## Official Positioning + +- Documentation section: Floating Point Instructions: `sin` + +## Key Constraints + +- Common forms are approximate; accuracy varies by variant and architecture. +- Exceptional-value handling follows ISA-defined semantics. +- Validate on production ranges for numerically sensitive workloads. + +## Usage Notes + +- Favor `sin` for moderate-accuracy signal paths; validate if gradients or phase error are sensitive. +- Benchmark with realistic input distributions, not only uniform synthetic ranges. + +## Example (PTX style) + +```ptx +sin.approx.f32 d, a; +``` + +## Official Source Links (fact check) + +- sin: https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions-sin +- Floating point instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions + +Last verified date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/floating-point/references/sqrt.md b/content/cuda/docs/ptx/instructions/floating-point/references/sqrt.md new file mode 100644 index 00000000..ccd850ea --- /dev/null +++ b/content/cuda/docs/ptx/instructions/floating-point/references/sqrt.md @@ -0,0 +1,31 @@ +# PTX Instruction Note: sqrt + +`sqrt` computes square root for the selected floating-point variant. + +## Official Positioning + +- Documentation section: Floating Point Instructions: `sqrt` + +## Key Constraints + +- Use variant-specific rounding and type suffixes where required. +- Negative and exceptional input behavior follows ISA-defined semantics. +- Evaluate precision/performance tradeoffs between exact and approximate forms. + +## Usage Notes + +- Prefer `rsqrt` plus refinement when reciprocal-root throughput is the primary goal. +- Validate corner cases (very small, very large, and subnormal inputs) when switching variants. + +## Example (PTX style) + +```ptx +sqrt.rn.f32 d, a; +``` + +## Official Source Links (fact check) + +- sqrt: https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions-sqrt +- Floating point instruction set: https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions + +Last verified date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/floating-point/references/sub.md b/content/cuda/docs/ptx/instructions/floating-point/references/sub.md new file mode 100644 index 00000000..98aa0d87 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/floating-point/references/sub.md @@ -0,0 +1,33 @@ +# PTX Instruction Note: sub (floating-point) + +`sub` performs floating-point subtraction with PTX-defined type and rounding variants. + +## Official Positioning + +- Documentation section: Floating Point Instructions: `sub` +- Related sections: Half precision and mixed precision `sub` variants + +## Key Constraints + +- Use valid type/rounding suffix combinations for the selected variant. +- Operand types must match the instruction form. +- Special-value behavior follows ISA-defined floating-point semantics. + +## Usage Notes + +- Keep subtract order explicit in refactors because `a - b` vs `b - a` can alter cancellation behavior. +- Re-evaluate tolerance thresholds when replacing `sub` with fused alternatives. + +## Example (PTX style) + +```ptx +sub.rn.f32 d, a, b; +``` + +## Official Source Links (fact check) + +- sub: https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions-sub +- Half precision sub: https://docs.nvidia.com/cuda/parallel-thread-execution/#half-precision-floating-point-instructions-sub +- Mixed precision sub: https://docs.nvidia.com/cuda/parallel-thread-execution/#mixed-precision-floating-point-instructions-sub + +Last verified date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/floating-point/references/tanh.md b/content/cuda/docs/ptx/instructions/floating-point/references/tanh.md new file mode 100644 index 00000000..fbcac272 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/floating-point/references/tanh.md @@ -0,0 +1,32 @@ +# PTX Instruction Note: tanh + +`tanh` computes hyperbolic tangent using PTX floating-point variants. + +## Official Positioning + +- Documentation section: Floating Point Instructions: `tanh` +- Related extension: Half precision `tanh` + +## Key Constraints + +- Typical forms are approximate and should be validated for model-specific tolerances. +- Saturation and exceptional input behavior follow ISA-defined semantics. +- Use reference comparisons for numerically sensitive paths. + +## Usage Notes + +- Use `tanh` where bounded output is required and approximation error is acceptable. +- Check gradient-sensitive training/inference paths separately from forward-only tolerance checks. + +## Example (PTX style) + +```ptx +tanh.approx.f32 d, a; +``` + +## Official Source Links (fact check) + +- tanh: https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions-tanh +- Half precision tanh: https://docs.nvidia.com/cuda/parallel-thread-execution/#half-precision-floating-point-instructions-tanh + +Last verified date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/floating-point/references/testp.md b/content/cuda/docs/ptx/instructions/floating-point/references/testp.md new file mode 100644 index 00000000..713d01c2 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/floating-point/references/testp.md @@ -0,0 +1,31 @@ +# PTX Instruction Note: testp + +`testp` evaluates floating-point class/property predicates and writes a predicate result. + +## Official Positioning + +- Documentation section: Floating Point Instructions: `testp` + +## Key Constraints + +- Predicate selector (`nan`, `finite`, and related forms) controls the test semantics. +- Destination is a predicate register and is typically consumed by branch/selection instructions. +- Type suffix and selector must match a legal ISA form. + +## Usage Notes + +- Use `testp` to isolate exceptional-value handling into explicit predicate paths. +- Pair with `selp` for branchless fallback selection when divergence is undesirable. + +## Example (PTX style) + +```ptx +testp.nan.f32 p, a; +``` + +## Official Source Links (fact check) + +- testp: https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions-testp +- Floating point instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions + +Last verified date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/integer/DOC.md b/content/cuda/docs/ptx/instructions/integer/DOC.md new file mode 100644 index 00000000..d74f05e0 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/integer/DOC.md @@ -0,0 +1,74 @@ +--- +name: ptx-integer-instructions +description: "PTX integer arithmetic instructions and constraints for ISA 9.2." +metadata: + languages: "cpp" + versions: "9.2" + revision: 2 + updated-on: "2026-03-19" + source: official + tags: "cuda,ptx,integer,arithmetic" +--- + +# PTX Integer Arithmetic + +This page covers the core semantics and practical constraints of PTX integer arithmetic instruction families. + +## Common Instructions + +- `add` / `sub` / `mul` +- `mad` (multiply-add) +- `div` / `rem` +- `abs` / `neg` + +## Syntax Example (PTX Style) + +```ptx +add.s32 d, a, b; +mad.lo.s32 d, a, b, c; +``` + +## Constraints and Pitfalls + +- `.s*` / `.u*` must match both the register types and the operation semantics. +- Variants such as `mad` should be checked for high/low-part selection and rounding behavior. +- Different bit-widths and variants may be restricted by PTX ISA / Target ISA requirements. + +## Usage Recommendations + +- Prefer keeping clearly defined signed/unsigned semantics within the same code region. +- When dealing with overflow semantics, do not rely on the compiler to automatically infer behavior. + +## Official Source Links (Fact Check) + +- Integer Arithmetic Instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#integer-arithmetic-instructions +- add: https://docs.nvidia.com/cuda/parallel-thread-execution/#integer-arithmetic-instructions-add +- mad: https://docs.nvidia.com/cuda/parallel-thread-execution/#integer-arithmetic-instructions-mad +- mul: https://docs.nvidia.com/cuda/parallel-thread-execution/#integer-arithmetic-instructions-mul + +Last cross-check date: 2026-03-19 + +## Single-instruction Topics +- `references/setp.md` +- `references/selp.md` +- `references/brev.md` +- `references/bfind.md` +- `references/bfe.md` +- `references/bfi.md` +- `references/prmt.md` +- `references/lop3.md` +- `references/popc.md` +- `references/sad.md` +- `references/mul24.md` +- `references/mad24.md` +- `references/clz.md` +- `references/and.md` +- `references/xor.md` +- `references/shf.md` +- `references/or.md` +- `references/not.md` +- `references/shl.md` +- `references/shr.md` +- `references/min.md` +- `references/max.md` +- `references/div.md` diff --git a/content/cuda/docs/ptx/instructions/integer/references/and.md b/content/cuda/docs/ptx/instructions/integer/references/and.md new file mode 100644 index 00000000..66a9c280 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/integer/references/and.md @@ -0,0 +1,31 @@ +# PTX Instruction Topic: and + +`and` performs a bitwise AND and is a fundamental operation in the Logic and Shift instruction family. + +## Official Description + +- Documentation section: Logic and Shift Instructions: `and` + +## Key Constraints + +- Operand width/type suffixes must match (`.b16/.b32/.b64` forms as applicable). +- Inputs must already be normalized to the intended bit-width domain. +- Mask constants should use explicit width to avoid unintended sign/width propagation. + +## Usage Notes + +- Use `and` for deterministic mask extraction before shifts or comparisons. +- In packed-field code, pair with `shl/shr/bfe` to keep bit positions explicit. + +## Example (PTX Style) + +```ptx +and.b32 d, a, b; +``` + +## Official Source Links (Fact Check) + +- and: https://docs.nvidia.com/cuda/parallel-thread-execution/#logic-and-shift-instructions-and +- Logic and Shift instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#logic-and-shift-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/integer/references/bfe.md b/content/cuda/docs/ptx/instructions/integer/references/bfe.md new file mode 100644 index 00000000..68d0f2c4 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/integer/references/bfe.md @@ -0,0 +1,31 @@ +# PTX Instruction Topic: bfe + +`bfe` (bit-field extract) extracts a specified bit-field from a source value. + +## Official Description + +- Documentation section: Logic and Shift Instructions: `bfe` +- Commonly used for packed data decoding and field extraction + +## Key Constraints + +- The start bit and length parameters must satisfy the bit-width range. +- The signed/unsigned extraction semantics are determined by the variant suffix. + +## Usage Notes + +- Prefer `bfe` over ad hoc mask/shift chains when decoding packed metadata fields. +- Keep `pos/len` constants explicit and centralized to avoid layout drift bugs. + +## Example (PTX Style, Illustrative) + +```ptx +bfe.u32 d, a, pos, len; +``` + +## Official Source Links (Fact Check) + +- bfe: https://docs.nvidia.com/cuda/parallel-thread-execution/#logic-and-shift-instructions-bfe +- Logic and Shift instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#logic-and-shift-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/integer/references/bfi.md b/content/cuda/docs/ptx/instructions/integer/references/bfi.md new file mode 100644 index 00000000..52f4b919 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/integer/references/bfi.md @@ -0,0 +1,31 @@ +# PTX Instruction Topic: bfi + +`bfi` (bit-field insert) writes a field into a target bit range. + +## Official Description + +- Documentation section: Logic and Shift Instructions: `bfi` +- Often used together with `bfe` for packed data encoding + +## Key Constraints + +- The insert-range parameters must be within the target bit-width range. +- The combination of source field width and position must satisfy the variant definition. + +## Usage Notes + +- Use `bfi` to update packed headers/flags without disturbing unaffected bit fields. +- Pair with `bfe` in encode/decode pipelines to keep bit-layout contracts symmetric. + +## Example (PTX Style, Illustrative) + +```ptx +bfi.b32 d, a, b, pos, len; +``` + +## Official Source Links (Fact Check) + +- bfi: https://docs.nvidia.com/cuda/parallel-thread-execution/#logic-and-shift-instructions-bfi +- Logic and Shift instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#logic-and-shift-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/integer/references/bfind.md b/content/cuda/docs/ptx/instructions/integer/references/bfind.md new file mode 100644 index 00000000..ed58dcdf --- /dev/null +++ b/content/cuda/docs/ptx/instructions/integer/references/bfind.md @@ -0,0 +1,31 @@ +# PTX Instruction Topic: bfind + +`bfind` finds the position of a bit (the most/least significant bit position, depending on the variant semantics). + +## Official Description + +- Documentation section: Logic and Shift Instructions: `bfind` +- Suitable for bit scanning, normalization, and encoding optimization paths + +## Key Constraints + +- For empty input (e.g., all zeros), the result semantics follow the variant definition. +- The type/bit-width must match the suffix and the destination register. + +## Usage Notes + +- Use `bfind` for fast position lookup in sparse-bit masks and encoding routines. +- Guard zero-input handling explicitly when downstream logic assumes a valid bit index. + +## Example (PTX Style, Illustrative) + +```ptx +bfind.u32 d, a; +``` + +## Official Source Links (Fact Check) + +- bfind: https://docs.nvidia.com/cuda/parallel-thread-execution/#logic-and-shift-instructions-bfind +- Logic and Shift instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#logic-and-shift-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/integer/references/brev.md b/content/cuda/docs/ptx/instructions/integer/references/brev.md new file mode 100644 index 00000000..c0626d47 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/integer/references/brev.md @@ -0,0 +1,31 @@ +# PTX Instruction Topic: brev + +`brev` performs a bit reverse and is commonly used for bit-manipulation rearrangement and index transformations. + +## Official Description + +- Documentation section: Logic and Shift Instructions: `brev` +- Commonly used in scenarios that require bit-level reversed mappings + +## Key Constraints + +- The input/output bit widths must match the instruction variant. +- It only changes bit ordering; it does not extend arithmetic semantics. + +## Usage Notes + +- Useful for bit-reversed indexing patterns and bitstream transformations. +- Keep post-transform masking explicit when only subsets of bits are semantically valid. + +## Example (PTX Style) + +```ptx +brev.b32 d, a; +``` + +## Official Source Links (Fact Check) + +- brev: https://docs.nvidia.com/cuda/parallel-thread-execution/#logic-and-shift-instructions-brev +- Logic and Shift instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#logic-and-shift-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/integer/references/clz.md b/content/cuda/docs/ptx/instructions/integer/references/clz.md new file mode 100644 index 00000000..465c03eb --- /dev/null +++ b/content/cuda/docs/ptx/instructions/integer/references/clz.md @@ -0,0 +1,35 @@ +# PTX Instruction Topic: clz + +`clz` (count leading zeros) counts the number of consecutive zero bits starting from the most significant bit. + +## Official Description + +- Documentation section: Integer Arithmetic Instructions: `clz` + +## Key Constraints + +- When the input is 0, result is the operand bit width for the corresponding variant. +- The bit-width suffix must match the register type. + +## Usage Notes + +- Use `clz` as a primitive for normalization, integer `log2` approximations, and bit-scan helpers. +- Keep input width explicit (`.b32` vs `.b64`) when results are consumed by index arithmetic. + +## Common Failure Modes + +- Assuming zero input returns a sentinel other than operand bit width. +- Mixing 32-bit and 64-bit `clz` outputs in shared index math without conversion. + +## Example (PTX Style) + +```ptx +clz.b32 d, a; +``` + +## Official Source Links (Fact Check) + +- clz: https://docs.nvidia.com/cuda/parallel-thread-execution/#integer-arithmetic-instructions-clz +- Integer arithmetic instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#integer-arithmetic-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/integer/references/div.md b/content/cuda/docs/ptx/instructions/integer/references/div.md new file mode 100644 index 00000000..9b451848 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/integer/references/div.md @@ -0,0 +1,31 @@ +# PTX Instruction Topic: div + +`div` performs division and supports different types and variant semantics. + +## Official Description + +- Documentation section: Integer Arithmetic Instructions: `div` + +## Key Constraints + +- The behavior when the divisor is 0 is defined by the official specification; protect against it before use. +- Signed/unsigned division semantics differ. +- On performance-critical paths, evaluate `div` latency and consider alternative strategies. + +## Usage Notes + +- In tight loops, replace division by compile-time constants with multiply/shift transforms when valid. +- Keep explicit preconditions for divisor domain to avoid hidden exceptional-path costs. + +## Example (PTX Style) + +```ptx +div.s32 d, a, b; +``` + +## Official Source Links (Fact Check) + +- div: https://docs.nvidia.com/cuda/parallel-thread-execution/#integer-arithmetic-instructions-div +- Integer arithmetic instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#integer-arithmetic-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/integer/references/lop3.md b/content/cuda/docs/ptx/instructions/integer/references/lop3.md new file mode 100644 index 00000000..a5a5f8ea --- /dev/null +++ b/content/cuda/docs/ptx/instructions/integer/references/lop3.md @@ -0,0 +1,31 @@ +# PTX Instruction Topic: lop3 + +`lop3` is a three-input lookup-table logical operation that can express any three-input boolean function. + +## Official Description + +- Documentation section: Logic and Shift Instructions: `lop3` +- Commonly used to fuse multiple boolean instructions into a single logical operation + +## Key Constraints + +- The 8-bit immediate truth table defines the boolean function. +- The type suffix must match the input bit widths. + +## Usage Notes + +- Use `lop3` to compress multi-stage boolean logic into one instruction where possible. +- Keep LUT constants named and documented, because readability drops quickly with raw immediates. + +## Example (PTX Style, Illustrative) + +```ptx +lop3.b32 d, a, b, c, immLut; +``` + +## Official Source Links (Fact Check) + +- lop3: https://docs.nvidia.com/cuda/parallel-thread-execution/#logic-and-shift-instructions-lop3 +- Logic and Shift instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#logic-and-shift-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/integer/references/mad24.md b/content/cuda/docs/ptx/instructions/integer/references/mad24.md new file mode 100644 index 00000000..9af5e531 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/integer/references/mad24.md @@ -0,0 +1,33 @@ +# PTX Instruction Topic: mad24 + +`mad24` adds a third operand on top of the `mul24` result and supports variants such as `.hi`/`.lo` and saturated modes. + +## Official Description + +- Documentation section: Integer Arithmetic Instructions: `mad24` + +## Official Syntax (Excerpt) + +```ptx +mad24.mode.type d, a, b, c; +mad24.hi.sat.s32 d, a, b, c; +``` + +## Key Semantics + +- `.lo`: adds `c` to the low 32 bits of a 24x24 product. +- `.hi`: adds `c` to the high 32 bits of a 24x24 product. +- `.hi` may be slower when there is no dedicated 24-bit multiplication hardware. + +## Example (PTX Style) + +```ptx +mad24.lo.s32 d, a, b, c; +``` + +## Official Source Links (Fact Check) + +- mad24: https://docs.nvidia.com/cuda/parallel-thread-execution/#integer-arithmetic-instructions-mad24 +- Integer arithmetic instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#integer-arithmetic-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/integer/references/max.md b/content/cuda/docs/ptx/instructions/integer/references/max.md new file mode 100644 index 00000000..dbffde0b --- /dev/null +++ b/content/cuda/docs/ptx/instructions/integer/references/max.md @@ -0,0 +1,35 @@ +# PTX Instruction Topic: max + +`max` returns the larger of two operands and is commonly used for threshold clamping and range constraints. + +## Official Description + +- Documentation section: Integer Arithmetic Instructions: `max` + +## Key Constraints + +- Semantics depend on the data type and the variant suffix. +- For floating-point variants, refer to the official NaN semantics. + +## Usage Notes + +- Use signed variants for signed ranges and unsigned variants for raw bit-pattern ranges. +- For clamp logic, combine `max` and `min` in a fixed order to keep behavior predictable. + +## Common Failure Modes + +- Signed/unsigned variant mismatch causes incorrect ordering around high-bit values. +- Floating-point `max` behavior is assumed identical to host-language helper semantics without NaN checks. + +## Example (PTX Style) + +```ptx +max.s32 d, a, b; +``` + +## Official Source Links (Fact Check) + +- max: https://docs.nvidia.com/cuda/parallel-thread-execution/#integer-arithmetic-instructions-max +- Integer arithmetic instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#integer-arithmetic-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/integer/references/min.md b/content/cuda/docs/ptx/instructions/integer/references/min.md new file mode 100644 index 00000000..b4e82664 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/integer/references/min.md @@ -0,0 +1,35 @@ +# PTX Instruction Topic: min + +`min` returns the smaller of two operands and supports integer/float variants (as defined in the official section). + +## Official Description + +- Documentation section: Integer Arithmetic Instructions: `min` + +## Key Constraints + +- Result semantics are determined by the type and suffix. +- For floating-point comparison paths, pay attention to NaN handling (see the corresponding section notes). + +## Usage Notes + +- Pick variant suffixes to match the intended numeric ordering (`.s*` vs `.u*`). +- Use together with `max` to build branchless bound enforcement. + +## Common Failure Modes + +- Bound-check logic reverses `min`/`max` order and silently changes clamp behavior. +- Integer and floating-point minima are mixed in shared helper paths without variant-specific handling. + +## Example (PTX Style) + +```ptx +min.s32 d, a, b; +``` + +## Official Source Links (Fact Check) + +- min: https://docs.nvidia.com/cuda/parallel-thread-execution/#integer-arithmetic-instructions-min +- Integer arithmetic instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#integer-arithmetic-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/integer/references/mul24.md b/content/cuda/docs/ptx/instructions/integer/references/mul24.md new file mode 100644 index 00000000..d3da559c --- /dev/null +++ b/content/cuda/docs/ptx/instructions/integer/references/mul24.md @@ -0,0 +1,32 @@ +# PTX Instruction Topic: mul24 + +`mul24` returns either the high 32 bits or the low 32 bits of a 48-bit result from a 24x24-bit multiplication (depending on the `.hi`/`.lo` mode). + +## Official Description + +- Documentation section: Integer Arithmetic Instructions: `mul24` + +## Official Syntax (Excerpt) + +```ptx +mul24.mode.type d, a, b; +``` + +## Key Semantics + +- `.lo`: returns the low 32 bits of the 48-bit product. +- `.hi`: returns the high 32 bits of the 48-bit product. +- The documentation notes that on some hardware, `.hi` may be less efficient. + +## Example (PTX Style) + +```ptx +mul24.lo.s32 d, a, b; +``` + +## Official Source Links (Fact Check) + +- mul24: https://docs.nvidia.com/cuda/parallel-thread-execution/#integer-arithmetic-instructions-mul24 +- Integer arithmetic instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#integer-arithmetic-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/integer/references/not.md b/content/cuda/docs/ptx/instructions/integer/references/not.md new file mode 100644 index 00000000..6e5b5723 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/integer/references/not.md @@ -0,0 +1,31 @@ +# PTX Instruction Topic: not + +`not` performs a bitwise inversion and is a fundamental instruction for mask construction and logical complement operations. + +## Official Description + +- Documentation section: Logic and Shift Instructions: `not` + +## Key Constraints + +- Destination width must match the intended inversion domain. +- Inversion of packed fields should be followed by masking when only partial bits are valid. +- Do not treat `not` as arithmetic negation; semantics are bitwise inversion. + +## Usage Notes + +- Use `not` for complement masks and branchless bit-condition rewrites. +- Pair with `and` to isolate relevant inverted ranges in packed representations. + +## Example (PTX Style) + +```ptx +not.b32 d, a; +``` + +## Official Source Links (Fact Check) + +- not: https://docs.nvidia.com/cuda/parallel-thread-execution/#logic-and-shift-instructions-not +- Logic and Shift instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#logic-and-shift-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/integer/references/or.md b/content/cuda/docs/ptx/instructions/integer/references/or.md new file mode 100644 index 00000000..19b1d02f --- /dev/null +++ b/content/cuda/docs/ptx/instructions/integer/references/or.md @@ -0,0 +1,31 @@ +# PTX Instruction Topic: or + +`or` is a bitwise OR instruction and belongs to the Logic and Shift instruction family. + +## Official Description + +- Documentation section: Logic and Shift Instructions: `or` + +## Key Constraints + +- Operand widths and type suffixes must match the selected variant. +- Bit-layout assumptions should be documented before combining packed fields. +- Use explicit constants with matching width to avoid implicit truncation confusion. + +## Usage Notes + +- Use `or` to compose flags and packed-bit fields after proper masking/shift steps. +- Prefer readable staged composition over opaque one-line bit merges in critical code. + +## Example (PTX Style) + +```ptx +or.b32 d, a, b; +``` + +## Official Source Links (Fact Check) + +- or: https://docs.nvidia.com/cuda/parallel-thread-execution/#logic-and-shift-instructions-or +- Logic and Shift instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#logic-and-shift-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/integer/references/popc.md b/content/cuda/docs/ptx/instructions/integer/references/popc.md new file mode 100644 index 00000000..cc54d3f3 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/integer/references/popc.md @@ -0,0 +1,31 @@ +# PTX Instruction Topic: popc + +`popc` (population count) counts the number of set bits in a binary value. + +## Official Description + +- Documentation section: Integer Arithmetic Instructions: `popc` +- Common uses include mask counting, bitset operations, and compact encoding + +## Key Constraints + +- The input bit width determines the counting range. +- The result type must be able to hold the maximum count. + +## Usage Notes + +- Common for bitset density metrics, mask compaction prepasses, and voting summaries. +- Validate accumulator width if multiple `popc` results are aggregated. + +## Example (PTX Style) + +```ptx +popc.b32 d, a; +``` + +## Official Source Links (Fact Check) + +- popc: https://docs.nvidia.com/cuda/parallel-thread-execution/#integer-arithmetic-instructions-popc +- Integer arithmetic instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#integer-arithmetic-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/integer/references/prmt.md b/content/cuda/docs/ptx/instructions/integer/references/prmt.md new file mode 100644 index 00000000..5e974da2 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/integer/references/prmt.md @@ -0,0 +1,31 @@ +# PTX Instruction Topic: prmt + +`prmt` (permute) reorders bytes/nibbles under selection control and is suitable for bit-level data rearrangement. + +## Official Description + +- Documentation section: Logic and Shift Instructions: `prmt` +- Common in encoding/decoding and data-layout adjustments + +## Key Constraints + +- The control mask determines the reorder sources and order. +- Ensure the permute mode matches the input data layout. + +## Usage Notes + +- Prefer `prmt` for byte-lane rearrangement when scalar mask/shift sequences become instruction-heavy. +- Keep test vectors for endianness-sensitive paths to catch layout mistakes early. + +## Example (PTX Style, Illustrative) + +```ptx +prmt.b32 d, a, b, c; +``` + +## Official Source Links (Fact Check) + +- prmt: https://docs.nvidia.com/cuda/parallel-thread-execution/#logic-and-shift-instructions-prmt +- Logic and Shift instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#logic-and-shift-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/integer/references/sad.md b/content/cuda/docs/ptx/instructions/integer/references/sad.md new file mode 100644 index 00000000..327d561b --- /dev/null +++ b/content/cuda/docs/ptx/instructions/integer/references/sad.md @@ -0,0 +1,35 @@ +# PTX Instruction Topic: sad + +`sad` (sum of absolute differences) computes the sum of absolute differences and is commonly used in image processing and distance-related operators. + +## Official Description + +- Documentation section: Integer Arithmetic Instructions: `sad` + +## Key Constraints + +- Operand types and bit widths must match the variant suffix. +- The accumulation width must be able to hold the sum result across multiple elements. + +## Usage Notes + +- Use `sad` for low-overhead distance accumulation in matching and scoring loops. +- Validate accumulation range early when chaining multiple `sad` stages. + +## Common Failure Modes + +- Accumulation width is too narrow for multi-stage reductions and overflows silently. +- Input packing assumptions differ between producer and `sad` consumer paths. + +## Example (PTX Style, Illustrative) + +```ptx +sad.u32 d, a, b; +``` + +## Official Source Links (Fact Check) + +- sad: https://docs.nvidia.com/cuda/parallel-thread-execution/#integer-arithmetic-instructions-sad +- Integer arithmetic instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#integer-arithmetic-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/integer/references/selp.md b/content/cuda/docs/ptx/instructions/integer/references/selp.md new file mode 100644 index 00000000..e3fd8db0 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/integer/references/selp.md @@ -0,0 +1,32 @@ +# PTX Instruction Topic: selp + +`selp` selects between two operands based on a predicate and is commonly used for branchless conditional assignment. + +## Official Description + +- Documentation section: Comparison and Selection Instructions: `selp` +- Commonly used as an alternative to simple if/else to reduce branch divergence + +## Key Constraints + +- The predicate operand must be a valid predicate. +- The source/destination types must match the `selp` suffix. +- When strict numeric semantics are required, ensure that the value types are fully consistent. + +## Usage Notes + +- Use `selp` to remove short divergent branches when both candidate values are already available. +- Keep expensive side-effecting work outside `selp` paths because values are produced before selection. + +## Example (PTX Style) + +```ptx +selp.s32 d, a, b, p; +``` + +## Official Source Links (Fact Check) + +- selp: https://docs.nvidia.com/cuda/parallel-thread-execution/#comparison-and-selection-instructions-selp +- Comparison and Selection instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#comparison-and-selection-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/integer/references/setp.md b/content/cuda/docs/ptx/instructions/integer/references/setp.md new file mode 100644 index 00000000..a7cb1d21 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/integer/references/setp.md @@ -0,0 +1,33 @@ +# PTX Instruction Topic: setp + +`setp` is a core instruction that compares and writes a predicate register, used to build conditional branching and predicated (masked) execution. + +## Official Description + +- Documentation section: Comparison and Selection Instructions: `setp` +- Generates a predicate result based on the comparison relation; commonly used with `@p bra` and `selp` + +## Key Constraints + +- The comparison operand types must match the variant suffix. +- The result is written to a predicate register and can be used later as a predication condition. +- For floating-point comparisons, pay attention to NaN-related comparison semantics. + +## Usage Notes + +- Use `setp + selp` for branchless value selection in divergence-sensitive paths. +- Keep predicate lifetimes short and explicit to avoid accidental predicate reuse bugs. + +## Example (PTX Style) + +```ptx +setp.lt.s32 p, a, b; +@p bra L_true; +``` + +## Official Source Links (Fact Check) + +- setp: https://docs.nvidia.com/cuda/parallel-thread-execution/#comparison-and-selection-instructions-setp +- Comparison and Selection instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#comparison-and-selection-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/integer/references/shf.md b/content/cuda/docs/ptx/instructions/integer/references/shf.md new file mode 100644 index 00000000..ef268de5 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/integer/references/shf.md @@ -0,0 +1,35 @@ +# PTX Instruction Topic: shf + +`shf` provides shift/concatenation semantics that combine left and right operands (see the specific variants in the official section). + +## Official Description + +- Documentation section: Logic and Shift Instructions: `shf` + +## Key Constraints + +- The shift amount and mode must follow the variant definition. +- Commonly used for wide-data rearrangement and efficient shift sequences. + +## Usage Notes + +- Use `wrap` forms for rotate-like behavior and `clamp` forms for bounded lane extraction behavior. +- Prefer `shf` over manual shift/or sequences when modeling cross-word shifts. + +## Common Failure Modes + +- `wrap` and `clamp` semantics are confused, causing incorrect bit propagation. +- Shift-count origin is not normalized and produces architecture-dependent behavior in edge cases. + +## Example (PTX Style, Illustrative) + +```ptx +shf.l.wrap.b32 d, a, b, c; +``` + +## Official Source Links (Fact Check) + +- shf: https://docs.nvidia.com/cuda/parallel-thread-execution/#logic-and-shift-instructions-shf +- Logic and Shift instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#logic-and-shift-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/integer/references/shl.md b/content/cuda/docs/ptx/instructions/integer/references/shl.md new file mode 100644 index 00000000..8ee34659 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/integer/references/shl.md @@ -0,0 +1,35 @@ +# PTX Instruction Topic: shl + +`shl` is a left-shift instruction, used for bit extension and constructing high-bit alignment. + +## Official Description + +- Documentation section: Logic and Shift Instructions: `shl` + +## Key Constraints + +- The shift amount should be within the legal range for the bit width. +- For computations related to signed semantics, carefully verify overflow behavior. + +## Usage Notes + +- Treat `shl` as a bit operation, not a safe arithmetic multiply substitute under overflow-sensitive logic. +- Keep shift-count provenance explicit when inputs may exceed legal ranges. + +## Common Failure Modes + +- Shift counts exceed legal bit width assumptions and produce unexpected masked behavior. +- Arithmetic intent is encoded with `shl` where overflow handling is actually required. + +## Example (PTX Style) + +```ptx +shl.b32 d, a, b; +``` + +## Official Source Links (Fact Check) + +- shl: https://docs.nvidia.com/cuda/parallel-thread-execution/#logic-and-shift-instructions-shl +- Logic and Shift instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#logic-and-shift-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/integer/references/shr.md b/content/cuda/docs/ptx/instructions/integer/references/shr.md new file mode 100644 index 00000000..c754681e --- /dev/null +++ b/content/cuda/docs/ptx/instructions/integer/references/shr.md @@ -0,0 +1,35 @@ +# PTX Instruction Topic: shr + +`shr` is a right-shift instruction that supports logical/arithmetic right shifts (depending on the variant suffix). + +## Official Description + +- Documentation section: Logic and Shift Instructions: `shr` + +## Key Constraints + +- The signed/unsigned suffix affects the high-bit fill semantics. +- The shift amount must be within the allowed bit-width range. + +## Usage Notes + +- Use signed variants for arithmetic right shift and unsigned variants for logical right shift. +- Audit downstream mask/extract logic when switching between `.s*` and `.u*` variants. + +## Common Failure Modes + +- Logical right shift is expected but arithmetic variant is used under signed types. +- Post-shift masking is omitted when consumers assume zero-filled high bits. + +## Example (PTX Style) + +```ptx +shr.u32 d, a, b; +``` + +## Official Source Links (Fact Check) + +- shr: https://docs.nvidia.com/cuda/parallel-thread-execution/#logic-and-shift-instructions-shr +- Logic and Shift instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#logic-and-shift-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/integer/references/xor.md b/content/cuda/docs/ptx/instructions/integer/references/xor.md new file mode 100644 index 00000000..89408c33 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/integer/references/xor.md @@ -0,0 +1,31 @@ +# PTX Instruction Topic: xor + +`xor` performs a bitwise XOR and is commonly used for mask toggling and simple encryption/checksum paths. + +## Official Description + +- Documentation section: Logic and Shift Instructions: `xor` + +## Key Constraints + +- Operand width/type suffixes must match legal ISA variants. +- For parity/checksum style paths, define whether truncation at each stage is acceptable. +- Avoid mixing signed arithmetic assumptions with pure bitwise transformations. + +## Usage Notes + +- Use `xor` for parity checks, mask toggles, and cheap difference markers. +- In lock-free protocols, avoid overloading `xor` logic with unclear state encoding. + +## Example (PTX Style) + +```ptx +xor.b32 d, a, b; +``` + +## Official Source Links (Fact Check) + +- xor: https://docs.nvidia.com/cuda/parallel-thread-execution/#logic-and-shift-instructions-xor +- Logic and Shift instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#logic-and-shift-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/special-registers/DOC.md b/content/cuda/docs/ptx/instructions/special-registers/DOC.md new file mode 100644 index 00000000..c0829c7f --- /dev/null +++ b/content/cuda/docs/ptx/instructions/special-registers/DOC.md @@ -0,0 +1,49 @@ +--- +name: ptx-special-registers +description: "PTX special registers reference for ISA 9.2 with common usage patterns." +metadata: + languages: "cpp" + versions: "9.2" + revision: 2 + updated-on: "2026-03-19" + source: official + tags: "cuda,ptx,special-registers,tid,ctaid" +--- + +# PTX Special Registers + +Special registers provide execution context such as thread indices, grid information, and SM-related details. + +## Common Registers + +- `%tid`: thread index within the CTA +- `%ntid`: CTA dimensions +- `%ctaid`: CTA index within the thread grid +- `%nctaid`: total number of CTAs in the grid (per dimension) +- `%smid`: SM ID (target related) + +## Usage Notes + +- Rely on special registers directly only when low-level control is truly needed. +- When inferring scheduling/topology, first verify that the target ISA is supported and the semantics are stable. + +## Example + +```ptx +mov.u32 r0, %tid.x; +mov.u32 r1, %ctaid.x; +``` + +## Official Source Links (Fact Check) + +- Special Registers: https://docs.nvidia.com/cuda/parallel-thread-execution/#special-registers +- %tid: https://docs.nvidia.com/cuda/parallel-thread-execution/#special-registers-tid +- %ctaid: https://docs.nvidia.com/cuda/parallel-thread-execution/#special-registers-ctaid +- %smid: https://docs.nvidia.com/cuda/parallel-thread-execution/#special-registers-smid + +Last cross-check date: 2026-03-19 + +## Single-instruction Topics + +- `references/tid-ctaid.md` +- `references/activemask.md` diff --git a/content/cuda/docs/ptx/instructions/special-registers/references/activemask.md b/content/cuda/docs/ptx/instructions/special-registers/references/activemask.md new file mode 100644 index 00000000..bef3a82a --- /dev/null +++ b/content/cuda/docs/ptx/instructions/special-registers/references/activemask.md @@ -0,0 +1,32 @@ +# PTX Instruction Topic: activemask + +`activemask` is used to retrieve the current active thread mask and is commonly used in warp-level cooperative algorithms. + +## Official Description + +- Documentation section: Miscellaneous Instructions: `activemask` +- Commonly used together with warp primitives such as `shfl.sync` and `vote.sync` + +## Key Constraints + +- The mask value reflects the set of active threads at the current execution point. +- If used after branch divergence, ensure the mask semantics are well understood. + +## Usage Notes + +- Read `activemask` as late as possible on the path that consumes it. +- Keep `membermask` derivation stable when chaining `shfl.sync` and `vote.sync`. +- Avoid reusing masks captured before divergence points. + +## Example (PTX Style, Illustrative) + +```ptx +activemask.b32 r_mask; +``` + +## Official Source Links (Fact Check) + +- activemask: https://docs.nvidia.com/cuda/parallel-thread-execution/#miscellaneous-instructions-activemask +- Miscellaneous instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#miscellaneous-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/special-registers/references/tid-ctaid.md b/content/cuda/docs/ptx/instructions/special-registers/references/tid-ctaid.md new file mode 100644 index 00000000..9c7b5948 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/special-registers/references/tid-ctaid.md @@ -0,0 +1,33 @@ +# PTX Instruction Topic: %tid and %ctaid + +`%tid` and `%ctaid` are the most commonly used index registers. They represent a thread's position within a CTA and the CTA's position within the grid, respectively. + +## Typical Usage + +```ptx +mov.u32 r_tid, %tid.x; +mov.u32 r_cta, %ctaid.x; +``` + +## Usage Notes + +- `%tid` / `%ctaid` are read-only special registers. +- The dimension components (`.x/.y/.z`) must match how the kernel is organized. + +## Common Failure Modes + +- Assuming 1D launch indexing while kernels are configured as 2D/3D. +- Mixing CTA-level and global index formulas across helper functions. +- Recomputing indices with mismatched integer width when problem size exceeds 32-bit ranges. + +## Indexing Reminder + +- Build global index formulas with explicit dimension strides (`blockDim` and `gridDim`) to avoid shape-dependent bugs. + +## Official Source Links (Fact Check) + +- Special Registers: https://docs.nvidia.com/cuda/parallel-thread-execution/#special-registers +- %tid: https://docs.nvidia.com/cuda/parallel-thread-execution/#special-registers-tid +- %ctaid: https://docs.nvidia.com/cuda/parallel-thread-execution/#special-registers-ctaid + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/sync-comm/DOC.md b/content/cuda/docs/ptx/instructions/sync-comm/DOC.md new file mode 100644 index 00000000..3bb0a928 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/sync-comm/DOC.md @@ -0,0 +1,66 @@ +--- +name: ptx-sync-comm-instructions +description: "PTX synchronization and communication instructions with scope-aware usage in ISA 9.2." +metadata: + languages: "cpp" + versions: "9.2" + revision: 2 + updated-on: "2026-03-19" + source: official + tags: "cuda,ptx,synchronization,mbarrier,barrier" +--- + +# PTX Synchronization and Communication + +This page covers core synchronization and communication primitives such as `barrier`, `mbarrier`, `atom`, `red`, and `fence`. + +## Official Semantics Excerpts (Key Points) + +- PTX documentation notes: asynchronous copy completion can be tracked via async-group or mbarrier mechanisms. +- For `cp.async`, if you do not use `wait_group/wait_all` or an mbarrier, the synchronization relationship does not hold. +- `cp.async.bulk`-related `complete-tx` operations on mbarrier provide `.release` and `.cluster` semantics (see the section definitions). + +## Common Patterns + +```ptx +// Initiate the async transfer first, then observe completion via mbarrier. +cp.async.bulk.shared::cta.global.mbarrier::complete_tx::bytes [dst], [src], size, [mbar]; +// ... +mbarrier.test_wait.acquire.shared::cta.b64 p, [mbar], state; +``` + +## Usage Notes + +- First determine the scope, then apply semantic modifiers (acquire/release/relaxed). +- Explicitly connect the producer completion point to the consumer-visible point. +- When using `atom` together with async copies, carefully review ordering relationships. + +## Official Source Links (Fact Check) + +- Parallel Synchronization and Communication Instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions +- mbarrier: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-mbarrier +- barrier: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-barrier +- Asynchronous operations: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-operations + +Last cross-check date: 2026-03-19 + +## Single-instruction Topics + +- `references/mbarrier-test-wait.md` +- `references/barrier.md` +- `references/atom.md` +- `references/membar-fence.md` +- `references/red.md` +- `references/elect-sync.md` +- `references/bar-sync.md` +- `references/atom-cas.md` +- `references/vote-sync.md` +- `references/match-sync.md` +- `references/shfl-sync.md` +- `references/mbarrier-arrive.md` +- `references/redux-sync.md` +- `references/mbarrier-arrive-drop.md` +- `references/cp-async-mbarrier-arrive.md` +- `references/bar-warp-sync.md` +- `references/fence-proxy.md` +- `references/membar-proxy.md` diff --git a/content/cuda/docs/ptx/instructions/sync-comm/references/atom-cas.md b/content/cuda/docs/ptx/instructions/sync-comm/references/atom-cas.md new file mode 100644 index 00000000..ae5ee78b --- /dev/null +++ b/content/cuda/docs/ptx/instructions/sync-comm/references/atom-cas.md @@ -0,0 +1,35 @@ +# PTX Instruction Topic: atom.cas + +`atom.cas` provides compare-and-swap atomic semantics and is a commonly used foundation instruction for lock-free data structures. + +## Official Notes + +- As part of the `atom` family, it has variants distinguished by address space and type. +- The documentation lists version and architecture requirements for some low-bit-width variants (e.g., `atom.cas.b16`). + +## Usage Notes + +- Build a lock-free update path by combining CAS with retry loops. +- Clearly specify scope and semantic modifiers to avoid cross-thread visibility issues. +- Ensure the target address is naturally aligned for the selected data width. +- Keep producer/consumer memory-order assumptions consistent with the selected atom semantics. + +## Common Failure Modes + +- CAS retry loops omit backoff under heavy contention and stall forward progress. +- `expected` value reuse is incorrect after failed CAS attempts. +- Scope/semantic modifiers do not match producer-consumer visibility requirements. + +## Example (PTX style) + +```ptx +atom.cas.gpu.global.u32 old, [addr], expected, desired; +``` + +## Official Source Links (Fact Check) + +- atom: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-atom +- atom.cas notes in atom section: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-atom +- Memory consistency model: https://docs.nvidia.com/cuda/parallel-thread-execution/#memory-consistency-model + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/sync-comm/references/atom.md b/content/cuda/docs/ptx/instructions/sync-comm/references/atom.md new file mode 100644 index 00000000..84d3368a --- /dev/null +++ b/content/cuda/docs/ptx/instructions/sync-comm/references/atom.md @@ -0,0 +1,33 @@ +# PTX Instruction Topic: atom + +`atom` provides atomic read-modify-write operations for concurrently updating shared/global state. + +## Official Description + +- Documentation section: Parallel Synchronization and Communication Instructions: `atom` +- Common operations include add/min/max/cas/exch, etc. (depending on the type and state space) + +## Key Constraints + +- The combination of operand type and state space must match the specified variant. +- The memory semantics (e.g., acquire/release/relaxed) and the scope must satisfy synchronization requirements. +- Choosing the wrong scope can lead to results that look correct but are concurrency-unstable. + +## Usage Notes + +- Use the narrowest valid scope (`cta` before `gpu`/`sys`) to reduce coherence traffic. +- Prefer warp/block local aggregation before global atomics under high contention. + +## Example (PTX Style) + +```ptx +atom.global.add.u32 r_old, [addr], r_val; +``` + +## Official Source Links (Fact Check) + +- atom: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-atom +- Memory consistency model: https://docs.nvidia.com/cuda/parallel-thread-execution/#memory-consistency-model +- Scope and applicability: https://docs.nvidia.com/cuda/parallel-thread-execution/#scope-and-applicability-of-the-model + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/sync-comm/references/bar-sync.md b/content/cuda/docs/ptx/instructions/sync-comm/references/bar-sync.md new file mode 100644 index 00000000..92a3bc57 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/sync-comm/references/bar-sync.md @@ -0,0 +1,35 @@ +# PTX Instruction Topic: bar.sync + +`bar.sync` is a commonly used barrier synchronization form that waits for the participating threads to rendezvous before continuing. + +## Official Notes + +- Supports variants with no thread count as well as variants with a thread count (see the section examples). +- Commonly used for phase transitions within a CTA and as boundaries for shared-memory reads/writes. + +## Example (PTX Style) + +```ptx +bar.sync 0; +bar.sync 1, 64; +``` + +## Usage Notes + +- Synchronize only the set of threads that participate in the same barrier protocol. +- Cannot replace specialized completion-wait mechanisms for `cp.async` / `wgmma`. +- Keep barrier identifier usage deterministic across all participating threads. +- Prefer full-CTA barriers unless a subset barrier protocol is explicitly designed and verified. + +## Common Failure Modes + +- Barrier id is reused by overlapping protocols in the same kernel phase. +- Some participant threads bypass the barrier due to conditional control flow. +- Barrier placement is correct for compute but misses shared-memory producer-consumer boundaries. + +## Official Source Links (Fact Check) + +- barrier: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-barrier +- bar.sync examples context: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/sync-comm/references/bar-warp-sync.md b/content/cuda/docs/ptx/instructions/sync-comm/references/bar-warp-sync.md new file mode 100644 index 00000000..d78ac5eb --- /dev/null +++ b/content/cuda/docs/ptx/instructions/sync-comm/references/bar-warp-sync.md @@ -0,0 +1,31 @@ +# PTX Instruction Topic: bar.warp.sync + +`bar.warp.sync` provides a warp-level synchronization barrier and is used for phase synchronization within a warp. + +## Official Description + +- Documentation section: Parallel Synchronization and Communication Instructions: `bar.warp.sync` +- Finer-grained than CTA-level barriers + +## Key Constraints + +- The participation mask must match the threads that actually participate. +- Should not be used as a substitute for synchronization primitives across warps/CTAs. + +## Usage Notes + +- Prefer `bar.warp.sync` for intra-warp phase boundaries with explicit member masks. +- Recompute/propagate `membermask` carefully after divergent control flow. + +## Example (PTX Style, Illustrative) + +```ptx +bar.warp.sync membermask; +``` + +## Official Source Links (Fact Check) + +- bar.warp.sync: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-bar-warp-sync +- Parallel synchronization instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/sync-comm/references/barrier.md b/content/cuda/docs/ptx/instructions/sync-comm/references/barrier.md new file mode 100644 index 00000000..9968b501 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/sync-comm/references/barrier.md @@ -0,0 +1,33 @@ +# PTX Instruction Topic: barrier + +The `barrier` family is used for thread-cooperative synchronization and is commonly used for phase transitions at the CTA/cluster level. + +## Official Description + +- Use `barrier` when you need threads to rendezvous before continuing. +- When you need to track completion of an asynchronous transfer, prefer the async-group / mbarrier mechanism specified in the documentation; do not use `barrier` as a substitute. + +## Key Constraints + +- All intended participants must reach the same barrier protocol point. +- Do not mix barrier identifiers/protocols across incompatible control paths. +- Use warp-level primitives instead when only warp-scope coordination is required. + +## Usage Notes + +- Reserve one barrier id per protocol stage to keep code auditing straightforward. +- Keep barrier placement symmetric across control-flow paths for all participating threads. + +## Example (PTX style) + +```ptx +barrier.sync 0; +``` + +## Official Source Links (Fact Check) + +- barrier: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-barrier +- Parallel synchronization instruction set: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions +- Asynchronous operations: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-operations + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/sync-comm/references/cp-async-mbarrier-arrive.md b/content/cuda/docs/ptx/instructions/sync-comm/references/cp-async-mbarrier-arrive.md new file mode 100644 index 00000000..ced7db53 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/sync-comm/references/cp-async-mbarrier-arrive.md @@ -0,0 +1,36 @@ +# PTX Instruction Topic: cp.async.mbarrier.arrive + +`cp.async.mbarrier.arrive` maps “completion of a prior `cp.async` operation” to an mbarrier arrive-on event. + +## Official Syntax + +```ptx +cp.async.mbarrier.arrive{.noinc}{.shared{::cta}}.b64 [addr]; +``` + +## Key Semantics + +- The system triggers the mbarrier arrive-on after the “`cp.async` completion that was initiated earlier by the current thread”. +- The arrive-on relative to the execution of `cp.async.mbarrier.arrive` itself is asynchronous. +- The documentation describes the ordering relationship with the prior `cp.async` and it is commonly used with `mbarrier.test_wait`. + +## Usage Notes + +- Use it to incorporate `cp.async` completion events into a unified mbarrier protocol. +- Keep it consistent with the participation count used by `mbarrier.init` to avoid count mismatches. +- Pair with explicit wait/check points before consumer loads from the staged region. +- Keep each async pipeline stage on a clear phase contract to avoid cross-stage completion confusion. + +## Common Failure Modes + +- Completion events are wired to the wrong barrier instance in multi-stage pipelines. +- Stage counters are updated without matching `arrive` expectations. +- Consumer paths assume arrival implies full protocol completion without wait checks. + +## Official Source Links (Fact Check) + +- cp.async.mbarrier.arrive: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-cp-async-mbarrier-arrive +- cp.async: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async +- mbarrier.test_wait / try_wait: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-mbarrier-try-wait + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/sync-comm/references/elect-sync.md b/content/cuda/docs/ptx/instructions/sync-comm/references/elect-sync.md new file mode 100644 index 00000000..aedc7d24 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/sync-comm/references/elect-sync.md @@ -0,0 +1,35 @@ +# PTX Instruction Topic: elect.sync + +`elect.sync` elects a representative thread within a synchronization mask scope and is commonly used for role assignment within a warp. + +## Official Description + +- Documentation section: Parallel Synchronization and Communication Instructions: `elect.sync` +- Produces a consistent “elected thread” result across the participating thread set + +## Usage Notes + +- Use when you need a single thread to execute management logic (e.g., writing shared metadata). +- Combine with synchronization primitives such as `bar` / `mbarrier` to ensure phase consistency. +- Ensure all participating threads execute with a consistent `membermask`. +- Pair leader-election paths with explicit broadcast or shared-memory publication when followers consume leader results. + +## Common Failure Modes + +- Leader path writes metadata without synchronization before follower reads. +- Different `membermask` values are used across divergent paths in the same warp. +- Elected-lane assumptions are hard-coded and break under changed active-lane patterns. + +## Example (PTX Style, Illustrative) + +```ptx +elect.sync %p, membermask; +@%p // elected thread path +``` + +## Official Source Links (Fact Check) + +- elect.sync: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-elect-sync +- Parallel synchronization instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/sync-comm/references/fence-proxy.md b/content/cuda/docs/ptx/instructions/sync-comm/references/fence-proxy.md new file mode 100644 index 00000000..905e9352 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/sync-comm/references/fence-proxy.md @@ -0,0 +1,36 @@ +# PTX Instruction Topic: fence.proxy + +`fence.proxy` establishes ordering relationships across proxies, especially for synchronization between proxies such as generic/async/tensormap. + +## Official Syntax (Excerpt) + +```ptx +fence.proxy.proxykind; +fence.proxy.to_proxykind::from_proxykind.release.scope; +fence.proxy.to_proxykind::from_proxykind.acquire.scope [addr], size; +``` + +## Key Semantics + +- Addresses ordering issues when the same memory location is accessed through different proxies. +- `fence.proxy.async` is used to synchronize between generic proxy and async proxy. +- The documentation provides the version and target-architecture requirements for `fence.proxy.async`. + +## Usage Notes + +- Apply `fence.proxy` only where cross-proxy visibility is a real requirement. +- Keep proxy-domain assumptions explicit in comments/protocol docs to avoid misuse. + +## Common Failure Modes + +- Generic-proxy ordering is assumed to cover async/tensormap proxy access without explicit fence rules. +- Acquire/release direction is reversed for producer-consumer handoff. +- Fence scope is too narrow for the actual sharing domain. + +## Official Source Links (Fact Check) + +- membar / fence (including fence.proxy): https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-membar-fence +- Asynchronous operations notes: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-operations +- Memory consistency model: https://docs.nvidia.com/cuda/parallel-thread-execution/#memory-consistency-model + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/sync-comm/references/match-sync.md b/content/cuda/docs/ptx/instructions/sync-comm/references/match-sync.md new file mode 100644 index 00000000..d8bb154e --- /dev/null +++ b/content/cuda/docs/ptx/instructions/sync-comm/references/match-sync.md @@ -0,0 +1,31 @@ +# PTX Instruction Topic: match.sync + +`match.sync` performs value matching within a synchronization mask scope and is used for warp-level grouping and consistency checks. + +## Official Description + +- Documentation section: Parallel Synchronization and Communication Instructions: `match.sync` +- Can be used to build warp-level cooperative logic grouped by key + +## Key Constraints + +- The comparison value types must match the requirements of the specific variant. +- The participation mask must match the execution path to avoid distorted results. + +## Usage Notes + +- Use for warp-level key grouping before subgroup-local reductions or dispatch. +- Validate mask consistency in debug builds for paths with complex divergence. + +## Example (PTX Style, Illustrative) + +```ptx +match.any.sync.b32 mask_out, value, membermask; +``` + +## Official Source Links (Fact Check) + +- match.sync: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-match-sync +- Parallel synchronization instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/sync-comm/references/mbarrier-arrive-drop.md b/content/cuda/docs/ptx/instructions/sync-comm/references/mbarrier-arrive-drop.md new file mode 100644 index 00000000..23b32dd1 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/sync-comm/references/mbarrier-arrive-drop.md @@ -0,0 +1,31 @@ +# PTX Instruction Topic: mbarrier.arrive_drop + +`mbarrier.arrive_drop` removes the current thread from the set of subsequent participants while also performing the arrive-on action. + +## Official Syntax (Excerpt) + +```ptx +mbarrier.arrive_drop{.sem.scope}{.shared{::cta}}.b64 state, [addr]{, count}; +mbarrier.arrive_drop{.sem.scope}{.shared::cluster}.b64 _, [addr]{, count}; +mbarrier.arrive_drop.noComplete{.release.cta}{.shared{::cta}}.b64 state, [addr], count; +``` + +## Key Semantics + +- Used by threads that “exit/not participate anymore” in the mbarrier protocol. +- The `.release` variant forms a release pattern and can synchronize with an acquire side. +- If the `.noComplete` variant leads to the phase completing, the behavior is undefined. +- In scenarios that use only `.shared::cluster` (not the current CTA), the destination operand must be `_`. + +## Usage Notes + +- Use `arrive_drop` when thread participation shrinks across pipeline phases. +- Reconcile participant counts with `mbarrier.init` contract to avoid deadlocks. + +## Official Source Links (Fact Check) + +- mbarrier.arrive_drop: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-mbarrier-arrive-drop +- mbarrier.arrive: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-mbarrier-arrive +- Memory consistency model: https://docs.nvidia.com/cuda/parallel-thread-execution/#memory-consistency-model + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/sync-comm/references/mbarrier-arrive.md b/content/cuda/docs/ptx/instructions/sync-comm/references/mbarrier-arrive.md new file mode 100644 index 00000000..cdff6f21 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/sync-comm/references/mbarrier-arrive.md @@ -0,0 +1,31 @@ +# PTX Instruction Topic: mbarrier.arrive + +`mbarrier.arrive` performs an arrive-on operation on a specified mbarrier and is a commonly used producer-side primitive for asynchronous workflows. + +## Official Syntax (Excerpt) + +```ptx +mbarrier.arrive{.sem.scope}{.shared{::cta}}.b64 state, [addr]{, count}; +mbarrier.arrive.expect_tx{.sem.scope}{.shared{::cta}}.b64 state, [addr], txCount; +mbarrier.arrive.noComplete{.release.cta}{.shared{::cta}}.b64 state, [addr], count; +``` + +## Key Semantics + +- For a `.shared::cta` mbarrier, an opaque `state` value can be returned to represent the phase. +- For scenarios that use only `.shared::cluster` (not the current CTA), the target operand must be the sink `_`. +- The `.noComplete` variant must not cause the current phase to complete; otherwise, behavior is undefined. +- The `.release` semantics can synchronize with a consumer-side acquire mode. + +## Usage Notes + +- Use `state` together with `mbarrier.test_wait/try_wait` to avoid phase-mixing confusion. +- For remote cluster barrier scenarios, strictly follow the sink rules. + +## Official Source Links (Fact Check) + +- mbarrier.arrive: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-mbarrier-arrive +- mbarrier.test_wait / try_wait: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-mbarrier-try-wait +- Memory consistency model: https://docs.nvidia.com/cuda/parallel-thread-execution/#memory-consistency-model + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/sync-comm/references/mbarrier-test-wait.md b/content/cuda/docs/ptx/instructions/sync-comm/references/mbarrier-test-wait.md new file mode 100644 index 00000000..279c53d7 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/sync-comm/references/mbarrier-test-wait.md @@ -0,0 +1,36 @@ +# PTX Instruction Topic: mbarrier.test_wait / mbarrier.try_wait + +`mbarrier.test_wait` / `mbarrier.try_wait` are used to test whether an mbarrier phase has completed and are commonly used wait primitives on the consumer side for asynchronous transfers. + +## Official Syntax (Excerpt) + +```ptx +mbarrier.test_wait{.sem.scope}{.shared{::cta}}.b64 waitComplete, [addr], state; +mbarrier.test_wait.parity{.sem.scope}{.shared{::cta}}.b64 waitComplete, [addr], phaseParity; +``` + +## Key Semantics + +- `test_wait` is a non-blocking test. +- When used with `.acquire` and returning `True`, it forms an acquire mode (see the memory model section). +- `.scope` defaults to `.cta` when not explicitly specified. + +## Version and Target + +- Documentation indicates `mbarrier.test_wait` was introduced in PTX ISA 7.0. +- Documentation indicates it requires `sm_80` or higher. + +## Minimal Mode + +```ptx +mbarrier.test_wait.shared::cta.b64 p, [mbar_addr], state; +@!p bra retry; +``` + +## Official Source Links (Fact Check) + +- mbarrier.test_wait / try_wait: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-mbarrier-try-wait +- mbarrier family: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-mbarrier +- Memory consistency model: https://docs.nvidia.com/cuda/parallel-thread-execution/#memory-consistency-model + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/sync-comm/references/membar-fence.md b/content/cuda/docs/ptx/instructions/sync-comm/references/membar-fence.md new file mode 100644 index 00000000..0c66feaf --- /dev/null +++ b/content/cuda/docs/ptx/instructions/sync-comm/references/membar-fence.md @@ -0,0 +1,33 @@ +# PTX Instruction Topic: membar / fence + +`membar`/`fence` establish ordering for memory accesses and are fundamental primitives for correctness in concurrent execution. + +## Official Syntax (Excerpt) + +```ptx +membar.gl; +membar.cta; +membar.sys; +fence.sc.cta; +fence.sc.cluster; +``` + +## Key Semantics + +- `membar` ensures that prior memory accesses in the current thread are observed before subsequent accesses at the specified level. +- The documentation explains that `fence.sc` can restore sequential consistency at sufficient locations, but with a higher cost. +- On `sm_70+`, the semantic relationship between `membar` and `fence.sc` is clearly documented as being compatible (see the section notes). + +## Version and Target + +- `membar.{cta,gl}`: introduced in PTX ISA 1.4 +- `membar.sys`: introduced in PTX ISA 2.0, requires `sm_20+` +- `membar.proxy` / `fence.proxy`: introduced in PTX ISA 7.5 + +## Official Source Links (Fact Check) + +- membar / fence: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-membar-fence +- Memory consistency model: https://docs.nvidia.com/cuda/parallel-thread-execution/#memory-consistency-model +- Scope and applicability: https://docs.nvidia.com/cuda/parallel-thread-execution/#scope-and-applicability-of-the-model + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/sync-comm/references/membar-proxy.md b/content/cuda/docs/ptx/instructions/sync-comm/references/membar-proxy.md new file mode 100644 index 00000000..cd5b0cf2 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/sync-comm/references/membar-proxy.md @@ -0,0 +1,33 @@ +# PTX Instruction Topic: membar.proxy + +`membar.proxy` is a cross-proxy ordering primitive, historically tied to `fence.proxy` via semantic mapping. + +## Official Description + +- Defined in the `membar/fence` section as the relationship between `membar.proxy` and `fence.proxy`. +- The documentation notes that on `sm_70+`, `membar.proxy` and `fence.proxy` are synonymous. + +## Version and Target + +- `membar.proxy` / `fence.proxy`: introduced in PTX ISA 7.5 +- `membar.proxy`: requires `sm_60+` +- `fence.proxy`: requires `sm_70+` + +## Usage Notes + +- Use proxy fences only when data crosses proxy domains (for example, async-proxy to generic-proxy handoff). +- Do not substitute proxy fences for full protocol synchronization (`mbarrier`/barrier) when completion must also be tracked. + +## Example (PTX style) + +```ptx +membar.proxy.alias; +``` + +## Official Source Links (Fact Check) + +- membar / fence: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-membar-fence +- PTX ISA notes: https://docs.nvidia.com/cuda/parallel-thread-execution/#ptx-isa-notes +- Target ISA notes: https://docs.nvidia.com/cuda/parallel-thread-execution/#target-isa-notes + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/sync-comm/references/red.md b/content/cuda/docs/ptx/instructions/sync-comm/references/red.md new file mode 100644 index 00000000..973ec0ef --- /dev/null +++ b/content/cuda/docs/ptx/instructions/sync-comm/references/red.md @@ -0,0 +1,33 @@ +# PTX Instruction Topic: red + +`red` is a parallel reduction-update instruction family: it performs an atomic reduction on a specified memory location and writes the result back to the same location (overwriting the original value). + +## Official Description + +- Documentation section: Parallel Synchronization and Communication Instructions: `red` +- Compared with `atom`/`atom.*`, the core semantic of `red` is reduction write-back; whether an additional destination register exists/ is used depends on the specific variant syntax (see the corresponding ISA subsection). + +## Key Constraints + +- The operation type and the target address space must match the specific `red` variant. +- Concurrency semantics depend on the specified memory semantics and scope. +- It must be used together with consumer-side synchronization primitives to ensure visibility. + +## Usage Notes + +- Use `red` when you only need in-place accumulation and do not require the previous value. +- Combine with hierarchical reduction (warp/block first) to reduce global contention. + +## Example (PTX Style) + +```ptx +red.global.add.u32 [addr], r1; +``` + +## Official Source Links (Fact Check) + +- red: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-red +- red.async: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-red-async +- Memory consistency model: https://docs.nvidia.com/cuda/parallel-thread-execution/#memory-consistency-model + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/sync-comm/references/redux-sync.md b/content/cuda/docs/ptx/instructions/sync-comm/references/redux-sync.md new file mode 100644 index 00000000..492a8b09 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/sync-comm/references/redux-sync.md @@ -0,0 +1,32 @@ +# PTX Instruction Topic: redux.sync + +`redux.sync` provides a synchronized reduction operation used for mask-based reduction computations within a thread group. + +## Official Description + +- Documentation section: Parallel Synchronization and Communication Instructions: `redux.sync` +- Applicable to reduction scenarios that require a synchronized participation set + +## Key Constraints + +- `membermask` must correctly cover participating threads. +- The data type and reduction operator must match the instruction variant. +- The overall synchronization protocol must still be satisfied with subsequent consumer paths. + +## Usage Notes + +- Use `redux.sync` for compact warp-scope reductions when shared-memory staging is unnecessary. +- Keep mask construction stable across control-flow paths to avoid partial participation bugs. + +## Example (PTX Style, Illustrative) + +```ptx +redux.sync.add.s32 r_out, r_in, membermask; +``` + +## Official Source Links (Fact Check) + +- redux.sync: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-redux-sync +- Parallel synchronization instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/sync-comm/references/shfl-sync.md b/content/cuda/docs/ptx/instructions/sync-comm/references/shfl-sync.md new file mode 100644 index 00000000..2c5c79a9 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/sync-comm/references/shfl-sync.md @@ -0,0 +1,32 @@ +# PTX Instruction Topic: shfl.sync + +`shfl.sync` exchanges register data within a warp and is commonly used for warp-level communication and reductions. + +## Official Description + +- Documentation section: Parallel Synchronization and Communication Instructions: `shfl.sync` +- Commonly used for warp-level broadcast, down-scan, up-scan, and cross-lane exchange + +## Key Constraints + +- `membermask` must correctly describe the participating threads. +- lane indices and width parameters must follow the variant definition. +- Confirm that the target architecture supports this synchronized shuffle semantic before use. + +## Usage Notes + +- Use `shfl.sync` for warp-local broadcast and tree reductions to reduce shared-memory traffic. +- Keep lane mapping logic explicit when mixing `bfly`, `up`, `down`, and indexed shuffle forms. + +## Example (PTX Style, Illustrative) + +```ptx +shfl.sync.bfly.b32 r_out, r_in, laneMask, clamp, membermask; +``` + +## Official Source Links (Fact Check) + +- shfl.sync: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-shfl-sync +- Parallel synchronization instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/sync-comm/references/vote-sync.md b/content/cuda/docs/ptx/instructions/sync-comm/references/vote-sync.md new file mode 100644 index 00000000..828fb66e --- /dev/null +++ b/content/cuda/docs/ptx/instructions/sync-comm/references/vote-sync.md @@ -0,0 +1,31 @@ +# PTX Instruction Topic: vote.sync + +`vote.sync` performs a boolean reduction vote within the participating threads mask and is commonly used for warp-level conditional aggregation. + +## Official Description + +- Documentation section: Parallel Synchronization and Communication Instructions: `vote.sync` +- Common uses: determine whether any thread/all threads satisfy a condition + +## Key Constraints + +- The participating mask must match the actual set of active threads. +- When the vote result drives control flow, avoid mask mismatches that can cause semantic deviations. + +## Usage Notes + +- Use `vote.sync.any/all` for fast warp agreement checks before expensive work. +- Keep mask derivation adjacent to the vote site for easier correctness auditing. + +## Example (PTX Style, Illustrative) + +```ptx +vote.sync.any.pred p_out, p_in, membermask; +``` + +## Official Source Links (Fact Check) + +- vote.sync: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-vote-sync +- Parallel synchronization instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/tcgen05/DOC.md b/content/cuda/docs/ptx/instructions/tcgen05/DOC.md new file mode 100644 index 00000000..e5ccb682 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/tcgen05/DOC.md @@ -0,0 +1,46 @@ +--- +name: ptx-tcgen05-instructions +description: "PTX TensorCore 5th Generation (tcgen05) entry and B-series related constraints." +metadata: + languages: "cpp" + versions: "9.2" + revision: 1 + updated-on: "2026-03-19" + source: official + tags: "cuda,ptx,tcgen05,tensorcore,b-series,sm100,sm120" +--- + +# PTX tcgen05 (TensorCore 5th Generation) + +This directory focuses on tcgen05 entry points and B-series architectural constraints, without duplicating the `wgmma` details already covered elsewhere. + +## Core Positioning + +- tcgen05 is the TensorCore 5th Generation entry point in the PTX documentation. +- It is tightly related to WGMMA, mixed/alternate precision, and new type qualifier constraints. +- Multiple capabilities in the documentation are bound to `sm_100`/`sm_120` family feature thresholds. + +## Recommended Reading + +- `references/overview.md` +- `references/arch-gating.md` +- `references/wgmma-tcgen05-relationship.md` +- `references/b-series-checklist.md` + +## Further Reading + +- `references/tcgen05-mma-kinds.md` +- `references/tcgen05-block-scale.md` +- `references/tcgen05-sm120a-restrictions.md` +- `references/tcgen05-sm100-sm120-mapping.md` +- `references/tcgen05-alt-fp-types.md` +- `references/tcgen05-sparse-path.md` +- `references/tcgen05-migration-playbook.md` + +## Official Source Links (Fact Check) + +- TensorCore 5th Generation: https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions +- WGMMA: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-matrix-instructions +- Target ISA Notes: https://docs.nvidia.com/cuda/parallel-thread-execution/#target-isa-notes + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/tcgen05/references/arch-gating.md b/content/cuda/docs/ptx/instructions/tcgen05/references/arch-gating.md new file mode 100644 index 00000000..c009c65f --- /dev/null +++ b/content/cuda/docs/ptx/instructions/tcgen05/references/arch-gating.md @@ -0,0 +1,34 @@ +# tcgen05 Architecture Gating (B-Series) + +tcgen05-related capabilities are highly coupled in the official documentation with `sm_100`/`sm_120` and the `a/f` family conditions. + +## Architecture Gating Recommendations + +- Abstract “availability” as capabilities (e.g., `has_tcgen05`, `has_alt_fp`, `has_cp_mask`). +- Filter instruction templates by capabilities before generating kernels. +- Explicitly avoid or degrade restricted types on `sm_120a` (especially sub-byte / alternate fp). + +## Usage Notes + +- Gate at kernel-generation time, not only at runtime dispatch, to avoid generating illegal templates. +- Keep capability probing and fallback policy versioned with PTX/CUDA upgrade milestones. + +## Common Failure Modes + +- Capability flags are defined but not enforced during code emission. +- `sm_120a` restrictions are checked for compute path but missed for data-movement path. +- Fallback kernels compile but violate numerical contract due to dtype drift. + +## Minimal Test Matrix + +- `sm_100*` dense path with baseline type combinations. +- `sm_120a` restricted-type path with fallback validation. +- `sm_120f` extended-support path with regression parity checks. + +## Official Source Links (Fact Check) + +- Target ISA Notes: https://docs.nvidia.com/cuda/parallel-thread-execution/#target-isa-notes +- PTX ISA Notes: https://docs.nvidia.com/cuda/parallel-thread-execution/#ptx-isa-notes +- cp.async.bulk.tensor restrictions context: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-bulk-tensor + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/tcgen05/references/b-series-checklist.md b/content/cuda/docs/ptx/instructions/tcgen05/references/b-series-checklist.md new file mode 100644 index 00000000..c37ba630 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/tcgen05/references/b-series-checklist.md @@ -0,0 +1,32 @@ +# B-Series Implementation Checklist (tcgen05-related) + +For quick verification before engineering rollout. + +## Checklist + +- [ ] `.target` matches the actual deployment architecture (`sm_100`/`sm_120`). +- [ ] All tcgen05/WGMMA variants have passed capability gating. +- [ ] Relevant async protocols (fence/commit/wait) are complete. +- [ ] `sm_120a` restricted types have been checked and have fallbacks. +- [ ] Linked scenarios with TMA paths have completed correctness regression testing. + +## Release Notes for Reviewers + +- Record the capability matrix used during generation and testing. +- Include sparse and alternate-FP coverage status explicitly in release notes. +- Document fallback behavior when tcgen05 constraints fail on target hardware. + +## Minimum Evidence Package + +- One correctness report per architecture family (`sm_100*`, `sm_120*`) with capability-gated variants. +- One protocol trace confirming async fence/commit/wait ordering on representative kernels. +- One numerical report covering dense, sparse, and alternate-FP routes. + +## Official Source Links (Fact Check) + +- Target ISA Notes: https://docs.nvidia.com/cuda/parallel-thread-execution/#target-isa-notes +- TensorCore 5th Generation: https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions +- cp.async.bulk.tensor: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-bulk-tensor +- wgmma.mma_async: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-matrix-instructions-wgmma-mma-async + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/tcgen05/references/overview.md b/content/cuda/docs/ptx/instructions/tcgen05/references/overview.md new file mode 100644 index 00000000..e465621c --- /dev/null +++ b/content/cuda/docs/ptx/instructions/tcgen05/references/overview.md @@ -0,0 +1,33 @@ +# tcgen05 Overview + +tcgen05 is the entry point chapter for TensorCore 5th Generation in PTX 9.2, covering capabilities and constraints related to new-generation matrix computations. + +## Core Capability Axes + +- Which data types, MMA kinds, and qualifiers are legal on the target architecture. +- Which capability subsets are gated by `sm_100*` and `sm_120*` families. +- Which async protocols are mandatory when composed with WGMMA/TMA paths. + +## Usage Notes + +- Treat tcgen05 as a capability map, then bind concrete instruction templates after gating. +- Keep architecture, type, and protocol checks in one validation layer to avoid drift. + +## Common Failure Modes + +- Selecting a legal MMA shape with an illegal type/scale combination. +- Assuming support transfers across architecture variants without checking target notes. +- Reusing async synchronization recipes that are valid for WGMMA but incomplete for tcgen05 composition. + +## Quick Start Checklist + +- Confirm architecture capability before selecting instruction templates. +- Validate `kind`/type/scale combinations before code generation. +- Verify async protocol completion before accumulator consumption. + +## Official Source Links (Fact Check) + +- TensorCore 5th Generation: https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions +- WGMMA MMA Async: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-matrix-instructions-wgmma-mma-async + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/tcgen05/references/tcgen05-alt-fp-types.md b/content/cuda/docs/ptx/instructions/tcgen05/references/tcgen05-alt-fp-types.md new file mode 100644 index 00000000..b1635146 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/tcgen05/references/tcgen05-alt-fp-types.md @@ -0,0 +1,31 @@ +# tcgen05 Topic: Alternate Floating-Point Types + +This page focuses on usage constraints for common alternate FP types on the tcgen05 paths (e.g., `.e2m1/.e3m2/.e2m3`). + +## Official Notes + +- The documentation provides legal combinations of these types with `.kind`, shape, and `scale_vec_size`. +- Multiple entries explicitly tie support conditions to `sm_120a` / `sm_120f`. + +## Engineering Guidance + +- Build a separate numerical regression baseline for alternate FP paths. +- Bind the type-support matrix and architecture thresholds to the same configuration source. + +## Usage Notes + +- Keep alternate-FP enablement behind explicit feature flags in kernel selection logic. +- Store tolerance thresholds per type family instead of sharing one global tolerance. + +## Common Failure Modes + +- Alternate-FP kernels pass shape checks but fail hidden type-combination rules. +- Tolerances copied from FP16/BF16 baselines under-report alternate-FP drift. +- Architecture gating is checked for compute ops but missed for related async transfer paths. + +## Official Source Links (Fact Check) + +- TensorCore 5th Generation: https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions +- Warp-level MMA instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#warp-level-matrix-instructions-mma + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/tcgen05/references/tcgen05-block-scale.md b/content/cuda/docs/ptx/instructions/tcgen05/references/tcgen05-block-scale.md new file mode 100644 index 00000000..5abe2698 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/tcgen05/references/tcgen05-block-scale.md @@ -0,0 +1,31 @@ +# tcgen05 Topic: block_scale and scale_vec_size + +This page covers `scale_vec_size` constraints in tcgen05-related `block_scale` paths. + +## Official Notes + +- `.block_scale` indicates that A/B matrices are scaled by `scale_A/scale_B` before multiply-add. +- `scale_vec_size` determines the shape of the scale matrix and how the selector is interpreted. +- Different `.kind` entries allow different values of `scale_vec_size` (the document tables define legal combinations). + +## B-Series Guidance + +- Do static validation using the triplet “kind + stype + scale_vec_size”. +- Check legal combinations before compilation to avoid runtime undefined behavior. + +## Usage Notes + +- Keep scale tensor layout and selector interpretation documented next to kernel templates. +- Validate block-scale metadata generation on host side before launching compute kernels. + +## Common Failure Modes + +- Correct compute opcode with invalid scale metadata layout. +- `scale_vec_size` chosen from template defaults without checking `.kind` constraints. + +## Official Source Links (Fact Check) + +- TensorCore 5th Generation: https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions +- MMA instructions (block scale context): https://docs.nvidia.com/cuda/parallel-thread-execution/#warp-level-matrix-instructions-mma + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/tcgen05/references/tcgen05-migration-playbook.md b/content/cuda/docs/ptx/instructions/tcgen05/references/tcgen05-migration-playbook.md new file mode 100644 index 00000000..af48cb45 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/tcgen05/references/tcgen05-migration-playbook.md @@ -0,0 +1,35 @@ +# tcgen05 Migration Playbook (From “Works” to “Stable”) + +This page provides a minimal process for taking tcgen05 from “compiles” to “ready for stable production.” + +## Four-Step Process + +1. Architecture gate check: first determine whether `sm_100*`/`sm_120*` are available. +2. Combination validity check: verify `kind + stype + scale_vec_size`. +3. Protocol correctness check: ensure fences/commit/wait on the async path and full mbarrier participation. +4. Numerical and performance regression: establish baselines separately for alternate FP and sparse paths. + +## Exit Criteria + +- All generated kernels pass architecture-gated validation without manual overrides. +- Async protocol traces show correct fence/commit/wait ordering under stress inputs. +- Numerical tolerance and performance deltas are recorded for dense and sparse variants. + +## Common Failure Modes + +- Migration stops at "compiles" without protocol or numerical regression coverage. +- Sparse and alternate-FP paths share the same baseline, hiding path-specific drift. +- Fallback policy is undocumented, leading to deployment-time behavior changes. + +## Rollback Readiness + +- Keep a tested fallback path for unsupported architecture/type combinations. +- Version migration decisions with reproducible benchmark and correctness artifacts. + +## Official Source Links (Fact Check) + +- TensorCore 5th Generation: https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions +- Target ISA Notes: https://docs.nvidia.com/cuda/parallel-thread-execution/#target-isa-notes +- Asynchronous operations: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-operations + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/tcgen05/references/tcgen05-mma-kinds.md b/content/cuda/docs/ptx/instructions/tcgen05/references/tcgen05-mma-kinds.md new file mode 100644 index 00000000..8c591676 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/tcgen05/references/tcgen05-mma-kinds.md @@ -0,0 +1,31 @@ +# tcgen05 Topic: MMA kind Family (f8f6f4 / mxf4 / mxf4nvf4 / mxf8f6f4) + +This page focuses on the `.kind` families for tcgen05-related MMA and their engineering meaning. + +## Official Notes + +- The documentation lists families such as `.kind::f8f6f4`, `.kind::mxf4`, `.kind::mxf4nvf4`, and `.kind::mxf8f6f4`. +- Different `.kind` entries impose different restrictions on data packing, optional `scale_vec_size`, and available type combinations. + +## B-Series Guidance + +- Treat `.kind` as a first-class capability parameter at the code-generation level. +- Enforce an explicit `scale_vec_size` for `mxf4nvf4` (per the official rules). + +## Usage Notes + +- Carry `.kind` through scheduling, metadata generation, and validation stages as one parameter. +- Keep fallback templates keyed by `.kind` to avoid silent conversion to unsupported combinations. + +## Common Failure Modes + +- Selecting `.kind` by benchmark speed only, without validating legal type combinations. +- Forgetting to propagate `.kind` choice into sparse/scale metadata generation. +- Using shared fallback code paths that silently change `.kind`-dependent numerics. + +## Official Source Links (Fact Check) + +- TensorCore 5th Generation: https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions +- MMA block-scale/kind context: https://docs.nvidia.com/cuda/parallel-thread-execution/#warp-level-matrix-instructions-mma + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/tcgen05/references/tcgen05-sm100-sm120-mapping.md b/content/cuda/docs/ptx/instructions/tcgen05/references/tcgen05-sm100-sm120-mapping.md new file mode 100644 index 00000000..5fe365ff --- /dev/null +++ b/content/cuda/docs/ptx/instructions/tcgen05/references/tcgen05-sm100-sm120-mapping.md @@ -0,0 +1,31 @@ +# tcgen05 Topic: sm_100 to sm_120 Mapping + +This page describes a mapping approach for tcgen05-related capabilities across the `sm_100*` and `sm_120*` families. + +## Mapping Approach + +- `sm_100a/sm_120a`: typically indicates earlier availability or more strict gating paths. +- `sm_100f/sm_120f`: the documentation frequently notes “higher targets within the same family provide support.” +- Specific functionality should follow the Target ISA notes in the corresponding section; do not infer across sections. + +## Implementation Guidance + +- Encode architecture checks as a `supports(feature, sm)` function. +- Let the generator degrade along the feature dimension instead of scattering many `#if` in kernel source. + +## Usage Notes + +- Centralize mapping rules in one table consumed by codegen, runtime dispatch, and tests. +- Keep mapping updates synchronized with CUDA/PTX version bumps. + +## Common Failure Modes + +- Hard-coding architecture assumptions per kernel instead of using shared mapping logic. +- Conflating "supported on family" with "supported on every family variant." + +## Official Source Links (Fact Check) + +- Target ISA Notes: https://docs.nvidia.com/cuda/parallel-thread-execution/#target-isa-notes +- PTX ISA Notes: https://docs.nvidia.com/cuda/parallel-thread-execution/#ptx-isa-notes + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/tcgen05/references/tcgen05-sm120a-restrictions.md b/content/cuda/docs/ptx/instructions/tcgen05/references/tcgen05-sm120a-restrictions.md new file mode 100644 index 00000000..0413a1b2 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/tcgen05/references/tcgen05-sm120a-restrictions.md @@ -0,0 +1,34 @@ +# tcgen05 Topic: sm_120a Restrictions and Notes + +This page distills the restriction entries in the PTX documentation that are directly related to `sm_120a`. + +## Official Signals + +- Multiple sections explicitly state that `sm_120a` is the initial support point, while `sm_120f` provides later support within the same family. +- Some sub-byte / alternate floating-point types have restriction notes for `sm_120a`. +- Asynchronous tensor paths such as `cp.async.bulk.tensor` have dedicated restrictions entries for `sm_120a`. + +## Engineering Guidance + +- Maintain a separate “disabled types list” for `sm_120a`. +- First perform dedicated testing on `sm_120a` for a new kernel, then expand to `sm_120f`. + +## Common Failure Modes + +- Assuming `sm_120f` support implies `sm_120a` parity for all type paths. +- Missing fallback coverage for restricted alternate-FP and sub-byte routes. +- Validating only throughput and skipping correctness checks on restricted configurations. + +## Verification Checklist + +- Confirm restricted-type disables are active on `sm_120a`. +- Confirm fallback kernels preserve numerical contract and output layout. +- Re-run protocol validation for async tensor paths under restricted modes. + +## Official Source Links (Fact Check) + +- Target ISA Notes: https://docs.nvidia.com/cuda/parallel-thread-execution/#target-isa-notes +- cp.async.bulk.tensor restrictions: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-bulk-tensor +- TensorCore 5th Generation: https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/tcgen05/references/tcgen05-sparse-path.md b/content/cuda/docs/ptx/instructions/tcgen05/references/tcgen05-sparse-path.md new file mode 100644 index 00000000..b110f7db --- /dev/null +++ b/content/cuda/docs/ptx/instructions/tcgen05/references/tcgen05-sparse-path.md @@ -0,0 +1,33 @@ +# tcgen05 Topic: Sparse MMA Paths + +This page covers key points for tcgen05-related sparse MMA (`mma.sp`) across the `kind` / `block-scale` dimensions. + +## Official Notes + +- The sparse path introduces additional metadata/selectors operands. +- When combined with block scale, you still must satisfy valid combinations of `kind`/`stype`/`scale_vec_size`. +- The documentation specifies architectural requirements for alternate FP sparse paths. + +## Engineering Guidance + +- Build separate templates for sparse and dense paths to avoid cross-contaminating parameters. +- Perform boundary and consistency checks for metadata and selector parameters on the host side. + +## Common Failure Modes + +- Reusing dense-path metadata assumptions in sparse kernels. +- Sparse selector tensors generated with mismatched shape/stride conventions. +- Shared regression suites miss sparse-only corner cases. + +## Verification Checklist + +- Validate sparse metadata shape/range before kernel launch. +- Compare sparse and dense numerical baselines under identical problem shapes. +- Confirm architecture gates for sparse alternate-FP combinations. + +## Official Source Links (Fact Check) + +- TensorCore 5th Generation: https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions +- Warp-level MMA sparse context: https://docs.nvidia.com/cuda/parallel-thread-execution/#warp-level-matrix-instructions-mma-sp + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/tcgen05/references/wgmma-tcgen05-relationship.md b/content/cuda/docs/ptx/instructions/tcgen05/references/wgmma-tcgen05-relationship.md new file mode 100644 index 00000000..ee158bfc --- /dev/null +++ b/content/cuda/docs/ptx/instructions/tcgen05/references/wgmma-tcgen05-relationship.md @@ -0,0 +1,35 @@ +# Relationship Between WGMMA and tcgen05 + +In the current PTX structure, WGMMA is a high-frequency entry point at the implementation level, while tcgen05 provides the generational capability and constraint framework. + +## Practical Relationship + +- First check the capability boundaries of tcgen05, then choose the specific WGMMA variant. +- WGMMA depends on the `wgmma.fence` + `commit_group` + `wait_group` protocol. +- Async paths involve async proxies and require matching fence/wait semantics. + +## Usage Notes + +- Use tcgen05 gating to decide whether WGMMA templates are eligible before launch configuration tuning. +- Keep one protocol contract per pipeline stage to avoid mixing WGMMA and non-WGMMA completion logic. + +## Common Failure Modes + +- Choosing a WGMMA template first and discovering tcgen05 incompatibility late in the pipeline. +- Reusing wait-group thresholds across kernels with different stage depth and tile size. +- Assuming fence semantics are interchangeable across all async producer-consumer chains. + +## Integration Checklist + +- Gate tcgen05 capability before WGMMA template selection. +- Validate fence/commit/wait sequencing under representative stage depth. +- Confirm accumulator-read boundaries are protected by matching wait semantics. + +## Official Source Links (Fact Check) + +- TensorCore 5th Generation: https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions +- wgmma.fence: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-matrix-instructions-wgmma-fence +- wgmma.commit_group: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-matrix-instructions-wgmma-commit-group +- wgmma.wait_group: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-matrix-instructions-wgmma-wait-group + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/tma/DOC.md b/content/cuda/docs/ptx/instructions/tma/DOC.md new file mode 100644 index 00000000..beda7b1e --- /dev/null +++ b/content/cuda/docs/ptx/instructions/tma/DOC.md @@ -0,0 +1,56 @@ +--- +name: ptx-tma-instructions +description: "PTX Tensor Memory Accelerator related instructions and usage constraints in ISA 9.2." +metadata: + languages: "cpp" + versions: "9.2" + revision: 2 + updated-on: "2026-03-19" + source: official + tags: "cuda,ptx,tma,async,memory" +--- + +# PTX TMA + +Tensor Memory Accelerator (TMA) instructions move tensor tiles asynchronously with explicit completion protocols. + +## Representative Syntax + +```ptx +cp.async.bulk.tensor.1d.shared::cta.global.mbarrier::complete_tx::bytes.tile [dstMem], [tensorMap, {tc}], [mbar]; +``` + +This is a representative form. Actual variants add dimension, source/destination state-space, completion mechanism, and multicast/reduction modifiers. + +## Key Semantics + +- TMA operations are asynchronous and require explicit completion handling before consumer use. +- Completion may use mbarrier-based `complete_tx` or bulk-group wait mechanisms depending on variant. +- Memory visibility and ordering follow PTX asynchronous-operation rules and proxy semantics. + +## Common Constraints + +- `tensorMap` descriptors and coordinate operands must be valid for the selected dimension/layout form. +- Variant-specific modifiers (for example multicast/reduce forms) require matching operand lists. +- Alignment, shape, and state-space combinations must match ISA restrictions for the target architecture. + +## Usage Recommendations + +- First validate correctness with a single-stage movement/compute loop. +- Add staged pipelining only after synchronization boundaries are explicit and correct. +- Keep a fallback path for architectures or types that do not support your chosen TMA variant. + +## Official Source Links (fact check) + +- cp.async.bulk.tensor: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-bulk-tensor +- Tensor Map: https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-map +- mbarrier: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-mbarrier +- Asynchronous operations: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-operations + +Last verified date: 2026-03-19 + +## Single-Instruction References + +- `references/cp-async-bulk-tensor.md` +- `references/cp-reduce-async-bulk.md` +- `references/multimem-cp-reduce-async-bulk.md` diff --git a/content/cuda/docs/ptx/instructions/tma/references/cp-async-bulk-tensor.md b/content/cuda/docs/ptx/instructions/tma/references/cp-async-bulk-tensor.md new file mode 100644 index 00000000..0849efbe --- /dev/null +++ b/content/cuda/docs/ptx/instructions/tma/references/cp-async-bulk-tensor.md @@ -0,0 +1,48 @@ +# PTX Instruction Note: cp.async.bulk.tensor (TMA) + +`cp.async.bulk.tensor` is the core PTX TMA instruction family for asynchronous tensor movement between selected state spaces. + +## Official Syntax (Excerpt) + +```ptx +cp.async.bulk.tensor.dim.dst.src{.load_mode}.completion_mechanism{.cta_group}{.level::cache_hint} + [dstMem], [tensorMap, tensorCoords], [mbar]{, im2colInfo}{, cache-policy} +``` + +```ptx +cp.async.bulk.tensor.dim.dst.src{.load_mode}.completion_mechanism{.multicast}{.cta_group}{.level::cache_hint} + [dstMem], [tensorMap, tensorCoords], [mbar]{, im2colInfo}{, ctaMask}{, cache-policy} +``` + +```ptx +cp.async.bulk.tensor.dim.dst.src{.load_mode}.completion_mechanism{.level::cache_hint} + [tensorMap, tensorCoords], [srcMem]{, cache-policy} +``` + +## Key Semantics + +- The instruction is asynchronous and requires an explicit completion protocol before consumer use. +- Completion mechanism is variant-dependent (`.mbarrier::complete_tx::bytes` or `.bulk_group` in eligible forms). +- Source/destination state-space and modifier choices determine valid operand templates. +- Memory ordering and visibility follow PTX asynchronous-operation and proxy rules. + +## Common Constraints + +- `tensorMap` and coordinate operands must match dimension, load mode, and layout requirements. +- Multicast and CTA-group modifiers require correct target-mask or grouping operands. +- Architecture/type restrictions apply to specific variants; verify against the ISA restrictions section. + +## Example (PTX style) + +```ptx +cp.async.bulk.tensor.1d.shared::cta.global.mbarrier::complete_tx::bytes.tile [sMem0], [tensorMap0, {tc0}], [mbar0]; +``` + +## Official Source Links (fact check) + +- cp.async.bulk.tensor: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-bulk-tensor +- Tensor Map: https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-map +- Asynchronous operations: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-operations +- mbarrier: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-mbarrier + +Last verified date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/tma/references/cp-reduce-async-bulk.md b/content/cuda/docs/ptx/instructions/tma/references/cp-reduce-async-bulk.md new file mode 100644 index 00000000..3f1724b2 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/tma/references/cp-reduce-async-bulk.md @@ -0,0 +1,31 @@ +# PTX Instruction Note: cp.reduce.async.bulk + +`cp.reduce.async.bulk` is an async bulk reduction-copy instruction that performs element-wise reduction during transfer. + +## Official Syntax (Excerpt) + +```ptx +cp.reduce.async.bulk.dst.src.completion_mechanism.redOp.type [dstMem], [srcMem], size, [mbar]; +cp.reduce.async.bulk.dst.src.completion_mechanism.add.noftz.type [dstMem], [srcMem], size, [mbar]; +``` + +## Key Semantics + +- The instruction is non-blocking and issues asynchronous reduction work. +- `.mbarrier::complete_tx::bytes`: executes complete-tx on mbarrier at completion. +- `.bulk_group`: uses bulk async-group completion. +- The docs classify this path as a weak memory operation; reduction has `.relaxed.gpu` semantics. + +## Detailed Constraints (Official Highlights) + +- `size` specifies equal source/destination array length. +- `add.f16/add.bf16` requires `.noftz`. +- Some sub-byte types are unsupported (see restrictions section). + +## Official Source Links (fact check) + +- cp.reduce.async.bulk: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-reduce-async-bulk +- Async data movement instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-data-movement-and-conversion-instructions +- Memory consistency model: https://docs.nvidia.com/cuda/parallel-thread-execution/#memory-consistency-model + +Last verified date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/tma/references/multimem-cp-reduce-async-bulk.md b/content/cuda/docs/ptx/instructions/tma/references/multimem-cp-reduce-async-bulk.md new file mode 100644 index 00000000..cb99dc9e --- /dev/null +++ b/content/cuda/docs/ptx/instructions/tma/references/multimem-cp-reduce-async-bulk.md @@ -0,0 +1,39 @@ +# PTX Instruction Note: multimem.cp.reduce.async.bulk + +`multimem.cp.reduce.async.bulk` performs asynchronous bulk copy-reduction to multi-memory targets. + +## Official Syntax (Excerpt) + +```ptx +multimem.cp.reduce.async.bulk.dst.src.completion_mechanism.redOp.type [dstMem], [srcMem], size; +``` + +## Key Semantics + +- The operation is asynchronous and reduction-enabled across multi-memory destinations. +- Completion semantics follow the selected completion mechanism for this variant family. +- Memory ordering and visibility behavior follow PTX memory-consistency and async-operation rules. + +## Common Constraints + +- Reduction operator and data type must be a legal ISA combination. +- `size` and address ranges must match source/destination requirements. +- Architecture restrictions apply; verify the target ISA and restrictions sections. + +## Usage Notes + +- Use this path when multi-memory reduction transport is required by system-level sharding design. +- Validate completion mechanism selection against downstream consumer synchronization points. + +## Common Failure Modes + +- Reduction operator is valid in isolation but illegal for the selected multimem variant. +- Completion mechanism is correct for copy but insufficient for consumer visibility requirements. + +## Official Source Links (fact check) + +- multimem.cp.reduce.async.bulk: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-multimem-cp-reduce-async-bulk +- cp.reduce.async.bulk: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-reduce-async-bulk +- Memory consistency model: https://docs.nvidia.com/cuda/parallel-thread-execution/#memory-consistency-model + +Last verified date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/wgmma/DOC.md b/content/cuda/docs/ptx/instructions/wgmma/DOC.md new file mode 100644 index 00000000..2e4ed82c --- /dev/null +++ b/content/cuda/docs/ptx/instructions/wgmma/DOC.md @@ -0,0 +1,48 @@ +--- +name: ptx-wgmma-instructions +description: "PTX warpgroup-level matrix multiply-accumulate instructions and constraints for ISA 9.2." +metadata: + languages: "cpp" + versions: "9.2" + revision: 2 + updated-on: "2026-03-19" + source: official + tags: "cuda,ptx,wgmma,mma,tensorcore,wmma,tensor-core,matrix-multiply,matrix-multiply-accumulate" +--- + +# PTX WGMMA + +WGMMA is used for warpgroup-level matrix multiply-accumulate and targets high-throughput Tensor Core paths. + +## Feature Positioning + +- Compared with traditional `mma`, WGMMA is designed for higher-level cooperative execution. +- It is commonly combined with asynchronous movement (e.g., TMA) to reduce data waiting. + +## Key Constraints + +- The combination of tile shape, layout, and dtype must fully match the specification. +- Instruction availability depends on the target architecture (see the Target ISA notes). +- Asynchronous compute paths require corresponding wait/synchronization mechanisms. + +## Example (Structural Illustrative) + +```ptx +// Specific operand formats should follow the official section. +wgmma.mma_async.sync.aligned ...; +``` + +## Official Source Links (Fact Check) + +- Asynchronous Warpgroup Level Matrix Instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-matrix-instructions +- wgmma.mma_async: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-matrix-instructions-wgmma-mma-async +- TensorCore 5th Generation: https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions + +Last cross-check date: 2026-03-19 + +## Single-instruction Topics + +- `references/wgmma-mma-async.md` +- `references/wgmma-commit-group.md` +- `references/wgmma-wait-group.md` +- `references/wgmma-fence.md` diff --git a/content/cuda/docs/ptx/instructions/wgmma/references/wgmma-commit-group.md b/content/cuda/docs/ptx/instructions/wgmma/references/wgmma-commit-group.md new file mode 100644 index 00000000..d73eee26 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/wgmma/references/wgmma-commit-group.md @@ -0,0 +1,34 @@ +# PTX Instruction Topic: wgmma.commit_group + +`wgmma.commit_group` commits the currently uncommitted batch of `wgmma.mma_async` as one wgmma-group. + +## Official Syntax + +```ptx +wgmma.commit_group.sync.aligned; +``` + +## Key Semantics + +- Each warpgroup creates a new wgmma-group and collects previously uncommitted `wgmma.mma_async`. +- If there are no uncommitted operations, it creates an empty group. +- `.sync` requires threads within the warp to rendezvous at the same instruction point. +- `.aligned` requires all threads in the warpgroup to execute the same `commit_group`; inconsistencies under conditional branches lead to undefined behavior. + +## Usage Notes + +- Commit once per pipeline stage after all stage-local `wgmma.mma_async` instructions are issued. +- Keep commit boundaries consistent across all participating warps in the warpgroup. + +## Common Failure Modes + +- Multiple commits are issued for one logical stage due to divergent control paths. +- Commit is skipped on one warp path, causing wait-group protocol mismatch later. + +## Official Source Links (Fact Check) + +- wgmma.commit_group: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-matrix-instructions-wgmma-commit-group +- wgmma.mma_async: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-matrix-instructions-wgmma-mma-async +- Async warpgroup matrix instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-matrix-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/wgmma/references/wgmma-fence.md b/content/cuda/docs/ptx/instructions/wgmma/references/wgmma-fence.md new file mode 100644 index 00000000..611fa1a3 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/wgmma/references/wgmma-fence.md @@ -0,0 +1,36 @@ +# PTX Instruction Topic: wgmma.fence + +`wgmma.fence` is used to constrain the ordering boundary of register accesses related to `wgmma.mma_async`. + +## Official Key Semantics + +- The documentation explicitly states that you must use `wgmma.fence` before `wgmma.mma_async` to isolate the related register accesses; otherwise behavior is undefined. +- It is typically combined with `wgmma.commit_group` / `wgmma.wait_group` to form a complete execution protocol. + +## Usage Notes + +- Insert `wgmma.fence` at stage boundaries where operand register ownership changes. +- Keep fence placement identical across participating warpgroup threads. + +## Common Failure Modes + +- Omitting fence on one path in a conditionally structured pipeline. +- Assuming `commit_group` alone is sufficient for register-handoff correctness. + +## Usage Patterns (Illustrative) + +```ptx +wgmma.fence.sync.aligned; +wgmma.mma_async.sync.aligned ...; +wgmma.commit_group.sync.aligned; +wgmma.wait_group.sync.aligned 0; +``` + +## Official Source Links (Fact Check) + +- wgmma.fence: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-matrix-instructions-wgmma-fence +- wgmma.mma_async: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-matrix-instructions-wgmma-mma-async +- wgmma.commit_group: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-matrix-instructions-wgmma-commit-group +- wgmma.wait_group: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-matrix-instructions-wgmma-wait-group + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/wgmma/references/wgmma-mma-async.md b/content/cuda/docs/ptx/instructions/wgmma/references/wgmma-mma-async.md new file mode 100644 index 00000000..7ddcb003 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/wgmma/references/wgmma-mma-async.md @@ -0,0 +1,41 @@ +# PTX Instruction Topic: wgmma.mma_async + +`wgmma.mma_async` is a warpgroup-level asynchronous matrix multiply-accumulate instruction that runs on the async proxy. + +## Official Syntax (Excerpt) + +```ptx +wgmma.mma_async.sync.aligned.shape.dtype.f16.f16 d, a-desc, b-desc, scale-d, imm-scale-a, imm-scale-b, imm-trans-a, imm-trans-b; +wgmma.mma_async.sync.aligned.shape.dtype.tf32.tf32 d, a-desc, b-desc, scale-d, imm-scale-a, imm-scale-b; +``` + +## Key Semantics + +- The instruction executes on the async proxy, and an implicit generic-async proxy fence occurs upon completion. +- You must use mechanisms such as `wgmma.commit_group` + `wgmma.wait_group` to wait for completion. +- The documentation emphasizes: `wgmma.fence` must be used to isolate the related register accesses; otherwise behavior is undefined. + +## Parameter Constraints (High-Risk) + +- `imm-trans-a` / `imm-trans-b` only allow 0 or 1. +- For floating-point variants, `imm-scale-a` / `imm-scale-b` only allow -1 or 1. +- The `shape` / `dtype` / descriptor layout must match the official matrix fragment definitions. + +## Usage Notes + +- Keep descriptor generation and shape selection in one helper to avoid operand mismatch. +- Pair each issued async MMA stage with explicit commit and wait boundaries before accumulator reads. + +## Common Failure Modes + +- Descriptor layout matches shape but not selected dtype variant. +- `imm-scale-*` values are propagated from host config without variant validation. +- Register consumption starts before wait-group completion for the corresponding stage. + +## Official Source Links (Fact Check) + +- wgmma.mma_async: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-matrix-instructions-wgmma-mma-async +- Asynchronous warpgroup matrix instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-matrix-instructions +- Async proxy notes: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-operations + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/wgmma/references/wgmma-wait-group.md b/content/cuda/docs/ptx/instructions/wgmma/references/wgmma-wait-group.md new file mode 100644 index 00000000..dc6e83f3 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/wgmma/references/wgmma-wait-group.md @@ -0,0 +1,34 @@ +# PTX Instruction Topic: wgmma.wait_group + +`wgmma.wait_group` waits for the wgmma-group to complete and is a necessary step before reading the results of `wgmma.mma_async`. + +## Official Syntax + +```ptx +wgmma.wait_group.sync.aligned N; +``` + +## Key Semantics + +- Wait until the number of the most recent pending groups does not exceed `N`, and earlier groups have completed. +- `N=0` means waiting for all previously submitted groups to complete. +- The documentation states that if you access the accumulator / related input registers without waiting for the group that contains the target `wgmma.mma_async`, the behavior is undefined. +- `.sync` and `.aligned` have the same execution-consistency requirements as `commit_group`. + +## Usage Notes + +- Tune `N` according to pipeline depth and register-pressure limits. +- Place `wait_group` immediately before accumulator consumption boundaries. + +## Common Failure Modes + +- Using `N` from a different kernel stage depth and reading incomplete accumulators. +- Waiting too early and collapsing overlap between async compute stages. + +## Official Source Links (Fact Check) + +- wgmma.wait_group: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-matrix-instructions-wgmma-wait-group +- wgmma.commit_group: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-matrix-instructions-wgmma-commit-group +- wgmma.mma_async: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-matrix-instructions-wgmma-mma-async + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/references/abi-and-calling-convention.md b/content/cuda/docs/ptx/references/abi-and-calling-convention.md new file mode 100644 index 00000000..8066e9ed --- /dev/null +++ b/content/cuda/docs/ptx/references/abi-and-calling-convention.md @@ -0,0 +1,35 @@ +# PTX ABI and Calling Convention (9.2) + +PTX abstracts the ABI and calling convention at the `.entry` and `.func` levels; the parameter space and symbol declarations affect call correctness. + +## Key Points + +- `.entry`: kernel entry point, typically launched from the host side. +- `.func`: device function callable within PTX. +- Parameters are typically passed through the `.param` space. +- Function declarations and definitions must be consistent in symbols and parameters. + +## Common Mistakes + +- Mixing `.entry` and `.func` parameter rules. +- Inline PTX that ignores calling conventions can violate register constraints. +- Inconsistent symbol definitions across multiple files during linking. + +## Usage Notes + +- Keep `.entry` and `.func` signatures in shared templates to prevent declaration drift. +- Validate parameter layout assumptions when integrating inline PTX with compiler-generated code. + +## Verification Checklist + +- Check symbol names and `.param` ordering across declarations and definitions. +- Confirm call sites use operand types consistent with callee parameter types. + +## Official Source Links (Fact Check) + +- Abstracting the ABI: https://docs.nvidia.com/cuda/parallel-thread-execution/#abstracting-the-abi +- Function Declarations and Definitions: https://docs.nvidia.com/cuda/parallel-thread-execution/#function-declarations-and-definitions +- Parameter State Space: https://docs.nvidia.com/cuda/parallel-thread-execution/#parameter-state-space +- Linking directives: https://docs.nvidia.com/cuda/parallel-thread-execution/#linking-directives + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/references/b-series-arch-matrix.md b/content/cuda/docs/ptx/references/b-series-arch-matrix.md new file mode 100644 index 00000000..c88064e6 --- /dev/null +++ b/content/cuda/docs/ptx/references/b-series-arch-matrix.md @@ -0,0 +1,35 @@ +# B-Series Architecture Capability Matrix (PTX 9.2) + +This page summarizes the target-architecture constraints in the PTX 9.2 documentation that are related to the B-Series, with a focus on `sm_100`/`sm_120` and their `a/f` family conditions. + +## Key Observations (from Official Sections) + +- Multiple instruction variants are explicitly marked as “requires `sm_100` or higher”. +- Some advanced variants use `sm_100a` / `sm_120a` as first-launch requirements, while also noting that `sm_100f` / `sm_120f` in the same family provide support in higher versions. +- `cp.async.bulk.tensor` and `cp.reduce.async.bulk.tensor` have type restriction entries for `sm_120a`. +- Certain modifiers related to `.multicast::cluster` and `.cp_mask` provide performance/availability notes on `sm_100+` paths. + +## Capability Matrix (Current Document View) + +| Capability Direction | Key Instructions/Features | Architecture Signals (Official Docs) | +|---|---|---| +| Asynchronous tensor movement | `cp.async.bulk.tensor` | `sm_100`/`sm_100a`/`sm_100f` and `sm_120a` restriction entries | +| Asynchronous reduction movement | `cp.reduce.async.bulk(.tensor)` | `sm_100+` paths + type restriction entries | +| Proxy synchronization enhancements | `fence.proxy.async` | Documented higher architecture thresholds (`sm_90`/`sm_100+` related) | +| Advanced MMA/TensorCore | `wgmma` + `tcgen05` family entry points | Documented new types and qualifier conditions on `sm_120`/`sm_120a` | + +## Usage Suggestions + +- For B-Series-specific paths, perform “target-architecture threshold checks” before generating code. +- Implement `a`/`f` family differences as explicit capability flags in the engineering codebase, rather than scattering them in kernel code. +- Validate all “new types/new qualifiers” via compilation and runtime checks on both `sm_100` and `sm_120` platforms. + +## Official Source Links (Fact Check) + +- PTX main document: https://docs.nvidia.com/cuda/parallel-thread-execution/ +- PTX ISA Notes: https://docs.nvidia.com/cuda/parallel-thread-execution/#ptx-isa-notes +- Target ISA Notes: https://docs.nvidia.com/cuda/parallel-thread-execution/#target-isa-notes +- TensorCore 5th Generation: https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions +- cp.async.bulk.tensor: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-bulk-tensor + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/references/b-series-delta-from-hopper.md b/content/cuda/docs/ptx/references/b-series-delta-from-hopper.md new file mode 100644 index 00000000..594bcea8 --- /dev/null +++ b/content/cuda/docs/ptx/references/b-series-delta-from-hopper.md @@ -0,0 +1,32 @@ +# B-Series Delta Index vs. Hopper + +This page records the key PTX differences to focus on when migrating from Hopper (e.g., the `sm_90a` path) to the B-Series (`sm_100`/`sm_120`). + +## Observed Difference Directions + +- More instructions/modifiers are marked as available only under `sm_100+` in the documentation. +- `sm_120a` vs. `sm_120f` includes extra notes on certain types and qualifiers. +- TensorCore 5th Generation and related mixed/alternate-precision conditions are more common on the new-architecture paths. +- Asynchronous tensor movement and reduction paths (TMA / async bulk) include more architecture/type restriction entries. + +## Migration Checklist + +1. Check that `target` and compilation options match the intended target architecture. +2. Check whether any `sm_100+` threshold features are used (e.g., some cache/eviction/async proxy variants). +3. Check whether restricted types on `sm_120a` are triggered. +4. Perform minimal runnable regression tests for WGMMA / tcgen05 / TMA paths. + +## Common Failure Modes + +- Porting instruction syntax while leaving Hopper-specific gating assumptions unchanged. +- Validating only dense compute paths and missing sparse/alternate-FP restrictions. +- Applying one fallback policy across `sm_100*` and `sm_120*` without feature-level checks. + +## Official Source Links (Fact Check) + +- PTX ISA Notes: https://docs.nvidia.com/cuda/parallel-thread-execution/#ptx-isa-notes +- Target ISA Notes: https://docs.nvidia.com/cuda/parallel-thread-execution/#target-isa-notes +- Asynchronous operations: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-operations +- TensorCore 5th Generation: https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/references/h-series-special-instructions.md b/content/cuda/docs/ptx/references/h-series-special-instructions.md new file mode 100644 index 00000000..6a692622 --- /dev/null +++ b/content/cuda/docs/ptx/references/h-series-special-instructions.md @@ -0,0 +1,60 @@ +# H-Series (Hopper) Specialized Instructions and Mechanisms (Summary) + +This document is intended for engineering use and organizes capabilities in the PTX documentation that can be classified as first introduced in the H-Series (`sm_90/sm_90a`) or strongly related to it. + +## Key Takeaways + +- Hopper introduced and systematized the **cluster + async proxy + mbarrier + TMA + WGMMA** capability combination. +- Many of these capabilities were later extended in the B-Series, so they should be understood as “**H debuted, later inherited**.” +- For code generation, prefer capability gating over only looking at architecture codenames. + +## A. Core Capabilities Debuted in H and Inherited Later + +### 1) WGMMA Asynchronous Matrix Multiply-Accumulate Path + +- Representative instructions: `wgmma.mma_async`, `wgmma.fence`, `wgmma.commit_group`, `wgmma.wait_group` +- Typical meaning: a warpgroup-level asynchronous MMA protocol (initiate / commit / wait) + +### 2) TMA / Tensor Asynchronous Copy Path + +- Representative instruction: `cp.async.bulk.tensor` +- Related objects: `tensormap`, `prefetch.*.tensormap` +- Typical meaning: high-throughput tensor movement + dedicated completion mechanisms + +### 3) mbarrier Completion-Tracking System + +- Representative instructions: `mbarrier.arrive`, `mbarrier.arrive_drop`, `mbarrier.test_wait`, `mbarrier.try_wait` +- Related instruction: `cp.async.mbarrier.arrive` +- Typical meaning: explicitly ties asynchronous completion to visibility synchronization + +### 4) Cluster and Cross-Proxy Synchronization Mechanisms + +- Representative capabilities: `.cluster` scope, `fence.proxy.async` +- Typical meaning: ordering guarantees across paths for generic/async proxies + +## B. Common “requires sm_90+” Signals on the H Path (Examples) + +- Multiple `.cluster` scope instructions/modifiers are marked `requires sm_90 or higher` +- `.tensormap`-related paths are marked `requires sm_90 or higher` +- `fence.proxy.async` is marked `requires sm_90 or higher` +- Some bf16/bf16x2 and mixed-precision variants have explicit thresholds on the H path + +## Engineering Implementation Suggestions + +1. Break capabilities into feature flags (e.g., `has_wgmma`, `has_tma`, `has_mbarrier_cluster`, `has_proxy_async_fence`). +2. Do “capability detection -> instruction template selection” first, then perform kernel generation. +3. Reuse the same semantic checks for H and B, but apply different fallbacks based on `sm`. + +## Official Source Links (Fact Check) + +- PTX main document: https://docs.nvidia.com/cuda/parallel-thread-execution/ +- WGMMA: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-matrix-instructions +- wgmma.mma_async: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-matrix-instructions-wgmma-mma-async +- cp.async.bulk.tensor: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-bulk-tensor +- mbarrier family: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-mbarrier +- cp.async.mbarrier.arrive: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-cp-async-mbarrier-arrive +- membar/fence (including proxy semantics): https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-membar-fence +- Asynchronous operations: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-operations +- Target ISA notes: https://docs.nvidia.com/cuda/parallel-thread-execution/#target-isa-notes + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/references/instruction-format-and-operands.md b/content/cuda/docs/ptx/references/instruction-format-and-operands.md new file mode 100644 index 00000000..f40e953f --- /dev/null +++ b/content/cuda/docs/ptx/references/instruction-format-and-operands.md @@ -0,0 +1,36 @@ +# Instruction Format and Operands (9.2) +PTX instructions typically consist of a predicate, opcode, suffix, modifiers, and an operand array. Operand rules are one of the most common sources of errors. + +## Instruction Components + +- Optional predicate: `@p` / `@!p` +- Opcode: e.g., `add`, `ld`, `cp.async` +- Type suffix: e.g., `.s32`, `.f32` +- Semantic modifiers: e.g., `.acquire`, `.release`, `.relaxed` +- Scope modifiers: e.g., `.cta`, `.cluster`, `.gpu`, `.sys` + +## Operand Check List + +- Whether the immediate ranges satisfy the definitions in the section +- Whether address operands come from valid state spaces +- Whether source/destination types permit implicit or explicit conversions +- Whether additional synchronization is required (e.g., waiting for an async copy) + +## Key Facts Related to Asynchronous Instructions + +The PTX documentation clearly states that `cp.async` operations do not provide completion-order guarantees by default; explicit synchronization is required using `cp.async.wait_all` / `cp.async.wait_group` or mbarrier. + +## Common Failure Modes + +- Modifier ordering is syntactically valid but semantically wrong for the intended memory model. +- Operand width and state-space assumptions diverge between template and instantiated code. +- Async instructions are emitted without matching wait or barrier completion paths. + +## Official Source Links (Fact Check) + +- Instruction Statements: https://docs.nvidia.com/cuda/parallel-thread-execution/#instruction-statements +- Instruction Operands: https://docs.nvidia.com/cuda/parallel-thread-execution/#instruction-operands +- Operand Costs: https://docs.nvidia.com/cuda/parallel-thread-execution/#operand-costs +- Asynchronous Data Movement semantics: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-data-movement-and-conversion-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/references/memory-consistency-model.md b/content/cuda/docs/ptx/references/memory-consistency-model.md new file mode 100644 index 00000000..07a0ea8e --- /dev/null +++ b/content/cuda/docs/ptx/references/memory-consistency-model.md @@ -0,0 +1,35 @@ +# Memory Consistency Model (9.2) +PTX consistency model is defined by “semantics + scope + proxy”. Asynchronous instructions are typically modeled as weak memory operations. + +## Core Concepts + +- Semantics: `relaxed`, `acquire`, `release`, etc. +- Scope: `cta`, `cluster`, `gpu`, `sys` +- Proxies: generic proxy / async proxy, etc. + +## Focus for Asynchronous Paths + +- `cp.async` and `cp.async.bulk` belong to asynchronous copy paths. +- The documentation states that there is no ordering guarantee between `cp.async` operations unless you explicitly synchronize. +- After `cp.async.bulk` / `cp.reduce.async.bulk` completes, an implicit generic-async proxy fence is applied (see the section notes). +- `mbarrier complete-tx` has `.release` at `.cluster` scope semantics in the corresponding description. + +## Practical Recommendations + +- Establish the relationship between “transfer completion” and “visibility to consumers” using the specified mechanisms. +- When mixing `atom`/`fence`/`mbarrier`, draw the happens-before relationships before writing code. + +## Common Failure Modes + +- Using correct scope with wrong semantics (`relaxed` where release/acquire is required). +- Assuming async-copy completion implies ordering for unrelated memory operations. +- Combining proxy domains without explicit proxy fence rules. + +## Official Source Links (Fact Check) + +- Memory Consistency Model: https://docs.nvidia.com/cuda/parallel-thread-execution/#memory-consistency-model +- Scope and applicability: https://docs.nvidia.com/cuda/parallel-thread-execution/#scope-and-applicability-of-the-model +- Parallel sync instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions +- Async operations and ordering notes: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-operations + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/references/programming-model.md b/content/cuda/docs/ptx/references/programming-model.md new file mode 100644 index 00000000..cad10b6a --- /dev/null +++ b/content/cuda/docs/ptx/references/programming-model.md @@ -0,0 +1,31 @@ +# PTX Programming Model (9.2) + +The PTX programming model describes thread organization, execution hierarchy, state spaces, and function boundaries, and it is a prerequisite for understanding instruction semantics and synchronization scopes. + +## Structured Takeaways + +- Thread execution is organized at the CTA / cluster / grid hierarchy levels. +- Synchronization and visibility depend on the scope; you cannot assume visibility across scopes. +- Kernels (`.entry`) and functions (`.func`) differ in parameter and call boundaries. +- Asynchronous instructions (e.g., `cp.async`, `cp.async.bulk`, `wgmma.mma_async`) do not fully follow ordinary program order. + +## Practical Interpretation + +- Before choosing synchronization primitives, first determine which scope the data is shared within. +- When writing asynchronous copies or asynchronous MMA, you must explicitly wait for completion mechanisms (group or mbarrier). +- Do not infer cross-thread visibility from the apparent sequential execution behavior in a single thread. + +## Common Failure Modes + +- Selecting block-wide synchronization where cluster or system scope is required. +- Assuming async producer completion implies consumer visibility without explicit protocol completion. +- Mixing scope assumptions across helper kernels in the same pipeline. + +## Official Source Links (Fact Check) + +- Programming Model: https://docs.nvidia.com/cuda/parallel-thread-execution/#programming-model +- Machine Model: https://docs.nvidia.com/cuda/parallel-thread-execution/#machine-model +- State Spaces: https://docs.nvidia.com/cuda/parallel-thread-execution/#state-spaces +- Asynchronous operations notes: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-operations + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/references/release-notes-ptx-9.2.md b/content/cuda/docs/ptx/references/release-notes-ptx-9.2.md new file mode 100644 index 00000000..e1944d77 --- /dev/null +++ b/content/cuda/docs/ptx/references/release-notes-ptx-9.2.md @@ -0,0 +1,35 @@ +# PTX 9.2 Release Notes Index + +This page tracks PTX 9.2 newly added features, behavior changes, compatibility limitations, and target-architecture requirements. + +## Suggested Review Process + +1. First review the release notes to identify newly added/changed instructions. +2. Then consult the corresponding instruction section’s PTX ISA notes. +3. Finally review the Target ISA notes to determine availability under `.target sm_xx`. + +## Change Categories to Track + +- New instruction families and qualifiers. +- Semantic changes that alter ordering, completion, or undefined-behavior boundaries. +- Target gating updates that change legal deployment architectures. + +## Common Failure Modes + +- Applying syntax updates while missing semantic changes in the same release. +- Updating PTX templates without synchronizing architecture-gating logic. +- Treating release notes as optional and relying on historical behavior assumptions. + +## Verification Checklist + +- Re-run architecture-gating tests after release-note-driven template updates. +- Re-run numerical and protocol regression on kernels touched by updated instruction families. +- Revalidate fallback behavior on the oldest supported architecture target. + +## Official Source Links (Fact Check) + +- Release Notes: https://docs.nvidia.com/cuda/parallel-thread-execution/#release-notes +- PTX ISA Notes: https://docs.nvidia.com/cuda/parallel-thread-execution/#ptx-isa-notes +- Target ISA Notes: https://docs.nvidia.com/cuda/parallel-thread-execution/#target-isa-notes + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/references/state-spaces-and-types.md b/content/cuda/docs/ptx/references/state-spaces-and-types.md new file mode 100644 index 00000000..3e366b62 --- /dev/null +++ b/content/cuda/docs/ptx/references/state-spaces-and-types.md @@ -0,0 +1,34 @@ +# PTX State Spaces and Types (9.2) + +PTX validity is jointly constrained by “state spaces + the type system”. Being syntactically correct alone is not sufficient to guarantee semantic correctness. + +## Common State Spaces + +- `.reg`: registers +- `.local`: thread-private memory +- `.shared`: CTA/cluster shared memory (depending on modifiers) +- `.global`: global memory +- `.const`: constant memory +- `.param`: parameter space + +## Common Type Families + +- Bit types: `.b8/.b16/.b32/.b64` +- Integer types: `.s*` / `.u*` +- Floating-point types: `.f16/.bf16/.tf32/.f32/.f64` +- Vector and packed types: commonly used in load/store, mma, and tensor operations + +## Practical Constraints + +- The address space for `ld/st/cp` must match the instruction variant. +- Arithmetic type suffixes must be compatible with the register declarations. +- Mixed-precision and tensor instructions often have stricter type/tile combination constraints. + +## Official Source Links (Fact Check) + +- State Spaces: https://docs.nvidia.com/cuda/parallel-thread-execution/#state-spaces +- Types: https://docs.nvidia.com/cuda/parallel-thread-execution/#types +- Variables: https://docs.nvidia.com/cuda/parallel-thread-execution/#variables +- Parameter State Space: https://docs.nvidia.com/cuda/parallel-thread-execution/#parameter-state-space + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/randomness-and-reproducibility/DOC.md b/content/cuda/docs/randomness-and-reproducibility/DOC.md new file mode 100644 index 00000000..5bf83bfd --- /dev/null +++ b/content/cuda/docs/randomness-and-reproducibility/DOC.md @@ -0,0 +1,66 @@ +--- +name: randomness-and-reproducibility +description: "CUDA randomness and reproducibility essentials: RNG strategy, seed control, deterministic settings, and cross-run consistency pitfalls." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,reproducibility,determinism,randomness,seed,curand,atomic-order,numerical-variance" +--- + +# CUDA Randomness And Reproducibility (C++) + +Use this page when you need stable results across runs, devices, or software versions. + +## Reproducibility Scope + +Define what you need: + +- same run, same machine +- same machine across runs +- across GPUs/driver/toolkit versions + +The stricter the target, the more constraints you must apply. + +## RNG Strategy + +For random-number generation in CUDA pipelines: + +- use explicit seed management +- separate per-thread/sequence state deterministically +- avoid implicit global RNG side effects in hot kernels + +cuRAND is common for production-grade GPU RNG workflows. + +## Determinism Pitfalls + +Even without RNG, floating-point results can vary due to: + +- reduction order changes +- atomic update ordering +- parallel scheduling differences +- precision/mode differences (for example Tensor Core math paths) + +Bitwise reproducibility is usually harder than statistical reproducibility. + +## Practical Checklist + +1. fix seeds and log them. +2. pin algorithm/mode choices that affect operation order. +3. define tolerance-based correctness checks when bitwise identity is unrealistic. +4. isolate non-deterministic kernels and test them separately. + +## Related Topics + +- Numerics and precision: `../numerics-and-precision/DOC.md` +- Atomics and reductions: `../atomics-and-reductions/DOC.md` +- Error handling and debug build: `../error-handling-and-debug-build/DOC.md` + +## Official Source Links (Fact Check) + +- cuRAND documentation: https://docs.nvidia.com/cuda/curand/index.html +- CUDA C++ Programming Guide, floating-point and parallel execution caveats: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/regression-testing-and-ci/DOC.md b/content/cuda/docs/regression-testing-and-ci/DOC.md new file mode 100644 index 00000000..8e95425e --- /dev/null +++ b/content/cuda/docs/regression-testing-and-ci/DOC.md @@ -0,0 +1,67 @@ +--- +name: regression-testing-and-ci +description: "CUDA regression testing and CI essentials: correctness baselines, tolerance strategy, perf guardrails, and multi-arch validation." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,testing,regression,ci,correctness,tolerance,performance-guardrail,multi-arch" +--- + +# CUDA Regression Testing And CI (C++) + +Use this page to keep CUDA kernels stable across optimizations and toolchain updates. + +## Test Layers + +Keep separate layers: + +- functional correctness tests +- numerical tolerance tests +- performance regression tests + +Blending all three into one pass makes failures hard to diagnose. + +## Correctness Baselines + +- keep a trusted reference path (CPU or high-precision GPU) +- compare output shapes, boundary behavior, and representative edge cases +- include deterministic seeds for stochastic paths + +## Tolerance Policy + +Define tolerance per operator class and precision mode. + +- tighter for stable FP32 math +- looser but explicit for FP16/BF16/TF32 or nondeterministic orderings + +Store tolerance policy in code/config, not ad-hoc comments. + +## Performance Guardrails + +- track key benchmarks in CI (or scheduled perf jobs) +- compare against a baseline window, not a single run +- alert on sustained regression beyond threshold + +## Multi-Arch Validation + +When possible, validate across representative GPU classes. + +- architecture differences can expose hidden assumptions +- build matrices should reflect deployment reality + +## Related Topics + +- Build and ABI compatibility: `../build-and-abi-compatibility/DOC.md` +- Benchmarking methodology: `../benchmarking-methodology/DOC.md` +- Numerics and precision: `../numerics-and-precision/DOC.md` +- Randomness and reproducibility: `../randomness-and-reproducibility/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Best Practices Guide, verification and optimization workflow context: https://docs.nvidia.com/cuda/archive/13.0.0/cuda-c-best-practices-guide/index.html +- CUDA Programming Guide, numerical/ordering considerations: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/runtime/DOC.md b/content/cuda/docs/runtime/DOC.md new file mode 100644 index 00000000..0ebf57e0 --- /dev/null +++ b/content/cuda/docs/runtime/DOC.md @@ -0,0 +1,151 @@ +--- +name: runtime +description: "CUDA Runtime API essentials for allocating memory, launching kernels, and managing streams." +metadata: + languages: "cpp" + versions: "12.4" + revision: 1 + updated-on: "2026-03-18" + source: community + tags: "cuda,gpu,kernel,runtime,api" +--- + +# CUDA Runtime API (C++) + +Use the CUDA Runtime API for most application-level kernel development. It provides a simpler model than the Driver API while still exposing streams, events, and device management. + +## Minimal End-to-End Example + +```cpp +#include +#include + +__global__ void saxpy(const float* x, const float* y, float* out, float a, int n) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < n) out[i] = a * x[i] + y[i]; +} + +int main() { + const int n = 1 << 20; + const size_t bytes = n * sizeof(float); + float *h_x = (float*)malloc(bytes); + float *h_y = (float*)malloc(bytes); + float *h_out = (float*)malloc(bytes); + + float *d_x = nullptr, *d_y = nullptr, *d_out = nullptr; + cudaMalloc(&d_x, bytes); + cudaMalloc(&d_y, bytes); + cudaMalloc(&d_out, bytes); + + cudaMemcpy(d_x, h_x, bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_y, h_y, bytes, cudaMemcpyHostToDevice); + + const int threads = 256; + const int blocks = (n + threads - 1) / threads; + saxpy<<>>(d_x, d_y, d_out, 2.0f, n); + + cudaDeviceSynchronize(); + cudaMemcpy(h_out, d_out, bytes, cudaMemcpyDeviceToHost); + + cudaFree(d_x); + cudaFree(d_y); + cudaFree(d_out); + free(h_x); + free(h_y); + free(h_out); + return 0; +} +``` + +## Core Runtime APIs + +Use these first when building kernels: + +- `cudaMalloc`, `cudaFree` for device memory +- `cudaMemcpy`, `cudaMemcpyAsync` for transfers +- `cudaMemset` for initialization +- `cudaGetLastError`, `cudaDeviceSynchronize` for error detection +- `cudaStreamCreate`, `cudaStreamDestroy` for async execution +- `cudaEventCreate`, `cudaEventRecord`, `cudaEventElapsedTime` for timing + +## Error Handling Pattern + +Always check errors for: + +- the kernel launch (use `cudaGetLastError`) +- the execution (use `cudaDeviceSynchronize` or stream sync) + +See `references/error-handling.md` for a macro-based pattern. + +## Common Pitfalls + +- Forgetting to synchronize before reading results on the host +- Miscomputing grid size (off-by-one on tail elements) +- Assuming host memory is page-locked (use `cudaHostAlloc` if needed) +- Launching with too few blocks to cover all elements + +## When to Use Streams + +Use streams when: + +- You need overlap of copy and compute (`cudaMemcpyAsync`) +- You want concurrent kernels +- You want explicit ordering without global device sync + +## Related Topics + +- Error handling macro and diagnostics: `references/error-handling.md` +- Memory hierarchy overview: `../memory-hierarchy/DOC.md` +- Shared memory overview: `../shared-memory/DOC.md` +- Synchronization overview: `../synchronization/DOC.md` +- Coalescing overview: `../coalescing/DOC.md` +- Occupancy tuning: `../occupancy/DOC.md` +- Warp-level primitives: `../warp-primitives/DOC.md` +- Execution model: `../execution-model/DOC.md` +- Compute throughput: `../compute-throughput/DOC.md` +- CUDA Core path: `../cuda-core/DOC.md` +- CUDA Core optimization checklist: `../cuda-core-optimization-checklist/DOC.md` +- Tensor Core usage: `../tensor-cores/DOC.md` +- WMMA kernel patterns: `../wmma-kernel-patterns/DOC.md` +- WMMA debugging checklist: `../wmma-debugging-checklist/DOC.md` +- Tensor Core pipeline patterns: `../tensor-core-pipeline-patterns/DOC.md` +- Tensor Core numerical validation: `../tensor-core-numerical-validation/DOC.md` +- CUDA Core vs Tensor Core path selection: `../cuda-core-vs-tensor-core-path-selection/DOC.md` +- Kernel bottleneck diagnosis workflow: `../kernel-bottleneck-diagnosis-workflow/DOC.md` +- Memory-bound optimization playbook: `../memory-bound-kernel-optimization-playbook/DOC.md` +- Compute-bound optimization playbook: `../compute-bound-kernel-optimization-playbook/DOC.md` +- Launch-bound optimization playbook: `../launch-bound-optimization-playbook/DOC.md` +- Nsight metrics interpretation cheatsheet: `../nsight-metrics-interpretation-cheatsheet/DOC.md` +- Atomics and reductions: `../atomics-and-reductions/DOC.md` +- Cooperative Groups: `../cooperative-groups/DOC.md` +- Async copy: `../async-copy/DOC.md` +- Thread Block Clusters: `../thread-block-clusters/DOC.md` +- Streams and events: `../streams-and-events/DOC.md` +- Memory fences and ordering: `../memory-fences-and-ordering/DOC.md` +- CUDA Graphs: `../cuda-graphs/DOC.md` +- Performance debugging: `../performance-debugging/DOC.md` +- Launch bounds and registers: `../launch-bounds-and-registers/DOC.md` +- Unified Memory: `../unified-memory/DOC.md` +- Pinned memory and transfers: `../pinned-memory-and-transfers/DOC.md` +- Multi-GPU and peer access: `../multi-gpu-and-peer-access/DOC.md` +- Dynamic Parallelism: `../dynamic-parallelism/DOC.md` +- Error handling and debug build: `../error-handling-and-debug-build/DOC.md` +- cuBLAS/cuDNN integration patterns: `../cublas-cudnn-integration-patterns/DOC.md` +- NVTX and profiling workflow: `../nvtx-and-profiling-workflow/DOC.md` +- Numerics and precision: `../numerics-and-precision/DOC.md` +- Randomness and reproducibility: `../randomness-and-reproducibility/DOC.md` +- Fused kernel design patterns: `../fused-kernel-design-patterns/DOC.md` +- Build and ABI compatibility: `../build-and-abi-compatibility/DOC.md` +- Sparse and irregular kernels: `../sparse-and-irregular-kernels/DOC.md` +- Collective communication patterns: `../collective-communication-patterns/DOC.md` +- Benchmarking methodology: `../benchmarking-methodology/DOC.md` +- Regression testing and CI: `../regression-testing-and-ci/DOC.md` +- Data layout and alignment: `../data-layout-and-alignment/DOC.md` +- Cache behavior and access policy: `../cache-behavior-and-access-policy/DOC.md` +- Persistent kernels and work queues: `../persistent-kernels-and-work-queues/DOC.md` +- Production readiness checklist: `../production-readiness-checklist/DOC.md` +- Kernel API design guidelines: `../kernel-api-design-guidelines/DOC.md` +- Shape specialization and autotuning: `../input-shape-specialization-and-autotuning/DOC.md` +- Fallback strategies and capability detection: `../fallback-strategies-and-capability-detection/DOC.md` +- Incident response and rollback playbook: `../incident-response-and-rollback-playbook/DOC.md` +- PTX shared-memory async path: `../ptx/instructions/data-movement/references/cp-async.md` diff --git a/content/cuda/docs/runtime/references/error-handling.md b/content/cuda/docs/runtime/references/error-handling.md new file mode 100644 index 00000000..658aaf5e --- /dev/null +++ b/content/cuda/docs/runtime/references/error-handling.md @@ -0,0 +1,28 @@ +# CUDA Runtime Error Handling + +Use a small helper to surface errors early. Check both launch errors and runtime errors. + +```cpp +#include +#include + +#define CUDA_CHECK(call) \ + do { \ + cudaError_t err = call; \ + if (err != cudaSuccess) { \ + fprintf(stderr, "CUDA error %s:%d: %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \ + exit(1); \ + } \ + } while (0) + +// Usage +CUDA_CHECK(cudaMalloc(&d_x, bytes)); +// After kernel launch +CUDA_CHECK(cudaGetLastError()); +CUDA_CHECK(cudaDeviceSynchronize()); +``` + +Notes: +- `cudaGetLastError()` catches launch errors. +- `cudaDeviceSynchronize()` surfaces runtime errors. +- For async workflows, prefer `cudaStreamSynchronize(stream)`. diff --git a/content/cuda/docs/shared-memory/DOC.md b/content/cuda/docs/shared-memory/DOC.md new file mode 100644 index 00000000..f35dfa8b --- /dev/null +++ b/content/cuda/docs/shared-memory/DOC.md @@ -0,0 +1,174 @@ +--- +name: shared-memory +description: "CUDA shared memory essentials: __shared__, dynamic shared memory, synchronization, bank conflicts, and async copy." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,shared-memory,sharedmem,smem,__shared__,dynamic-shared-memory,__syncthreads__,bank-conflict,bank-conflicts,bank-conflict-avoidance,padding,shared-memory-tiling,cp.async,mbarrier" +--- + +# CUDA Shared Memory (C++) + +Use this page when you need the CUDA C++ view of shared memory: what `__shared__` means, how dynamic shared memory is declared, when `__syncthreads()` is required, and how bank conflicts affect performance. + +## What Shared Memory Is + +In the CUDA C++ Programming Guide, `__shared__` declares storage that: + +- resides in the shared memory space of a thread block +- has the lifetime of the block +- has a distinct object per block +- is accessible only to threads in the same block + +This makes shared memory the standard scratchpad for cooperation within a block. + +## Static Shared Memory + +Use a compile-time-sized declaration when the storage size is fixed: + +```cpp +__global__ void saxpy_tile(const float* x, const float* y, float* out, int n) { + __shared__ float tile[256]; + + int tid = threadIdx.x; + int i = blockIdx.x * blockDim.x + tid; + + if (i < n) { + tile[tid] = x[i]; + } + __syncthreads(); + + if (i < n) { + out[i] = 2.0f * tile[tid] + y[i]; + } +} +``` + +Use this form when the tile shape is fixed and simple. + +## Dynamic Shared Memory + +Use `extern __shared__` when the size is determined at launch time: + +```cpp +__global__ void reduce_kernel(const float* input, float* output, int n) { + extern __shared__ float smem[]; + + int tid = threadIdx.x; + int i = blockIdx.x * blockDim.x + tid; + + smem[tid] = (i < n) ? input[i] : 0.0f; + __syncthreads(); + + for (int stride = blockDim.x / 2; stride > 0; stride >>= 1) { + if (tid < stride) smem[tid] += smem[tid + stride]; + __syncthreads(); + } + + if (tid == 0) output[blockIdx.x] = smem[0]; +} + +// Launch with dynamic shared memory bytes: +// reduce_kernel<<>>(...); +``` + +The CUDA C++ Programming Guide notes that all `extern __shared__` variables start at the same address, so if you pack multiple arrays into dynamic shared memory you must manage offsets and alignment explicitly. + +## Synchronization Rule + +Use `__syncthreads()` when one set of threads writes shared memory and another set of threads in the same block will read it later. + +- `__syncthreads()` is a block-wide barrier +- writes to shared memory before the barrier are visible to threads in the block after the barrier +- do not place it in divergent control flow unless the condition is uniform across the whole block + +Typical cases: + +- loading a tile from global memory into shared memory +- reduction steps between iterations +- transpose or stencil phases where threads consume values written by other threads + +## Why Shared Memory Helps + +The Best Practices Guide highlights three common reasons to use shared memory: + +- avoid redundant loads from global memory +- transform global accesses into coalesced accesses +- avoid wasted bandwidth from strided patterns + +Shared memory is especially useful for tiled GEMM, stencil, convolution, reduction, and transpose kernels. + +## Bank Conflicts + +Shared memory performance depends on bank usage. + +- modern devices expose 32 banks for warp accesses +- successive 32-bit words map to successive banks +- if threads in a warp hit distinct banks, accesses can proceed concurrently +- if multiple threads hit the same bank, the access is split and serialized +- one important exception is broadcast: when threads read the same shared location, hardware can serve that efficiently + +The standard remedy for column-wise access on a 32x32 tile is padding: + +```cpp +__shared__ float tile[32][33]; +``` + +The Best Practices Guide uses this pattern to remove many-way bank conflicts in a transpose-like matrix multiply example. + +## Async Copy Path + +For newer CUDA toolchains and architectures, shared memory can also participate in explicit async copy pipelines from global memory. + +- C++ layer: `__pipeline_memcpy_async`, `__pipeline_commit`, `__pipeline_wait_prior` +- PTX layer: `cp.async`, `cp.async.commit_group`, `cp.async.wait_group`, and mbarrier-based completion + +Use this path when you need to overlap global-to-shared transfers with computation and reduce intermediate register traffic. + +## When To Escalate To PTX Docs + +Stay in CUDA C++ docs for: + +- `__shared__` +- dynamic shared memory launch configuration +- `__syncthreads()` +- bank conflict basics + +Jump to PTX docs for: + +- `.shared` state-space rules +- `cp.async` +- `mbarrier` +- TMA and shared-memory layout/swizzling + +See: + +- `../ptx/references/state-spaces-and-types.md` +- `../ptx/instructions/data-movement/references/cp-async.md` +- `../ptx/instructions/sync-comm/DOC.md` +- `../ptx/instructions/tma/DOC.md` + +## Related Topics + +- CUDA Runtime overview: `../runtime/DOC.md` +- Synchronization rules: `../synchronization/DOC.md` +- Memory-space overview: `../memory-hierarchy/DOC.md` +- Global-memory coalescing: `../coalescing/DOC.md` +- Warp-level primitives: `../warp-primitives/DOC.md` +- Tensor Core usage: `../tensor-cores/DOC.md` +- Async copy: `../async-copy/DOC.md` +- Thread Block Clusters / DSM: `../thread-block-clusters/DOC.md` +- PTX ISA overview: `../ptx/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Programming Guide, `__shared__`: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html#shared +- CUDA C++ Programming Guide, synchronization functions: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html#synchronization-functions +- CUDA C++ Best Practices Guide, Shared Memory: https://docs.nvidia.com/cuda/archive/13.0.0/cuda-c-best-practices-guide/index.html#shared-memory +- CUDA C++ Best Practices Guide, Shared Memory and Memory Banks: https://docs.nvidia.com/cuda/archive/13.0.0/cuda-c-best-practices-guide/index.html#shared-memory-and-memory-banks +- CUDA C++ Best Practices Guide, Async Copy from Global Memory to Shared Memory: https://docs.nvidia.com/cuda/archive/13.0.0/cuda-c-best-practices-guide/index.html#asynchronous-copy-from-global-memory-to-shared-memory + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/sparse-and-irregular-kernels/DOC.md b/content/cuda/docs/sparse-and-irregular-kernels/DOC.md new file mode 100644 index 00000000..b63ebd85 --- /dev/null +++ b/content/cuda/docs/sparse-and-irregular-kernels/DOC.md @@ -0,0 +1,65 @@ +--- +name: sparse-and-irregular-kernels +description: "CUDA sparse/irregular kernel essentials: load imbalance, indirect access, divergence control, and locality-aware data layouts." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,sparse,irregular,load-balance,divergence,indirect-access,gather,scatter" +--- + +# CUDA Sparse And Irregular Kernels (C++) + +Use this page when access patterns are indirect, data-dependent, or highly skewed. + +## Why These Kernels Are Hard + +Sparse/irregular workloads often suffer from: + +- poor coalescing from indirect addressing +- warp divergence from data-dependent control flow +- load imbalance across warps/blocks +- cache inefficiency from weak locality + +## Design Priorities + +1. reduce divergence where possible. +2. improve memory locality through data reordering. +3. balance work granularity to avoid long-tail warps. +4. isolate hot irregular regions from regular compute regions. + +## Common Patterns + +- work queues for dynamic tasks +- segmented processing for variable-length rows/lists +- gather/scatter with index compression/reordering +- two-phase pipelines: count/scan then compact/execute + +## Practical Techniques + +- reorder indices to improve spatial locality +- use warp-level primitives for local compaction and voting +- split heavy/light workloads into separate kernels +- avoid over-synchronizing global progress paths + +## Typical Pitfalls + +- one-thread-per-item mapping with heavy skew +- atomics on hot addresses without privatization +- excessive branch nesting in the main kernel body + +## Related Topics + +- Coalescing: `../coalescing/DOC.md` +- Warp primitives: `../warp-primitives/DOC.md` +- Atomics and reductions: `../atomics-and-reductions/DOC.md` +- Fused kernel patterns: `../fused-kernel-design-patterns/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Best Practices Guide, memory behavior and control divergence context: https://docs.nvidia.com/cuda/archive/13.0.0/cuda-c-best-practices-guide/index.html +- CUDA C++ Programming Guide, execution and memory model background: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/streams-and-events/DOC.md b/content/cuda/docs/streams-and-events/DOC.md new file mode 100644 index 00000000..1676504b --- /dev/null +++ b/content/cuda/docs/streams-and-events/DOC.md @@ -0,0 +1,91 @@ +--- +name: streams-and-events +description: "CUDA streams and events essentials: ordering, overlap, cudaStreamWaitEvent, timing, and default-stream caveats." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,streams,events,cudaStreamWaitEvent,cudaEventRecord,cudaEventElapsedTime,default-stream,overlap" +--- + +# CUDA Streams And Events (C++) + +Use this page for CUDA work orchestration on the host side: stream ordering, event dependencies, and timing. + +## Streams + +A stream is an ordered sequence of operations on the device. + +- operations in the same stream execute in issue order +- operations in different streams may overlap when dependencies allow +- stream-level concurrency is the basic CUDA mechanism for overlapping copy and compute + +## Events + +Events are lightweight synchronization markers. + +Common uses: + +- record progress in a stream with `cudaEventRecord` +- make another stream wait with `cudaStreamWaitEvent` +- measure elapsed time with `cudaEventElapsedTime` + +Events are the standard tool for cross-stream dependencies. + +## Basic Cross-Stream Dependency + +```cpp +cudaEvent_t done; +cudaEventCreate(&done); + +kernelA<<>>(...); +cudaEventRecord(done, streamA); +cudaStreamWaitEvent(streamB, done, 0); +kernelB<<>>(...); +``` + +This keeps the dependency local and avoids device-wide synchronization. + +## Default Stream Caveat + +The default stream has special behavior. + +- legacy default stream semantics can introduce implicit synchronization +- per-thread default stream semantics behave differently + +Do not assume the default stream behaves like an ordinary user-created stream unless you know which mode your application uses. + +## Timing Rule + +For coarse kernel timing: + +1. create start/end events +2. record them in the target stream +3. synchronize on the end event +4. call `cudaEventElapsedTime` + +This is the standard CUDA timing pattern when you want stream-local measurements. + +## Common Mistakes + +- using `cudaDeviceSynchronize()` when a stream or event sync is enough +- assuming different streams imply overlap without checking dependencies or resources +- forgetting that synchronous APIs can force serialization +- timing a stream with events but synchronizing the whole device + +## Related Topics + +- CUDA Graphs: `../cuda-graphs/DOC.md` +- Async copy pipelines: `../async-copy/DOC.md` +- Runtime API overview: `../runtime/DOC.md` +- Performance debugging: `../performance-debugging/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Programming Guide, streams and concurrency: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html +- CUDA C++ Programming Guide, events and cross-stream dependencies: https://docs.nvidia.com/cuda/archive/11.7.0/cuda-c-programming-guide/index.html +- CUDA Runtime API, stream and event functions: https://docs.nvidia.com/cuda/cuda-runtime-api/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/synchronization/DOC.md b/content/cuda/docs/synchronization/DOC.md new file mode 100644 index 00000000..358e3e72 --- /dev/null +++ b/content/cuda/docs/synchronization/DOC.md @@ -0,0 +1,120 @@ +--- +name: synchronization +description: "CUDA synchronization essentials: __syncthreads, __syncwarp, block-wide visibility, and common barrier rules." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,synchronization,syncthreads,syncwarp,block-barrier,barrier-divergence,__syncthreads__,__syncwarp,barrier,warp,thread-block,memory-ordering" +--- + +# CUDA Synchronization (C++) + +Use this page for CUDA C++ synchronization rules at the thread-block and warp levels. + +## Thread-Block Synchronization + +`__syncthreads()` is the standard block-wide barrier. + +- every non-exited thread in the block must reach it +- it waits until all threads in the block arrive +- global and shared memory accesses before the barrier become visible to threads in the block after the barrier + +Use it when threads in a block communicate through memory. + +Typical cases: + +- one phase writes shared memory and a later phase reads it +- reduction loops between strides +- transpose, stencil, or tiled GEMM phases + +## Conditional Barrier Rule + +Do not place `__syncthreads()` in divergent control flow unless the condition is uniform across the entire block. + +Unsafe pattern: + +```cpp +if (threadIdx.x < 16) { + __syncthreads(); // Wrong unless every thread takes the same branch +} +``` + +Safe pattern: + +```cpp +bool active = threadIdx.x < 16; +if (active) { + // work +} +__syncthreads(); +``` + +## Variants of `__syncthreads()` + +CUDA also provides block-wide variants that combine a barrier with a predicate reduction: + +- `__syncthreads_count(predicate)` +- `__syncthreads_and(predicate)` +- `__syncthreads_or(predicate)` + +Use them when you need a collective decision at block scope without adding a separate reduction pass. + +## Warp-Level Synchronization + +`__syncwarp(mask)` synchronizes participating lanes in a warp. + +- every participating lane must use the same mask +- each calling lane must have its own bit set in the mask +- it provides memory ordering among participating threads + +Use `__syncwarp()` when: + +- threads communicate only within one warp +- you want a lighter-weight barrier than `__syncthreads()` +- you are using warp-specialized code paths + +## Important Distinction: Warp Vote vs Barrier + +Warp vote intrinsics such as: + +- `__all_sync` +- `__any_sync` +- `__ballot_sync` + +do not imply a memory barrier by themselves. Use `__syncwarp()` when lanes must safely communicate through memory. + +## Common Mistakes + +- assuming warp-synchronous execution without an explicit warp barrier +- using `__syncthreads()` in a branch that only some threads take +- reading shared memory written by other threads before a barrier +- using block-wide barriers when the communication scope is only one warp + +## Rule of Thumb + +- use `__syncthreads()` for cross-warp communication inside a block +- use `__syncwarp()` for intra-warp communication +- if the communication path uses shared memory, place the barrier between the producer and consumer phases + +## Related Topics + +- Shared memory usage: `../shared-memory/DOC.md` +- Memory space overview: `../memory-hierarchy/DOC.md` +- Coalesced global access: `../coalescing/DOC.md` +- Warp-level primitives: `../warp-primitives/DOC.md` +- Atomics and reductions: `../atomics-and-reductions/DOC.md` +- Cooperative Groups: `../cooperative-groups/DOC.md` +- Async copy: `../async-copy/DOC.md` +- Memory fences and ordering: `../memory-fences-and-ordering/DOC.md` +- PTX synchronization primitives: `../ptx/instructions/sync-comm/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Programming Guide, thread hierarchy and cooperation: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html +- CUDA C++ Programming Guide, synchronization functions: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html#synchronization-functions +- CUDA C++ Programming Guide, warp vote and match functions: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/tensor-core-numerical-validation/DOC.md b/content/cuda/docs/tensor-core-numerical-validation/DOC.md new file mode 100644 index 00000000..eb0e9552 --- /dev/null +++ b/content/cuda/docs/tensor-core-numerical-validation/DOC.md @@ -0,0 +1,71 @@ +--- +name: tensor-core-numerical-validation +description: "Tensor Core numerical validation workflow: baseline comparison, tolerance policy, shape coverage, and regression gates." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,tensor-core,numerics,validation,tolerance,baseline,wmma,tf32,fp16,bf16,regression" +--- + +# Tensor Core Numerical Validation (C++) + +Use this page when enabling WMMA/Tensor Core paths and you need a defensible numerical-validation process. + +## Baseline Strategy + +- Keep a trusted reference path (often FP32 accumulate). +- Run identical input tensors through baseline and Tensor Core paths. +- Compare per-output error and aggregate metrics. + +## Tolerance Policy + +Define tolerance before tuning: + +- absolute tolerance +- relative tolerance +- special-case handling for near-zero regions + +Document tolerance by workload category, not by one benchmark snapshot. + +## Coverage Requirements + +Validate across: + +- representative shapes (small, medium, large) +- boundary shapes (tail tiles, non-multiple dimensions) +- realistic value ranges (not only unit random data) +- production-like batch distributions + +## Failure Triage + +If error exceeds policy: + +- check dtype/accumulator configuration first +- check layout and tile mapping consistency +- check whether a supposedly Tensor Core path silently falls back or changes instruction mix +- re-run with deterministic seeds and fixed launch configs + +## Regression Gates + +- Add numerical checks into CI for key shapes. +- Keep per-architecture baselines where behavior differs by hardware mode. +- Block performance-only changes when they break agreed numeric policy. + +## Related Topics + +- Tensor Cores: `../tensor-cores/DOC.md` +- WMMA patterns: `../wmma-kernel-patterns/DOC.md` +- WMMA debugging checklist: `../wmma-debugging-checklist/DOC.md` +- Numerics and precision: `../numerics-and-precision/DOC.md` +- Regression testing and CI: `../regression-testing-and-ci/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Programming Guide, floating-point behavior and Tensor Core context: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html +- CUDA C++ Best Practices Guide, verification guidance: https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/ + +Last cross-check date: 2026-03-20 + diff --git a/content/cuda/docs/tensor-core-pipeline-patterns/DOC.md b/content/cuda/docs/tensor-core-pipeline-patterns/DOC.md new file mode 100644 index 00000000..5f1d25db --- /dev/null +++ b/content/cuda/docs/tensor-core-pipeline-patterns/DOC.md @@ -0,0 +1,102 @@ +--- +name: tensor-core-pipeline-patterns +description: "Tensor Core pipeline patterns: global-to-shared staging, multi-stage K loops, async copy synchronization, and escalation to WGMMA/TMA." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,tensor-core,tensorcore,pipeline,pipelining,multi-stage-pipeline,cp.async,async-copy,shared-memory,mbarrier,wmma,wgmma,tma,double-buffering,stage-depth" +--- + +# Tensor Core Pipeline Patterns (C++) + +Use this page for end-to-end Tensor Core kernel structure, not just a single `mma_sync` call. + +## Why Pipeline Design Dominates + +In real GEMM-like kernels, arithmetic throughput is often high enough that data staging and synchronization decide final performance. + +A strong Tensor Core kernel usually needs: + +- global-memory tile fetch +- shared-memory staging and layout control +- fragment load and matrix instruction issue +- overlapped staging for the next K tile + +## Canonical Multi-Stage Loop + +A practical loop has at least two stages: + +1. Stage N: copy tile data for current compute. +2. Stage N+1: prefetch tile data for next compute step. + +With larger K, three-stage pipelines can smooth latency at the cost of more shared memory and register pressure. + +## Synchronization Boundaries + +You need explicit boundaries between: + +- producer writes to shared memory +- consumer fragment loads +- matrix instruction issue +- buffer reuse for next stage + +At C++ level this usually means structured barrier usage. At lower levels it can include async-copy wait semantics and mbarrier protocols. + +## Shared-Memory Layout Rules + +Tensor Core pipelines fail or slow down when shared layout is wrong. + +- align tile rows/strides for load requirements +- avoid severe bank conflicts in the staging pattern +- keep layout choices consistent with fragment load layout expectations + +## Stage-Depth Tradeoff + +More stages can hide memory latency better, but also: + +- increase shared-memory footprint per block +- reduce occupancy +- increase control complexity + +Tune stage count jointly with block-level warp count and tile shapes. + +## WMMA vs WGMMA/TMA Escalation + +Stay with WMMA-focused C++ pipeline when: + +- supported tile shapes and types fit +- performance is acceptable after staging and synchronization tuning + +Escalate toward lower-level PTX workflows when: + +- you need architecture-specific warpgroup matrix instructions +- you need advanced async tensor movement control +- your kernel requires fine-grained control beyond C++ WMMA surface area + +## Profiling Checks + +- matrix instruction activity is present and dominant in hot loops +- shared-memory pressure is not causing severe bank-serialization stalls +- memory pipeline overlaps compute in timeline and stall analysis +- occupancy remains sufficient for latency hiding + +## Related Topics + +- Tensor Core API overview: `../tensor-cores/DOC.md` +- WMMA practical patterns: `../wmma-kernel-patterns/DOC.md` +- Shared memory: `../shared-memory/DOC.md` +- Async copy: `../async-copy/DOC.md` +- Performance debugging: `../performance-debugging/DOC.md` +- PTX TMA: `../ptx/instructions/tma/DOC.md` +- PTX WGMMA: `../ptx/instructions/wgmma/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Programming Guide, asynchronous data movement and pipelines: https://docs.nvidia.com/cuda/cuda-c-programming-guide/ +- CUDA C++ Best Practices Guide, async copy and memory staging: https://docs.nvidia.com/cuda/archive/13.0.0/cuda-c-best-practices-guide/index.html +- PTX ISA docs for advanced matrix/tensor movement paths: https://docs.nvidia.com/cuda/parallel-thread-execution/ + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/tensor-cores/DOC.md b/content/cuda/docs/tensor-cores/DOC.md new file mode 100644 index 00000000..d804fbb2 --- /dev/null +++ b/content/cuda/docs/tensor-cores/DOC.md @@ -0,0 +1,144 @@ +--- +name: tensor-cores +description: "CUDA Tensor Core essentials: WMMA fragments, load/store rules, mma_sync, and when to drop to PTX WGMMA." +metadata: + languages: "cpp" + versions: "12.9" + revision: 2 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,tensor-cores,tensor-core,tensorcore,wmma,nvcuda::wmma,warp-matrix-multiply-accumulate,warp-mma,load_matrix_sync,mma_sync,store_matrix_sync,wgmma,mma,matrix-multiply-accumulate,fragment" +--- + +# CUDA Tensor Cores (C++) + +Use this page for the CUDA C++ API view of Tensor Cores. It is the correct first stop for `wmma` questions. + +## Primary API Namespace + +CUDA exposes the warp-level matrix API in `nvcuda::wmma`. + +Core concepts: + +- `wmma::fragment` +- `wmma::load_matrix_sync` +- `wmma::store_matrix_sync` +- `wmma::fill_fragment` +- `wmma::mma_sync` + +All of these are warp-synchronous interfaces. + +## Mental Model + +Each warp collaborates on a matrix tile. + +- matrix A and B tiles are loaded into fragments +- an accumulator fragment holds C / D +- `mma_sync` performs `D = A * B + C` +- results are written back with `store_matrix_sync` + +## Minimal Workflow + +```cpp +using namespace nvcuda; + +wmma::fragment a_frag; +wmma::fragment b_frag; +wmma::fragment c_frag; + +wmma::fill_fragment(c_frag, 0.0f); +wmma::load_matrix_sync(a_frag, a_ptr, lda); +wmma::load_matrix_sync(b_frag, b_ptr, ldb); +wmma::mma_sync(c_frag, a_frag, b_frag, c_frag); +wmma::store_matrix_sync(d_ptr, c_frag, ldd, wmma::mem_row_major); +``` + +## Usage Rules + +- all threads in the warp must participate +- `mptr`, `ldm`, layout, and template parameters must match across the warp +- memory pointers for matrix loads/stores must satisfy the documented alignment and leading-dimension requirements +- fragment element mapping across lanes is opaque; do not assume a stable per-lane layout + +## Alignment And Stride Constraints + +`load_matrix_sync` and `store_matrix_sync` have strict requirements. + +- the pointer must meet the documented alignment requirement +- `ldm` must satisfy the documented stride constraint in elements +- all lanes in the warp must agree on the arguments + +If these conditions are violated, behavior is undefined or performance will collapse around the staging path. + +## Supported Types And Shapes + +WMMA does not mean "any matrix multiply on Tensor Cores". + +- only specific tile shapes are supported +- only specific multiplicand and accumulator type combinations are supported +- support varies by architecture and API subset + +When the type / shape combination is outside the documented WMMA set, you either stay on the ordinary arithmetic path or move to a lower-level PTX path if the hardware and toolchain support it. + +## Shared Memory Staging Is Common + +High-performance Tensor Core kernels usually do more than call `mma_sync`. + +Typical structure: + +1. move tiles from global memory +2. stage or reorder them in shared memory if needed +3. load fragments +4. execute `mma_sync` +5. store accumulators back to memory + +So Tensor Core performance is often gated by shared-memory layout, coalescing, and synchronization as much as by the MMA instruction itself. + +## Restrictions That Matter + +- fragment layout is architecture-specific +- passing fragments across separately compiled code for different architectures is unsafe +- if fragments must cross an interface boundary, store to memory first and pass ordinary pointers instead + +## When WMMA Is The Right Layer + +Stay with WMMA when: + +- you are writing CUDA C++ kernels +- you want a supported high-level Tensor Core interface +- the problem maps naturally to documented WMMA tile shapes and types + +Drop to PTX when: + +- you need `wgmma` +- you need architecture-specific async MMA protocols +- you are working with TMA, mbarrier, or lower-level Hopper/Blackwell Tensor Core workflows + +## WMMA vs "CUDA Core" Arithmetic + +If a matrix multiply is written as ordinary nested scalar FMAs, it usually runs on the ordinary arithmetic path rather than the Tensor Core path. + +To reliably target Tensor Cores from CUDA C++, use the documented WMMA interfaces or an equivalent library path that emits the required matrix instructions. + +## Related Topics + +- Execution model: `../execution-model/DOC.md` +- CUDA Core path: `../cuda-core/DOC.md` +- Compute throughput model: `../compute-throughput/DOC.md` +- Warp-level execution model: `../warp-primitives/DOC.md` +- Shared memory staging: `../shared-memory/DOC.md` +- WMMA practical patterns: `../wmma-kernel-patterns/DOC.md` +- WMMA debugging checklist: `../wmma-debugging-checklist/DOC.md` +- Tensor Core pipeline patterns: `../tensor-core-pipeline-patterns/DOC.md` +- Tensor Core numerical validation: `../tensor-core-numerical-validation/DOC.md` +- CUDA Core vs Tensor Core path selection: `../cuda-core-vs-tensor-core-path-selection/DOC.md` +- PTX WGMMA entry: `../ptx/instructions/wgmma/DOC.md` +- PTX TMA entry: `../ptx/instructions/tma/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Programming Guide, WMMA API and fragments: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html +- CUDA C++ Programming Guide, `load_matrix_sync` / `store_matrix_sync` / `mma_sync`: https://docs.nvidia.com/cuda/archive/9.1/cuda-c-programming-guide/index.html +- CUDA C++ Programming Guide, Tensor Core restrictions: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/thread-block-clusters/DOC.md b/content/cuda/docs/thread-block-clusters/DOC.md new file mode 100644 index 00000000..a4ed8bba --- /dev/null +++ b/content/cuda/docs/thread-block-clusters/DOC.md @@ -0,0 +1,99 @@ +--- +name: thread-block-clusters +description: "CUDA thread block cluster essentials: cluster launch, cluster.sync, distributed shared memory, and portable cluster-size rules." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,thread-block-clusters,cluster,distributed-shared-memory,dsm,cluster.sync,__cluster_dims__,cudaLaunchKernelEx" +--- + +# CUDA Thread Block Clusters (C++) + +Use this page for the CUDA C++ view of cluster launch, cluster-level synchronization, and distributed shared memory. + +## What A Cluster Is + +Thread Block Clusters add an optional hierarchy level above blocks. + +- multiple blocks form one cluster +- blocks in a cluster are co-scheduled on the same GPC +- blocks in the cluster can synchronize and communicate more directly than unrelated blocks + +This feature is available on compute capability 9.0 and higher. + +## Launch Mechanisms + +Clusters can be specified either: + +- at compile time with `__cluster_dims__(x, y, z)` +- at launch time with `cudaLaunchKernelEx` and a cluster-dimension attribute + +Important: + +- `gridDim` still counts blocks, not clusters +- the grid should be compatible with the cluster dimensions + +## Cluster Synchronization + +CUDA exposes cluster-level synchronization through the Cooperative Groups cluster API. + +Typical pattern: + +- obtain the cluster handle +- coordinate phases with `cluster.sync()` + +This is the cluster-scope analogue of block synchronization, but for blocks that belong to the same cluster. + +## Distributed Shared Memory + +Blocks in a cluster can access distributed shared memory. + +That means: + +- a block can read or write shared memory owned by another block in the same cluster +- atomics can also target addresses in distributed shared memory + +This is useful when one block's normal shared memory is too small, but full global-memory communication would be too expensive. + +## Portable Cluster Size Rule + +CUDA documentation describes 8 blocks as the portable maximum cluster size. + +- some hardware or configurations may support less +- some architectures can support larger nonportable sizes +- query support instead of hard-coding assumptions + +Relevant APIs include occupancy helpers such as `cudaOccupancyMaxPotentialClusterSize`. + +## When To Use Clusters + +Clusters are a good fit when: + +- communication across several neighboring blocks is frequent +- distributed shared memory removes expensive global-memory round trips +- the algorithm naturally decomposes into a few tightly coupled blocks + +Avoid them when: + +- the kernel is simple enough for ordinary per-block decomposition +- portability matters more than architecture-specific optimization +- the communication pattern is weak or irregular + +## Related Topics + +- Cooperative Groups: `../cooperative-groups/DOC.md` +- Shared memory usage: `../shared-memory/DOC.md` +- Occupancy tuning: `../occupancy/DOC.md` +- Async copy and TMA: `../async-copy/DOC.md` +- PTX cluster / mbarrier / TMA path: `../ptx/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA Programming Guide, Thread Block Clusters: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html +- CUDA Programming Guide, modern programming-model introduction to clusters: https://docs.nvidia.com/cuda/archive/13.1.1/cuda-programming-guide/01-introduction/programming-model.html +- Hopper Tuning Guide, distributed shared memory and cluster notes: https://docs.nvidia.com/cuda/archive/12.4.0/hopper-tuning-guide/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/unified-memory/DOC.md b/content/cuda/docs/unified-memory/DOC.md new file mode 100644 index 00000000..6bf76c52 --- /dev/null +++ b/content/cuda/docs/unified-memory/DOC.md @@ -0,0 +1,71 @@ +--- +name: unified-memory +description: "CUDA Unified Memory essentials: managed allocations, migration behavior, prefetch/advice, and common performance pitfalls." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,unified-memory,managed-memory,cudaMallocManaged,cudaMemPrefetchAsync,cudaMemAdvise,page-migration" +--- + +# CUDA Unified Memory (C++) + +Use this page when you need a single pointer model across CPU and GPU with on-demand migration. + +## Core API + +Unified Memory is commonly allocated with: + +- `cudaMallocManaged` + +The runtime and driver can migrate pages between host and device as memory is accessed. + +## Why It Helps + +- simpler programming model for heterogeneous memory access +- easier incremental porting from CPU-oriented code +- fewer explicit memcpy calls in basic workflows + +## Why It Can Be Slow + +On-demand page migration can stall kernels if data is not resident on the device when accessed. + +Symptoms: + +- unpredictable first-touch latency +- page-fault-driven migration overhead +- lower effective bandwidth than explicit transfer pipelines + +## Performance Controls + +Use: + +- `cudaMemPrefetchAsync` to place data near expected access +- `cudaMemAdvise` hints for access patterns and preferred location + +These often reduce migration faults and smooth performance. + +## When To Prefer Explicit Transfers + +Prefer explicit host/device transfers when: + +- access pattern is stable and predictable +- maximum throughput is required +- migration overhead dominates runtime + +Unified Memory is often best for productivity first, then selectively optimized for hot paths. + +## Related Topics + +- Pinned memory and transfers: `../pinned-memory-and-transfers/DOC.md` +- Streams and events: `../streams-and-events/DOC.md` +- Performance debugging: `../performance-debugging/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Programming Guide, Unified Memory programming: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html +- CUDA Runtime API, managed-memory and memory-advice APIs: https://docs.nvidia.com/cuda/cuda-runtime-api/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/warp-primitives/DOC.md b/content/cuda/docs/warp-primitives/DOC.md new file mode 100644 index 00000000..d2edd596 --- /dev/null +++ b/content/cuda/docs/warp-primitives/DOC.md @@ -0,0 +1,105 @@ +--- +name: warp-primitives +description: "CUDA warp-level primitives: shuffle, ballot, active masks, syncwarp, and when to replace shared memory with warp collectives." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,warp,warp-primitives,warp-collectives,warp-synchronous,shuffle,ballot,warp-vote,__shfl_sync,__ballot_sync,__activemask,__syncwarp,warp-reduction" +--- + +# CUDA Warp Primitives (C++) + +Use this page for warp-scope communication patterns that avoid block-wide synchronization and often reduce shared-memory traffic. + +## Core Warp Primitives + +Common warp-level intrinsics include: + +- `__shfl_sync` +- `__shfl_down_sync` +- `__shfl_xor_sync` +- `__ballot_sync` +- `__all_sync` +- `__any_sync` +- `__activemask` +- `__syncwarp` + +These operate on the active lanes of a warp and require a consistent participation mask. + +## When Warp Primitives Help + +Use warp primitives when: + +- communication stays within one warp +- you want to avoid shared memory for a small reduction or exchange +- a block-wide barrier would be too expensive or unnecessary + +Typical cases: + +- warp reductions +- prefix-like exchanges within a warp +- voting and mask construction +- lane permutation for register-resident data + +## Shuffle vs Shared Memory + +Shuffle intrinsics move register values directly between lanes. + +Prefer shuffle when: + +- the communication scope is one warp +- data volume is small +- you want to avoid shared-memory stores, loads, and `__syncthreads()` + +Prefer shared memory when: + +- communication crosses warp boundaries +- the data footprint exceeds what is comfortable in registers +- the access pattern spans the whole block + +## Memory Ordering Rule + +- `__syncwarp()` provides warp-scope synchronization and memory ordering for participating lanes +- vote intrinsics such as `__ballot_sync` do not by themselves imply a memory barrier + +If lanes communicate through memory, insert `__syncwarp()`. + +## Minimal Warp Reduction Pattern + +```cpp +float x = value; +for (int offset = 16; offset > 0; offset >>= 1) { + x += __shfl_down_sync(0xffffffff, x, offset); +} +``` + +This is the standard first step before reducing across warps with shared memory or atomics. + +## Mask Discipline + +For `_sync` intrinsics: + +- every participating lane must use the same mask +- each calling lane must have its own bit set in the mask +- all named non-exited lanes must execute the same intrinsic with the same mask + +Violating mask discipline leads to undefined behavior. + +## Related Topics + +- Execution model: `../execution-model/DOC.md` +- Block and warp synchronization: `../synchronization/DOC.md` +- Shared memory alternatives: `../shared-memory/DOC.md` +- Atomics and reductions: `../atomics-and-reductions/DOC.md` +- Tensor Core warp-level usage: `../tensor-cores/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Programming Guide, warp vote and match functions: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html +- CUDA C++ Programming Guide, shuffle functions: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html +- CUDA C++ Programming Guide, `__syncwarp()`: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html#synchronization-functions + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/wmma-debugging-checklist/DOC.md b/content/cuda/docs/wmma-debugging-checklist/DOC.md new file mode 100644 index 00000000..b2712d02 --- /dev/null +++ b/content/cuda/docs/wmma-debugging-checklist/DOC.md @@ -0,0 +1,61 @@ +--- +name: wmma-debugging-checklist +description: "WMMA debugging checklist: fragment/layout mismatches, leading-dimension issues, warp participation errors, and profiling verification." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,wmma,debugging,checklist,tensor-core,fragment,load_matrix_sync,mma_sync,store_matrix_sync,ldm,alignment" +--- + +# WMMA Debugging Checklist (C++) + +Use this page when a WMMA kernel is incorrect, unstable, or unexpectedly slow. + +## Correctness Checklist + +- Warp participation is complete for every WMMA call. +- `matrix_a` / `matrix_b` layout templates match actual memory layout. +- `ldm` values are in elements and match tensor strides. +- Load/store pointers satisfy required alignment. +- Accumulator type and final store type match intended precision policy. + +## Common Failure Signatures + +- Output full of zeros or repeated blocks: wrong pointer arithmetic or tile mapping. +- Numerically wrong but stable shape: wrong layout or `ldm` mismatch. +- Intermittent corruption: partial-warp execution or out-of-bounds tile guards. +- Correct output but poor speed: data staging dominates, not matrix instruction issue. + +## Profiling Checklist + +- Confirm matrix instruction activity is present. +- Confirm expected hot kernels use Tensor Core-capable instruction mix. +- Check shared-memory staging quality and bank-conflict pressure. +- Check occupancy/register pressure after unrolling and staging changes. + +## Minimal Debug Order + +1. Validate one warp, one tile, one K-step. +2. Validate full K-loop accumulation. +3. Scale to multi-warp block mapping. +4. Add pipelining/staging optimizations only after correctness is stable. + +## Related Topics + +- Tensor Core overview: `../tensor-cores/DOC.md` +- WMMA kernel patterns: `../wmma-kernel-patterns/DOC.md` +- Tensor Core pipeline patterns: `../tensor-core-pipeline-patterns/DOC.md` +- Numerics and precision: `../numerics-and-precision/DOC.md` +- Tensor Core numerical validation: `../tensor-core-numerical-validation/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Programming Guide, WMMA APIs: https://docs.nvidia.com/cuda/cuda-c-programming-guide/ +- CUDA C++ Programming Guide, Tensor Core restrictions: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html +- Nsight Compute Profiling Guide: https://docs.nvidia.com/nsight-compute/2024.2/ProfilingGuide/index.html + +Last cross-check date: 2026-03-20 + diff --git a/content/cuda/docs/wmma-kernel-patterns/DOC.md b/content/cuda/docs/wmma-kernel-patterns/DOC.md new file mode 100644 index 00000000..7766e5ea --- /dev/null +++ b/content/cuda/docs/wmma-kernel-patterns/DOC.md @@ -0,0 +1,107 @@ +--- +name: wmma-kernel-patterns +description: "Practical WMMA kernel patterns: warp-to-tile mapping, fragment loading rules, accumulator handling, and common failure modes." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,wmma,tensor-core,tensorcore,nvcuda::wmma,warp-matrix-multiply-accumulate,warp-mma,matrix-multiply-accumulate,fragment,mma_sync,load_matrix_sync,store_matrix_sync,gemm" +--- + +# WMMA Kernel Patterns (C++) + +Use this page when you need a practical implementation pattern for `nvcuda::wmma`, not just API names. + +## Warp-To-Tile Mapping + +The baseline mapping is one warp per output tile: + +- one warp loads A/B tile fragments +- one warp keeps the accumulator fragment +- one warp stores results back + +Scale to larger problems by assigning multiple warps per block and iterating over K tiles. + +## Minimal Pattern Skeleton + +```cpp +using namespace nvcuda; + +__global__ void wmma_gemm_kernel(const half* A, const half* B, float* C, + int M, int N, int K, + int lda, int ldb, int ldc) { + int warp_id_in_block = threadIdx.x / 32; + int lane_id = threadIdx.x % 32; + + int warp_m = (blockIdx.y * (blockDim.x / 32) + warp_id_in_block); + int warp_n = blockIdx.x; + + if (warp_m * 16 >= M || warp_n * 16 >= N) return; + + wmma::fragment c_frag; + wmma::fill_fragment(c_frag, 0.0f); + + for (int k0 = 0; k0 < K; k0 += 16) { + wmma::fragment a_frag; + wmma::fragment b_frag; + + const half* a_ptr = A + (warp_m * 16) * lda + k0; + const half* b_ptr = B + k0 * ldb + (warp_n * 16); + + wmma::load_matrix_sync(a_frag, a_ptr, lda); + wmma::load_matrix_sync(b_frag, b_ptr, ldb); + wmma::mma_sync(c_frag, a_frag, b_frag, c_frag); + } + + float* c_ptr = C + (warp_m * 16) * ldc + (warp_n * 16); + wmma::store_matrix_sync(c_ptr, c_frag, ldc, wmma::mem_row_major); +} +``` + +This skeleton is intentionally simple. Production kernels usually add shared-memory staging and pipelining. + +## Critical Correctness Rules + +- All lanes in the warp must execute the WMMA calls with consistent arguments. +- Layout and leading-dimension parameters must match fragment template expectations. +- Pointer alignment and stride constraints for load/store must satisfy API requirements. +- Fragment internal lane mapping is opaque; do not index fragment storage with custom lane assumptions. + +## High-Value Performance Patterns + +- Stage A/B tiles in shared memory to reduce uncoalesced global traffic. +- Use double-buffered tile staging when K is large. +- Keep one accumulator fragment alive across K-loop iterations. +- Control register pressure before adding heavy unrolling. + +## Common Failure Modes + +- Wrong `row_major`/`col_major` choice for multiplicands. +- Incorrect `lda`/`ldb`/`ldc` in element units. +- Partial-warp execution due to guard branches around WMMA calls. +- Correct output with low speed because data movement dominates MMA throughput. + +## Verification Workflow + +1. Compare numerics against a trusted GEMM baseline. +2. Confirm matrix instruction activity in profiler output. +3. Confirm shared-memory staging efficiency and low bank-conflict pressure. +4. Sweep block-level warp count and K-step scheduling for throughput. + +## Related Topics + +- Tensor Core overview: `../tensor-cores/DOC.md` +- Tensor Core pipeline patterns: `../tensor-core-pipeline-patterns/DOC.md` +- Shared memory: `../shared-memory/DOC.md` +- Async copy: `../async-copy/DOC.md` +- Numerics and precision: `../numerics-and-precision/DOC.md` +- PTX WGMMA: `../ptx/instructions/wgmma/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Programming Guide, WMMA API: https://docs.nvidia.com/cuda/cuda-c-programming-guide/ +- CUDA C++ Programming Guide, Tensor Core usage restrictions: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html + +Last cross-check date: 2026-03-20 diff --git a/docs/features/search-regression.md b/docs/features/search-regression.md new file mode 100644 index 00000000..d4884ab0 --- /dev/null +++ b/docs/features/search-regression.md @@ -0,0 +1,43 @@ +# Search Regression + +This workflow provides repeatable search-quality checks for local Context Hub content. + +## Files + +- `scripts/search_regression.py`: regression runner. +- `scripts/search_regression_cases.json`: query cases and expectations. +- `scripts/search_regression_baseline.json`: generated snapshot (current top results). + +## Run + +From repository root: + +```bash +python3 scripts/search_regression.py --mode check +``` + +Generate a fresh snapshot/baseline: + +```bash +python3 scripts/search_regression.py --mode snapshot +``` + +## Case Format + +Each case in `search_regression_cases.json` supports: + +- `id`: stable case identifier +- `query`: search query text +- `tags`: optional `--tags` value +- `lang`: optional `--lang` value +- `limit`: search result count +- `top_k`: range used for assertions +- `expect_top1`: expected id at rank 1 +- `expect_all`: all expected ids must appear in top-k +- `expect_any`: at least one id must appear in top-k +- `expect_absent`: ids that must not appear in top-k + +## CI Suggestion + +- Run `python3 scripts/search_regression.py --mode check` after `chub build`. +- Store `scripts/search_regression_baseline.json` as an artifact to track ranking drift. diff --git a/scripts/search_regression.py b/scripts/search_regression.py new file mode 100644 index 00000000..20d8f7c4 --- /dev/null +++ b/scripts/search_regression.py @@ -0,0 +1,210 @@ +#!/usr/bin/env python3 +"""Run search regression checks for Context Hub.""" + +from __future__ import annotations + +import argparse +import json +import subprocess +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Any + + +@dataclass +class CaseResult: + case_id: str + query: str + passed: bool + message: str + top_ids: list[str] + raw_results: list[dict[str, Any]] + + +def _load_json(path: Path) -> Any: + return json.loads(path.read_text(encoding="utf-8")) + + +def _run_search( + chub: str, query: str, tags: str | None, lang: str | None, limit: int +) -> list[dict[str, Any]]: + cmd = [chub, "search", query, "--limit", str(limit), "--json"] + if tags: + cmd.extend(["--tags", tags]) + if lang: + cmd.extend(["--lang", lang]) + + proc = subprocess.run(cmd, capture_output=True, text=True, check=False) + if proc.returncode != 0: + raise RuntimeError( + f"command failed ({proc.returncode}): {' '.join(cmd)}\n{proc.stderr.strip()}" + ) + + text = proc.stdout.strip() + if not text: + raise RuntimeError(f"empty output for query: {query}") + + try: + payload = json.loads(text) + except json.JSONDecodeError as exc: + raise RuntimeError(f"invalid JSON output for query: {query}\n{text}") from exc + + results = payload.get("results") + if not isinstance(results, list): + raise RuntimeError(f"missing `results` in output for query: {query}") + return results + + +def _evaluate_case(case: dict[str, Any], results: list[dict[str, Any]]) -> CaseResult: + case_id = str(case["id"]) + query = str(case["query"]) + top_k = int(case.get("top_k", len(results))) + top_ids = [str(r.get("id", "")) for r in results[:top_k]] + + expect_top1 = case.get("expect_top1") + expect_all = [str(x) for x in case.get("expect_all", [])] + expect_any = [str(x) for x in case.get("expect_any", [])] + expect_absent = [str(x) for x in case.get("expect_absent", [])] + + failures: list[str] = [] + if expect_top1 and (not top_ids or top_ids[0] != expect_top1): + got = top_ids[0] if top_ids else "" + failures.append(f"top1 expected `{expect_top1}`, got `{got}`") + + missing_all = [x for x in expect_all if x not in top_ids] + if missing_all: + failures.append(f"missing expected ids in top-{top_k}: {missing_all}") + + if expect_any and not any(x in top_ids for x in expect_any): + failures.append(f"none of expect_any found in top-{top_k}: {expect_any}") + + present_absent = [x for x in expect_absent if x in top_ids] + if present_absent: + failures.append(f"unexpected ids found in top-{top_k}: {present_absent}") + + if failures: + return CaseResult( + case_id=case_id, + query=query, + passed=False, + message="; ".join(failures), + top_ids=top_ids, + raw_results=results, + ) + return CaseResult( + case_id=case_id, + query=query, + passed=True, + message="ok", + top_ids=top_ids, + raw_results=results, + ) + + +def _snapshot_payload(run_results: list[CaseResult]) -> dict[str, Any]: + cases: list[dict[str, Any]] = [] + for r in run_results: + top_items = [] + for item in r.raw_results[:10]: + top_items.append( + { + "id": item.get("id"), + "name": item.get("name"), + "score": item.get("_score"), + } + ) + cases.append( + { + "id": r.case_id, + "query": r.query, + "passed": r.passed, + "top_ids": r.top_ids, + "top_items": top_items, + } + ) + return {"cases": cases} + + +def main() -> int: + parser = argparse.ArgumentParser(description="Run Context Hub search regressions.") + parser.add_argument( + "--cases", + default="scripts/search_regression_cases.json", + help="Path to regression case JSON file.", + ) + parser.add_argument( + "--chub", + default="./cli/bin/chub", + help="Path to chub executable.", + ) + parser.add_argument( + "--mode", + choices=["check", "snapshot"], + default="check", + help="check: assert expectations; snapshot: emit current top results", + ) + parser.add_argument( + "--snapshot-out", + default="scripts/search_regression_baseline.json", + help="Where to write snapshot/baseline JSON.", + ) + args = parser.parse_args() + + cases = _load_json(Path(args.cases)) + if not isinstance(cases, list) or not cases: + print("error: cases file must be a non-empty JSON array", file=sys.stderr) + return 2 + + run_results: list[CaseResult] = [] + hard_failures = 0 + + for case in cases: + try: + query = str(case["query"]) + limit = int(case.get("limit", 5)) + tags = case.get("tags") + lang = case.get("lang") + results = _run_search(args.chub, query, tags, lang, limit) + result = _evaluate_case(case, results) + except Exception as exc: # pragma: no cover + hard_failures += 1 + case_id = str(case.get("id", "")) + query = str(case.get("query", "")) + result = CaseResult( + case_id=case_id, + query=query, + passed=False, + message=str(exc), + top_ids=[], + raw_results=[], + ) + run_results.append(result) + + pass_count = sum(1 for r in run_results if r.passed) + fail_count = len(run_results) - pass_count + + for r in run_results: + status = "PASS" if r.passed else "FAIL" + print(f"[{status}] {r.case_id}: {r.message}") + if not r.passed: + print(f" query={r.query}") + if r.top_ids: + print(f" top_ids={r.top_ids}") + + snapshot = _snapshot_payload(run_results) + out_path = Path(args.snapshot_out) + if args.mode == "snapshot" or fail_count: + out_path.write_text(json.dumps(snapshot, indent=2), encoding="utf-8") + print(f"wrote snapshot: {out_path}") + + print( + f"summary: total={len(run_results)} pass={pass_count} fail={fail_count} hard_failures={hard_failures}" + ) + if args.mode == "snapshot": + return 0 + return 0 if fail_count == 0 else 1 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/search_regression_baseline.json b/scripts/search_regression_baseline.json new file mode 100644 index 00000000..f9638351 --- /dev/null +++ b/scripts/search_regression_baseline.json @@ -0,0 +1,2458 @@ +{ + "cases": [ + { + "id": "wmma-how-to-use", + "query": "how to use wmma", + "passed": true, + "top_ids": [ + "cuda/wmma-kernel-patterns", + "cuda/wmma-debugging-checklist", + "cuda/tensor-cores" + ], + "top_items": [ + { + "id": "cuda/wmma-kernel-patterns", + "name": "wmma-kernel-patterns", + "score": 43.61714540227329 + }, + { + "id": "cuda/wmma-debugging-checklist", + "name": "wmma-debugging-checklist", + "score": 41.959203565519395 + }, + { + "id": "cuda/tensor-cores", + "name": "tensor-cores", + "score": 12.674713742843819 + }, + { + "id": "cuda/ptx-wgmma-instructions", + "name": "ptx-wgmma-instructions", + "score": 8.150786522578006 + }, + { + "id": "cuda/tensor-core-numerical-validation", + "name": "tensor-core-numerical-validation", + "score": 7.805048644402254 + } + ] + }, + { + "id": "shared-memory-core", + "query": "shared memory cuda", + "passed": true, + "top_ids": [ + "cuda/shared-memory", + "cuda/memory-hierarchy", + "cuda/unified-memory", + "cuda/memory-bound-kernel-optimization-playbook", + "cuda/memory-fences-and-ordering" + ], + "top_items": [ + { + "id": "cuda/shared-memory", + "name": "shared-memory", + "score": 657.484807905159 + }, + { + "id": "cuda/memory-hierarchy", + "name": "memory-hierarchy", + "score": 71.98408450085644 + }, + { + "id": "cuda/unified-memory", + "name": "unified-memory", + "score": 59.236138761936736 + }, + { + "id": "cuda/memory-bound-kernel-optimization-playbook", + "name": "memory-bound-kernel-optimization-playbook", + "score": 56.79127964172355 + }, + { + "id": "cuda/memory-fences-and-ordering", + "name": "memory-fences-and-ordering", + "score": 53.301943695582 + } + ] + }, + { + "id": "tensor-core-pipeline", + "query": "tensor core pipeline", + "passed": true, + "top_ids": [ + "cuda/tensor-core-pipeline-patterns", + "cuda/tensor-core-numerical-validation", + "cuda/cuda-core-vs-tensor-core-path-selection", + "cuda/tensor-cores", + "cuda/cuda-core" + ], + "top_items": [ + { + "id": "cuda/tensor-core-pipeline-patterns", + "name": "tensor-core-pipeline-patterns", + "score": 640.7984395849124 + }, + { + "id": "cuda/tensor-core-numerical-validation", + "name": "tensor-core-numerical-validation", + "score": 61.14991223278459 + }, + { + "id": "cuda/cuda-core-vs-tensor-core-path-selection", + "name": "cuda-core-vs-tensor-core-path-selection", + "score": 60.62135318417785 + }, + { + "id": "cuda/tensor-cores", + "name": "tensor-cores", + "score": 46.70057996177767 + }, + { + "id": "cuda/cuda-core", + "name": "cuda-core", + "score": 34.07011220539216 + } + ] + }, + { + "id": "cuda-core-checklist", + "query": "cuda core optimization checklist", + "passed": true, + "top_ids": [ + "cuda/cuda-core-optimization-checklist", + "cuda/cuda-core", + "cuda/production-readiness-checklist", + "cuda/wmma-debugging-checklist", + "cuda/compute-bound-kernel-optimization-playbook" + ], + "top_items": [ + { + "id": "cuda/cuda-core-optimization-checklist", + "name": "cuda-core-optimization-checklist", + "score": 751.9375339114197 + }, + { + "id": "cuda/cuda-core", + "name": "cuda-core", + "score": 70.84057076343724 + }, + { + "id": "cuda/production-readiness-checklist", + "name": "production-readiness-checklist", + "score": 62.75340373370601 + }, + { + "id": "cuda/wmma-debugging-checklist", + "name": "wmma-debugging-checklist", + "score": 59.63545206984803 + }, + { + "id": "cuda/compute-bound-kernel-optimization-playbook", + "name": "compute-bound-kernel-optimization-playbook", + "score": 56.75325281572377 + } + ] + }, + { + "id": "ptx-mbarrier-patterns", + "query": "ptx cp.async mbarrier", + "passed": true, + "top_ids": [ + "cuda/ptx-mbarrier-protocol-patterns", + "cuda/ptx", + "cuda/ptx-data-movement-instructions" + ], + "top_items": [ + { + "id": "cuda/ptx-mbarrier-protocol-patterns", + "name": "ptx-mbarrier-protocol-patterns", + "score": 99.09204708236842 + }, + { + "id": "cuda/ptx", + "name": "ptx", + "score": 63.73408756781064 + }, + { + "id": "cuda/ptx-data-movement-instructions", + "name": "ptx-data-movement-instructions", + "score": 53.065078148339296 + }, + { + "id": "cuda/ptx-sync-comm-instructions", + "name": "ptx-sync-comm-instructions", + "score": 52.96398634389052 + }, + { + "id": "cuda/ptx-tma-instructions", + "name": "ptx-tma-instructions", + "score": 49.76827473487636 + } + ] + }, + { + "id": "wmma-debugging", + "query": "wmma debugging checklist", + "passed": true, + "top_ids": [ + "cuda/wmma-debugging-checklist", + "cuda/performance-debugging", + "cuda/wmma-kernel-patterns" + ], + "top_items": [ + { + "id": "cuda/wmma-debugging-checklist", + "name": "wmma-debugging-checklist", + "score": 740.5267488953604 + }, + { + "id": "cuda/performance-debugging", + "name": "performance-debugging", + "score": 43.61799897347794 + }, + { + "id": "cuda/wmma-kernel-patterns", + "name": "wmma-kernel-patterns", + "score": 43.61714540227329 + }, + { + "id": "cuda/production-readiness-checklist", + "name": "production-readiness-checklist", + "score": 43.036883602177625 + }, + { + "id": "cuda/cuda-core-optimization-checklist", + "name": "cuda-core-optimization-checklist", + "score": 39.59373341109431 + } + ] + }, + { + "id": "warp-primitives", + "query": "warp primitives shuffle ballot syncwarp", + "passed": true, + "top_ids": [ + "cuda/warp-primitives", + "cuda/ptx-warp-synchronization-patterns", + "cuda/synchronization", + "cuda/atomics-and-reductions", + "cuda/wmma-kernel-patterns" + ], + "top_items": [ + { + "id": "cuda/warp-primitives", + "name": "warp-primitives", + "score": 150.80478939459522 + }, + { + "id": "cuda/ptx-warp-synchronization-patterns", + "name": "ptx-warp-synchronization-patterns", + "score": 42.69307820298248 + }, + { + "id": "cuda/synchronization", + "name": "synchronization", + "score": 26.19282758027902 + }, + { + "id": "cuda/atomics-and-reductions", + "name": "atomics-and-reductions", + "score": 12.974093484309805 + }, + { + "id": "cuda/wmma-kernel-patterns", + "name": "wmma-kernel-patterns", + "score": 12.04094824018038 + } + ] + }, + { + "id": "sync-basics", + "query": "cuda synchronization syncthreads syncwarp", + "passed": true, + "top_ids": [ + "cuda/synchronization", + "cuda/ptx-warp-synchronization-patterns", + "cuda/cuda-graphs", + "cuda/ptx-sync-comm-instructions", + "cuda/cuda-core" + ], + "top_items": [ + { + "id": "cuda/synchronization", + "name": "synchronization", + "score": 109.96763789392514 + }, + { + "id": "cuda/ptx-warp-synchronization-patterns", + "name": "ptx-warp-synchronization-patterns", + "score": 47.14551457940909 + }, + { + "id": "cuda/cuda-graphs", + "name": "cuda-graphs", + "score": 33.21805884552287 + }, + { + "id": "cuda/ptx-sync-comm-instructions", + "name": "ptx-sync-comm-instructions", + "score": 33.201092367183755 + }, + { + "id": "cuda/cuda-core", + "name": "cuda-core", + "score": 32.33557499378493 + } + ] + }, + { + "id": "coalescing", + "query": "cuda memory coalescing global load store", + "passed": true, + "top_ids": [ + "cuda/coalescing", + "cuda/memory-hierarchy", + "cuda/memory-bound-kernel-optimization-playbook" + ], + "top_items": [ + { + "id": "cuda/coalescing", + "name": "coalescing", + "score": 115.17972318690447 + }, + { + "id": "cuda/memory-hierarchy", + "name": "memory-hierarchy", + "score": 74.83896537094957 + }, + { + "id": "cuda/memory-bound-kernel-optimization-playbook", + "name": "memory-bound-kernel-optimization-playbook", + "score": 60.915971003945096 + }, + { + "id": "cuda/unified-memory", + "name": "unified-memory", + "score": 59.236138761936736 + }, + { + "id": "cuda/shared-memory", + "name": "shared-memory", + "score": 57.612339389511895 + } + ] + }, + { + "id": "occupancy-and-launch-bounds", + "query": "occupancy register pressure launch bounds", + "passed": true, + "top_ids": [ + "cuda/launch-bounds-and-registers", + "cuda/occupancy", + "cuda/launch-bound-optimization-playbook", + "cuda/compute-bound-kernel-optimization-playbook" + ], + "top_items": [ + { + "id": "cuda/launch-bounds-and-registers", + "name": "launch-bounds-and-registers", + "score": 140.30542341076642 + }, + { + "id": "cuda/occupancy", + "name": "occupancy", + "score": 86.08574569491097 + }, + { + "id": "cuda/launch-bound-optimization-playbook", + "name": "launch-bound-optimization-playbook", + "score": 40.07254299673837 + }, + { + "id": "cuda/compute-bound-kernel-optimization-playbook", + "name": "compute-bound-kernel-optimization-playbook", + "score": 37.7438974501102 + }, + { + "id": "cuda/cuda-core-optimization-checklist", + "name": "cuda-core-optimization-checklist", + "score": 28.46629509303759 + } + ] + }, + { + "id": "unified-memory", + "query": "cuda unified memory prefetch advise", + "passed": true, + "top_ids": [ + "cuda/unified-memory", + "cuda/memory-hierarchy", + "cuda/shared-memory", + "cuda/memory-fences-and-ordering", + "cuda/pinned-memory-and-transfers" + ], + "top_items": [ + { + "id": "cuda/unified-memory", + "name": "unified-memory", + "score": 125.58254962497524 + }, + { + "id": "cuda/memory-hierarchy", + "name": "memory-hierarchy", + "score": 62.368554193218 + }, + { + "id": "cuda/shared-memory", + "name": "shared-memory", + "score": 57.612339389511895 + }, + { + "id": "cuda/memory-fences-and-ordering", + "name": "memory-fences-and-ordering", + "score": 53.301943695582 + }, + { + "id": "cuda/pinned-memory-and-transfers", + "name": "pinned-memory-and-transfers", + "score": 52.80653268638846 + } + ] + }, + { + "id": "graphs", + "query": "cuda graphs stream capture", + "passed": true, + "top_ids": [ + "cuda/cuda-graphs", + "cuda/launch-bound-optimization-playbook", + "cuda/streams-and-events", + "cuda/cuda-core", + "cuda/cublas-cudnn-integration-patterns" + ], + "top_items": [ + { + "id": "cuda/cuda-graphs", + "name": "cuda-graphs", + "score": 124.38204740075595 + }, + { + "id": "cuda/launch-bound-optimization-playbook", + "name": "launch-bound-optimization-playbook", + "score": 41.93322479720487 + }, + { + "id": "cuda/streams-and-events", + "name": "streams-and-events", + "score": 34.91091385571819 + }, + { + "id": "cuda/cuda-core", + "name": "cuda-core", + "score": 32.33557499378493 + }, + { + "id": "cuda/cublas-cudnn-integration-patterns", + "name": "cublas-cudnn-integration-patterns", + "score": 30.954401006440616 + } + ] + }, + { + "id": "numerics", + "query": "cuda numerics precision tf32 fp16 bf16", + "passed": true, + "top_ids": [ + "cuda/numerics-and-precision", + "cuda/tensor-core-numerical-validation", + "cuda/cuda-graphs", + "cuda/cuda-core" + ], + "top_items": [ + { + "id": "cuda/numerics-and-precision", + "name": "numerics-and-precision", + "score": 177.0308253013444 + }, + { + "id": "cuda/tensor-core-numerical-validation", + "name": "tensor-core-numerical-validation", + "score": 51.97943487991361 + }, + { + "id": "cuda/cuda-graphs", + "name": "cuda-graphs", + "score": 33.21805884552287 + }, + { + "id": "cuda/cuda-core", + "name": "cuda-core", + "score": 32.33557499378493 + }, + { + "id": "cuda/cuda-core-vs-tensor-core-path-selection", + "name": "cuda-core-vs-tensor-core-path-selection", + "score": 31.350730159755678 + } + ] + }, + { + "id": "bottleneck-workflow", + "query": "kernel bottleneck diagnosis workflow", + "passed": true, + "top_ids": [ + "cuda/kernel-bottleneck-diagnosis-workflow", + "cuda/nvtx-and-profiling-workflow", + "cuda/fused-kernel-design-patterns", + "cuda/kernel-api-design-guidelines", + "cuda/wmma-kernel-patterns" + ], + "top_items": [ + { + "id": "cuda/kernel-bottleneck-diagnosis-workflow", + "name": "kernel-bottleneck-diagnosis-workflow", + "score": 770.585046598412 + }, + { + "id": "cuda/nvtx-and-profiling-workflow", + "name": "nvtx-and-profiling-workflow", + "score": 32.23575901211767 + }, + { + "id": "cuda/fused-kernel-design-patterns", + "name": "fused-kernel-design-patterns", + "score": 26.4992951571028 + }, + { + "id": "cuda/kernel-api-design-guidelines", + "name": "kernel-api-design-guidelines", + "score": 24.28432268394546 + }, + { + "id": "cuda/wmma-kernel-patterns", + "name": "wmma-kernel-patterns", + "score": 24.246978382914747 + } + ] + }, + { + "id": "production-readiness", + "query": "cuda production readiness checklist", + "passed": true, + "top_ids": [ + "cuda/production-readiness-checklist", + "cuda/cuda-core-optimization-checklist", + "cuda/wmma-debugging-checklist", + "cuda/cuda-graphs", + "cuda/cuda-core" + ], + "top_items": [ + { + "id": "cuda/production-readiness-checklist", + "name": "production-readiness-checklist", + "score": 763.855058540744 + }, + { + "id": "cuda/cuda-core-optimization-checklist", + "name": "cuda-core-optimization-checklist", + "score": 66.71264663506466 + }, + { + "id": "cuda/wmma-debugging-checklist", + "name": "wmma-debugging-checklist", + "score": 55.240305689543874 + }, + { + "id": "cuda/cuda-graphs", + "name": "cuda-graphs", + "score": 33.21805884552287 + }, + { + "id": "cuda/cuda-core", + "name": "cuda-core", + "score": 32.33557499378493 + } + ] + }, + { + "id": "ptx-wgmma", + "query": "ptx wgmma commit wait fence", + "passed": true, + "top_ids": [ + "cuda/ptx-wgmma-instructions", + "cuda/ptx-mbarrier-protocol-patterns", + "cuda/ptx" + ], + "top_items": [ + { + "id": "cuda/ptx-wgmma-instructions", + "name": "ptx-wgmma-instructions", + "score": 81.40918180665194 + }, + { + "id": "cuda/ptx-mbarrier-protocol-patterns", + "name": "ptx-mbarrier-protocol-patterns", + "score": 50.39363273422481 + }, + { + "id": "cuda/ptx", + "name": "ptx", + "score": 48.234964787326334 + }, + { + "id": "cuda/ptx-integer-instructions", + "name": "ptx-integer-instructions", + "score": 43.47704292979509 + }, + { + "id": "cuda/ptx-tma-instructions", + "name": "ptx-tma-instructions", + "score": 42.05795217677962 + } + ] + }, + { + "id": "ptx-atomics", + "query": "ptx atom cas red redux", + "passed": true, + "top_ids": [ + "cuda/ptx-atomic-and-reduction-patterns", + "cuda/ptx", + "cuda/ptx-integer-instructions" + ], + "top_items": [ + { + "id": "cuda/ptx-atomic-and-reduction-patterns", + "name": "ptx-atomic-and-reduction-patterns", + "score": 99.5493879622509 + }, + { + "id": "cuda/ptx", + "name": "ptx", + "score": 48.234964787326334 + }, + { + "id": "cuda/ptx-integer-instructions", + "name": "ptx-integer-instructions", + "score": 43.47704292979509 + }, + { + "id": "cuda/ptx-tma-instructions", + "name": "ptx-tma-instructions", + "score": 42.05795217677962 + }, + { + "id": "cuda/ptx-special-registers", + "name": "ptx-special-registers", + "score": 41.54763657890468 + } + ] + }, + { + "id": "ptx-integer-bitops", + "query": "ptx integer bit manipulation lop3 bfe bfi", + "passed": true, + "top_ids": [ + "cuda/ptx-integer-bit-manipulation-patterns", + "cuda/ptx-integer-instructions", + "cuda/ptx" + ], + "top_items": [ + { + "id": "cuda/ptx-integer-bit-manipulation-patterns", + "name": "ptx-integer-bit-manipulation-patterns", + "score": 203.76661290339422 + }, + { + "id": "cuda/ptx-integer-instructions", + "name": "ptx-integer-instructions", + "score": 105.4968343941433 + }, + { + "id": "cuda/ptx", + "name": "ptx", + "score": 48.234964787326334 + }, + { + "id": "cuda/ptx-tma-instructions", + "name": "ptx-tma-instructions", + "score": 42.05795217677962 + }, + { + "id": "cuda/ptx-special-registers", + "name": "ptx-special-registers", + "score": 41.54763657890468 + } + ] + }, + { + "id": "metal-kernel-basics", + "query": "metal kernel thread_position_in_grid threadgroup", + "passed": true, + "top_ids": [ + "apple/metal-kernel-basics", + "apple/metal-memory-and-threadgroup", + "apple/metal-threadgroup-sizing-playbook", + "apple/metal-image-and-2d-kernel-patterns", + "apple/metal-kernel-debugging-checklist" + ], + "top_items": [ + { + "id": "apple/metal-kernel-basics", + "name": "metal-kernel-basics", + "score": 106.4300456192966 + }, + { + "id": "apple/metal-memory-and-threadgroup", + "name": "metal-memory-and-threadgroup", + "score": 75.96656405137797 + }, + { + "id": "apple/metal-threadgroup-sizing-playbook", + "name": "metal-threadgroup-sizing-playbook", + "score": 65.93255891081415 + }, + { + "id": "apple/metal-image-and-2d-kernel-patterns", + "name": "metal-image-and-2d-kernel-patterns", + "score": 65.58969661795014 + }, + { + "id": "apple/metal-kernel-debugging-checklist", + "name": "metal-kernel-debugging-checklist", + "score": 54.475122634881814 + } + ] + }, + { + "id": "metal-compute-launch", + "query": "metal compute pipeline dispatchthreads threadsperthreadgroup", + "passed": true, + "top_ids": [ + "apple/metal-compute-launch-patterns", + "apple/metal-library-and-pipeline-compilation", + "apple/metal-double-buffered-pipeline-patterns", + "apple/metal-multistage-tensor-pipeline-patterns", + "apple/metal-performance-tuning" + ], + "top_items": [ + { + "id": "apple/metal-compute-launch-patterns", + "name": "metal-compute-launch-patterns", + "score": 81.20799849169319 + }, + { + "id": "apple/metal-library-and-pipeline-compilation", + "name": "metal-library-and-pipeline-compilation", + "score": 67.40609263535065 + }, + { + "id": "apple/metal-double-buffered-pipeline-patterns", + "name": "metal-double-buffered-pipeline-patterns", + "score": 64.63681625621987 + }, + { + "id": "apple/metal-multistage-tensor-pipeline-patterns", + "name": "metal-multistage-tensor-pipeline-patterns", + "score": 62.65374604665615 + }, + { + "id": "apple/metal-performance-tuning", + "name": "metal-performance-tuning", + "score": 58.68451844518415 + } + ] + }, + { + "id": "metal-debugging", + "query": "metal kernel debugging validation resource binding dispatch", + "passed": true, + "top_ids": [ + "apple/metal-kernel-debugging-checklist", + "apple/metal-resource-binding-checklist", + "apple/metal-kernel-basics", + "apple/metal-validation-and-profiling-workflow", + "apple/metal-numerical-drift-debugging-checklist" + ], + "top_items": [ + { + "id": "apple/metal-kernel-debugging-checklist", + "name": "metal-kernel-debugging-checklist", + "score": 126.74782895167914 + }, + { + "id": "apple/metal-resource-binding-checklist", + "name": "metal-resource-binding-checklist", + "score": 110.21135302749931 + }, + { + "id": "apple/metal-kernel-basics", + "name": "metal-kernel-basics", + "score": 60.64701989263922 + }, + { + "id": "apple/metal-validation-and-profiling-workflow", + "name": "metal-validation-and-profiling-workflow", + "score": 60.419950929159796 + }, + { + "id": "apple/metal-numerical-drift-debugging-checklist", + "name": "metal-numerical-drift-debugging-checklist", + "score": 58.368586933075974 + } + ] + }, + { + "id": "metal-buffer-alignment", + "query": "metal buffer alignment bytesperrow texture buffer", + "passed": true, + "top_ids": [ + "apple/metal-buffer-layout-and-alignment", + "apple/metal-texture-vs-buffer-path-selection", + "apple/metal-command-buffer-reuse-and-batching", + "apple/metal-resource-binding-checklist", + "apple/metal-image-and-2d-kernel-patterns" + ], + "top_items": [ + { + "id": "apple/metal-buffer-layout-and-alignment", + "name": "metal-buffer-layout-and-alignment", + "score": 166.45136159680763 + }, + { + "id": "apple/metal-texture-vs-buffer-path-selection", + "name": "metal-texture-vs-buffer-path-selection", + "score": 123.42774808519891 + }, + { + "id": "apple/metal-command-buffer-reuse-and-batching", + "name": "metal-command-buffer-reuse-and-batching", + "score": 102.71776579884074 + }, + { + "id": "apple/metal-resource-binding-checklist", + "name": "metal-resource-binding-checklist", + "score": 69.7832516255418 + }, + { + "id": "apple/metal-image-and-2d-kernel-patterns", + "name": "metal-image-and-2d-kernel-patterns", + "score": 50.4802844899155 + } + ] + }, + { + "id": "metal-simdgroup", + "query": "metal simdgroup subgroup warp-like", + "passed": true, + "top_ids": [ + "apple/metal-simdgroup-patterns", + "apple/metal-kernel-basics", + "apple/metal-memory-and-threadgroup", + "apple/metal-reduction-patterns", + "apple/metal-validation-and-profiling-workflow" + ], + "top_items": [ + { + "id": "apple/metal-simdgroup-patterns", + "name": "metal-simdgroup-patterns", + "score": 113.35485504328487 + }, + { + "id": "apple/metal-kernel-basics", + "name": "metal-kernel-basics", + "score": 42.407947067917846 + }, + { + "id": "apple/metal-memory-and-threadgroup", + "name": "metal-memory-and-threadgroup", + "score": 37.69803028218896 + }, + { + "id": "apple/metal-reduction-patterns", + "name": "metal-reduction-patterns", + "score": 34.75251511203153 + }, + { + "id": "apple/metal-validation-and-profiling-workflow", + "name": "metal-validation-and-profiling-workflow", + "score": 31.512843247131073 + } + ] + }, + { + "id": "metal-performance", + "query": "metal performance tuning dispatchthreads command buffer pipeline reuse", + "passed": true, + "top_ids": [ + "apple/metal-command-buffer-reuse-and-batching", + "apple/metal-performance-tuning", + "apple/metal-persistent-objects-and-submission-overhead" + ], + "top_items": [ + { + "id": "apple/metal-command-buffer-reuse-and-batching", + "name": "metal-command-buffer-reuse-and-batching", + "score": 139.77734751758967 + }, + { + "id": "apple/metal-performance-tuning", + "name": "metal-performance-tuning", + "score": 138.58925265403658 + }, + { + "id": "apple/metal-persistent-objects-and-submission-overhead", + "name": "metal-persistent-objects-and-submission-overhead", + "score": 74.5000704874895 + }, + { + "id": "apple/metal-prefetch-and-reuse-heuristics", + "name": "metal-prefetch-and-reuse-heuristics", + "score": 69.2617883114066 + }, + { + "id": "apple/metal-multistage-tensor-pipeline-patterns", + "name": "metal-multistage-tensor-pipeline-patterns", + "score": 68.70864987858005 + } + ] + }, + { + "id": "pytorch-custom-metal", + "query": "pytorch mps vs custom metal", + "passed": true, + "top_ids": [ + "apple/pytorch-mps-vs-custom-metal" + ], + "top_items": [ + { + "id": "apple/pytorch-mps-vs-custom-metal", + "name": "pytorch-mps-vs-custom-metal", + "score": 790.3555392604801 + } + ] + }, + { + "id": "metal-host-wrapper", + "query": "metal host wrapper pipeline cache command encoder", + "passed": true, + "top_ids": [ + "apple/metal-host-wrapper-patterns", + "apple/metal-host-device-synchronization-checklist", + "apple/metal-pytorch-custom-op-host-patterns", + "apple/metal-multistage-tensor-pipeline-patterns", + "apple/metal-command-buffer-reuse-and-batching" + ], + "top_items": [ + { + "id": "apple/metal-host-wrapper-patterns", + "name": "metal-host-wrapper-patterns", + "score": 132.7524568879407 + }, + { + "id": "apple/metal-host-device-synchronization-checklist", + "name": "metal-host-device-synchronization-checklist", + "score": 87.00808259041237 + }, + { + "id": "apple/metal-pytorch-custom-op-host-patterns", + "name": "metal-pytorch-custom-op-host-patterns", + "score": 71.01670881286532 + }, + { + "id": "apple/metal-multistage-tensor-pipeline-patterns", + "name": "metal-multistage-tensor-pipeline-patterns", + "score": 66.13947891294225 + }, + { + "id": "apple/metal-command-buffer-reuse-and-batching", + "name": "metal-command-buffer-reuse-and-batching", + "score": 63.30606693686744 + } + ] + }, + { + "id": "metal-threadgroup-sizing", + "query": "metal threadExecutionWidth maxTotalThreadsPerThreadgroup threadgroup sizing", + "passed": true, + "top_ids": [ + "apple/metal-threadgroup-sizing-playbook", + "apple/metal-memory-and-threadgroup", + "apple/metal-kernel-basics", + "apple/metal-reduction-patterns", + "apple/metal-convolution-tiling-playbook" + ], + "top_items": [ + { + "id": "apple/metal-threadgroup-sizing-playbook", + "name": "metal-threadgroup-sizing-playbook", + "score": 128.21645678208895 + }, + { + "id": "apple/metal-memory-and-threadgroup", + "name": "metal-memory-and-threadgroup", + "score": 75.96656405137797 + }, + { + "id": "apple/metal-kernel-basics", + "name": "metal-kernel-basics", + "score": 44.74356414166591 + }, + { + "id": "apple/metal-reduction-patterns", + "name": "metal-reduction-patterns", + "score": 41.78311004032571 + }, + { + "id": "apple/metal-convolution-tiling-playbook", + "name": "metal-convolution-tiling-playbook", + "score": 39.64145803058623 + } + ] + }, + { + "id": "metal-resource-binding", + "query": "metal resource binding useResource useHeap buffer index", + "passed": true, + "top_ids": [ + "apple/metal-resource-binding-checklist", + "apple/metal-argument-buffers-and-residency", + "apple/metal-buffer-layout-and-alignment", + "apple/metal-command-buffer-reuse-and-batching", + "apple/metal-texture-vs-buffer-path-selection" + ], + "top_items": [ + { + "id": "apple/metal-resource-binding-checklist", + "name": "metal-resource-binding-checklist", + "score": 143.4289522065164 + }, + { + "id": "apple/metal-argument-buffers-and-residency", + "name": "metal-argument-buffers-and-residency", + "score": 81.36249271181883 + }, + { + "id": "apple/metal-buffer-layout-and-alignment", + "name": "metal-buffer-layout-and-alignment", + "score": 70.0484385243684 + }, + { + "id": "apple/metal-command-buffer-reuse-and-batching", + "name": "metal-command-buffer-reuse-and-batching", + "score": 63.58700873679091 + }, + { + "id": "apple/metal-texture-vs-buffer-path-selection", + "name": "metal-texture-vs-buffer-path-selection", + "score": 60.70840833316848 + } + ] + }, + { + "id": "metal-tiled-matmul", + "query": "metal tiled matmul threadgroup memory gemm", + "passed": true, + "top_ids": [ + "apple/metal-tiled-matmul-patterns", + "apple/metal-memory-and-threadgroup", + "apple/metal-threadgroup-sizing-playbook", + "apple/metal-memory-pressure-checklist", + "apple/metal-prefetch-and-reuse-heuristics" + ], + "top_items": [ + { + "id": "apple/metal-tiled-matmul-patterns", + "name": "metal-tiled-matmul-patterns", + "score": 163.6543176262664 + }, + { + "id": "apple/metal-memory-and-threadgroup", + "name": "metal-memory-and-threadgroup", + "score": 114.11877190134864 + }, + { + "id": "apple/metal-threadgroup-sizing-playbook", + "name": "metal-threadgroup-sizing-playbook", + "score": 65.93255891081415 + }, + { + "id": "apple/metal-memory-pressure-checklist", + "name": "metal-memory-pressure-checklist", + "score": 59.94297701986353 + }, + { + "id": "apple/metal-prefetch-and-reuse-heuristics", + "name": "metal-prefetch-and-reuse-heuristics", + "score": 48.79749186715945 + } + ] + }, + { + "id": "metal-reduction", + "query": "metal reduction threadgroup accumulation barrier", + "passed": true, + "top_ids": [ + "apple/metal-reduction-patterns", + "apple/metal-memory-and-threadgroup", + "apple/metal-segmented-reduction-patterns", + "apple/metal-threadgroup-sizing-playbook", + "apple/metal-simdgroup-patterns" + ], + "top_items": [ + { + "id": "apple/metal-reduction-patterns", + "name": "metal-reduction-patterns", + "score": 111.92728244075192 + }, + { + "id": "apple/metal-memory-and-threadgroup", + "name": "metal-memory-and-threadgroup", + "score": 88.5962118737391 + }, + { + "id": "apple/metal-segmented-reduction-patterns", + "name": "metal-segmented-reduction-patterns", + "score": 78.16171806593684 + }, + { + "id": "apple/metal-threadgroup-sizing-playbook", + "name": "metal-threadgroup-sizing-playbook", + "score": 65.93255891081415 + }, + { + "id": "apple/metal-simdgroup-patterns", + "name": "metal-simdgroup-patterns", + "score": 45.025485787684175 + } + ] + }, + { + "id": "metal-image-2d", + "query": "metal texture 2d kernel thread_position_in_grid read_write texture", + "passed": true, + "top_ids": [ + "apple/metal-image-and-2d-kernel-patterns", + "apple/metal-texture-vs-buffer-path-selection", + "apple/metal-kernel-basics", + "apple/metal-edge-tile-and-bounds-check-playbook", + "apple/metal-resource-binding-checklist" + ], + "top_items": [ + { + "id": "apple/metal-image-and-2d-kernel-patterns", + "name": "metal-image-and-2d-kernel-patterns", + "score": 169.86614142384167 + }, + { + "id": "apple/metal-texture-vs-buffer-path-selection", + "name": "metal-texture-vs-buffer-path-selection", + "score": 101.18397124049206 + }, + { + "id": "apple/metal-kernel-basics", + "name": "metal-kernel-basics", + "score": 94.97890809241845 + }, + { + "id": "apple/metal-edge-tile-and-bounds-check-playbook", + "name": "metal-edge-tile-and-bounds-check-playbook", + "score": 59.009726275496575 + }, + { + "id": "apple/metal-resource-binding-checklist", + "name": "metal-resource-binding-checklist", + "score": 52.53746932417654 + } + ] + }, + { + "id": "metal-command-buffer-batching", + "query": "metal command buffer batching indirect command buffer persistent objects", + "passed": true, + "top_ids": [ + "apple/metal-command-buffer-reuse-and-batching", + "apple/metal-persistent-objects-and-submission-overhead", + "apple/metal-buffer-layout-and-alignment", + "apple/metal-texture-vs-buffer-path-selection", + "apple/metal-gpu-driven-work-generation-patterns" + ], + "top_items": [ + { + "id": "apple/metal-command-buffer-reuse-and-batching", + "name": "metal-command-buffer-reuse-and-batching", + "score": 264.1869058051941 + }, + { + "id": "apple/metal-persistent-objects-and-submission-overhead", + "name": "metal-persistent-objects-and-submission-overhead", + "score": 139.10586514378645 + }, + { + "id": "apple/metal-buffer-layout-and-alignment", + "name": "metal-buffer-layout-and-alignment", + "score": 105.51978885520943 + }, + { + "id": "apple/metal-texture-vs-buffer-path-selection", + "name": "metal-texture-vs-buffer-path-selection", + "score": 86.85473513749503 + }, + { + "id": "apple/metal-gpu-driven-work-generation-patterns", + "name": "metal-gpu-driven-work-generation-patterns", + "score": 75.04030126091419 + } + ] + }, + { + "id": "metal-library-compilation", + "query": "metal metallib pipeline compilation runtime loading", + "passed": true, + "top_ids": [ + "apple/metal-library-and-pipeline-compilation", + "apple/metal-double-buffered-pipeline-patterns", + "apple/metal-multistage-tensor-pipeline-patterns", + "apple/metal-compute-launch-patterns", + "apple/metal-host-wrapper-patterns" + ], + "top_items": [ + { + "id": "apple/metal-library-and-pipeline-compilation", + "name": "metal-library-and-pipeline-compilation", + "score": 143.01523554813485 + }, + { + "id": "apple/metal-double-buffered-pipeline-patterns", + "name": "metal-double-buffered-pipeline-patterns", + "score": 55.69176722258639 + }, + { + "id": "apple/metal-multistage-tensor-pipeline-patterns", + "name": "metal-multistage-tensor-pipeline-patterns", + "score": 54.32008190548861 + }, + { + "id": "apple/metal-compute-launch-patterns", + "name": "metal-compute-launch-patterns", + "score": 43.97926973348613 + }, + { + "id": "apple/metal-host-wrapper-patterns", + "name": "metal-host-wrapper-patterns", + "score": 41.061402187483125 + } + ] + }, + { + "id": "metal-texture-vs-buffer", + "query": "metal texture vs buffer resource selection image linear memory", + "passed": true, + "top_ids": [ + "apple/metal-texture-vs-buffer-path-selection", + "apple/metal-resource-binding-checklist", + "apple/metal-buffer-layout-and-alignment", + "apple/metal-image-and-2d-kernel-patterns", + "apple/metal-memory-and-threadgroup" + ], + "top_items": [ + { + "id": "apple/metal-texture-vs-buffer-path-selection", + "name": "metal-texture-vs-buffer-path-selection", + "score": 183.40123218511377 + }, + { + "id": "apple/metal-resource-binding-checklist", + "name": "metal-resource-binding-checklist", + "score": 82.98829603911443 + }, + { + "id": "apple/metal-buffer-layout-and-alignment", + "name": "metal-buffer-layout-and-alignment", + "score": 75.40613088250826 + }, + { + "id": "apple/metal-image-and-2d-kernel-patterns", + "name": "metal-image-and-2d-kernel-patterns", + "score": 67.69608112600764 + }, + { + "id": "apple/metal-memory-and-threadgroup", + "name": "metal-memory-and-threadgroup", + "score": 67.1056435483948 + } + ] + }, + { + "id": "metal-edge-bounds", + "query": "metal edge tile bounds check rounded dispatch partial tile", + "passed": true, + "top_ids": [ + "apple/metal-edge-tile-and-bounds-check-playbook", + "apple/metal-transpose-and-layout-reorder-patterns", + "apple/metal-ragged-tensors-and-masked-kernels", + "apple/metal-kernel-debugging-checklist", + "apple/metal-tiled-matmul-patterns" + ], + "top_items": [ + { + "id": "apple/metal-edge-tile-and-bounds-check-playbook", + "name": "metal-edge-tile-and-bounds-check-playbook", + "score": 242.22676875164458 + }, + { + "id": "apple/metal-transpose-and-layout-reorder-patterns", + "name": "metal-transpose-and-layout-reorder-patterns", + "score": 54.86959411549956 + }, + { + "id": "apple/metal-ragged-tensors-and-masked-kernels", + "name": "metal-ragged-tensors-and-masked-kernels", + "score": 51.15890023987147 + }, + { + "id": "apple/metal-kernel-debugging-checklist", + "name": "metal-kernel-debugging-checklist", + "score": 40.376813084387415 + }, + { + "id": "apple/metal-tiled-matmul-patterns", + "name": "metal-tiled-matmul-patterns", + "score": 37.42948073600262 + } + ] + }, + { + "id": "metal-validation-profiling", + "query": "metal validation profiling workflow debugger instruments system trace", + "passed": true, + "top_ids": [ + "apple/metal-validation-and-profiling-workflow", + "apple/metal-performance-tuning", + "apple/metal-kernel-debugging-checklist", + "apple/metal-resource-binding-checklist", + "apple/metal-kernel-basics" + ], + "top_items": [ + { + "id": "apple/metal-validation-and-profiling-workflow", + "name": "metal-validation-and-profiling-workflow", + "score": 174.4107461569549 + }, + { + "id": "apple/metal-performance-tuning", + "name": "metal-performance-tuning", + "score": 52.0949683296992 + }, + { + "id": "apple/metal-kernel-debugging-checklist", + "name": "metal-kernel-debugging-checklist", + "score": 45.74568829840129 + }, + { + "id": "apple/metal-resource-binding-checklist", + "name": "metal-resource-binding-checklist", + "score": 34.912452202453196 + }, + { + "id": "apple/metal-kernel-basics", + "name": "metal-kernel-basics", + "score": 33.29242661478774 + } + ] + }, + { + "id": "metal-argument-buffers", + "query": "metal argument buffers residency useresource indirect resources", + "passed": true, + "top_ids": [ + "apple/metal-argument-buffers-and-residency", + "apple/metal-gather-scatter-and-indirect-access-patterns", + "apple/metal-gpu-driven-work-generation-patterns", + "apple/metal-resource-binding-checklist", + "apple/metal-multistage-tensor-pipeline-patterns" + ], + "top_items": [ + { + "id": "apple/metal-argument-buffers-and-residency", + "name": "metal-argument-buffers-and-residency", + "score": 178.4525473483073 + }, + { + "id": "apple/metal-gather-scatter-and-indirect-access-patterns", + "name": "metal-gather-scatter-and-indirect-access-patterns", + "score": 75.97243491230272 + }, + { + "id": "apple/metal-gpu-driven-work-generation-patterns", + "name": "metal-gpu-driven-work-generation-patterns", + "score": 66.3899838497021 + }, + { + "id": "apple/metal-resource-binding-checklist", + "name": "metal-resource-binding-checklist", + "score": 55.33343614673806 + }, + { + "id": "apple/metal-multistage-tensor-pipeline-patterns", + "name": "metal-multistage-tensor-pipeline-patterns", + "score": 37.68469841245517 + } + ] + }, + { + "id": "metal-heaps-events", + "query": "metal heaps fences events temporary resources multistage pipeline", + "passed": true, + "top_ids": [ + "apple/metal-heaps-fences-and-events", + "apple/metal-multistage-tensor-pipeline-patterns", + "apple/metal-library-and-pipeline-compilation", + "apple/metal-double-buffered-pipeline-patterns", + "apple/metal-performance-tuning" + ], + "top_items": [ + { + "id": "apple/metal-heaps-fences-and-events", + "name": "metal-heaps-fences-and-events", + "score": 168.7816019804878 + }, + { + "id": "apple/metal-multistage-tensor-pipeline-patterns", + "name": "metal-multistage-tensor-pipeline-patterns", + "score": 110.2710922059951 + }, + { + "id": "apple/metal-library-and-pipeline-compilation", + "name": "metal-library-and-pipeline-compilation", + "score": 62.52504358444493 + }, + { + "id": "apple/metal-double-buffered-pipeline-patterns", + "name": "metal-double-buffered-pipeline-patterns", + "score": 58.790096121034786 + }, + { + "id": "apple/metal-performance-tuning", + "name": "metal-performance-tuning", + "score": 40.033514413789916 + } + ] + }, + { + "id": "metal-convolution-stencil", + "query": "metal convolution stencil image filter neighborhood texture compute", + "passed": true, + "top_ids": [ + "apple/metal-convolution-and-stencil-patterns", + "apple/metal-convolution-tiling-playbook", + "apple/metal-texture-vs-buffer-path-selection", + "apple/metal-image-and-2d-kernel-patterns", + "apple/metal-compute-launch-patterns" + ], + "top_items": [ + { + "id": "apple/metal-convolution-and-stencil-patterns", + "name": "metal-convolution-and-stencil-patterns", + "score": 175.3338993233565 + }, + { + "id": "apple/metal-convolution-tiling-playbook", + "name": "metal-convolution-tiling-playbook", + "score": 116.38397646191258 + }, + { + "id": "apple/metal-texture-vs-buffer-path-selection", + "name": "metal-texture-vs-buffer-path-selection", + "score": 73.18644075778882 + }, + { + "id": "apple/metal-image-and-2d-kernel-patterns", + "name": "metal-image-and-2d-kernel-patterns", + "score": 72.57713017691336 + }, + { + "id": "apple/metal-compute-launch-patterns", + "name": "metal-compute-launch-patterns", + "score": 55.36946487822595 + } + ] + }, + { + "id": "metal-prefix-scan", + "query": "metal prefix scan exclusive inclusive scan threadgroup", + "passed": true, + "top_ids": [ + "apple/metal-prefix-scan-patterns", + "apple/metal-memory-and-threadgroup", + "apple/metal-threadgroup-sizing-playbook", + "apple/metal-segmented-reduction-patterns", + "apple/metal-kernel-basics" + ], + "top_items": [ + { + "id": "apple/metal-prefix-scan-patterns", + "name": "metal-prefix-scan-patterns", + "score": 235.96894388239755 + }, + { + "id": "apple/metal-memory-and-threadgroup", + "name": "metal-memory-and-threadgroup", + "score": 75.96656405137797 + }, + { + "id": "apple/metal-threadgroup-sizing-playbook", + "name": "metal-threadgroup-sizing-playbook", + "score": 65.93255891081415 + }, + { + "id": "apple/metal-segmented-reduction-patterns", + "name": "metal-segmented-reduction-patterns", + "score": 55.48851747296195 + }, + { + "id": "apple/metal-kernel-basics", + "name": "metal-kernel-basics", + "score": 44.74356414166591 + } + ] + }, + { + "id": "metal-transpose-layout", + "query": "metal transpose layout reorder tile threadgroup", + "passed": true, + "top_ids": [ + "apple/metal-transpose-and-layout-reorder-patterns", + "apple/metal-transpose-free-layout-choice-playbook", + "apple/metal-memory-and-threadgroup", + "apple/metal-buffer-layout-and-alignment", + "apple/metal-threadgroup-sizing-playbook" + ], + "top_items": [ + { + "id": "apple/metal-transpose-and-layout-reorder-patterns", + "name": "metal-transpose-and-layout-reorder-patterns", + "score": 177.6252728378576 + }, + { + "id": "apple/metal-transpose-free-layout-choice-playbook", + "name": "metal-transpose-free-layout-choice-playbook", + "score": 114.30339472753842 + }, + { + "id": "apple/metal-memory-and-threadgroup", + "name": "metal-memory-and-threadgroup", + "score": 75.96656405137797 + }, + { + "id": "apple/metal-buffer-layout-and-alignment", + "name": "metal-buffer-layout-and-alignment", + "score": 66.38165771657762 + }, + { + "id": "apple/metal-threadgroup-sizing-playbook", + "name": "metal-threadgroup-sizing-playbook", + "score": 65.93255891081415 + } + ] + }, + { + "id": "metal-histogram-binning", + "query": "metal histogram binning atomic threadgroup local bins", + "passed": true, + "top_ids": [ + "apple/metal-histogram-and-binning-patterns", + "apple/metal-memory-and-threadgroup", + "apple/metal-threadgroup-sizing-playbook", + "apple/metal-kernel-basics", + "apple/metal-reduction-patterns" + ], + "top_items": [ + { + "id": "apple/metal-histogram-and-binning-patterns", + "name": "metal-histogram-and-binning-patterns", + "score": 155.40066724934215 + }, + { + "id": "apple/metal-memory-and-threadgroup", + "name": "metal-memory-and-threadgroup", + "score": 79.8437752658241 + }, + { + "id": "apple/metal-threadgroup-sizing-playbook", + "name": "metal-threadgroup-sizing-playbook", + "score": 65.93255891081415 + }, + { + "id": "apple/metal-kernel-basics", + "name": "metal-kernel-basics", + "score": 44.74356414166591 + }, + { + "id": "apple/metal-reduction-patterns", + "name": "metal-reduction-patterns", + "score": 41.78311004032571 + } + ] + }, + { + "id": "metal-gather-scatter", + "query": "metal gather scatter indirect access argument buffers index table", + "passed": true, + "top_ids": [ + "apple/metal-gather-scatter-and-indirect-access-patterns", + "apple/metal-argument-buffers-and-residency", + "apple/metal-scatter-conflict-resolution-patterns", + "apple/metal-gpu-driven-work-generation-patterns", + "apple/metal-resource-binding-checklist" + ], + "top_items": [ + { + "id": "apple/metal-gather-scatter-and-indirect-access-patterns", + "name": "metal-gather-scatter-and-indirect-access-patterns", + "score": 194.92228704485223 + }, + { + "id": "apple/metal-argument-buffers-and-residency", + "name": "metal-argument-buffers-and-residency", + "score": 121.1849558048 + }, + { + "id": "apple/metal-scatter-conflict-resolution-patterns", + "name": "metal-scatter-conflict-resolution-patterns", + "score": 84.13158489347497 + }, + { + "id": "apple/metal-gpu-driven-work-generation-patterns", + "name": "metal-gpu-driven-work-generation-patterns", + "score": 61.566863377073304 + }, + { + "id": "apple/metal-resource-binding-checklist", + "name": "metal-resource-binding-checklist", + "score": 48.45453331309322 + } + ] + }, + { + "id": "metal-convolution-tiling", + "query": "metal convolution tiling halo threadgroup separable", + "passed": true, + "top_ids": [ + "apple/metal-convolution-tiling-playbook", + "apple/metal-convolution-and-stencil-patterns", + "apple/metal-threadgroup-sizing-playbook", + "apple/metal-memory-and-threadgroup", + "apple/metal-tiled-matmul-patterns" + ], + "top_items": [ + { + "id": "apple/metal-convolution-tiling-playbook", + "name": "metal-convolution-tiling-playbook", + "score": 171.88096146075546 + }, + { + "id": "apple/metal-convolution-and-stencil-patterns", + "name": "metal-convolution-and-stencil-patterns", + "score": 82.04077409804944 + }, + { + "id": "apple/metal-threadgroup-sizing-playbook", + "name": "metal-threadgroup-sizing-playbook", + "score": 77.03069881974126 + }, + { + "id": "apple/metal-memory-and-threadgroup", + "name": "metal-memory-and-threadgroup", + "score": 75.96656405137797 + }, + { + "id": "apple/metal-tiled-matmul-patterns", + "name": "metal-tiled-matmul-patterns", + "score": 48.03254055094126 + } + ] + }, + { + "id": "metal-persistent-objects", + "query": "metal persistent objects command buffer queue pipeline reuse submission overhead", + "passed": true, + "top_ids": [ + "apple/metal-persistent-objects-and-submission-overhead", + "apple/metal-command-buffer-reuse-and-batching", + "apple/metal-prefetch-and-reuse-heuristics", + "apple/metal-multistage-tensor-pipeline-patterns", + "apple/metal-library-and-pipeline-compilation" + ], + "top_items": [ + { + "id": "apple/metal-persistent-objects-and-submission-overhead", + "name": "metal-persistent-objects-and-submission-overhead", + "score": 218.43281057582067 + }, + { + "id": "apple/metal-command-buffer-reuse-and-batching", + "name": "metal-command-buffer-reuse-and-batching", + "score": 181.6451750233534 + }, + { + "id": "apple/metal-prefetch-and-reuse-heuristics", + "name": "metal-prefetch-and-reuse-heuristics", + "score": 69.2617883114066 + }, + { + "id": "apple/metal-multistage-tensor-pipeline-patterns", + "name": "metal-multistage-tensor-pipeline-patterns", + "score": 68.70864987858005 + }, + { + "id": "apple/metal-library-and-pipeline-compilation", + "name": "metal-library-and-pipeline-compilation", + "score": 67.87858234963244 + } + ] + }, + { + "id": "metal-gpu-driven-work", + "query": "metal gpu driven work generation indirect command buffer indirect dispatch", + "passed": true, + "top_ids": [ + "apple/metal-gpu-driven-work-generation-patterns", + "apple/metal-command-buffer-reuse-and-batching", + "apple/metal-gather-scatter-and-indirect-access-patterns", + "apple/metal-buffer-layout-and-alignment", + "apple/metal-texture-vs-buffer-path-selection" + ], + "top_items": [ + { + "id": "apple/metal-gpu-driven-work-generation-patterns", + "name": "metal-gpu-driven-work-generation-patterns", + "score": 204.25861611725966 + }, + { + "id": "apple/metal-command-buffer-reuse-and-batching", + "name": "metal-command-buffer-reuse-and-batching", + "score": 127.90806975745879 + }, + { + "id": "apple/metal-gather-scatter-and-indirect-access-patterns", + "name": "metal-gather-scatter-and-indirect-access-patterns", + "score": 99.80158872946285 + }, + { + "id": "apple/metal-buffer-layout-and-alignment", + "name": "metal-buffer-layout-and-alignment", + "score": 66.38165771657762 + }, + { + "id": "apple/metal-texture-vs-buffer-path-selection", + "name": "metal-texture-vs-buffer-path-selection", + "score": 55.09550804658624 + } + ] + }, + { + "id": "metal-multistage-pipeline", + "query": "metal multistage tensor pipeline intermediate buffers stages", + "passed": true, + "top_ids": [ + "apple/metal-multistage-tensor-pipeline-patterns", + "apple/metal-library-and-pipeline-compilation", + "apple/metal-argument-buffers-and-residency", + "apple/metal-tensor-packing-and-unpacking-patterns", + "apple/metal-double-buffered-pipeline-patterns" + ], + "top_items": [ + { + "id": "apple/metal-multistage-tensor-pipeline-patterns", + "name": "metal-multistage-tensor-pipeline-patterns", + "score": 152.2795976282822 + }, + { + "id": "apple/metal-library-and-pipeline-compilation", + "name": "metal-library-and-pipeline-compilation", + "score": 62.52504358444493 + }, + { + "id": "apple/metal-argument-buffers-and-residency", + "name": "metal-argument-buffers-and-residency", + "score": 62.036536914105525 + }, + { + "id": "apple/metal-tensor-packing-and-unpacking-patterns", + "name": "metal-tensor-packing-and-unpacking-patterns", + "score": 55.998142853971345 + }, + { + "id": "apple/metal-double-buffered-pipeline-patterns", + "name": "metal-double-buffered-pipeline-patterns", + "score": 55.69176722258639 + } + ] + }, + { + "id": "metal-numerical-drift", + "query": "metal numerical drift precision debugging half float accumulation", + "passed": true, + "top_ids": [ + "apple/metal-numerical-drift-debugging-checklist", + "apple/metal-kernel-debugging-checklist", + "apple/metal-silent-nan-inf-debugging-checklist", + "apple/metal-softmax-and-logsumexp-stability-patterns", + "apple/metal-reduction-patterns" + ], + "top_items": [ + { + "id": "apple/metal-numerical-drift-debugging-checklist", + "name": "metal-numerical-drift-debugging-checklist", + "score": 165.9583497146746 + }, + { + "id": "apple/metal-kernel-debugging-checklist", + "name": "metal-kernel-debugging-checklist", + "score": 64.60023602910067 + }, + { + "id": "apple/metal-silent-nan-inf-debugging-checklist", + "name": "metal-silent-nan-inf-debugging-checklist", + "score": 55.58927323671709 + }, + { + "id": "apple/metal-softmax-and-logsumexp-stability-patterns", + "name": "metal-softmax-and-logsumexp-stability-patterns", + "score": 54.89007295097145 + }, + { + "id": "apple/metal-reduction-patterns", + "name": "metal-reduction-patterns", + "score": 44.80032659455576 + } + ] + }, + { + "id": "metal-segmented-reduction", + "query": "metal segmented reduction grouped irregular segments carry", + "passed": true, + "top_ids": [ + "apple/metal-segmented-reduction-patterns", + "apple/metal-reduction-patterns", + "apple/metal-simdgroup-patterns", + "apple/metal-gather-scatter-and-indirect-access-patterns", + "apple/metal-softmax-and-logsumexp-stability-patterns" + ], + "top_items": [ + { + "id": "apple/metal-segmented-reduction-patterns", + "name": "metal-segmented-reduction-patterns", + "score": 583.9550520495274 + }, + { + "id": "apple/metal-reduction-patterns", + "name": "metal-reduction-patterns", + "score": 75.17852744676036 + }, + { + "id": "apple/metal-simdgroup-patterns", + "name": "metal-simdgroup-patterns", + "score": 37.3588272947278 + }, + { + "id": "apple/metal-gather-scatter-and-indirect-access-patterns", + "name": "metal-gather-scatter-and-indirect-access-patterns", + "score": 36.92825443466229 + }, + { + "id": "apple/metal-softmax-and-logsumexp-stability-patterns", + "name": "metal-softmax-and-logsumexp-stability-patterns", + "score": 33.9613402852602 + } + ] + }, + { + "id": "metal-ragged-masked", + "query": "metal ragged tensor masked kernel variable length bounds check", + "passed": true, + "top_ids": [ + "apple/metal-ragged-tensors-and-masked-kernels", + "apple/metal-edge-tile-and-bounds-check-playbook", + "apple/metal-broadcast-kernel-patterns", + "apple/metal-kernel-basics", + "apple/metal-multistage-tensor-pipeline-patterns" + ], + "top_items": [ + { + "id": "apple/metal-ragged-tensors-and-masked-kernels", + "name": "metal-ragged-tensors-and-masked-kernels", + "score": 185.42135356247962 + }, + { + "id": "apple/metal-edge-tile-and-bounds-check-playbook", + "name": "metal-edge-tile-and-bounds-check-playbook", + "score": 99.79725202558359 + }, + { + "id": "apple/metal-broadcast-kernel-patterns", + "name": "metal-broadcast-kernel-patterns", + "score": 70.18290277216843 + }, + { + "id": "apple/metal-kernel-basics", + "name": "metal-kernel-basics", + "score": 60.64701989263922 + }, + { + "id": "apple/metal-multistage-tensor-pipeline-patterns", + "name": "metal-multistage-tensor-pipeline-patterns", + "score": 56.205395844574234 + } + ] + }, + { + "id": "metal-streaming-online", + "query": "metal streaming online kernel chunked rolling buffer latency", + "passed": true, + "top_ids": [ + "apple/metal-streaming-and-online-kernel-patterns", + "apple/metal-buffer-layout-and-alignment", + "apple/metal-command-buffer-reuse-and-batching", + "apple/metal-kernel-basics", + "apple/metal-texture-vs-buffer-path-selection" + ], + "top_items": [ + { + "id": "apple/metal-streaming-and-online-kernel-patterns", + "name": "metal-streaming-and-online-kernel-patterns", + "score": 166.74575345933403 + }, + { + "id": "apple/metal-buffer-layout-and-alignment", + "name": "metal-buffer-layout-and-alignment", + "score": 69.45314872256657 + }, + { + "id": "apple/metal-command-buffer-reuse-and-batching", + "name": "metal-command-buffer-reuse-and-batching", + "score": 63.58700873679091 + }, + { + "id": "apple/metal-kernel-basics", + "name": "metal-kernel-basics", + "score": 60.64701989263922 + }, + { + "id": "apple/metal-texture-vs-buffer-path-selection", + "name": "metal-texture-vs-buffer-path-selection", + "score": 59.79717243599304 + } + ] + }, + { + "id": "metal-memory-pressure", + "query": "metal memory pressure checklist heaps storage mode transient allocation", + "passed": true, + "top_ids": [ + "apple/metal-memory-pressure-checklist", + "apple/metal-heaps-fences-and-events", + "apple/metal-memory-and-threadgroup", + "apple/metal-kernel-debugging-checklist", + "apple/metal-resource-binding-checklist" + ], + "top_items": [ + { + "id": "apple/metal-memory-pressure-checklist", + "name": "metal-memory-pressure-checklist", + "score": 177.019950326042 + }, + { + "id": "apple/metal-heaps-fences-and-events", + "name": "metal-heaps-fences-and-events", + "score": 67.93944814341806 + }, + { + "id": "apple/metal-memory-and-threadgroup", + "name": "metal-memory-and-threadgroup", + "score": 67.1056435483948 + }, + { + "id": "apple/metal-kernel-debugging-checklist", + "name": "metal-kernel-debugging-checklist", + "score": 58.353436539648804 + }, + { + "id": "apple/metal-resource-binding-checklist", + "name": "metal-resource-binding-checklist", + "score": 57.09735715441359 + } + ] + }, + { + "id": "metal-double-buffered-pipeline", + "query": "metal double buffered pipeline overlap producer consumer alternating buffers", + "passed": true, + "top_ids": [ + "apple/metal-double-buffered-pipeline-patterns", + "apple/metal-producer-consumer-staging-playbook", + "apple/metal-multistage-tensor-pipeline-patterns", + "apple/metal-library-and-pipeline-compilation", + "apple/metal-argument-buffers-and-residency" + ], + "top_items": [ + { + "id": "apple/metal-double-buffered-pipeline-patterns", + "name": "metal-double-buffered-pipeline-patterns", + "score": 191.46930042951595 + }, + { + "id": "apple/metal-producer-consumer-staging-playbook", + "name": "metal-producer-consumer-staging-playbook", + "score": 110.1379674994217 + }, + { + "id": "apple/metal-multistage-tensor-pipeline-patterns", + "name": "metal-multistage-tensor-pipeline-patterns", + "score": 67.10378411998875 + }, + { + "id": "apple/metal-library-and-pipeline-compilation", + "name": "metal-library-and-pipeline-compilation", + "score": 62.52504358444493 + }, + { + "id": "apple/metal-argument-buffers-and-residency", + "name": "metal-argument-buffers-and-residency", + "score": 62.036536914105525 + } + ] + }, + { + "id": "metal-producer-consumer-staging", + "query": "metal producer consumer staging handoff intermediate resource synchronization", + "passed": true, + "top_ids": [ + "apple/metal-producer-consumer-staging-playbook", + "apple/metal-double-buffered-pipeline-patterns", + "apple/metal-host-device-synchronization-checklist", + "apple/metal-resource-binding-checklist", + "apple/metal-multistage-tensor-pipeline-patterns" + ], + "top_items": [ + { + "id": "apple/metal-producer-consumer-staging-playbook", + "name": "metal-producer-consumer-staging-playbook", + "score": 185.5695519160555 + }, + { + "id": "apple/metal-double-buffered-pipeline-patterns", + "name": "metal-double-buffered-pipeline-patterns", + "score": 74.0156541652276 + }, + { + "id": "apple/metal-host-device-synchronization-checklist", + "name": "metal-host-device-synchronization-checklist", + "score": 64.27858563021897 + }, + { + "id": "apple/metal-resource-binding-checklist", + "name": "metal-resource-binding-checklist", + "score": 54.788253017723825 + }, + { + "id": "apple/metal-multistage-tensor-pipeline-patterns", + "name": "metal-multistage-tensor-pipeline-patterns", + "score": 47.04653752759393 + } + ] + }, + { + "id": "metal-scatter-conflict", + "query": "metal scatter conflict resolution atomics staged merge duplicate destinations", + "passed": true, + "top_ids": [ + "apple/metal-scatter-conflict-resolution-patterns", + "apple/metal-gather-scatter-and-indirect-access-patterns", + "apple/metal-histogram-and-binning-patterns", + "apple/metal-validation-and-profiling-workflow", + "apple/metal-reduction-patterns" + ], + "top_items": [ + { + "id": "apple/metal-scatter-conflict-resolution-patterns", + "name": "metal-scatter-conflict-resolution-patterns", + "score": 185.19508402979073 + }, + { + "id": "apple/metal-gather-scatter-and-indirect-access-patterns", + "name": "metal-gather-scatter-and-indirect-access-patterns", + "score": 63.467072041997625 + }, + { + "id": "apple/metal-histogram-and-binning-patterns", + "name": "metal-histogram-and-binning-patterns", + "score": 37.720360318710746 + }, + { + "id": "apple/metal-validation-and-profiling-workflow", + "name": "metal-validation-and-profiling-workflow", + "score": 37.05637887256782 + }, + { + "id": "apple/metal-reduction-patterns", + "name": "metal-reduction-patterns", + "score": 34.89058388143134 + } + ] + }, + { + "id": "metal-kernel-fusion", + "query": "metal kernel fusion tradeoff bandwidth occupancy intermediate resources", + "passed": true, + "top_ids": [ + "apple/metal-kernel-fusion-tradeoff-checklist", + "apple/metal-kernel-basics", + "apple/metal-kernel-debugging-checklist", + "apple/metal-broadcast-kernel-patterns", + "apple/metal-producer-consumer-staging-playbook" + ], + "top_items": [ + { + "id": "apple/metal-kernel-fusion-tradeoff-checklist", + "name": "metal-kernel-fusion-tradeoff-checklist", + "score": 186.86371755264193 + }, + { + "id": "apple/metal-kernel-basics", + "name": "metal-kernel-basics", + "score": 60.64701989263922 + }, + { + "id": "apple/metal-kernel-debugging-checklist", + "name": "metal-kernel-debugging-checklist", + "score": 46.808464141925434 + }, + { + "id": "apple/metal-broadcast-kernel-patterns", + "name": "metal-broadcast-kernel-patterns", + "score": 46.09385744779122 + }, + { + "id": "apple/metal-producer-consumer-staging-playbook", + "name": "metal-producer-consumer-staging-playbook", + "score": 42.753400012758156 + } + ] + }, + { + "id": "metal-tensor-packing", + "query": "metal tensor packing unpacking layout transform contiguous staging", + "passed": true, + "top_ids": [ + "apple/metal-tensor-packing-and-unpacking-patterns", + "apple/metal-transpose-free-layout-choice-playbook", + "apple/metal-transpose-and-layout-reorder-patterns", + "apple/metal-buffer-layout-and-alignment", + "apple/metal-producer-consumer-staging-playbook" + ], + "top_items": [ + { + "id": "apple/metal-tensor-packing-and-unpacking-patterns", + "name": "metal-tensor-packing-and-unpacking-patterns", + "score": 177.8135478780065 + }, + { + "id": "apple/metal-transpose-free-layout-choice-playbook", + "name": "metal-transpose-free-layout-choice-playbook", + "score": 72.01651516569339 + }, + { + "id": "apple/metal-transpose-and-layout-reorder-patterns", + "name": "metal-transpose-and-layout-reorder-patterns", + "score": 68.838954982908 + }, + { + "id": "apple/metal-buffer-layout-and-alignment", + "name": "metal-buffer-layout-and-alignment", + "score": 66.38165771657762 + }, + { + "id": "apple/metal-producer-consumer-staging-playbook", + "name": "metal-producer-consumer-staging-playbook", + "score": 61.40919955630339 + } + ] + }, + { + "id": "metal-strided-views", + "query": "metal strided views subtensor slice offset math noncontiguous", + "passed": true, + "top_ids": [ + "apple/metal-strided-views-and-subtensor-access-patterns", + "apple/metal-kernel-basics", + "apple/metal-validation-and-profiling-workflow", + "apple/metal-reduction-patterns", + "apple/metal-numerical-drift-debugging-checklist" + ], + "top_items": [ + { + "id": "apple/metal-strided-views-and-subtensor-access-patterns", + "name": "metal-strided-views-and-subtensor-access-patterns", + "score": 201.22460104223939 + }, + { + "id": "apple/metal-kernel-basics", + "name": "metal-kernel-basics", + "score": 33.29242661478774 + }, + { + "id": "apple/metal-validation-and-profiling-workflow", + "name": "metal-validation-and-profiling-workflow", + "score": 31.512843247131073 + }, + { + "id": "apple/metal-reduction-patterns", + "name": "metal-reduction-patterns", + "score": 29.53289152329146 + }, + { + "id": "apple/metal-numerical-drift-debugging-checklist", + "name": "metal-numerical-drift-debugging-checklist", + "score": 29.449220853380268 + } + ] + }, + { + "id": "metal-broadcast-kernels", + "query": "metal broadcast kernel shape alignment scalar vector expansion", + "passed": true, + "top_ids": [ + "apple/metal-broadcast-kernel-patterns", + "apple/metal-buffer-layout-and-alignment", + "apple/metal-kernel-basics", + "apple/metal-kernel-fusion-tradeoff-checklist", + "apple/metal-kernel-debugging-checklist" + ], + "top_items": [ + { + "id": "apple/metal-broadcast-kernel-patterns", + "name": "metal-broadcast-kernel-patterns", + "score": 146.28247047446843 + }, + { + "id": "apple/metal-buffer-layout-and-alignment", + "name": "metal-buffer-layout-and-alignment", + "score": 75.38536431450298 + }, + { + "id": "apple/metal-kernel-basics", + "name": "metal-kernel-basics", + "score": 60.64701989263922 + }, + { + "id": "apple/metal-kernel-fusion-tradeoff-checklist", + "name": "metal-kernel-fusion-tradeoff-checklist", + "score": 47.58018896338181 + }, + { + "id": "apple/metal-kernel-debugging-checklist", + "name": "metal-kernel-debugging-checklist", + "score": 46.808464141925434 + } + ] + }, + { + "id": "metal-host-device-sync", + "query": "metal host device synchronization completion resource lifetime wrapper", + "passed": true, + "top_ids": [ + "apple/metal-host-device-synchronization-checklist", + "apple/metal-host-wrapper-patterns", + "apple/metal-pytorch-custom-op-host-patterns", + "apple/metal-resource-binding-checklist", + "apple/metal-memory-and-threadgroup" + ], + "top_items": [ + { + "id": "apple/metal-host-device-synchronization-checklist", + "name": "metal-host-device-synchronization-checklist", + "score": 180.54403556794583 + }, + { + "id": "apple/metal-host-wrapper-patterns", + "name": "metal-host-wrapper-patterns", + "score": 111.34495763583598 + }, + { + "id": "apple/metal-pytorch-custom-op-host-patterns", + "name": "metal-pytorch-custom-op-host-patterns", + "score": 71.01670881286532 + }, + { + "id": "apple/metal-resource-binding-checklist", + "name": "metal-resource-binding-checklist", + "score": 54.788253017723825 + }, + { + "id": "apple/metal-memory-and-threadgroup", + "name": "metal-memory-and-threadgroup", + "score": 46.6496098844782 + } + ] + }, + { + "id": "metal-softmax-logsumexp", + "query": "metal softmax logsumexp stability max subtraction precision", + "passed": true, + "top_ids": [ + "apple/metal-softmax-and-logsumexp-stability-patterns", + "apple/metal-reduction-patterns", + "apple/metal-numerical-drift-debugging-checklist", + "apple/metal-kernel-basics", + "apple/metal-validation-and-profiling-workflow" + ], + "top_items": [ + { + "id": "apple/metal-softmax-and-logsumexp-stability-patterns", + "name": "metal-softmax-and-logsumexp-stability-patterns", + "score": 220.2557695796513 + }, + { + "id": "apple/metal-reduction-patterns", + "name": "metal-reduction-patterns", + "score": 45.44838119089637 + }, + { + "id": "apple/metal-numerical-drift-debugging-checklist", + "name": "metal-numerical-drift-debugging-checklist", + "score": 38.27141495322089 + }, + { + "id": "apple/metal-kernel-basics", + "name": "metal-kernel-basics", + "score": 33.29242661478774 + }, + { + "id": "apple/metal-validation-and-profiling-workflow", + "name": "metal-validation-and-profiling-workflow", + "score": 31.512843247131073 + } + ] + }, + { + "id": "metal-nan-inf-debugging", + "query": "metal silent nan inf debugging overflow first bad intermediate", + "passed": true, + "top_ids": [ + "apple/metal-silent-nan-inf-debugging-checklist", + "apple/metal-numerical-drift-debugging-checklist", + "apple/metal-kernel-debugging-checklist", + "apple/metal-kernel-fusion-tradeoff-checklist", + "apple/metal-producer-consumer-staging-playbook" + ], + "top_items": [ + { + "id": "apple/metal-silent-nan-inf-debugging-checklist", + "name": "metal-silent-nan-inf-debugging-checklist", + "score": 225.44914220830498 + }, + { + "id": "apple/metal-numerical-drift-debugging-checklist", + "name": "metal-numerical-drift-debugging-checklist", + "score": 67.19078103291659 + }, + { + "id": "apple/metal-kernel-debugging-checklist", + "name": "metal-kernel-debugging-checklist", + "score": 64.60023602910067 + }, + { + "id": "apple/metal-kernel-fusion-tradeoff-checklist", + "name": "metal-kernel-fusion-tradeoff-checklist", + "score": 43.2622700841287 + }, + { + "id": "apple/metal-producer-consumer-staging-playbook", + "name": "metal-producer-consumer-staging-playbook", + "score": 39.65507111430976 + } + ] + }, + { + "id": "metal-prefetch-reuse", + "query": "metal prefetch reuse heuristics threadgroup memory locality", + "passed": true, + "top_ids": [ + "apple/metal-prefetch-and-reuse-heuristics", + "apple/metal-memory-and-threadgroup", + "apple/metal-threadgroup-sizing-playbook", + "apple/metal-memory-pressure-checklist", + "apple/metal-command-buffer-reuse-and-batching" + ], + "top_items": [ + { + "id": "apple/metal-prefetch-and-reuse-heuristics", + "name": "metal-prefetch-and-reuse-heuristics", + "score": 207.27893996788106 + }, + { + "id": "apple/metal-memory-and-threadgroup", + "name": "metal-memory-and-threadgroup", + "score": 114.11877190134864 + }, + { + "id": "apple/metal-threadgroup-sizing-playbook", + "name": "metal-threadgroup-sizing-playbook", + "score": 71.29025126895404 + }, + { + "id": "apple/metal-memory-pressure-checklist", + "name": "metal-memory-pressure-checklist", + "score": 64.68561904857324 + }, + { + "id": "apple/metal-command-buffer-reuse-and-batching", + "name": "metal-command-buffer-reuse-and-batching", + "score": 61.796775193413495 + } + ] + }, + { + "id": "metal-transpose-free-layout", + "query": "metal transpose free layout choice playbook packing reorder avoidance", + "passed": true, + "top_ids": [ + "apple/metal-transpose-free-layout-choice-playbook", + "apple/metal-transpose-and-layout-reorder-patterns", + "apple/metal-tensor-packing-and-unpacking-patterns", + "apple/metal-buffer-layout-and-alignment", + "apple/metal-threadgroup-sizing-playbook" + ], + "top_items": [ + { + "id": "apple/metal-transpose-free-layout-choice-playbook", + "name": "metal-transpose-free-layout-choice-playbook", + "score": 776.4597362998956 + }, + { + "id": "apple/metal-transpose-and-layout-reorder-patterns", + "name": "metal-transpose-and-layout-reorder-patterns", + "score": 153.3103504378608 + }, + { + "id": "apple/metal-tensor-packing-and-unpacking-patterns", + "name": "metal-tensor-packing-and-unpacking-patterns", + "score": 72.29749762985034 + }, + { + "id": "apple/metal-buffer-layout-and-alignment", + "name": "metal-buffer-layout-and-alignment", + "score": 66.38165771657762 + }, + { + "id": "apple/metal-threadgroup-sizing-playbook", + "name": "metal-threadgroup-sizing-playbook", + "score": 59.596596128581076 + } + ] + } + ] +} \ No newline at end of file diff --git a/scripts/search_regression_cases.json b/scripts/search_regression_cases.json new file mode 100644 index 00000000..e3a654a1 --- /dev/null +++ b/scripts/search_regression_cases.json @@ -0,0 +1,546 @@ +[ + { + "id": "wmma-how-to-use", + "query": "how to use wmma", + "tags": "cuda", + "lang": "cpp", + "limit": 5, + "top_k": 3, + "expect_top1": "cuda/wmma-kernel-patterns", + "expect_any": ["cuda/tensor-cores"] + }, + { + "id": "shared-memory-core", + "query": "shared memory cuda", + "tags": "cuda", + "lang": "cpp", + "limit": 5, + "expect_top1": "cuda/shared-memory" + }, + { + "id": "tensor-core-pipeline", + "query": "tensor core pipeline", + "tags": "cuda", + "lang": "cpp", + "limit": 5, + "expect_top1": "cuda/tensor-core-pipeline-patterns" + }, + { + "id": "cuda-core-checklist", + "query": "cuda core optimization checklist", + "tags": "cuda", + "lang": "cpp", + "limit": 5, + "expect_top1": "cuda/cuda-core-optimization-checklist" + }, + { + "id": "ptx-mbarrier-patterns", + "query": "ptx cp.async mbarrier", + "tags": "cuda", + "lang": "cpp", + "limit": 5, + "top_k": 3, + "expect_top1": "cuda/ptx-mbarrier-protocol-patterns", + "expect_any": [ + "cuda/ptx-sync-comm-instructions", + "cuda/ptx-data-movement-instructions", + "cuda/ptx" + ] + }, + { + "id": "wmma-debugging", + "query": "wmma debugging checklist", + "tags": "cuda", + "lang": "cpp", + "limit": 5, + "top_k": 3, + "expect_any": [ + "cuda/wmma-debugging-checklist", + "cuda/wmma-kernel-patterns" + ] + }, + { + "id": "warp-primitives", + "query": "warp primitives shuffle ballot syncwarp", + "tags": "cuda", + "lang": "cpp", + "limit": 5, + "expect_top1": "cuda/warp-primitives" + }, + { + "id": "sync-basics", + "query": "cuda synchronization syncthreads syncwarp", + "tags": "cuda", + "lang": "cpp", + "limit": 5, + "expect_top1": "cuda/synchronization" + }, + { + "id": "coalescing", + "query": "cuda memory coalescing global load store", + "tags": "cuda", + "lang": "cpp", + "limit": 5, + "top_k": 3, + "expect_any": ["cuda/coalescing", "cuda/memory-bound-kernel-optimization-playbook"] + }, + { + "id": "occupancy-and-launch-bounds", + "query": "occupancy register pressure launch bounds", + "tags": "cuda", + "lang": "cpp", + "limit": 5, + "top_k": 4, + "expect_any": ["cuda/occupancy", "cuda/launch-bounds-and-registers"] + }, + { + "id": "unified-memory", + "query": "cuda unified memory prefetch advise", + "tags": "cuda", + "lang": "cpp", + "limit": 5, + "expect_top1": "cuda/unified-memory" + }, + { + "id": "graphs", + "query": "cuda graphs stream capture", + "tags": "cuda", + "lang": "cpp", + "limit": 5, + "expect_top1": "cuda/cuda-graphs" + }, + { + "id": "numerics", + "query": "cuda numerics precision tf32 fp16 bf16", + "tags": "cuda", + "lang": "cpp", + "limit": 5, + "top_k": 4, + "expect_any": ["cuda/numerics-and-precision", "cuda/tensor-core-numerical-validation"] + }, + { + "id": "bottleneck-workflow", + "query": "kernel bottleneck diagnosis workflow", + "tags": "cuda", + "lang": "cpp", + "limit": 5, + "expect_top1": "cuda/kernel-bottleneck-diagnosis-workflow" + }, + { + "id": "production-readiness", + "query": "cuda production readiness checklist", + "tags": "cuda", + "lang": "cpp", + "limit": 5, + "expect_top1": "cuda/production-readiness-checklist" + }, + { + "id": "ptx-wgmma", + "query": "ptx wgmma commit wait fence", + "tags": "cuda", + "lang": "cpp", + "limit": 5, + "top_k": 3, + "expect_any": ["cuda/ptx-wgmma-instructions", "cuda/tensor-core-pipeline-patterns"] + }, + { + "id": "ptx-atomics", + "query": "ptx atom cas red redux", + "tags": "cuda", + "lang": "cpp", + "limit": 5, + "top_k": 3, + "expect_any": [ + "cuda/ptx-atomic-and-reduction-patterns", + "cuda/ptx-sync-comm-instructions" + ] + }, + { + "id": "ptx-integer-bitops", + "query": "ptx integer bit manipulation lop3 bfe bfi", + "tags": "cuda", + "lang": "cpp", + "limit": 5, + "top_k": 3, + "expect_any": [ + "cuda/ptx-integer-bit-manipulation-patterns", + "cuda/ptx-integer-instructions" + ] + }, + { + "id": "metal-kernel-basics", + "query": "metal kernel thread_position_in_grid threadgroup", + "tags": "apple", + "lang": "cpp", + "limit": 5, + "expect_top1": "apple/metal-kernel-basics" + }, + { + "id": "metal-compute-launch", + "query": "metal compute pipeline dispatchthreads threadsperthreadgroup", + "tags": "apple", + "lang": "cpp", + "limit": 5, + "expect_top1": "apple/metal-compute-launch-patterns" + }, + { + "id": "metal-debugging", + "query": "metal kernel debugging validation resource binding dispatch", + "tags": "apple", + "lang": "cpp", + "limit": 5, + "expect_top1": "apple/metal-kernel-debugging-checklist" + }, + { + "id": "metal-buffer-alignment", + "query": "metal buffer alignment bytesperrow texture buffer", + "tags": "apple", + "lang": "cpp", + "limit": 5, + "expect_top1": "apple/metal-buffer-layout-and-alignment" + }, + { + "id": "metal-simdgroup", + "query": "metal simdgroup subgroup warp-like", + "tags": "apple", + "lang": "cpp", + "limit": 5, + "expect_top1": "apple/metal-simdgroup-patterns" + }, + { + "id": "metal-performance", + "query": "metal performance tuning dispatchthreads command buffer pipeline reuse", + "tags": "apple", + "lang": "cpp", + "limit": 5, + "top_k": 3, + "expect_any": [ + "apple/metal-performance-tuning", + "apple/metal-command-buffer-reuse-and-batching" + ] + }, + { + "id": "pytorch-custom-metal", + "query": "pytorch mps vs custom metal", + "tags": "apple", + "lang": "python", + "limit": 5, + "top_k": 2, + "expect_any": [ + "apple/pytorch-mps-vs-custom-metal", + "apple/metal-pytorch-custom-op-host-patterns" + ] + }, + { + "id": "metal-host-wrapper", + "query": "metal host wrapper pipeline cache command encoder", + "tags": "apple", + "lang": "cpp", + "limit": 5, + "expect_top1": "apple/metal-host-wrapper-patterns" + }, + { + "id": "metal-threadgroup-sizing", + "query": "metal threadExecutionWidth maxTotalThreadsPerThreadgroup threadgroup sizing", + "tags": "apple", + "lang": "cpp", + "limit": 5, + "expect_top1": "apple/metal-threadgroup-sizing-playbook" + }, + { + "id": "metal-resource-binding", + "query": "metal resource binding useResource useHeap buffer index", + "tags": "apple", + "lang": "cpp", + "limit": 5, + "expect_top1": "apple/metal-resource-binding-checklist" + }, + { + "id": "metal-tiled-matmul", + "query": "metal tiled matmul threadgroup memory gemm", + "tags": "apple", + "lang": "cpp", + "limit": 5, + "expect_top1": "apple/metal-tiled-matmul-patterns" + }, + { + "id": "metal-reduction", + "query": "metal reduction threadgroup accumulation barrier", + "tags": "apple", + "lang": "cpp", + "limit": 5, + "expect_top1": "apple/metal-reduction-patterns" + }, + { + "id": "metal-image-2d", + "query": "metal texture 2d kernel thread_position_in_grid read_write texture", + "tags": "apple", + "lang": "cpp", + "limit": 5, + "expect_top1": "apple/metal-image-and-2d-kernel-patterns" + }, + { + "id": "metal-command-buffer-batching", + "query": "metal command buffer batching indirect command buffer persistent objects", + "tags": "apple", + "lang": "cpp", + "limit": 5, + "expect_top1": "apple/metal-command-buffer-reuse-and-batching" + }, + { + "id": "metal-library-compilation", + "query": "metal metallib pipeline compilation runtime loading", + "tags": "apple", + "lang": "cpp", + "limit": 5, + "expect_top1": "apple/metal-library-and-pipeline-compilation" + }, + { + "id": "metal-texture-vs-buffer", + "query": "metal texture vs buffer resource selection image linear memory", + "tags": "apple", + "lang": "cpp", + "limit": 5, + "expect_top1": "apple/metal-texture-vs-buffer-path-selection" + }, + { + "id": "metal-edge-bounds", + "query": "metal edge tile bounds check rounded dispatch partial tile", + "tags": "apple", + "lang": "cpp", + "limit": 5, + "expect_top1": "apple/metal-edge-tile-and-bounds-check-playbook" + }, + { + "id": "metal-validation-profiling", + "query": "metal validation profiling workflow debugger instruments system trace", + "tags": "apple", + "lang": "cpp", + "limit": 5, + "expect_top1": "apple/metal-validation-and-profiling-workflow" + }, + { + "id": "metal-argument-buffers", + "query": "metal argument buffers residency useresource indirect resources", + "tags": "apple", + "lang": "cpp", + "limit": 5, + "expect_top1": "apple/metal-argument-buffers-and-residency" + }, + { + "id": "metal-heaps-events", + "query": "metal heaps fences events temporary resources multistage pipeline", + "tags": "apple", + "lang": "cpp", + "limit": 5, + "expect_top1": "apple/metal-heaps-fences-and-events" + }, + { + "id": "metal-convolution-stencil", + "query": "metal convolution stencil image filter neighborhood texture compute", + "tags": "apple", + "lang": "cpp", + "limit": 5, + "expect_top1": "apple/metal-convolution-and-stencil-patterns" + }, + { + "id": "metal-prefix-scan", + "query": "metal prefix scan exclusive inclusive scan threadgroup", + "tags": "apple", + "lang": "cpp", + "limit": 5, + "expect_top1": "apple/metal-prefix-scan-patterns" + }, + { + "id": "metal-transpose-layout", + "query": "metal transpose layout reorder tile threadgroup", + "tags": "apple", + "lang": "cpp", + "limit": 5, + "expect_top1": "apple/metal-transpose-and-layout-reorder-patterns" + }, + { + "id": "metal-histogram-binning", + "query": "metal histogram binning atomic threadgroup local bins", + "tags": "apple", + "lang": "cpp", + "limit": 5, + "expect_top1": "apple/metal-histogram-and-binning-patterns" + }, + { + "id": "metal-gather-scatter", + "query": "metal gather scatter indirect access argument buffers index table", + "tags": "apple", + "lang": "cpp", + "limit": 5, + "expect_top1": "apple/metal-gather-scatter-and-indirect-access-patterns" + }, + { + "id": "metal-convolution-tiling", + "query": "metal convolution tiling halo threadgroup separable", + "tags": "apple", + "lang": "cpp", + "limit": 5, + "expect_top1": "apple/metal-convolution-tiling-playbook" + }, + { + "id": "metal-persistent-objects", + "query": "metal persistent objects command buffer queue pipeline reuse submission overhead", + "tags": "apple", + "lang": "cpp", + "limit": 5, + "expect_top1": "apple/metal-persistent-objects-and-submission-overhead" + }, + { + "id": "metal-gpu-driven-work", + "query": "metal gpu driven work generation indirect command buffer indirect dispatch", + "tags": "apple", + "lang": "cpp", + "limit": 5, + "expect_top1": "apple/metal-gpu-driven-work-generation-patterns" + }, + { + "id": "metal-multistage-pipeline", + "query": "metal multistage tensor pipeline intermediate buffers stages", + "tags": "apple", + "lang": "cpp", + "limit": 5, + "expect_top1": "apple/metal-multistage-tensor-pipeline-patterns" + }, + { + "id": "metal-numerical-drift", + "query": "metal numerical drift precision debugging half float accumulation", + "tags": "apple", + "lang": "cpp", + "limit": 5, + "expect_top1": "apple/metal-numerical-drift-debugging-checklist" + }, + { + "id": "metal-segmented-reduction", + "query": "metal segmented reduction grouped irregular segments carry", + "tags": "apple", + "lang": "cpp", + "limit": 5, + "expect_top1": "apple/metal-segmented-reduction-patterns" + }, + { + "id": "metal-ragged-masked", + "query": "metal ragged tensor masked kernel variable length bounds check", + "tags": "apple", + "lang": "cpp", + "limit": 5, + "expect_top1": "apple/metal-ragged-tensors-and-masked-kernels" + }, + { + "id": "metal-streaming-online", + "query": "metal streaming online kernel chunked rolling buffer latency", + "tags": "apple", + "lang": "cpp", + "limit": 5, + "expect_top1": "apple/metal-streaming-and-online-kernel-patterns" + }, + { + "id": "metal-memory-pressure", + "query": "metal memory pressure checklist heaps storage mode transient allocation", + "tags": "apple", + "lang": "cpp", + "limit": 5, + "expect_top1": "apple/metal-memory-pressure-checklist" + }, + { + "id": "metal-double-buffered-pipeline", + "query": "metal double buffered pipeline overlap producer consumer alternating buffers", + "tags": "apple", + "lang": "cpp", + "limit": 5, + "expect_top1": "apple/metal-double-buffered-pipeline-patterns" + }, + { + "id": "metal-producer-consumer-staging", + "query": "metal producer consumer staging handoff intermediate resource synchronization", + "tags": "apple", + "lang": "cpp", + "limit": 5, + "expect_top1": "apple/metal-producer-consumer-staging-playbook" + }, + { + "id": "metal-scatter-conflict", + "query": "metal scatter conflict resolution atomics staged merge duplicate destinations", + "tags": "apple", + "lang": "cpp", + "limit": 5, + "expect_top1": "apple/metal-scatter-conflict-resolution-patterns" + }, + { + "id": "metal-kernel-fusion", + "query": "metal kernel fusion tradeoff bandwidth occupancy intermediate resources", + "tags": "apple", + "lang": "cpp", + "limit": 5, + "expect_top1": "apple/metal-kernel-fusion-tradeoff-checklist" + }, + { + "id": "metal-tensor-packing", + "query": "metal tensor packing unpacking layout transform contiguous staging", + "tags": "apple", + "lang": "cpp", + "limit": 5, + "expect_top1": "apple/metal-tensor-packing-and-unpacking-patterns" + }, + { + "id": "metal-strided-views", + "query": "metal strided views subtensor slice offset math noncontiguous", + "tags": "apple", + "lang": "cpp", + "limit": 5, + "expect_top1": "apple/metal-strided-views-and-subtensor-access-patterns" + }, + { + "id": "metal-broadcast-kernels", + "query": "metal broadcast kernel shape alignment scalar vector expansion", + "tags": "apple", + "lang": "cpp", + "limit": 5, + "expect_top1": "apple/metal-broadcast-kernel-patterns" + }, + { + "id": "metal-host-device-sync", + "query": "metal host device synchronization completion resource lifetime wrapper", + "tags": "apple", + "lang": "cpp", + "limit": 5, + "expect_top1": "apple/metal-host-device-synchronization-checklist" + }, + { + "id": "metal-softmax-logsumexp", + "query": "metal softmax logsumexp stability max subtraction precision", + "tags": "apple", + "lang": "cpp", + "limit": 5, + "expect_top1": "apple/metal-softmax-and-logsumexp-stability-patterns" + }, + { + "id": "metal-nan-inf-debugging", + "query": "metal silent nan inf debugging overflow first bad intermediate", + "tags": "apple", + "lang": "cpp", + "limit": 5, + "expect_top1": "apple/metal-silent-nan-inf-debugging-checklist" + }, + { + "id": "metal-prefetch-reuse", + "query": "metal prefetch reuse heuristics threadgroup memory locality", + "tags": "apple", + "lang": "cpp", + "limit": 5, + "expect_top1": "apple/metal-prefetch-and-reuse-heuristics" + }, + { + "id": "metal-transpose-free-layout", + "query": "metal transpose free layout choice playbook packing reorder avoidance", + "tags": "apple", + "lang": "cpp", + "limit": 5, + "expect_top1": "apple/metal-transpose-free-layout-choice-playbook" + } +]