diff --git a/content/cuda/docs/async-copy/DOC.md b/content/cuda/docs/async-copy/DOC.md new file mode 100644 index 00000000..354b8cfe --- /dev/null +++ b/content/cuda/docs/async-copy/DOC.md @@ -0,0 +1,117 @@ +--- +name: async-copy +description: "CUDA async copy essentials: cooperative_groups::memcpy_async, cuda::pipeline, wait rules, and the bridge to cp.async/TMA." +metadata: + languages: "cpp" + versions: "13.1" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,async-copy,memcpy_async,cuda::pipeline,cuda::barrier,cp.async,tma,shared-memory" +--- + +# CUDA Async Copy (C++) + +Use this page for the CUDA C++ view of asynchronous copies from global memory to shared memory and the synchronization rules around them. + +## What Problem It Solves + +A conventional copy into shared memory: + +```cpp +shared[idx] = global[idx]; +``` + +typically expands into: + +1. load from global memory into a register +2. store from register into shared memory + +Async copy can avoid that register staging path on supported hardware and can overlap data movement with computation. + +## Main CUDA C++ Entry Points + +Two common interfaces appear in NVIDIA documentation: + +- `cooperative_groups::memcpy_async(...)` +- `cuda::memcpy_async(...)` together with `cuda::pipeline` or `cuda::barrier` + +At a high level, both start an async transfer and require an explicit wait before the data in shared memory is consumed. + +## Fundamental Safety Rule + +After initiating the async copy: + +- do not read the destination shared memory until the corresponding wait completes +- do not modify the source or destination participating region while the transfer is in flight + +Until the wait completes, reading or writing the participating data can create a data race. + +## Cooperative Groups Pattern + +```cpp +namespace cg = cooperative_groups; + +auto block = cg::this_thread_block(); +extern __shared__ float smem[]; + +cg::memcpy_async(block, smem, gmem_ptr, bytes); +cg::wait(block); +block.sync(); +``` + +Use `cg::wait(group)` before consuming the copied shared-memory data. + +## Pipeline Pattern + +For newer CUDA C++ paths, `cuda::pipeline` can express staged copy/compute overlap. + +The common structure is: + +1. acquire / start pipeline stage +2. issue `cuda::memcpy_async` +3. commit or advance the stage +4. wait for the prior stage +5. compute on the completed shared-memory tile + +This is the higher-level CUDA C++ bridge to lower-level async copy hardware behavior. + +## When Hardware Acceleration Matters + +NVIDIA documents that on compute capability 8.0 and higher, async copies from global to shared memory can benefit from hardware acceleration that avoids an intermediate register path. + +That does not remove the need for: + +- alignment discipline +- correct wait behavior +- sensible shared-memory layout + +## When To Escalate To PTX / TMA + +Stay in CUDA C++ docs when: + +- you are using `memcpy_async` +- you need pipeline-level copy/compute overlap +- you want a supported C++ interface + +Drop to PTX / TMA docs when: + +- you need precise `cp.async` group semantics +- you need bulk async copies or TMA +- you need `mbarrier` or cluster-scope completion behavior + +## Related Topics + +- Shared memory usage: `../shared-memory/DOC.md` +- Synchronization rules: `../synchronization/DOC.md` +- Cooperative Groups: `../cooperative-groups/DOC.md` +- PTX `cp.async`: `../ptx/instructions/data-movement/references/cp-async.md` +- PTX TMA: `../ptx/instructions/tma/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA Programming Guide, Asynchronous Data Copies: https://docs.nvidia.com/cuda/archive/13.1.1/cuda-programming-guide/04-special-topics/async-copies.html +- CUDA Programming Guide, Cooperative Groups async copy examples: https://docs.nvidia.com/cuda/archive/11.8.0/cuda-c-programming-guide/index.html +- CUDA Programming Guide, `memcpy_async` and `cuda::pipeline`: https://docs.nvidia.com/cuda/archive/11.6.2/cuda-c-programming-guide/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/atomics-and-reductions/DOC.md b/content/cuda/docs/atomics-and-reductions/DOC.md new file mode 100644 index 00000000..aa8b6b91 --- /dev/null +++ b/content/cuda/docs/atomics-and-reductions/DOC.md @@ -0,0 +1,94 @@ +--- +name: atomics-and-reductions +description: "CUDA atomics and reduction essentials: atomicAdd, shared/global scope, warp-first reduction, and common tradeoffs." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,atomics,reduction,atomicAdd,atomicCAS,shared-memory,warp-reduction" +--- + +# CUDA Atomics And Reductions (C++) + +Use this page when deciding between direct atomics, shared-memory reductions, and warp-first reduction patterns. + +## Atomic Basics + +An atomic operation performs a read-modify-write sequence as one atomic transaction on a word in global or shared memory. + +Common examples: + +- `atomicAdd` +- `atomicCAS` +- `atomicMax` +- `atomicMin` + +Atomics are correct tools for contention-sensitive updates, but they can serialize hot spots. + +## Scope Choice + +- shared-memory atomics are useful for contention within one block +- global-memory atomics are visible across blocks but usually cost more under heavy contention + +A common pattern is: + +1. reduce within a warp +2. reduce within a block using shared memory +3. emit one global atomic per block + +## Preferred Reduction Structure + +For many reductions, do not start with one atomic per thread. + +Better default: + +- first use warp shuffle reduction +- then combine warp results in shared memory +- then write one value per block or one atomic per block + +This reduces contention and memory traffic. + +## When Direct Atomics Are Fine + +Direct global atomics are often acceptable when: + +- the output has low contention +- the kernel is not dominated by the atomic path +- simplicity matters more than peak throughput + +Examples: + +- histogram with many bins and good distribution +- sparse accumulation with low collision probability + +## When Atomics Become A Problem + +Expect trouble when: + +- many threads update the same location +- the output space is very small +- the kernel becomes serialization-bound + +In those cases, switch to hierarchical reduction or privatization. + +## Minimal Strategy Guide + +- one scalar result per block: block reduction in shared memory +- one scalar result for the whole grid: block reduction plus final stage +- many bins with moderate collisions: shared-memory privatization, then flush +- warp-local aggregation: use shuffle before touching shared or global memory + +## Related Topics + +- Shared memory staging: `../shared-memory/DOC.md` +- Warp-level collectives: `../warp-primitives/DOC.md` +- Synchronization rules: `../synchronization/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Programming Guide, atomic functions: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html +- CUDA C++ Best Practices Guide, reduction and shared-memory patterns: https://docs.nvidia.com/cuda/archive/13.0.0/cuda-c-best-practices-guide/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/benchmarking-methodology/DOC.md b/content/cuda/docs/benchmarking-methodology/DOC.md new file mode 100644 index 00000000..4a41274b --- /dev/null +++ b/content/cuda/docs/benchmarking-methodology/DOC.md @@ -0,0 +1,74 @@ +--- +name: benchmarking-methodology +description: "CUDA benchmarking methodology essentials: warmup, synchronization discipline, stable inputs, percentile reporting, and fair comparisons." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,benchmark,methodology,warmup,timing,percentile,variance,fair-comparison" +--- + +# CUDA Benchmarking Methodology (C++) + +Use this page when you need benchmark numbers that are comparable and reproducible. + +## Core Rules + +1. measure steady state, not cold start. +2. use correct synchronization for the scope being measured. +3. keep input shapes and distributions fixed across variants. +4. report variability, not just one best run. + +## Warmup + +Always include warmup iterations before measurement to absorb: + +- JIT or first-use overheads +- cache/allocator/transient startup effects + +## Timing Discipline + +For kernel timing: + +- use event-based timing around the measured stream segment +- avoid mixing host wall-clock timing with unsynchronized device work + +For end-to-end latency: + +- include all relevant host/device stages intentionally +- document what is excluded + +## Comparison Hygiene + +- same hardware and driver/toolkit +- same input set and batch strategy +- same precision and algorithm settings +- same determinism flags where relevant + +Any mismatch here can invalidate claimed speedups. + +## Reporting + +Report at least: + +- median +- p90/p95 (or similar tail percentile) +- run-to-run variance + +Single minimum time is not sufficient for production-facing claims. + +## Related Topics + +- Streams and events: `../streams-and-events/DOC.md` +- Performance debugging: `../performance-debugging/DOC.md` +- NVTX profiling workflow: `../nvtx-and-profiling-workflow/DOC.md` +- Regression testing and CI: `../regression-testing-and-ci/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Best Practices Guide, measurement and optimization workflow context: https://docs.nvidia.com/cuda/archive/13.0.0/cuda-c-best-practices-guide/index.html +- CUDA Runtime API, event timing APIs: https://docs.nvidia.com/cuda/cuda-runtime-api/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/build-and-abi-compatibility/DOC.md b/content/cuda/docs/build-and-abi-compatibility/DOC.md new file mode 100644 index 00000000..2c69e868 --- /dev/null +++ b/content/cuda/docs/build-and-abi-compatibility/DOC.md @@ -0,0 +1,72 @@ +--- +name: build-and-abi-compatibility +description: "CUDA build and ABI compatibility essentials: arch targets, PTX/SASS forward-compat strategy, runtime/driver constraints, and packaging hygiene." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,build,abi,compatibility,sm-arch,ptx,sass,nvcc,driver-runtime" +--- + +# CUDA Build And ABI Compatibility (C++) + +Use this page when shipping CUDA binaries across different GPU architectures and deployment environments. + +## Targeting Strategy + +Build artifacts can include: + +- SASS for specific SM architectures +- PTX for forward compatibility via JIT on newer compatible drivers + +A common practical strategy is to include both: + +- native SASS for known deployment GPUs +- PTX fallback for future-compatible targets + +## Why Compatibility Breaks + +Typical mismatch classes: + +- runtime-toolkit vs driver capability mismatch +- missing arch target in build flags +- ABI or dependency mismatch in host integration + +Treat compatibility as part of release engineering, not a last-minute fix. + +## NVCC Arch Hygiene + +Use explicit arch targets and document them in build config. + +- keep `-gencode` matrix aligned with actual fleet GPUs +- avoid shipping only one narrow arch unless environment is fixed + +## Runtime/Driver Considerations + +- new toolkits can require minimum driver versions +- deployment systems may lag driver updates + +Validate on representative driver/toolkit combinations before release. + +## Package-Level Practices + +- pin toolkit version in CI +- record compile flags in build metadata +- verify cold-start JIT overhead if PTX fallback is expected +- add smoke tests per target GPU class + +## Related Topics + +- Error handling and debug build: `../error-handling-and-debug-build/DOC.md` +- Runtime API overview: `../runtime/DOC.md` +- PTX ISA overview: `../ptx/DOC.md` + +## Official Source Links (Fact Check) + +- NVCC Compiler Driver documentation: https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html +- CUDA Compatibility documentation: https://docs.nvidia.com/deploy/cuda-compatibility/index.html +- CUDA Installation Guide (version/driver context): https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/cache-behavior-and-access-policy/DOC.md b/content/cuda/docs/cache-behavior-and-access-policy/DOC.md new file mode 100644 index 00000000..24ca3df3 --- /dev/null +++ b/content/cuda/docs/cache-behavior-and-access-policy/DOC.md @@ -0,0 +1,73 @@ +--- +name: cache-behavior-and-access-policy +description: "CUDA cache-behavior essentials: locality patterns, read-only paths, L2 persistence windows, and access-policy tradeoffs." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,cache,l2,access-policy,persistence-window,read-only-cache,locality,stream-attributes" +--- + +# CUDA Cache Behavior And Access Policy (C++) + +Use this page when kernels are bandwidth-limited and cache behavior is the next bottleneck. + +## First Principle + +No cache hint compensates for fundamentally poor locality. + +Always fix: + +- coalescing +- reuse distance +- working set shape + +before tuning cache policy knobs. + +## Read-Only And Locality-Aware Access + +Read-only paths and locality-aware layouts can reduce memory traffic pressure. + +- group neighboring accesses by neighboring threads +- avoid random scatter in the hottest loops +- keep reused regions compact when possible + +## L2 Access Policy Window + +CUDA exposes stream-level access-policy controls for L2 persistence behavior. + +- set stream attributes for persistence windows +- use them only for demonstrably reused regions +- tune hit ratio assumptions carefully + +Overusing persistence windows can hurt other traffic and reduce global efficiency. + +## Practical Workflow + +1. identify hotspot kernels. +2. confirm memory-bound behavior with profiling. +3. improve layout/coalescing first. +4. test cache/access-policy changes incrementally. +5. keep only changes that improve end-to-end latency. + +## Common Pitfalls + +- setting cache policy globally without per-kernel evidence +- treating cache hints as deterministic guarantees +- ignoring multi-stream interference in shared cache resources + +## Related Topics + +- Coalescing: `../coalescing/DOC.md` +- Data layout and alignment: `../data-layout-and-alignment/DOC.md` +- Streams and events: `../streams-and-events/DOC.md` +- Performance debugging: `../performance-debugging/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Programming Guide, L2 persistence/access-policy window APIs: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html +- CUDA C++ Best Practices Guide, memory-system optimization context: https://docs.nvidia.com/cuda/archive/13.0.0/cuda-c-best-practices-guide/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/coalescing/DOC.md b/content/cuda/docs/coalescing/DOC.md new file mode 100644 index 00000000..296a5271 --- /dev/null +++ b/content/cuda/docs/coalescing/DOC.md @@ -0,0 +1,132 @@ +--- +name: coalescing +description: "CUDA global-memory coalescing essentials: contiguous access, pitch, striding, and when shared memory helps." +metadata: + languages: "cpp" + versions: "13.0" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,coalescing,memory-coalescing,coalesced-access,uncoalesced-access,global-memory,memory-bandwidth,stride,pitch,shared-memory,transpose" +--- + +# CUDA Memory Coalescing (C++) + +Use this page for global-memory access-pattern rules that determine whether a kernel uses bandwidth efficiently. + +## What Coalescing Means + +Coalescing is the hardware combining a warp's global-memory accesses into as few memory transactions as possible. + +At a high level: + +- adjacent threads should usually access adjacent addresses +- strided or scattered access wastes bandwidth +- good coalescing matters most in memory-bound kernels + +## Best Default Pattern + +For a 1D array, prefer: + +```cpp +int i = blockIdx.x * blockDim.x + threadIdx.x; +value = input[i]; +``` + +This maps neighboring threads to neighboring elements. + +## Common Bad Pattern + +Patterns like this often destroy coalescing: + +```cpp +int i = blockIdx.x * blockDim.x + threadIdx.x; +value = input[i * stride]; +``` + +Large stride across a warp usually turns one efficient transaction pattern into many inefficient ones. + +## 2D Arrays and Pitch + +For 2D row-major arrays, accesses are most efficient when: + +- threads move along the row dimension together +- row width is aligned well for warp-based access + +If width is not naturally aligned for the hardware, use pitched allocation: + +- `cudaMallocPitch` +- `cudaMemcpy2D` + +This is the standard fix when row width is awkward and rows need padding. + +## Shared Memory As A Reordering Tool + +Shared memory is often used together with coalescing: + +- load from global memory in a coalesced pattern +- reorder in shared memory +- consume in the algorithm's preferred order + +This is a common pattern for: + +- transpose +- tiled GEMM +- stencil halos +- gather/scatter restructuring + +## Coalescing vs Bank Conflicts + +These are different problems: + +- coalescing concerns global-memory transactions +- bank conflicts concern shared-memory accesses + +A kernel can have good coalescing and bad shared-memory banking, or the reverse. + +## Practical Heuristics + +- if a warp reads a row of contiguous elements, that is usually good +- if a warp reads a column from a row-major array directly, that is usually bad +- if a transpose-like pattern is needed, use shared memory to convert the access pattern +- align vectorized loads when using `float2` / `float4` + +## Minimal Tiling Pattern + +```cpp +__shared__ float tile[32][33]; + +int x = blockIdx.x * 32 + threadIdx.x; +int y = blockIdx.y * 32 + threadIdx.y; + +tile[threadIdx.y][threadIdx.x] = input[y * width + x]; +__syncthreads(); +``` + +This style is common because: + +- the global load can be coalesced +- the padded shared tile helps avoid bank conflicts during transposed access + +## When To Suspect Coalescing Problems + +- bandwidth is far below expectation +- profiling shows many global-memory transactions per requested byte +- a transpose or gather/scatter kernel is unexpectedly slow +- changing block shape changes performance dramatically + +## Related Topics + +- Shared memory usage: `../shared-memory/DOC.md` +- Synchronization rules: `../synchronization/DOC.md` +- Memory-space selection: `../memory-hierarchy/DOC.md` +- Runtime API overview: `../runtime/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Best Practices Guide, optimizing memory access: https://docs.nvidia.com/cuda/archive/13.0.0/cuda-c-best-practices-guide/index.html +- CUDA C++ Best Practices Guide, coalesced access to global memory: https://docs.nvidia.com/cuda/archive/13.0.0/cuda-c-best-practices-guide/index.html#coalesced-access-to-global-memory +- CUDA C++ Best Practices Guide, shared memory and matrix multiplication examples: https://docs.nvidia.com/cuda/archive/13.0.0/cuda-c-best-practices-guide/index.html#shared-memory +- CUDA C++ Programming Guide, 2D arrays and pitched allocation: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/collective-communication-patterns/DOC.md b/content/cuda/docs/collective-communication-patterns/DOC.md new file mode 100644 index 00000000..f7d55132 --- /dev/null +++ b/content/cuda/docs/collective-communication-patterns/DOC.md @@ -0,0 +1,66 @@ +--- +name: collective-communication-patterns +description: "CUDA collective communication essentials: reductions, scans, histogram-like updates, and hierarchical aggregation patterns." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,collective,reduction,scan,histogram,aggregation,warp-collective,block-collective" +--- + +# CUDA Collective Communication Patterns (C++) + +Use this page for patterns where many threads combine, distribute, or summarize values. + +## Common Collective Types + +- reduction (sum/max/min/etc.) +- scan/prefix sum +- histogram and bucketized accumulation +- vote/ballot-based filtering + +## Hierarchical Strategy + +A standard high-performance pattern is hierarchical: + +1. intra-warp collective (shuffle/vote) +2. intra-block collective (shared memory) +3. cross-block aggregation (global memory or multi-stage kernel) + +This minimizes global contention. + +## Reduction Pattern + +- reduce in warp first with `__shfl*_sync` +- write one value per warp to shared memory +- final block reduction +- optionally one global write/atomic per block + +## Scan Pattern + +- use block-local scan primitives +- stitch block boundaries in a second phase when global prefix is required + +Avoid forcing a single global synchronization model in one monolithic kernel. + +## Histogram-Like Pattern + +- privatize bins per warp/block when feasible +- merge privately accumulated bins later + +Direct global atomics on a small bin set are usually the worst-case path. + +## Related Topics + +- Warp primitives: `../warp-primitives/DOC.md` +- Atomics and reductions: `../atomics-and-reductions/DOC.md` +- Synchronization: `../synchronization/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Programming Guide, warp intrinsics and synchronization primitives: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html +- CUDA C++ Best Practices Guide, reduction and memory optimization context: https://docs.nvidia.com/cuda/archive/13.0.0/cuda-c-best-practices-guide/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/compute-bound-kernel-optimization-playbook/DOC.md b/content/cuda/docs/compute-bound-kernel-optimization-playbook/DOC.md new file mode 100644 index 00000000..9a6ae433 --- /dev/null +++ b/content/cuda/docs/compute-bound-kernel-optimization-playbook/DOC.md @@ -0,0 +1,65 @@ +--- +name: compute-bound-kernel-optimization-playbook +description: "Compute-bound kernel optimization playbook: instruction mix, occupancy/ILP balance, register pressure control, and path selection." +metadata: + languages: "cpp" + versions: "13.1" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,compute-bound,optimization,instruction-mix,occupancy,ilp,register-pressure,cuda-core,tensor-core" +--- + +# Compute-Bound Kernel Optimization Playbook (C++) + +Use this page after profiling indicates arithmetic throughput is the dominant limiter. + +## Primary Objectives + +- Improve useful instruction issue rate. +- Reduce dependency and scheduling stalls. +- Select the right arithmetic path (CUDA Core vs Tensor Core). + +## High-Impact Levers + +- Improve instruction mix in hot loops. +- Balance occupancy and ILP. +- Control register usage to avoid spill-driven regressions. +- Evaluate Tensor Core migration only when workload shape supports it. + +## Triage Sequence + +1. Confirm the kernel is truly compute-bound after memory cleanup. +2. Inspect stall reasons related to dependencies and issue efficiency. +3. Tune unroll depth and block geometry together. +4. Re-evaluate path selection (`cuda-core` vs `wmma`/Tensor Core). + +## Common Failure Modes + +- Aggressive unrolling increases spills and slows kernel. +- Occupancy chasing hurts per-warp progress. +- Tensor Core migration applied to non-matrix-like workloads. + +## Verification Checklist + +- Throughput metrics improve with stable correctness. +- Register spills do not increase unexpectedly. +- End-to-end runtime improves for production-representative shapes. + +## Related Topics + +- Compute throughput: `../compute-throughput/DOC.md` +- CUDA Core path: `../cuda-core/DOC.md` +- CUDA Core optimization checklist: `../cuda-core-optimization-checklist/DOC.md` +- Tensor Cores: `../tensor-cores/DOC.md` +- CUDA Core vs Tensor Core path selection: `../cuda-core-vs-tensor-core-path-selection/DOC.md` +- Kernel bottleneck diagnosis workflow: `../kernel-bottleneck-diagnosis-workflow/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Programming Guide, arithmetic throughput context: https://docs.nvidia.com/cuda/cuda-c-programming-guide/ +- CUDA C++ Best Practices Guide: https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/ +- Nsight Compute Profiling Guide: https://docs.nvidia.com/nsight-compute/2024.2/ProfilingGuide/index.html + +Last cross-check date: 2026-03-20 + diff --git a/content/cuda/docs/compute-throughput/DOC.md b/content/cuda/docs/compute-throughput/DOC.md new file mode 100644 index 00000000..1c4e9d3a --- /dev/null +++ b/content/cuda/docs/compute-throughput/DOC.md @@ -0,0 +1,105 @@ +--- +name: compute-throughput +description: "CUDA compute-throughput essentials: arithmetic throughput tables, latency hiding, and when Tensor Cores beat ordinary arithmetic paths." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,throughput,compute-bound,fp32,fp16,int32,cuda-core,tensor-core,latency-hiding" +--- + +# CUDA Compute Throughput (C++) + +Use this page to reason about whether a kernel is limited by ordinary arithmetic throughput, Tensor Core throughput, or memory behavior. + +## The First Split + +Ask this first: + +- is the kernel memory-bound? +- or is it compute-bound? + +If memory traffic dominates, moving from ordinary arithmetic to Tensor Cores may not help much until memory behavior is fixed. + +## Ordinary Arithmetic Path + +The CUDA Programming Guide publishes per-SM throughput tables for native arithmetic instructions. + +These tables show that: + +- throughput depends strongly on architecture +- FP32, FP16, INT32, and FP64 do not have the same peak rates +- per-SM throughput must be multiplied by SM count for whole-device peak + +So a generic "CUDA Core throughput" number is not enough by itself. The relevant question is which instruction family the kernel actually uses. + +## Tensor Core Path + +Tensor Cores can provide much higher matrix-multiply-accumulate throughput than ordinary scalar arithmetic paths when: + +- the algorithm is matrix-multiply-like +- supported data types are acceptable +- tile shapes and layouts match the API and hardware requirements +- data staging overhead does not erase the gains + +This is why GEMM, attention, and convolution-like kernels are common Tensor Core candidates, while control-heavy kernels usually are not. + +## Throughput Is Not Just Peak Math + +A kernel can miss peak throughput because of: + +- dependency chains that the scheduler cannot hide +- low occupancy +- poor instruction mix +- register pressure +- memory stalls before arithmetic units are saturated + +So "Tensor Core capable" does not imply "Tensor Core efficient". + +## Practical Decision Rule + +Stay on the ordinary arithmetic path when: + +- the operation is elementwise or irregular +- there is too much branching or indexing complexity +- supported Tensor Core types or layouts do not fit the problem + +Move toward Tensor Cores when: + +- the kernel is dominated by dense matrix multiply-accumulate +- the math can be tiled at warp granularity +- data movement can be organized cleanly + +## What To Check In Practice + +- achieved memory bandwidth +- achieved occupancy +- instruction mix +- whether warp-level matrix instructions are present +- whether the kernel is actually compute-bound after memory optimization + +## Related Topics + +- Execution model: `../execution-model/DOC.md` +- CUDA Core path: `../cuda-core/DOC.md` +- CUDA Core optimization checklist: `../cuda-core-optimization-checklist/DOC.md` +- Occupancy tuning: `../occupancy/DOC.md` +- Tensor Core API usage: `../tensor-cores/DOC.md` +- WMMA debugging checklist: `../wmma-debugging-checklist/DOC.md` +- Tensor Core pipeline patterns: `../tensor-core-pipeline-patterns/DOC.md` +- Tensor Core numerical validation: `../tensor-core-numerical-validation/DOC.md` +- CUDA Core vs Tensor Core path selection: `../cuda-core-vs-tensor-core-path-selection/DOC.md` +- Kernel bottleneck diagnosis workflow: `../kernel-bottleneck-diagnosis-workflow/DOC.md` +- Shared memory staging: `../shared-memory/DOC.md` +- Numerics and precision: `../numerics-and-precision/DOC.md` +- Fused kernel design patterns: `../fused-kernel-design-patterns/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Programming Guide, arithmetic instruction throughput tables: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html +- CUDA C++ Programming Guide, instruction-throughput interpretation: https://docs.nvidia.com/cuda/archive/11.7.0/cuda-c-programming-guide/index.html +- Turing Tuning Guide, SM execution resources and latency hiding discussion: https://docs.nvidia.com/cuda/archive/12.4.0/turing-tuning-guide/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/cooperative-groups/DOC.md b/content/cuda/docs/cooperative-groups/DOC.md new file mode 100644 index 00000000..1a775076 --- /dev/null +++ b/content/cuda/docs/cooperative-groups/DOC.md @@ -0,0 +1,104 @@ +--- +name: cooperative-groups +description: "CUDA Cooperative Groups essentials: thread_block, tiled_partition, coalesced_threads, cluster groups, and collective participation rules." +metadata: + languages: "cpp" + versions: "13.1" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,cooperative-groups,thread_block,tiled_partition,coalesced_threads,this_grid,this_cluster,group-sync" +--- + +# CUDA Cooperative Groups (C++) + +Use this page when kernels need explicit group objects rather than hard-coding assumptions about blocks and warps. + +## Why Cooperative Groups Exists + +Cooperative Groups makes the participating set of threads explicit. + +Instead of assuming "all threads in the block" or "one warp", code can pass a group object into a helper and make the collective scope explicit. + +This improves: + +- software composition +- readability +- portability across newer GPU behaviors + +## Common Group Handles + +Frequently used accessors include: + +- `this_thread_block()` +- `this_grid()` +- `coalesced_threads()` +- `this_cluster()` + +Common types and concepts include: + +- `thread_group` +- `thread_block` +- tiled partitions +- cluster groups + +## Basic Thread Block Example + +```cpp +namespace cg = cooperative_groups; + +cg::thread_block block = cg::this_thread_block(); +block.sync(); +``` + +`block.sync()` is the Cooperative Groups form of block-wide synchronization. + +## Tiled Partition + +Use `tiled_partition()` to decompose a block into smaller groups: + +```cpp +auto block = cg::this_thread_block(); +auto tile32 = cg::tiled_partition(block, 32); +``` + +This is useful for warp-sized or sub-warp collectives without manually reasoning about lane groups everywhere in the code. + +## Participation Rule + +Collective operations require correct participation. + +- all threads in the group must participate in collective operations +- the group handle should be created consistently +- it is best to obtain implicit groups early, before divergence + +Violating participation assumptions leads to undefined behavior. + +## Practical Guidance + +- pass group handles by reference into helper functions +- prefer specialized groups instead of over-generic abstractions when performance matters +- create implicit handles early in the kernel + +## Where It Connects To Other Features + +Cooperative Groups is the user-facing bridge for several advanced CUDA features: + +- tiled warp/block decomposition +- async copy collectives like `memcpy_async` +- cluster groups with `this_cluster()` + +## Related Topics + +- Synchronization rules: `../synchronization/DOC.md` +- Warp primitives: `../warp-primitives/DOC.md` +- Async copy: `../async-copy/DOC.md` +- Thread Block Clusters: `../thread-block-clusters/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA Programming Guide, Cooperative Groups: https://docs.nvidia.com/cuda/cuda-programming-guide/04-special-topics/cooperative-groups.html +- CUDA Programming Guide, classic Cooperative Groups overview: https://docs.nvidia.com/cuda/archive/9.2/cuda-c-programming-guide/ +- CUDA Programming Guide, modern cluster and implicit-group accessors: https://docs.nvidia.com/cuda/archive/13.1.1/cuda-programming-guide/01-introduction/programming-model.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/cublas-cudnn-integration-patterns/DOC.md b/content/cuda/docs/cublas-cudnn-integration-patterns/DOC.md new file mode 100644 index 00000000..9c486942 --- /dev/null +++ b/content/cuda/docs/cublas-cudnn-integration-patterns/DOC.md @@ -0,0 +1,71 @@ +--- +name: cublas-cudnn-integration-patterns +description: "CUDA library integration essentials: cuBLAS/cuDNN handle lifecycle, stream binding, workspace policy, and mixed custom-kernel pipelines." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,cublas,cudnn,integration,handle,stream-binding,workspace,mixed-pipeline" +--- + +# cuBLAS/cuDNN Integration Patterns (C++) + +Use this page when combining custom CUDA kernels with cuBLAS or cuDNN calls. + +## Handle Lifecycle + +Library handles should usually be: + +- created once per host thread/context +- reused across iterations +- destroyed at controlled shutdown + +Frequent create/destroy in hot paths adds overhead. + +## Stream Binding Rule + +Bind library handles to the correct stream before issuing calls. + +- cuBLAS/cuDNN work should run in the intended stream +- stream mismatch causes accidental serialization or race-like ordering bugs + +## Workspace Strategy + +Many cuDNN and some cuBLAS paths use temporary workspace. + +- allocate and reuse workspace buffers where possible +- avoid repeated malloc/free during steady-state loops +- keep workspace sizing policy consistent with algorithm selection + +## Mixed Pipelines + +Common pattern: + +1. pre/post-processing in custom kernels +2. dense math in cuBLAS/cuDNN +3. follow-up custom kernels + +Use events/stream ordering rather than global synchronization between stages. + +## Determinism And Performance + +Algorithm choices can trade determinism and speed. + +- production training/inference pipelines should explicitly document determinism expectations +- benchmark with the exact settings that production will use + +## Related Topics + +- Streams and events: `../streams-and-events/DOC.md` +- CUDA Graphs: `../cuda-graphs/DOC.md` +- Performance debugging: `../performance-debugging/DOC.md` + +## Official Source Links (Fact Check) + +- cuBLAS documentation: https://docs.nvidia.com/cuda/cublas/index.html +- cuDNN documentation: https://docs.nvidia.com/deeplearning/cudnn/latest/ +- CUDA Runtime API (stream interoperability): https://docs.nvidia.com/cuda/cuda-runtime-api/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/cuda-core-optimization-checklist/DOC.md b/content/cuda/docs/cuda-core-optimization-checklist/DOC.md new file mode 100644 index 00000000..71d9ed4f --- /dev/null +++ b/content/cuda/docs/cuda-core-optimization-checklist/DOC.md @@ -0,0 +1,73 @@ +--- +name: cuda-core-optimization-checklist +description: "CUDA Core optimization checklist: coalescing, divergence control, occupancy/ILP balancing, and measurement-first tuning." +metadata: + languages: "cpp" + versions: "13.1" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,cuda-core,optimization,checklist,coalescing,divergence,occupancy,ilp,register-pressure,latency-hiding" +--- + +# CUDA Core Optimization Checklist (C++) + +Use this page when a kernel is intentionally on the ordinary arithmetic path and needs systematic optimization. + +## Step 1: Confirm The Bottleneck Class + +Before changing code, classify the kernel: + +- memory-bound +- compute-bound +- launch/orchestration-bound + +Use profiling first. Do not optimize blind. + +## Step 2: Memory Access Quality + +- Ensure global-memory accesses are coalesced. +- Reduce redundant loads with reuse (register/shared memory where appropriate). +- Avoid severe shared-memory bank conflicts in staging buffers. + +## Step 3: Control Flow Quality + +- Reduce divergence in hot warps. +- Make branch conditions uniform where possible. +- Move rare-path logic off hot loops when feasible. + +## Step 4: Occupancy And ILP Balance + +- Avoid maximizing occupancy as a standalone goal. +- Tune block size, unroll depth, and register footprint together. +- Improve ILP when scoreboard/dependency stalls dominate. + +## Step 5: Validate Every Optimization + +- Reprofile after each major change. +- Track throughput, stall mix, occupancy, and memory metrics together. +- Keep correctness checks and numerical checks in the loop. + +## Common Anti-Patterns + +- Chasing one metric (for example occupancy) while total throughput worsens. +- Heavy unrolling that increases register spills. +- Introducing shared memory without fixing access pattern quality. + +## Related Topics + +- CUDA Core path overview: `../cuda-core/DOC.md` +- Compute throughput: `../compute-throughput/DOC.md` +- Occupancy: `../occupancy/DOC.md` +- Coalescing: `../coalescing/DOC.md` +- Performance debugging: `../performance-debugging/DOC.md` +- Bottleneck diagnosis workflow: `../kernel-bottleneck-diagnosis-workflow/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Programming Guide: https://docs.nvidia.com/cuda/cuda-c-programming-guide/ +- CUDA C++ Best Practices Guide: https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/ +- Nsight Compute Profiling Guide: https://docs.nvidia.com/nsight-compute/2024.2/ProfilingGuide/index.html + +Last cross-check date: 2026-03-20 + diff --git a/content/cuda/docs/cuda-core-vs-tensor-core-path-selection/DOC.md b/content/cuda/docs/cuda-core-vs-tensor-core-path-selection/DOC.md new file mode 100644 index 00000000..9bf072b5 --- /dev/null +++ b/content/cuda/docs/cuda-core-vs-tensor-core-path-selection/DOC.md @@ -0,0 +1,92 @@ +--- +name: cuda-core-vs-tensor-core-path-selection +description: "Path selection guide: deciding between CUDA Core and Tensor Core execution using workload shape, dtype, layout, and numerical constraints." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,cuda-core,tensor-core,path-selection,wmma,wgmma,dtype,layout,precision,fallback" +--- + +# CUDA Core vs Tensor Core Path Selection (C++) + +Use this page when deciding whether to implement or keep a kernel on ordinary arithmetic pipelines or move it to Tensor Core matrix instructions. + +## Fast Decision Matrix + +Choose CUDA Core path first when: + +- operation is elementwise, reduction-heavy, sparse, or control-heavy +- matrix structure is weak or tile reuse is poor +- required dtype/layout does not map cleanly to Tensor Core-supported combinations + +Choose Tensor Core path first when: + +- workload is dominated by dense matrix-multiply-accumulate +- shape and layout can be tiled consistently at warp or warpgroup granularity +- allowed dtype/accumulation policy matches supported Tensor Core paths + +## Data-Type And Numerics Gate + +Before migration, verify: + +- multiplicand and accumulator types are supported by the target path +- error budget tolerates the chosen precision policy +- baseline parity tests pass with realistic input distributions + +If these checks fail, forcing Tensor Core instructions can create unstable numerics or hidden fallback behavior. + +## Layout And Staging Gate + +Tensor Core speedups depend on movement cost. + +Require: + +- consistent layout contracts (`row_major`/`col_major`, leading dimensions) +- efficient shared-memory staging plan +- synchronization protocol that does not serialize hot loops + +If memory behavior remains dominant after staging optimization, keep CUDA Core path and optimize arithmetic/memory overlap there. + +## Performance Validation Protocol + +1. Build a correctness baseline. +2. Profile CUDA Core implementation to identify real bottlenecks. +3. Implement Tensor Core path candidate. +4. Compare throughput, memory pressure, occupancy, and stall behavior. +5. Keep the faster path under expected production shapes, not just synthetic peak cases. + +## Fallback Strategy + +Production kernels should keep explicit fallback behavior: + +- capability checks for architecture/toolchain support +- shape or dtype guards for unsupported combinations +- deterministic fallback to CUDA Core implementation + +This avoids silent behavior drift across deployment environments. + +## Practical Rule Of Thumb + +- Default to CUDA Core path for generality and low complexity. +- Move to Tensor Core path for matrix-dense hotspots after profiling confirms arithmetic throughput is the limiting factor. +- Keep both paths when workload diversity is high. + +## Related Topics + +- CUDA Core path: `../cuda-core/DOC.md` +- Tensor Core overview: `../tensor-cores/DOC.md` +- WMMA practical patterns: `../wmma-kernel-patterns/DOC.md` +- Tensor Core pipeline patterns: `../tensor-core-pipeline-patterns/DOC.md` +- Fallback/capability detection: `../fallback-strategies-and-capability-detection/DOC.md` +- Numerics and precision: `../numerics-and-precision/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Programming Guide (execution model, WMMA, memory model): https://docs.nvidia.com/cuda/cuda-c-programming-guide/ +- CUDA C++ Best Practices Guide (memory and throughput guidance): https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/ + +Last cross-check date: 2026-03-20 + diff --git a/content/cuda/docs/cuda-core/DOC.md b/content/cuda/docs/cuda-core/DOC.md new file mode 100644 index 00000000..e63a79b8 --- /dev/null +++ b/content/cuda/docs/cuda-core/DOC.md @@ -0,0 +1,91 @@ +--- +name: cuda-core +description: "CUDA Core path essentials: SIMT arithmetic pipelines, warp scheduling, ILP/occupancy tradeoffs, and practical optimization workflow." +metadata: + languages: "cpp" + versions: "13.1" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,cuda-core,simt,fp32,int32,warp,scheduler,ilp,occupancy,latency-hiding" +--- + +# CUDA Core Path (C++) + +Use this page for kernels that run on ordinary SM arithmetic pipelines (the path developers usually call "CUDA Core path"), not Tensor Core matrix instructions. + +## What This Means In Practice + +For CUDA C++ kernels, "CUDA Core path" usually means: + +- ordinary scalar or vector arithmetic instructions (FP32, INT32, FP64, and related ops) +- SIMT warp execution on standard SM arithmetic pipelines +- no explicit warp-matrix API (`wmma`) and no PTX warpgroup matrix instructions (`wgmma`) + +There is no separate CUDA C++ API named "CUDA Core". The distinction is a performance and execution-model distinction. + +## Typical Workloads + +Kernels that usually remain on this path: + +- elementwise transforms +- reductions and scans with limited matrix structure +- indexing-heavy or branch-heavy kernels +- irregular sparse kernels + +Even in ML workloads, many preprocessing, activation, normalization, and indexing phases are CUDA Core dominated. + +## Optimization Checklist + +1. Make global memory access coalesced. +2. Remove avoidable divergence in hot warps. +3. Balance occupancy and register pressure instead of maximizing occupancy blindly. +4. Increase instruction-level parallelism where dependency chains are long. +5. Validate cache and shared-memory behavior before deep unrolling. + +## Occupancy vs ILP Tradeoff + +Two common failure modes: + +- **High occupancy, low per-warp progress:** too little ILP, frequent dependency stalls. +- **High ILP, low occupancy:** register usage or shared-memory usage blocks enough resident warps. + +Tune block size, unroll factors, and register usage together. Treat occupancy as a means to hide latency, not as the final objective. + +## How To Verify You Are On This Path + +In profiler output, check whether runtime is dominated by ordinary arithmetic instruction activity and not matrix instruction activity. Also check: + +- warp stall reasons (dependency, memory throttling, execution dependency) +- achieved occupancy +- memory throughput utilization +- instruction mix consistency with kernel intent + +If your intended Tensor Core kernel shows only ordinary arithmetic activity, the path selection is wrong. + +## When To Escalate To Tensor Cores + +Move to Tensor Cores when all are true: + +- workload is dominated by dense matrix-multiply-accumulate +- data types and layouts match supported matrix instruction paths +- staging and synchronization overhead can be controlled +- numerical policy is acceptable (for example FP16/BF16/TF32 with chosen accumulation) + +## Related Topics + +- Execution model: `../execution-model/DOC.md` +- Compute throughput: `../compute-throughput/DOC.md` +- Occupancy: `../occupancy/DOC.md` +- Warp primitives: `../warp-primitives/DOC.md` +- Tensor Cores: `../tensor-cores/DOC.md` +- Path selection guide: `../cuda-core-vs-tensor-core-path-selection/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Programming Guide, SIMT and warp execution: https://docs.nvidia.com/cuda/cuda-c-programming-guide/ +- CUDA C++ Programming Guide, arithmetic instruction throughput interpretation: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html +- Turing Tuning Guide, latency hiding and scheduler behavior: https://docs.nvidia.com/cuda/archive/12.4.0/turing-tuning-guide/index.html + +Last cross-check date: 2026-03-20 + diff --git a/content/cuda/docs/cuda-graphs/DOC.md b/content/cuda/docs/cuda-graphs/DOC.md new file mode 100644 index 00000000..027478a1 --- /dev/null +++ b/content/cuda/docs/cuda-graphs/DOC.md @@ -0,0 +1,104 @@ +--- +name: cuda-graphs +description: "CUDA Graphs essentials: definition, instantiation, execution, stream capture, cross-stream event capture, and update rules." +metadata: + languages: "cpp" + versions: "12.6" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,cuda-graphs,graph,stream-capture,cudaStreamBeginCapture,cudaGraphLaunch,cudaGraphInstantiate" +--- + +# CUDA Graphs (C++) + +Use this page when the same workflow launches repeatedly and CPU launch overhead from streams becomes significant. + +## Why Graphs Exist + +CUDA Graphs separate work submission into: + +1. definition +2. instantiation +3. execution + +This amortizes setup work and can reduce CPU launch overhead compared with issuing many short kernels one by one into streams. + +## Two Creation Paths + +Graphs can be created by: + +- explicit graph APIs +- stream capture + +Stream capture is often the easiest migration path for existing stream-based code. + +## Stream Capture + +Typical pattern: + +```cpp +cudaGraph_t graph; + +cudaStreamBeginCapture(stream); +kernelA<<>>(...); +kernelB<<>>(...); +cudaStreamEndCapture(stream, &graph); +``` + +During capture, work is appended to a graph instead of being immediately enqueued for execution. + +## Event-Based Cross-Stream Capture + +CUDA documents that stream capture can preserve cross-stream dependencies expressed with: + +- `cudaEventRecord()` +- `cudaStreamWaitEvent()` + +provided the waited-on event belongs to the same capture graph. + +## Execution Lifecycle + +After a graph is defined: + +- instantiate it into an executable graph +- launch the executable graph into a stream +- reuse it many times if the workflow is stable + +Graphs help most when the structure is repeated often enough to amortize instantiation. + +## Common Capture Hazards + +- using unsupported APIs during capture +- mixing captured and non-captured dependencies incorrectly +- synchronizing captured streams or captured events in invalid ways +- relying on legacy default stream behavior during capture + +When a capture is invalidated, the graph becomes unusable and capture must be ended. + +## When Graphs Help + +Graphs are especially useful when: + +- kernels are short and launch overhead is material +- the workflow topology repeats +- stream orchestration logic is otherwise host-heavy + +They are less useful when: + +- the workload shape changes every iteration +- the overhead is dominated by kernel execution, not launch + +## Related Topics + +- Streams and events: `../streams-and-events/DOC.md` +- Runtime API overview: `../runtime/DOC.md` +- Performance debugging: `../performance-debugging/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Programming Guide, CUDA Graphs overview: https://docs.nvidia.com/cuda/archive/12.6.1/cuda-c-programming-guide/index.html +- CUDA C++ Programming Guide, stream capture and cross-stream events: https://docs.nvidia.com/cuda/archive/11.7.0/cuda-c-programming-guide/index.html +- CUDA Programming Guide, earlier graph API examples: https://docs.nvidia.com/cuda/archive/12.2.0/cuda-c-programming-guide/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/data-layout-and-alignment/DOC.md b/content/cuda/docs/data-layout-and-alignment/DOC.md new file mode 100644 index 00000000..05281964 --- /dev/null +++ b/content/cuda/docs/data-layout-and-alignment/DOC.md @@ -0,0 +1,80 @@ +--- +name: data-layout-and-alignment +description: "CUDA data-layout and alignment essentials: struct packing, vectorized loads/stores, pitch/stride choices, and alignment-driven performance." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,data-layout,alignment,vectorized-load,vectorized-store,pitch,stride,coalescing" +--- + +# CUDA Data Layout And Alignment (C++) + +Use this page when kernel performance depends on memory layout details. + +## Why Layout Matters + +On CUDA GPUs, layout affects: + +- coalescing behavior +- transaction count +- shared-memory bank behavior +- feasibility of vectorized loads/stores + +Poor layout can dominate runtime even when arithmetic is optimized. + +## Alignment Basics + +Prefer natural alignment for data types and vectorized access. + +- align pointers and base addresses to vector width +- keep struct fields ordered to reduce padding surprises +- avoid accidental misalignment from custom allocators or byte offsets + +## AoS vs SoA + +For many throughput-oriented kernels: + +- SoA (structure of arrays) is often better for coalesced parallel access +- AoS (array of structs) can be easier semantically but may scatter accessed fields + +Choose based on the access pattern of active threads, not only code convenience. + +## Vectorized Access + +Vectorized loads/stores (`float2`, `float4`, etc.) are useful when: + +- data is aligned to the vector width +- adjacent threads follow contiguous access +- vectorization does not introduce awkward tail handling overhead + +Always verify achieved bandwidth after vectorization; assumptions are often wrong. + +## 2D Layouts + +For 2D tensors/arrays: + +- row-major contiguous row access is usually easiest to coalesce +- use pitched allocation when row width alignment is problematic +- treat logical shape and physical stride as separate concepts in APIs + +## Common Pitfalls + +- hidden misalignment from packed/byte-offset structs +- mixing row-major assumptions with column-oriented access +- forcing vectorized access on unaligned data + +## Related Topics + +- Coalescing: `../coalescing/DOC.md` +- Shared memory: `../shared-memory/DOC.md` +- Memory hierarchy: `../memory-hierarchy/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Best Practices Guide, memory access patterns and alignment context: https://docs.nvidia.com/cuda/archive/13.0.0/cuda-c-best-practices-guide/index.html +- CUDA C++ Programming Guide, memory model and type/layout background: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/driver/DOC.md b/content/cuda/docs/driver/DOC.md new file mode 100644 index 00000000..ee388665 --- /dev/null +++ b/content/cuda/docs/driver/DOC.md @@ -0,0 +1,73 @@ +--- +name: driver +description: "CUDA Driver API essentials: explicit context management, module loading, and kernel launch." +metadata: + languages: "cpp" + versions: "12.4" + revision: 1 + updated-on: "2026-03-18" + source: community + tags: "cuda,gpu,kernel,driver,api,ptx" +--- + +# CUDA Driver API (C++) + +Use the Driver API when you need explicit control over contexts, modules, and dynamic kernel loading. It is lower-level than the Runtime API. + +## Basic Flow + +1. Initialize the driver and pick a device +2. Create a context +3. Load a module (PTX or cubin) +4. Get the kernel function +5. Allocate memory and launch +6. Cleanup + +```cpp +#include +#include + +int main() { + CUdevice dev; + CUcontext ctx; + cuInit(0); + cuDeviceGet(&dev, 0); + cuCtxCreate(&ctx, 0, dev); + + CUmodule module; + CUfunction func; + cuModuleLoad(&module, "kernel.ptx"); + cuModuleGetFunction(&func, module, "my_kernel"); + + CUdeviceptr d_out; + cuMemAlloc(&d_out, 1024); + + void* args[] = { &d_out }; + cuLaunchKernel(func, + 1, 1, 1, + 256, 1, 1, + 0, 0, args, 0); + + cuMemFree(d_out); + cuModuleUnload(module); + cuCtxDestroy(ctx); + return 0; +} +``` + +## Core Driver APIs + +- Context: `cuInit`, `cuDeviceGet`, `cuCtxCreate`, `cuCtxDestroy` +- Module: `cuModuleLoad`, `cuModuleLoadData`, `cuModuleGetFunction` +- Memory: `cuMemAlloc`, `cuMemFree`, `cuMemcpyHtoD`, `cuMemcpyDtoH` +- Launch: `cuLaunchKernel` + +## Common Pitfalls + +- Forgetting to create a context before module operations +- Using mismatched kernel names between PTX and host code +- Not checking return codes (Driver API returns `CUresult`) + +## Related Topics + +- Module loading details: `references/module-loading.md` diff --git a/content/cuda/docs/driver/references/module-loading.md b/content/cuda/docs/driver/references/module-loading.md new file mode 100644 index 00000000..c7634b1a --- /dev/null +++ b/content/cuda/docs/driver/references/module-loading.md @@ -0,0 +1,19 @@ +# CUDA Driver Module Loading + +You can load modules from: + +- PTX text (JIT compiled): `cuModuleLoadData` or `cuModuleLoadDataEx` +- Cubin file (precompiled): `cuModuleLoad` + +Common patterns: + +```cpp +CUmodule module = nullptr; +CUresult r = cuModuleLoad(&module, "kernel.cubin"); +// or +r = cuModuleLoadData(&module, ptx_string); +``` + +Notes: +- `cuModuleLoadDataEx` lets you pass JIT options for diagnostics or optimization. +- Always unload modules with `cuModuleUnload` when done. diff --git a/content/cuda/docs/dynamic-parallelism/DOC.md b/content/cuda/docs/dynamic-parallelism/DOC.md new file mode 100644 index 00000000..068410f6 --- /dev/null +++ b/content/cuda/docs/dynamic-parallelism/DOC.md @@ -0,0 +1,65 @@ +--- +name: dynamic-parallelism +description: "CUDA Dynamic Parallelism essentials: device-side kernel launch semantics, synchronization behavior, and memory-space constraints." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,dynamic-parallelism,cdp,device-side-launch,child-kernel,cudaDeviceSynchronize,memory-coherence" +--- + +# CUDA Dynamic Parallelism (C++) + +Use this page when kernels launch other kernels on the device. + +## What It Is + +Dynamic Parallelism (CDP) lets device code launch child grids. + +- parent and child execute on the device +- launch configuration is provided from device code +- useful for irregular recursion-like or adaptive decomposition patterns + +## Core Semantics + +- child launch is asynchronous with respect to the launching thread by default +- synchronization choices in parent code determine when child results are consumed +- launch overhead is non-trivial; avoid using CDP for tiny kernels in hot loops + +## Memory-Space Coherence + +Key memory-space rule from CUDA docs: + +- parent and child share global/constant memory +- local and shared memory are private to their respective thread/block contexts + +Do not assume parent shared memory is visible to child kernels. + +## Typical Use Cases + +- adaptive refinement +- irregular tree/graph traversal +- work generation discovered during device execution + +For regular dense workloads, host-side launch or CUDA Graphs is usually better. + +## Common Pitfalls + +- launching too many tiny child kernels +- misunderstanding parent/child visibility boundaries +- relying on implicit ordering that is not guaranteed + +## Related Topics + +- CUDA Graphs: `../cuda-graphs/DOC.md` +- Streams and events: `../streams-and-events/DOC.md` +- Memory fences and ordering: `../memory-fences-and-ordering/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Programming Guide, Dynamic Parallelism: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html +- CUDA C++ Programming Guide, memory coherence in CDP: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/error-handling-and-debug-build/DOC.md b/content/cuda/docs/error-handling-and-debug-build/DOC.md new file mode 100644 index 00000000..e3453b5a --- /dev/null +++ b/content/cuda/docs/error-handling-and-debug-build/DOC.md @@ -0,0 +1,75 @@ +--- +name: error-handling-and-debug-build +description: "CUDA error-handling and debug-build essentials: launch checks, sync checks, debug flags, and diagnosis workflow." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,error-handling,cudaGetLastError,cudaPeekAtLastError,cudaDeviceSynchronize,debug-build,nvcc,-G,lineinfo" +--- + +# CUDA Error Handling And Debug Build (C++) + +Use this page for practical correctness diagnostics in CUDA applications. + +## Two-Step Error Check Pattern + +Always separate: + +1. launch configuration/API errors +2. runtime execution errors + +Typical pattern: + +```cpp +kernel<<>>(...); +cudaError_t e1 = cudaGetLastError(); // launch/config error +cudaError_t e2 = cudaDeviceSynchronize(); // execution error +``` + +Use stream-specific synchronization when possible instead of global device sync. + +## Why This Matters + +- some errors are detected at launch +- others appear only when kernel execution actually runs + +Checking only one side can hide failures. + +## Debug Build Basics + +For debugging kernels, common compile choices include: + +- device debug info (`-G`) for heavy debug sessions +- line info (`-lineinfo`) for profiling-friendly symbol mapping + +Debug builds can change optimization and performance; do not compare debug and release timings directly. + +## Runtime Diagnostics + +- use descriptive error strings with `cudaGetErrorString` +- include kernel name / input shape in logs +- fail fast in development paths to avoid cascading corruption + +## Practical Workflow + +1. reproduce with smallest failing input. +2. enable strict launch+sync checks. +3. switch to debug-oriented build flags if needed. +4. profile or inspect only after correctness is stable. + +## Related Topics + +- Runtime overview: `../runtime/DOC.md` +- Performance debugging: `../performance-debugging/DOC.md` +- NVTX workflow: `../nvtx-and-profiling-workflow/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA Runtime API, error handling APIs: https://docs.nvidia.com/cuda/cuda-runtime-api/index.html +- CUDA C++ Best Practices Guide, correctness and debugging guidance: https://docs.nvidia.com/cuda/archive/13.0.0/cuda-c-best-practices-guide/index.html +- NVCC documentation (debug flags): https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/execution-model/DOC.md b/content/cuda/docs/execution-model/DOC.md new file mode 100644 index 00000000..bbe48a0c --- /dev/null +++ b/content/cuda/docs/execution-model/DOC.md @@ -0,0 +1,93 @@ +--- +name: execution-model +description: "CUDA execution model essentials: warps, SM scheduling, divergence, and how ordinary arithmetic paths differ from Tensor Core paths." +metadata: + languages: "cpp" + versions: "13.1" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,execution-model,simt,warp,sm,scheduler,divergence,cuda-core,tensor-core" +--- + +# CUDA Execution Model (C++) + +Use this page to understand how CUDA threads are grouped and scheduled, and how ordinary arithmetic execution differs from Tensor Core execution. + +## SIMT Basics + +CUDA executes threads in groups of 32 called warps. + +- a warp is the main scheduling unit inside an SM +- threads in a warp conceptually execute the same kernel code in SIMT style +- divergence inside a warp reduces efficiency because different branch paths are executed separately + +This is why block sizes are usually chosen as multiples of 32. + +## SM-Level Scheduling + +An SM manages many resident warps and switches among them to hide latency. + +- when one warp stalls on memory or dependencies, the SM can issue instructions from another ready warp +- latency hiding depends on both occupancy and instruction-level parallelism +- exact scheduler and execution-unit details vary by architecture + +## What Developers Mean By "CUDA Core" + +NVIDIA documentation usually talks about instruction throughput, FP32/INT32/FP64 units, and SM execution resources rather than a CUDA C++ API called "CUDA Core". + +In practice, developers use "CUDA Core path" to mean: + +- ordinary arithmetic instructions such as FP32 / INT32 math +- standard SIMT execution on the SM's general arithmetic pipelines +- kernels that do not explicitly target Tensor Core matrix instructions + +This is an interpretation of the hardware execution model, not a separate CUDA C++ programming interface. + +## Tensor Core Path + +Tensor Cores are specialized matrix-multiply-accumulate units. + +- they are exposed in CUDA C++ through warp-level matrix APIs such as `nvcuda::wmma` +- they are exposed in PTX through matrix instructions such as `wgmma` +- they are most relevant when the computation naturally maps to small matrix tiles and supported types/layouts + +If a kernel is written using ordinary scalar or vector arithmetic, it is usually on the ordinary SM arithmetic path rather than the Tensor Core path. + +## Divergence And Utilization + +Ordinary arithmetic kernels often lose efficiency because of: + +- warp divergence +- uncoalesced memory access +- bank conflicts +- low occupancy or long dependency chains + +Tensor Core kernels add extra constraints: + +- warp-wide participation +- shape / layout / alignment restrictions +- staging and synchronization overhead around fragments or async pipelines + +## Rule Of Thumb + +- generic elementwise, reduction, indexing-heavy, and control-heavy kernels usually live on the ordinary arithmetic path +- dense matrix-multiply-like kernels are the main candidates for Tensor Core acceleration + +## Related Topics + +- CUDA Core path: `../cuda-core/DOC.md` +- Compute throughput model: `../compute-throughput/DOC.md` +- Occupancy tuning: `../occupancy/DOC.md` +- Warp-level primitives: `../warp-primitives/DOC.md` +- Tensor Core API usage: `../tensor-cores/DOC.md` +- WMMA kernel patterns: `../wmma-kernel-patterns/DOC.md` +- CUDA Core vs Tensor Core path selection: `../cuda-core-vs-tensor-core-path-selection/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA Programming Guide, programming model and warps: https://docs.nvidia.com/cuda/archive/13.1.1/cuda-programming-guide/01-introduction/programming-model.html +- CUDA Programming Guide, SIMT execution model: https://docs.nvidia.com/cuda/cuda-programming-guide/03-advanced/advanced-kernel-programming.html +- Turing Tuning Guide, SM scheduling and execution resources: https://docs.nvidia.com/cuda/archive/12.4.0/turing-tuning-guide/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/fallback-strategies-and-capability-detection/DOC.md b/content/cuda/docs/fallback-strategies-and-capability-detection/DOC.md new file mode 100644 index 00000000..fa0fe900 --- /dev/null +++ b/content/cuda/docs/fallback-strategies-and-capability-detection/DOC.md @@ -0,0 +1,63 @@ +--- +name: fallback-strategies-and-capability-detection +description: "CUDA capability detection and fallback essentials: feature probes, architecture guards, and safe runtime degradation paths." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,capability-detection,fallback,feature-probe,sm-version,graceful-degradation,runtime-guards" +--- + +# Fallback Strategies And Capability Detection (C++) + +Use this page when kernels depend on architecture-specific features (Tensor Cores, clusters, async paths, etc.). + +## Capability Detection + +Query device properties at runtime and gate features explicitly. + +Typical inputs: + +- compute capability (SM version) +- shared-memory limits +- cooperative/cluster support +- peer access/topology capabilities + +Do not infer support from GPU name strings. + +## Fallback Hierarchy + +Define ordered execution paths: + +1. preferred fast path (feature-rich) +2. compatible optimized fallback +3. conservative correctness fallback + +All paths should be tested; fallback code is production code. + +## Guardrail Principles + +- fail fast for unsupported required features +- degrade gracefully for optional accelerations +- log selected path for observability and debugging + +## Common Mistakes + +- fallback exists but is untested +- path selection logic diverges from documented requirements +- silent fallback causes unnoticed performance regressions + +## Related Topics + +- Build and ABI compatibility: `../build-and-abi-compatibility/DOC.md` +- Multi-GPU and peer access: `../multi-gpu-and-peer-access/DOC.md` +- Production readiness checklist: `../production-readiness-checklist/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA Runtime API, device property query interfaces: https://docs.nvidia.com/cuda/cuda-runtime-api/index.html +- CUDA C++ Programming Guide, architecture/capability-dependent feature context: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/fused-kernel-design-patterns/DOC.md b/content/cuda/docs/fused-kernel-design-patterns/DOC.md new file mode 100644 index 00000000..ec01fb56 --- /dev/null +++ b/content/cuda/docs/fused-kernel-design-patterns/DOC.md @@ -0,0 +1,75 @@ +--- +name: fused-kernel-design-patterns +description: "CUDA fused-kernel design essentials: when fusion helps, when it hurts, and practical patterns for memory-traffic reduction." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,fusion,fused-kernel,memory-traffic,register-pressure,launch-overhead,epilogue-fusion" +--- + +# CUDA Fused-Kernel Design Patterns (C++) + +Use this page when deciding whether to combine multiple operations into one kernel. + +## Why Fusion Helps + +Fusion can improve performance by: + +- reducing global-memory round trips +- reducing kernel-launch overhead +- keeping intermediate values in registers/shared memory + +## Why Fusion Can Hurt + +Over-fusion can degrade performance due to: + +- register pressure and spills +- lower occupancy +- larger instruction footprint +- harder scheduling and poorer maintainability + +Fusion is beneficial only when memory/launch savings outweigh these costs. + +## Common Fusion Patterns + +- elementwise chain fusion (A->B->C) +- reduction + lightweight post-processing +- GEMM epilogue fusion (bias/add/activation) +- load-transform-store pipelines with shared-memory staging + +## Practical Decision Rule + +Fuse when: + +- intermediate tensors are large +- extra kernel boundaries dominate runtime +- the fused kernel remains resource-balanced + +Do not fuse when: + +- each op is already compute-heavy and well-optimized +- fusion introduces high register pressure or complex control divergence + +## Validation Workflow + +1. benchmark unfused baseline. +2. fuse one boundary at a time. +3. profile register usage, spills, occupancy, and bandwidth. +4. keep fusion only where end-to-end latency improves. + +## Related Topics + +- Coalescing: `../coalescing/DOC.md` +- Occupancy: `../occupancy/DOC.md` +- Launch bounds and registers: `../launch-bounds-and-registers/DOC.md` +- CUDA Graphs: `../cuda-graphs/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Best Practices Guide, memory and launch optimization context: https://docs.nvidia.com/cuda/archive/13.0.0/cuda-c-best-practices-guide/index.html +- CUDA C++ Programming Guide, execution and memory behavior background: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/incident-response-and-rollback-playbook/DOC.md b/content/cuda/docs/incident-response-and-rollback-playbook/DOC.md new file mode 100644 index 00000000..adcfc788 --- /dev/null +++ b/content/cuda/docs/incident-response-and-rollback-playbook/DOC.md @@ -0,0 +1,68 @@ +--- +name: incident-response-and-rollback-playbook +description: "CUDA incident-response essentials: triage, rollback criteria, mitigation levers, and post-incident hardening steps." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,incident,response,rollback,mitigation,triage,oncall,postmortem" +--- + +# Incident Response And Rollback Playbook (C++) + +Use this page when a CUDA optimization regresses latency, correctness, or stability in production. + +## Fast Triage Checklist + +1. identify blast radius (which models/tasks/hardware). +2. classify failure mode (correctness, latency, crash, OOM, timeout). +3. isolate recent kernel/config/toolchain changes. +4. determine safe rollback target. + +## Rollback Criteria + +Rollback immediately when: + +- correctness deviations exceed policy +- crash rate or timeout rate breaches SLO +- latency regression is severe and sustained + +Do not wait for perfect root-cause certainty before restoring service. + +## Mitigation Levers + +- disable risky fast paths via feature flags +- switch to known-safe kernel variant +- reduce batch size or concurrency temporarily +- force conservative precision/mode where necessary + +## Evidence Collection + +- capture failing inputs and minimal repro shapes +- record selected kernel path/capability info +- collect timeline + kernel profiles for before/after comparison + +## Post-Incident Hardening + +- add regression tests for the triggering pattern +- add rollout guardrails (canary, staged enablement) +- improve observability for path-selection and error counters +- document lessons and owner actions + +## Related Topics + +- Production readiness checklist: `../production-readiness-checklist/DOC.md` +- Regression testing and CI: `../regression-testing-and-ci/DOC.md` +- NVTX profiling workflow: `../nvtx-and-profiling-workflow/DOC.md` +- Fallback strategies: `../fallback-strategies-and-capability-detection/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Best Practices Guide (verification + optimization workflow context): https://docs.nvidia.com/cuda/archive/13.0.0/cuda-c-best-practices-guide/index.html +- Nsight Systems / Nsight Compute docs for triage instrumentation: + - https://docs.nvidia.com/nsight-systems/UserGuide/index.html + - https://docs.nvidia.com/nsight-compute/2024.2/ProfilingGuide/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/input-shape-specialization-and-autotuning/DOC.md b/content/cuda/docs/input-shape-specialization-and-autotuning/DOC.md new file mode 100644 index 00000000..575e5ce2 --- /dev/null +++ b/content/cuda/docs/input-shape-specialization-and-autotuning/DOC.md @@ -0,0 +1,60 @@ +--- +name: input-shape-specialization-and-autotuning +description: "CUDA shape specialization and autotuning essentials: variant spaces, compile/runtime dispatch, and robust tuning workflows." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,autotuning,shape-specialization,dispatch,variant-selection,tile-size,benchmarking" +--- + +# Input Shape Specialization And Autotuning (C++) + +Use this page when one kernel configuration cannot serve all input shapes efficiently. + +## Why Specialization Is Needed + +Kernel performance often depends on: + +- shape geometry +- stride/layout +- precision mode +- architecture/resource limits + +A single static launch/config choice is usually suboptimal across broad workloads. + +## Specialization Strategies + +- compile-time variants for known shape classes +- runtime dispatch by shape buckets +- autotuned parameter sets (tile sizes, block sizes, staging depth) + +Keep variant count bounded to control maintenance overhead. + +## Autotuning Workflow + +1. define search space (block/tile/stage variants). +2. benchmark representative shape corpus. +3. store winning config per shape bucket and hardware class. +4. validate correctness and stability of selected variants. + +## Robustness Rules + +- never tune on one micro-benchmark only +- include tail shapes and borderline sizes +- preserve safe fallback when no tuned profile matches + +## Related Topics + +- Benchmarking methodology: `../benchmarking-methodology/DOC.md` +- Fused kernel patterns: `../fused-kernel-design-patterns/DOC.md` +- Build and ABI compatibility: `../build-and-abi-compatibility/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Best Practices Guide, empirical optimization guidance: https://docs.nvidia.com/cuda/archive/13.0.0/cuda-c-best-practices-guide/index.html +- CUDA C++ Programming Guide, launch/resource model background: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/kernel-api-design-guidelines/DOC.md b/content/cuda/docs/kernel-api-design-guidelines/DOC.md new file mode 100644 index 00000000..02e795d1 --- /dev/null +++ b/content/cuda/docs/kernel-api-design-guidelines/DOC.md @@ -0,0 +1,67 @@ +--- +name: kernel-api-design-guidelines +description: "CUDA kernel API design essentials: parameter contracts, shape/stride conventions, launch invariants, and forward-compatible interface choices." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,api-design,shape,stride,contracts,launch-invariants,interface,maintainability" +--- + +# CUDA Kernel API Design Guidelines (C++) + +Use this page when defining or refactoring kernel-facing interfaces for long-term maintainability. + +## Interface Contracts First + +Document and enforce: + +- tensor shape expectations +- stride/layout assumptions +- alignment requirements +- supported dtype/precision combinations + +Unstated assumptions become production bugs. + +## Parameter Design + +Prefer explicit parameters over hidden globals: + +- dimensions (`n`, `h`, `w`, etc.) +- leading dimensions/strides +- flags that affect algorithmic paths + +Keep argument ordering stable and predictable across related kernels. + +## Launch Invariants + +Define launch invariants close to API: + +- valid block size range +- shared-memory requirements +- grid coverage model + +Validate invariants early in host code where possible. + +## Versioning Mindset + +If a kernel API is reused across modules: + +- avoid breaking parameter semantics silently +- add new fields/options in backward-compatible ways +- keep deprecation path explicit + +## Related Topics + +- Data layout and alignment: `../data-layout-and-alignment/DOC.md` +- Build and ABI compatibility: `../build-and-abi-compatibility/DOC.md` +- Regression testing and CI: `../regression-testing-and-ci/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Programming Guide, kernel launch and execution model background: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html +- CUDA C++ Best Practices Guide, software design and optimization workflow context: https://docs.nvidia.com/cuda/archive/13.0.0/cuda-c-best-practices-guide/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/kernel-bottleneck-diagnosis-workflow/DOC.md b/content/cuda/docs/kernel-bottleneck-diagnosis-workflow/DOC.md new file mode 100644 index 00000000..9417333f --- /dev/null +++ b/content/cuda/docs/kernel-bottleneck-diagnosis-workflow/DOC.md @@ -0,0 +1,83 @@ +--- +name: kernel-bottleneck-diagnosis-workflow +description: "Kernel bottleneck diagnosis workflow: classify memory-bound vs compute-bound vs launch-bound, then choose targeted optimization paths." +metadata: + languages: "cpp" + versions: "2024.2" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,bottleneck,diagnosis,workflow,memory-bound,compute-bound,launch-bound,profiling,nsight" +--- + +# Kernel Bottleneck Diagnosis Workflow (C++) + +Use this page when you need a repeatable way to decide which optimization direction is actually relevant. + +## Classification First + +Classify each hot kernel into one of three primary classes: + +- memory-bound +- compute-bound +- launch/orchestration-bound + +Do this with profiling evidence, not intuition. + +## Evidence Signals + +Memory-bound indicators: + +- high memory-pipeline utilization with low arithmetic utilization +- strong sensitivity to coalescing/layout changes + +Compute-bound indicators: + +- arithmetic pipeline pressure dominates +- throughput improves mainly with instruction-mix or scheduling improvements + +Launch-bound indicators: + +- many short kernels +- significant CPU/launch overhead and weak overlap + +## Optimization Routing + +If memory-bound: + +- prioritize coalescing, reuse, layout, and staging fixes. + +If compute-bound: + +- optimize instruction mix, occupancy/ILP balance, and path selection (CUDA Core vs Tensor Core). + +If launch-bound: + +- reduce launch count, fuse kernels where valid, and evaluate CUDA Graphs. + +## Guardrails + +- Reclassify after each major optimization; bottleneck class can change. +- Keep correctness and numerical checks active during performance iteration. +- Record profiler snapshots per step to avoid regression ambiguity. + +## Related Topics + +- Performance debugging: `../performance-debugging/DOC.md` +- Compute throughput: `../compute-throughput/DOC.md` +- Memory-bound optimization playbook: `../memory-bound-kernel-optimization-playbook/DOC.md` +- Compute-bound optimization playbook: `../compute-bound-kernel-optimization-playbook/DOC.md` +- Launch-bound optimization playbook: `../launch-bound-optimization-playbook/DOC.md` +- Nsight metrics interpretation cheatsheet: `../nsight-metrics-interpretation-cheatsheet/DOC.md` +- CUDA Core optimization checklist: `../cuda-core-optimization-checklist/DOC.md` +- CUDA Core vs Tensor Core path selection: `../cuda-core-vs-tensor-core-path-selection/DOC.md` +- CUDA Graphs: `../cuda-graphs/DOC.md` +- Fused kernel design patterns: `../fused-kernel-design-patterns/DOC.md` + +## Official Source Links (Fact Check) + +- Nsight Systems User Guide: https://docs.nvidia.com/nsight-systems/UserGuide/index.html +- Nsight Compute Profiling Guide: https://docs.nvidia.com/nsight-compute/2024.2/ProfilingGuide/index.html +- CUDA C++ Best Practices Guide: https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/ + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/launch-bound-optimization-playbook/DOC.md b/content/cuda/docs/launch-bound-optimization-playbook/DOC.md new file mode 100644 index 00000000..fbc3a7ee --- /dev/null +++ b/content/cuda/docs/launch-bound-optimization-playbook/DOC.md @@ -0,0 +1,64 @@ +--- +name: launch-bound-optimization-playbook +description: "Launch-bound optimization playbook: reducing launch overhead, improving overlap, and deciding when to use fusion or CUDA Graphs." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,launch-bound,optimization,launch-overhead,cuda-graphs,fusion,stream-overlap,orchestration" +--- + +# Launch-Bound Optimization Playbook (C++) + +Use this page when many short kernels or orchestration overhead dominate runtime. + +## Primary Objectives + +- Reduce launch overhead. +- Increase useful overlap between copy and compute. +- Simplify scheduling structure for repeated execution patterns. + +## High-Impact Levers + +- Reduce kernel launch count where semantically safe. +- Apply kernel fusion when it improves end-to-end cost. +- Evaluate CUDA Graphs for repetitive execution DAGs. +- Improve stream/event structure to avoid accidental serialization. + +## Triage Sequence + +1. Confirm launch/orchestration bottleneck in timeline profiling. +2. Identify high-frequency short kernels and synchronization hotspots. +3. Test fusion and graph capture candidates. +4. Reprofile overlap and CPU-side launch cost. + +## Common Failure Modes + +- Fusion increases register pressure and hurts throughput. +- Graph capture applied to highly dynamic control flow without clear gain. +- Stream dependencies unintentionally serialize work. + +## Verification Checklist + +- CPU launch overhead decreases. +- Timeline overlap improves. +- Overall runtime drops on production traces, not just micro-tests. + +## Related Topics + +- CUDA Graphs: `../cuda-graphs/DOC.md` +- Fused kernel design patterns: `../fused-kernel-design-patterns/DOC.md` +- Streams and events: `../streams-and-events/DOC.md` +- NVTX and profiling workflow: `../nvtx-and-profiling-workflow/DOC.md` +- Kernel bottleneck diagnosis workflow: `../kernel-bottleneck-diagnosis-workflow/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA Graphs programming guidance: https://docs.nvidia.com/cuda/cuda-c-programming-guide/ +- Nsight Systems User Guide: https://docs.nvidia.com/nsight-systems/UserGuide/index.html +- CUDA C++ Best Practices Guide: https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/ + +Last cross-check date: 2026-03-20 + diff --git a/content/cuda/docs/launch-bounds-and-registers/DOC.md b/content/cuda/docs/launch-bounds-and-registers/DOC.md new file mode 100644 index 00000000..cfa16467 --- /dev/null +++ b/content/cuda/docs/launch-bounds-and-registers/DOC.md @@ -0,0 +1,69 @@ +--- +name: launch-bounds-and-registers +description: "CUDA launch bounds and register-pressure essentials: __launch_bounds__, occupancy tradeoffs, and spill-aware tuning." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,launch-bounds,__launch_bounds__,register-pressure,spills,occupancy,maxrregcount" +--- + +# CUDA Launch Bounds And Registers (C++) + +Use this page when kernel performance depends on register pressure and block residency. + +## What `__launch_bounds__` Does + +`__launch_bounds__(maxThreadsPerBlock, minBlocksPerMultiprocessor)` gives the compiler launch-time assumptions. + +- `maxThreadsPerBlock` constrains the intended block size +- optional `minBlocksPerMultiprocessor` asks the compiler to keep enough resources for a target block residency + +This can change register allocation decisions and instruction scheduling. + +## Why It Matters + +Register pressure directly affects occupancy. + +- too many registers per thread can reduce active blocks/warps +- too few registers can cause spills to local memory + +So tuning is a balance: occupancy gain versus spill cost. + +## Practical Tuning Pattern + +1. Start from correctness and baseline performance. +2. Inspect occupancy and local-memory traffic in Nsight Compute. +3. Try `__launch_bounds__` with realistic block sizes. +4. Re-measure runtime, spills, and achieved occupancy. +5. Keep the setting only if end-to-end time improves. + +## `-maxrregcount` Caution + +Compiler flag `-maxrregcount` can cap registers globally, but it is blunt. + +- it may improve occupancy +- it can also increase spills and hurt performance + +Prefer targeted kernel-level tuning (`__launch_bounds__`) before applying global caps. + +## Common Mistakes + +- optimizing for occupancy percentage alone +- forcing low register count without checking spill metrics +- setting launch bounds that do not match actual launch configuration + +## Related Topics + +- Occupancy tuning: `../occupancy/DOC.md` +- Compute throughput: `../compute-throughput/DOC.md` +- Performance debugging: `../performance-debugging/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Programming Guide, launch bounds: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html +- CUDA C++ Programming Guide, occupancy and execution model discussion: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/memory-bound-kernel-optimization-playbook/DOC.md b/content/cuda/docs/memory-bound-kernel-optimization-playbook/DOC.md new file mode 100644 index 00000000..944dd57e --- /dev/null +++ b/content/cuda/docs/memory-bound-kernel-optimization-playbook/DOC.md @@ -0,0 +1,64 @@ +--- +name: memory-bound-kernel-optimization-playbook +description: "Memory-bound kernel optimization playbook: coalescing, cache locality, shared-memory staging, and bandwidth-focused validation." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,memory-bound,optimization,coalescing,cache,shared-memory,bandwidth,staging,latency" +--- + +# Memory-Bound Kernel Optimization Playbook (C++) + +Use this page after profiling confirms the kernel is limited by memory movement instead of arithmetic throughput. + +## Primary Objectives + +- Increase effective bandwidth. +- Reduce wasted traffic. +- Improve locality and access regularity. + +## High-Impact Levers + +- Coalesced global-memory access. +- Reuse through registers/shared memory. +- Shared-memory layouts that avoid severe bank conflicts. +- Data-layout changes that reduce strided/scattered loads. + +## Triage Sequence + +1. Validate coalescing quality for major tensors. +2. Check L1/L2 reuse opportunity and cache-policy behavior. +3. Add or improve shared-memory staging for high-reuse tiles. +4. Recheck occupancy/register pressure after staging changes. + +## Common Failure Modes + +- Correct staging logic but poor layout (bank conflicts dominate). +- More shared memory with no reuse gain (occupancy drops, throughput worsens). +- Overly complex index math adds latency and defeats memory gains. + +## Verification Checklist + +- Achieved bandwidth increases in profiler metrics. +- Memory-related warp stalls decrease in hot sections. +- Total runtime improves on representative production shapes. + +## Related Topics + +- Coalescing: `../coalescing/DOC.md` +- Shared memory: `../shared-memory/DOC.md` +- Cache behavior: `../cache-behavior-and-access-policy/DOC.md` +- Data layout and alignment: `../data-layout-and-alignment/DOC.md` +- Kernel bottleneck diagnosis workflow: `../kernel-bottleneck-diagnosis-workflow/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Best Practices Guide, memory optimizations: https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/ +- CUDA C++ Programming Guide, memory hierarchy and access behavior: https://docs.nvidia.com/cuda/cuda-c-programming-guide/ +- Nsight Compute Profiling Guide: https://docs.nvidia.com/nsight-compute/2024.2/ProfilingGuide/index.html + +Last cross-check date: 2026-03-20 + diff --git a/content/cuda/docs/memory-fences-and-ordering/DOC.md b/content/cuda/docs/memory-fences-and-ordering/DOC.md new file mode 100644 index 00000000..7897149d --- /dev/null +++ b/content/cuda/docs/memory-fences-and-ordering/DOC.md @@ -0,0 +1,86 @@ +--- +name: memory-fences-and-ordering +description: "CUDA memory-ordering essentials: weak ordering, __threadfence* scopes, visibility vs ordering, and fence-based handoff patterns." +metadata: + languages: "cpp" + versions: "12.6" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,memory-ordering,memory-fence,__threadfence,__threadfence_block,__threadfence_system,visibility,volatile" +--- + +# CUDA Memory Fences And Ordering (C++) + +Use this page when kernels communicate through memory and correctness depends on ordering rather than just synchronization. + +## Weak Ordering + +CUDA uses a weakly ordered memory model. + +- two unsynchronized threads reading and writing the same location create a data race +- memory fences enforce ordering of a thread's memory operations +- fences do not automatically provide block-wide participation like `__syncthreads()` + +## Fence Scope Variants + +CUDA provides three common fence scopes: + +- `__threadfence_block()` +- `__threadfence()` +- `__threadfence_system()` + +Roughly: + +- block scope: ordering relevant to the calling block +- device scope: ordering relevant across the device +- system scope: ordering visible to host threads and peer devices as well + +## Ordering vs Visibility + +This distinction matters: + +- fences order memory operations by the calling thread +- barriers coordinate participating threads +- visibility to observers may still require the right memory access path and synchronization pattern + +In other words, a fence is not a replacement for `__syncthreads()`. + +## Typical Pattern + +Producer-consumer handoff across blocks often looks like: + +1. producer writes data +2. producer executes `__threadfence()` +3. producer updates a flag or counter atomically +4. consumer observes the flag and then reads the data + +Without the fence, the flag can become visible before the data it is meant to publish. + +## Choosing The Scope + +- same block only: usually `__threadfence_block()` or a block barrier pattern +- different blocks on the same device: typically `__threadfence()` +- host or peer-device observers: `__threadfence_system()` + +Choose the narrowest scope that matches the communication pattern. + +## Common Mistakes + +- assuming atomics alone solve all ordering problems +- using `__threadfence()` when a block-local barrier is the real need +- forgetting that fences do not synchronize other threads +- using device-wide or system-wide fences more broadly than necessary + +## Related Topics + +- Synchronization rules: `../synchronization/DOC.md` +- Atomics and reductions: `../atomics-and-reductions/DOC.md` +- Streams and events: `../streams-and-events/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Programming Guide, memory fence functions: https://docs.nvidia.com/cuda/archive/12.6.1/cuda-c-programming-guide/index.html +- CUDA C++ Programming Guide, historical fence examples and ordering discussion: https://docs.nvidia.com/cuda/archive/11.5.0/cuda-c-programming-guide/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/memory-hierarchy/DOC.md b/content/cuda/docs/memory-hierarchy/DOC.md new file mode 100644 index 00000000..d10803be --- /dev/null +++ b/content/cuda/docs/memory-hierarchy/DOC.md @@ -0,0 +1,115 @@ +--- +name: memory-hierarchy +description: "CUDA memory hierarchy essentials: registers, local, shared, global, constant, and texture/read-only paths." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,memory-hierarchy,registers,local-memory,shared-memory,global-memory,constant-memory,texture-memory" +--- + +# CUDA Memory Hierarchy (C++) + +Use this page to decide which CUDA memory space fits a kernel access pattern. + +## The Main Spaces + +- registers: fastest per-thread storage, but limited +- local memory: per-thread memory in device memory, commonly used for spills or large automatic objects +- shared memory: on-chip storage shared by threads in a block +- global memory: large device memory visible across kernels and blocks +- constant memory: cached read-only storage, especially effective when many threads read the same location +- texture/read-only path: cached read-only access path that can help some spatial access patterns + +## Registers + +Registers are the first-choice storage for hot per-thread values. + +- lowest-latency storage for thread-private temporaries +- high register pressure can reduce occupancy +- if the compiler runs out of registers, values may spill to local memory + +## Local Memory + +Despite the name, local memory is not on-chip shared scratchpad memory. + +- it is private to one thread +- it resides in device memory +- it often appears when large automatic arrays are used or when register pressure causes spills + +If a kernel unexpectedly slows down, local-memory traffic is often a sign that register use is too high. + +## Shared Memory + +Shared memory is the standard block-level scratchpad. + +- shared by threads in one block +- useful for data reuse, tiling, transpose, and reduction +- requires explicit synchronization when threads communicate through it +- performance depends on avoiding bank conflicts + +See `../shared-memory/DOC.md` for the detailed usage rules. + +## Global Memory + +Global memory is the default large device memory space. + +- visible to all threads and across kernel launches +- highest capacity among device spaces +- much slower than on-chip storage +- performance depends heavily on coalesced access patterns + +See `../coalescing/DOC.md` for access-pattern guidance. + +## Constant Memory + +Constant memory is read-only from device code and is cached. + +- best when many threads read the same address +- not a substitute for shared memory +- useful for broadcast-like parameters or small read-only tables + +## Texture / Read-Only Path + +Texture and read-only cached access paths can help when: + +- access is read-only +- locality is irregular or spatial +- the pattern is not ideal for standard coalesced global loads + +Do not default to texture memory for ordinary linear arrays; it is a pattern-specific tool. + +## Selection Heuristics + +- value reused only by one thread: registers first +- value reused by many threads in one block: shared memory +- large tensor or array visible across blocks: global memory +- small read-only broadcast table: constant memory +- read-only data with irregular spatial locality: texture/read-only path + +## Practical Warnings + +- local memory is usually a warning sign, not a target optimization space +- shared memory helps only when reuse or reordering outweighs its setup and sync cost +- high occupancy alone does not guarantee fast memory behavior +- coalescing and bank conflicts often matter more than raw memory-space choice + +## Related Topics + +- Shared memory details: `../shared-memory/DOC.md` +- Synchronization rules: `../synchronization/DOC.md` +- Coalesced access patterns: `../coalescing/DOC.md` +- Unified Memory: `../unified-memory/DOC.md` +- Pinned memory and transfers: `../pinned-memory-and-transfers/DOC.md` +- PTX state spaces: `../ptx/references/state-spaces-and-types.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Programming Guide, programming model and memory overview: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html +- CUDA C++ Programming Guide, device memory space specifiers: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html#device-memory-space-specifiers +- CUDA C++ Programming Guide, local memory discussion: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html +- CUDA C++ Programming Guide, shared memory: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html#shared + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/multi-gpu-and-peer-access/DOC.md b/content/cuda/docs/multi-gpu-and-peer-access/DOC.md new file mode 100644 index 00000000..3cd3a878 --- /dev/null +++ b/content/cuda/docs/multi-gpu-and-peer-access/DOC.md @@ -0,0 +1,74 @@ +--- +name: multi-gpu-and-peer-access +description: "CUDA multi-GPU essentials: device selection, peer access (P2P), topology constraints, and cross-device synchronization basics." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,multi-gpu,peer-access,p2p,cudaDeviceEnablePeerAccess,cudaMemcpyPeerAsync,topology,nvlink" +--- + +# CUDA Multi-GPU And Peer Access (C++) + +Use this page for process-level multi-GPU programming and direct device-to-device data movement. + +## Device Selection Basics + +Multi-GPU programs typically: + +1. query device count and capabilities +2. assign work partitions per device +3. set active device with `cudaSetDevice` +4. create per-device streams/resources + +Avoid frequent device switching in tight host loops unless necessary. + +## Peer Access (P2P) + +Peer access allows one GPU to access memory on another GPU directly when topology and capability permit it. + +Core APIs: + +- `cudaDeviceCanAccessPeer` +- `cudaDeviceEnablePeerAccess` +- `cudaMemcpyPeerAsync` + +Always check capability before enabling peer access. + +## Why P2P Matters + +When supported, P2P can reduce host staging overhead for inter-GPU exchange. + +Performance depends on topology: + +- NVLink-connected peers often outperform PCIe-only paths +- some GPU pairs may not support peer access at all + +## Synchronization Notes + +Cross-device workflows still need explicit ordering and synchronization. + +- use stream/event patterns per device +- avoid global sync unless required +- ensure destination-side readiness before kernel consumption + +## Common Mistakes + +- assuming all GPU pairs support P2P +- forgetting to set the correct active device before API calls +- building one global stream strategy across devices without per-device ownership + +## Related Topics + +- Streams and events: `../streams-and-events/DOC.md` +- Pinned memory and transfers: `../pinned-memory-and-transfers/DOC.md` +- Unified Memory: `../unified-memory/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Programming Guide, multi-device and peer access: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html +- CUDA Runtime API, peer-device memory access APIs: https://docs.nvidia.com/cuda/cuda-runtime-api/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/nsight-metrics-interpretation-cheatsheet/DOC.md b/content/cuda/docs/nsight-metrics-interpretation-cheatsheet/DOC.md new file mode 100644 index 00000000..a562ba37 --- /dev/null +++ b/content/cuda/docs/nsight-metrics-interpretation-cheatsheet/DOC.md @@ -0,0 +1,53 @@ +--- +name: nsight-metrics-interpretation-cheatsheet +description: "Nsight metrics interpretation cheatsheet: practical mapping from common metric patterns to likely bottleneck classes and next actions." +metadata: + languages: "cpp" + versions: "2024.2" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,nsight,metrics,profiling,interpretation,warp-stalls,occupancy,bandwidth,bottleneck" +--- + +# Nsight Metrics Interpretation Cheatsheet (C++) + +Use this page for fast mapping from profiler symptoms to likely root causes and next steps. + +## Symptom To Action Map + +- High memory pressure + low arithmetic utilization: + likely memory-bound, prioritize coalescing/layout/reuse. +- Low issue efficiency + dependency-heavy stalls: + likely compute-bound scheduling/dependency bottleneck. +- Many short kernels + high CPU orchestration share: + likely launch-bound, evaluate fusion/graphs/overlap changes. + +## Warp Stall Reading Rules + +- Treat stall reasons as supporting evidence, not standalone truth. +- Interpret stall categories together with achieved throughput and occupancy. +- Re-check after each optimization stage because dominant stalls can shift. + +## Minimal Workflow + +1. Timeline classify (Nsight Systems). +2. Kernel-level metrics drilldown (Nsight Compute). +3. Route to memory/compute/launch playbook. +4. Reprofile and confirm bottleneck shift. + +## Related Topics + +- Performance debugging: `../performance-debugging/DOC.md` +- Kernel bottleneck diagnosis workflow: `../kernel-bottleneck-diagnosis-workflow/DOC.md` +- Memory-bound playbook: `../memory-bound-kernel-optimization-playbook/DOC.md` +- Compute-bound playbook: `../compute-bound-kernel-optimization-playbook/DOC.md` +- Launch-bound playbook: `../launch-bound-optimization-playbook/DOC.md` + +## Official Source Links (Fact Check) + +- Nsight Compute Profiling Guide: https://docs.nvidia.com/nsight-compute/2024.2/ProfilingGuide/index.html +- Nsight Systems User Guide: https://docs.nvidia.com/nsight-systems/UserGuide/index.html + +Last cross-check date: 2026-03-20 + diff --git a/content/cuda/docs/numerics-and-precision/DOC.md b/content/cuda/docs/numerics-and-precision/DOC.md new file mode 100644 index 00000000..671db38f --- /dev/null +++ b/content/cuda/docs/numerics-and-precision/DOC.md @@ -0,0 +1,74 @@ +--- +name: numerics-and-precision +description: "CUDA numerics and precision essentials: FP16/BF16/TF32 behavior, accumulation choices, and stability-aware kernel design." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,numerics,precision,fp16,bf16,tf32,accumulation,rounding,tensor-cores" +--- + +# CUDA Numerics And Precision (C++) + +Use this page when correctness and performance depend on precision mode choices. + +## Precision Choices Matter + +CUDA kernels often trade off: + +- throughput +- memory footprint +- numeric stability + +Common formats include FP32, FP16, BF16, and TF32 (Tensor Core-oriented math mode). + +## Storage Type vs Accumulation Type + +A robust pattern is mixed precision: + +- store inputs in lower precision (for bandwidth / throughput) +- accumulate in higher precision (for stability) + +Example direction: + +- FP16/BF16 inputs with FP32 accumulation for reductions and GEMM-like operations. + +## Tensor Core Precision Modes + +Tensor Core paths can use type-specific behavior (for example TF32/FP16/BF16 combinations depending on architecture and library mode). + +When enabling Tensor Core math modes: + +- verify expected numeric tolerance +- compare against a high-precision baseline +- record configuration to keep benchmark results reproducible + +## Common Instability Patterns + +- long reductions in low precision +- subtractive cancellation with similar-magnitude values +- iterative algorithms without periodic re-normalization + +## Practical Guardrails + +1. define accuracy targets first (absolute/relative tolerance). +2. choose accumulation precision before micro-optimizing. +3. test on representative dynamic ranges, not only random unit-scale inputs. +4. keep a reference path (often FP32 accumulation) for regression checks. + +## Related Topics + +- Tensor Core usage: `../tensor-cores/DOC.md` +- Tensor Core numerical validation: `../tensor-core-numerical-validation/DOC.md` +- WMMA debugging checklist: `../wmma-debugging-checklist/DOC.md` +- Compute throughput: `../compute-throughput/DOC.md` +- Performance debugging: `../performance-debugging/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Programming Guide, floating-point and mixed precision behavior: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html +- CUDA C++ Programming Guide, WMMA/Tensor Core precision context: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/nvtx-and-profiling-workflow/DOC.md b/content/cuda/docs/nvtx-and-profiling-workflow/DOC.md new file mode 100644 index 00000000..ab838296 --- /dev/null +++ b/content/cuda/docs/nvtx-and-profiling-workflow/DOC.md @@ -0,0 +1,58 @@ +--- +name: nvtx-and-profiling-workflow +description: "CUDA NVTX and profiling workflow essentials: annotation strategy, Nsight Systems correlation, and handoff to Nsight Compute." +metadata: + languages: "cpp" + versions: "2024.2" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,nvtx,profiling,nsight-systems,nsight-compute,annotation,timeline" +--- + +# NVTX And Profiling Workflow (C++) + +Use this page for a repeatable profiling workflow across host code and CUDA kernels. + +## Why NVTX First + +NVTX markers make timeline analysis actionable. + +- they label logical phases in host code +- Nsight Systems can correlate those ranges with stream activity and kernel launches +- this reduces guesswork before deep kernel-level profiling + +## Recommended Workflow + +1. add NVTX ranges around pipeline phases. +2. run Nsight Systems to identify timeline bottlenecks. +3. select top kernels from the timeline. +4. run Nsight Compute for per-kernel microanalysis. + +This avoids premature micro-optimization of non-critical kernels. + +## Annotation Guidelines + +- annotate coarse phases first (data load, preprocess, compute, postprocess) +- add finer ranges only where needed +- keep naming stable across runs for easy diffing + +## Common Mistakes + +- profiling kernels without timeline context +- over-annotating every tiny function +- changing workload shape between profiling runs + +## Related Topics + +- Performance debugging: `../performance-debugging/DOC.md` +- Streams and events: `../streams-and-events/DOC.md` +- CUDA Graphs: `../cuda-graphs/DOC.md` + +## Official Source Links (Fact Check) + +- NVTX documentation: https://nvidia.github.io/NVTX/ +- Nsight Systems User Guide: https://docs.nvidia.com/nsight-systems/UserGuide/index.html +- Nsight Compute Profiling Guide: https://docs.nvidia.com/nsight-compute/2024.2/ProfilingGuide/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/occupancy/DOC.md b/content/cuda/docs/occupancy/DOC.md new file mode 100644 index 00000000..c5f1b841 --- /dev/null +++ b/content/cuda/docs/occupancy/DOC.md @@ -0,0 +1,103 @@ +--- +name: occupancy +description: "CUDA occupancy essentials: active warps, launch configuration APIs, and the tradeoff with registers and shared memory." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,occupancy,launch-configuration,block-size,register-pressure,shared-memory,cudaOccupancyMaxPotentialBlockSize" +--- + +# CUDA Occupancy (C++) + +Use this page when tuning block size, shared memory size, or register usage and you need to reason about how many warps and blocks can stay active on an SM. + +## What Occupancy Means + +Occupancy is the ratio of active warps on an SM to the maximum supported warps on that SM. + +In practice, occupancy is constrained by: + +- threads per block +- registers used per thread +- shared memory used per block +- architectural limits on blocks and warps per SM + +## Important Caveat + +Higher occupancy is not automatically better. + +- low occupancy can hurt latency hiding +- very high occupancy can be unnecessary if the kernel is already bandwidth-limited or instruction-efficient +- reducing registers just to raise occupancy can backfire if it causes spills to local memory + +Treat occupancy as a constraint and diagnostic, not a standalone optimization target. + +## Runtime APIs + +CUDA provides helper APIs for launch configuration: + +- `cudaOccupancyMaxActiveBlocksPerMultiprocessor` +- `cudaOccupancyMaxPotentialBlockSize` +- `cudaOccupancyMaxPotentialBlockSizeVariableSMem` + +Use them to estimate a reasonable starting block size based on register and shared-memory usage. + +Minimal pattern: + +```cpp +int minGridSize = 0; +int blockSize = 0; +cudaOccupancyMaxPotentialBlockSize( + &minGridSize, + &blockSize, + my_kernel, + 0, + 0); +``` + +This gives a good starting point, not a final answer. + +## What Usually Lowers Occupancy + +- large dynamic shared memory allocations +- high register pressure +- overly large block sizes +- cluster or architecture-specific launch constraints on newer GPUs + +## Practical Tuning Rules + +- start in the 128 to 256 threads-per-block range unless you have a strong reason otherwise +- prefer a multiple of warp size +- if a kernel frequently calls `__syncthreads()`, several smaller blocks can outperform one very large block +- if reducing block size barely changes runtime, the kernel may not be occupancy-limited + +## Common Misread + +If performance is poor, ask these in order: + +1. Is memory access coalesced? +2. Are there bank conflicts? +3. Is there divergence? +4. Is occupancy actually the limiting factor? + +Very often, memory behavior matters more than squeezing out a few more active warps. + +## Related Topics + +- Execution model: `../execution-model/DOC.md` +- Compute throughput model: `../compute-throughput/DOC.md` +- Shared memory constraints: `../shared-memory/DOC.md` +- Memory hierarchy overview: `../memory-hierarchy/DOC.md` +- Synchronization behavior: `../synchronization/DOC.md` +- Coalesced global memory access: `../coalescing/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Programming Guide, occupancy calculator APIs: https://docs.nvidia.com/cuda/archive/11.8.0/cuda-c-programming-guide/index.html +- CUDA C++ Best Practices Guide, thread/block sizing guidance: https://docs.nvidia.com/cuda/archive/13.0.0/cuda-c-best-practices-guide/index.html +- CUDA Driver API occupancy reference: https://docs.nvidia.com/cuda/archive/11.4.4/cuda-driver-api/group__CUDA__OCCUPANCY.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/performance-debugging/DOC.md b/content/cuda/docs/performance-debugging/DOC.md new file mode 100644 index 00000000..7c1d1624 --- /dev/null +++ b/content/cuda/docs/performance-debugging/DOC.md @@ -0,0 +1,100 @@ +--- +name: performance-debugging +description: "CUDA performance debugging essentials: when to use Nsight Systems vs Nsight Compute, key metrics, and how to read warp stalls." +metadata: + languages: "cpp" + versions: "2024.2" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,performance-debugging,nsight-compute,nsight-systems,warp-stalls,occupancy,bandwidth,profiling" +--- + +# CUDA Performance Debugging (C++) + +Use this page when a kernel is correct but slow and you need to decide what to profile first. + +## First Tool Choice + +Use the tools for different questions: + +- Nsight Systems: timeline, host/device orchestration, overlap, streams, events, graph behavior +- Nsight Compute: per-kernel metrics, throughput, occupancy, warp stalls, memory behavior + +If you do not yet know whether the problem is on the host side or inside the kernel, start with Nsight Systems. + +## Nsight Systems + +Use Nsight Systems when you need to answer: + +- are streams actually overlapping? +- are copies blocking kernels? +- is the CPU launch path the bottleneck? +- are events or graphs introducing serialization? + +NVTX ranges are useful here for relating CPU regions to CUDA activity. + +## Nsight Compute + +Use Nsight Compute when you need to answer: + +- is the kernel memory-bound or compute-bound? +- is occupancy too low? +- are schedulers issuing efficiently? +- what are the top warp stall reasons? + +Useful report sections include: + +- SpeedOfLight +- Occupancy +- SchedulerStats +- WarpStateStats + +## Reading Stall Reasons Carefully + +NVIDIA's profiling guide explicitly warns not to over-focus on stalls unless schedulers are failing to issue well. + +Examples: + +- high short-scoreboard stalls often point to shared-memory operations or similar MIO dependencies +- high barrier-related stalls often mean uneven work before synchronization +- high not-selected can simply indicate there are enough eligible warps + +So stall interpretation should follow, not replace, a top-level throughput diagnosis. + +## Practical Triage Order + +1. check total runtime structure with Nsight Systems +2. identify the expensive kernel(s) +3. inspect throughput, occupancy, and warp states in Nsight Compute +4. map the dominant issue back to code: + coalescing, bank conflicts, divergence, occupancy, or launch overhead + +## Related Topics + +- Occupancy tuning: `../occupancy/DOC.md` +- Compute throughput: `../compute-throughput/DOC.md` +- Kernel bottleneck diagnosis workflow: `../kernel-bottleneck-diagnosis-workflow/DOC.md` +- Nsight metrics interpretation cheatsheet: `../nsight-metrics-interpretation-cheatsheet/DOC.md` +- Memory-bound optimization playbook: `../memory-bound-kernel-optimization-playbook/DOC.md` +- Compute-bound optimization playbook: `../compute-bound-kernel-optimization-playbook/DOC.md` +- Launch-bound optimization playbook: `../launch-bound-optimization-playbook/DOC.md` +- CUDA Core optimization checklist: `../cuda-core-optimization-checklist/DOC.md` +- WMMA debugging checklist: `../wmma-debugging-checklist/DOC.md` +- Tensor Core numerical validation: `../tensor-core-numerical-validation/DOC.md` +- Streams and events: `../streams-and-events/DOC.md` +- CUDA Graphs: `../cuda-graphs/DOC.md` +- NVTX workflow: `../nvtx-and-profiling-workflow/DOC.md` +- Error handling and debug build: `../error-handling-and-debug-build/DOC.md` +- Benchmarking methodology: `../benchmarking-methodology/DOC.md` +- Regression testing and CI: `../regression-testing-and-ci/DOC.md` +- Data layout and alignment: `../data-layout-and-alignment/DOC.md` +- Cache behavior and access policy: `../cache-behavior-and-access-policy/DOC.md` + +## Official Source Links (Fact Check) + +- Nsight Systems User Guide: https://docs.nvidia.com/nsight-systems/UserGuide/index.html +- Nsight Compute Profiling Guide: https://docs.nvidia.com/nsight-compute/2024.2/ProfilingGuide/index.html +- Older Nsight Compute profiling guide with stall explanations: https://docs.nvidia.com/nsight-compute/2022.4/ProfilingGuide/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/persistent-kernels-and-work-queues/DOC.md b/content/cuda/docs/persistent-kernels-and-work-queues/DOC.md new file mode 100644 index 00000000..1fbd90db --- /dev/null +++ b/content/cuda/docs/persistent-kernels-and-work-queues/DOC.md @@ -0,0 +1,62 @@ +--- +name: persistent-kernels-and-work-queues +description: "CUDA persistent-kernel essentials: resident worker model, device work queues, load balancing, and synchronization hazards." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,persistent-kernel,work-queue,load-balancing,atomics,producer-consumer,latency" +--- + +# CUDA Persistent Kernels And Work Queues (C++) + +Use this page for latency-sensitive or irregular workloads where one long-lived kernel processes dynamic work. + +## Persistent Kernel Model + +A persistent kernel keeps a fixed set of resident blocks/warps alive and repeatedly pulls tasks from a queue. + +This can reduce launch overhead and improve responsiveness for fine-grained dynamic work. + +## Typical Components + +- global/device work queue +- atomic enqueue/dequeue indices +- worker loop with termination protocol +- backoff or batching strategy for queue contention + +## Where It Helps + +- irregular task sizes +- real-time/low-latency pipelines +- workloads where kernel launch overhead is a large fraction of runtime + +## Where It Hurts + +- queue contention hotspots +- heavy atomic traffic +- poor fairness or starvation in naive dequeue policies +- over-occupying resources and blocking other kernels + +## Design Guardrails + +1. define clear producer/consumer ordering rules. +2. minimize global atomics per task (batch when possible). +3. bound queue contention with per-block or per-warp staging. +4. profile fairness and tail latency, not only average throughput. + +## Related Topics + +- Sparse and irregular kernels: `../sparse-and-irregular-kernels/DOC.md` +- Atomics and reductions: `../atomics-and-reductions/DOC.md` +- Memory fences and ordering: `../memory-fences-and-ordering/DOC.md` +- Streams/events and graphs: `../streams-and-events/DOC.md`, `../cuda-graphs/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Programming Guide, synchronization/order primitives used by queue-based designs: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html +- CUDA C++ Best Practices Guide, launch overhead and memory/atomic considerations: https://docs.nvidia.com/cuda/archive/13.0.0/cuda-c-best-practices-guide/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/pinned-memory-and-transfers/DOC.md b/content/cuda/docs/pinned-memory-and-transfers/DOC.md new file mode 100644 index 00000000..b08a1793 --- /dev/null +++ b/content/cuda/docs/pinned-memory-and-transfers/DOC.md @@ -0,0 +1,66 @@ +--- +name: pinned-memory-and-transfers +description: "CUDA pinned-memory and transfer essentials: page-locked host memory, async memcpy overlap, and transfer-path tuning." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,pinned-memory,page-locked,cudaHostAlloc,cudaMemcpyAsync,transfer-overlap,pcie" +--- + +# CUDA Pinned Memory And Transfers (C++) + +Use this page when host-device transfer performance or overlap is a bottleneck. + +## What Pinned Memory Is + +Pinned (page-locked) host memory is allocated with APIs such as: + +- `cudaHostAlloc` +- `cudaMallocHost` + +Because it is page-locked, the runtime can perform faster and more predictable DMA transfers. + +## Why It Matters + +`cudaMemcpyAsync` overlap with kernel execution generally requires: + +- non-default stream usage +- pinned host buffers for transfer endpoints + +Without pinned memory, many async-copy scenarios degrade to serialized behavior. + +## Basic Pattern + +1. allocate pinned host buffers +2. launch `cudaMemcpyAsync(..., stream)` +3. launch kernels in suitable streams +4. synchronize with stream/event primitives, not global device sync + +## Tradeoffs + +- pinned memory improves transfer behavior +- but excessive pinning can hurt overall system memory behavior on the host + +Pin only hot buffers and reuse them. + +## Common Mistakes + +- assuming `cudaMemcpyAsync` always overlaps without checking buffer type +- mixing default-stream semantics and expecting full concurrency +- over-allocating pinned memory globally + +## Related Topics + +- Streams and events: `../streams-and-events/DOC.md` +- Unified Memory: `../unified-memory/DOC.md` +- CUDA Graphs: `../cuda-graphs/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Best Practices Guide, host-device transfer optimization: https://docs.nvidia.com/cuda/archive/13.0.0/cuda-c-best-practices-guide/index.html +- CUDA Runtime API, host-memory management and async memcpy: https://docs.nvidia.com/cuda/cuda-runtime-api/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/production-readiness-checklist/DOC.md b/content/cuda/docs/production-readiness-checklist/DOC.md new file mode 100644 index 00000000..f155304f --- /dev/null +++ b/content/cuda/docs/production-readiness-checklist/DOC.md @@ -0,0 +1,72 @@ +--- +name: production-readiness-checklist +description: "CUDA production-readiness checklist: correctness gates, performance stability, observability, compatibility, and rollout safeguards." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,production,readiness,checklist,observability,compatibility,rollback,release-gates" +--- + +# CUDA Production Readiness Checklist (C++) + +Use this page before shipping optimized CUDA kernels to production environments. + +## 1) Correctness Gates + +- reference-baseline comparison on representative datasets +- tolerance policy per precision mode +- stress tests for boundary sizes and adversarial shapes +- deterministic/reproducibility expectations documented + +## 2) Performance Gates + +- benchmark methodology fixed and repeatable +- p50/p95 latency and throughput baselines recorded +- regression thresholds defined and enforced in CI/perf jobs +- cold-start versus steady-state behavior measured + +## 3) Observability + +- NVTX ranges present for major pipeline phases +- key metrics exported (latency, error rates, fallback rate) +- profiler workflows documented for oncall debugging + +## 4) Compatibility + +- target `-gencode` matrix matches deployment fleet +- driver/toolkit compatibility validated +- fallback path behavior tested when preferred kernels are unavailable + +## 5) Operational Safety + +- feature flag or staged rollout strategy +- fast rollback path +- runtime guardrails for unexpected shapes/resource exhaustion + +## 6) Documentation Hygiene + +- kernel assumptions and constraints documented +- precision and determinism modes documented +- known limitations and troubleshooting notes linked + +## Related Topics + +- Regression testing and CI: `../regression-testing-and-ci/DOC.md` +- Benchmarking methodology: `../benchmarking-methodology/DOC.md` +- Build and ABI compatibility: `../build-and-abi-compatibility/DOC.md` +- NVTX profiling workflow: `../nvtx-and-profiling-workflow/DOC.md` +- Fallback strategies and capability detection: `../fallback-strategies-and-capability-detection/DOC.md` +- Incident response and rollback playbook: `../incident-response-and-rollback-playbook/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Best Practices Guide: https://docs.nvidia.com/cuda/archive/13.0.0/cuda-c-best-practices-guide/index.html +- CUDA Compatibility documentation: https://docs.nvidia.com/deploy/cuda-compatibility/index.html +- Nsight Systems / Compute docs for observability workflows: + - https://docs.nvidia.com/nsight-systems/UserGuide/index.html + - https://docs.nvidia.com/nsight-compute/2024.2/ProfilingGuide/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/ptx-atomic-and-reduction-patterns/DOC.md b/content/cuda/docs/ptx-atomic-and-reduction-patterns/DOC.md new file mode 100644 index 00000000..9bbcd56f --- /dev/null +++ b/content/cuda/docs/ptx-atomic-and-reduction-patterns/DOC.md @@ -0,0 +1,68 @@ +--- +name: ptx-atomic-and-reduction-patterns +description: "PTX atomic and reduction patterns: atom/cas/red/redux usage, scope/semantic choices, and lock-free update templates." +metadata: + languages: "cpp" + versions: "9.2" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,ptx,atomics,reduction,atom,atom.cas,compare-and-swap,cas,cas-loop,red,redux,scope,acquire,release,lock-free,lockfree" +--- + +# PTX Atomic and Reduction Patterns + +Use this page when designing concurrent PTX update paths with explicit scope and memory semantics. + +## Instruction Families + +- Atomic RMW: `atom.*` +- Compare-and-swap: `atom.cas` +- Reduction-update: `red.*` +- Warp/group reduction helper: `redux.sync` + +## Scope and Semantics First + +Correctness depends on selecting: + +- target state space (shared/global/cluster forms as supported) +- scope (`cta`, `cluster`, `gpu`, `sys` as applicable) +- semantics (relaxed/acquire/release/acq_rel where available) + +A wrong scope can appear correct in tests but fail under real concurrency. + +## Canonical Patterns + +- Lock-free queue/head update: + CAS loop with explicit acquire/release semantics. +- Aggregation path: + `red.*` for one-way accumulation where return value is not required. +- Predicate-guided lane aggregation: + warp-level reduction then fewer global atomics. + +## Failure Modes + +- Missing acquire/release pairing between producer and consumer. +- Overly wide scope adds contention and latency. +- Excessive global atomics with no local aggregation stage. + +## Verification Checklist + +- Stress under high contention and varied scheduling. +- Validate determinism policy (if required) separately from correctness. +- Profile contention hotspots and retry-loop pressure. + +## Related Topics + +- PTX synchronization instructions: `../ptx/instructions/sync-comm/DOC.md` +- PTX memory consistency model: `../ptx/references/memory-consistency-model.md` +- PTX warp synchronization patterns: `../ptx-warp-synchronization-patterns/DOC.md` + +## Official Source Links (Fact Check) + +- PTX atom instruction family: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-atom +- PTX red instruction family: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-red +- PTX redux.sync: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-redux-sync +- PTX Memory Consistency Model: https://docs.nvidia.com/cuda/parallel-thread-execution/#memory-consistency-model + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/ptx-integer-bit-manipulation-patterns/DOC.md b/content/cuda/docs/ptx-integer-bit-manipulation-patterns/DOC.md new file mode 100644 index 00000000..29a0d417 --- /dev/null +++ b/content/cuda/docs/ptx-integer-bit-manipulation-patterns/DOC.md @@ -0,0 +1,57 @@ +--- +name: ptx-integer-bit-manipulation-patterns +description: "PTX integer and bit-manipulation patterns: logic/shift/select primitives, packing/unpacking strategies, and common correctness traps." +metadata: + languages: "cpp" + versions: "9.2" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,ptx,integer,bit-manipulation,logic,shift,selp,lop3,bfe,bfi,popc,brev,prmt" +--- + +# PTX Integer and Bit-Manipulation Patterns + +Use this page for practical composition of PTX integer/logic instructions in performance-sensitive kernels. + +## Core Primitive Groups + +- Logic: `and`, `or`, `xor`, `not`, `lop3` +- Shift and funnel-shift: `shl`, `shr`, `shf` +- Bitfield extraction/insert: `bfe`, `bfi` +- Bit counting/permutation: `clz`, `popc`, `brev`, `prmt` +- Predicate-style selection: `selp`, `setp` + +## Common Composition Patterns + +- Use `setp + selp` for branchless integer clamps and conditional assignment. +- Use `bfe/bfi` for packed-field decode/encode instead of long mask chains. +- Use `lop3` to fuse multi-step boolean logic into fewer instructions. +- Use `popc` and `clz` for bitset analytics and index derivation. + +## Correctness Traps + +- Signed vs unsigned shift semantics (`shr.s*` vs `shr.u*`) change high-bit fill behavior. +- Type width mismatches silently change mask and overflow behavior. +- Packing/unpacking code must define bit positions and endianness assumptions explicitly. + +## Performance Heuristics + +- Prefer fewer dependent bit-ops in hot loops to reduce scoreboard pressure. +- Validate whether `lop3` or `prmt` reduces instruction count on target architecture. +- Recheck register pressure after replacing arithmetic with heavy bit-manipulation sequences. + +## Related Topics + +- PTX integer instruction index: `../ptx/instructions/integer/DOC.md` +- PTX control flow: `../ptx/instructions/control-flow/DOC.md` +- PTX synchronization and communication: `../ptx/instructions/sync-comm/DOC.md` + +## Official Source Links (Fact Check) + +- PTX Integer Arithmetic Instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#integer-arithmetic-instructions +- PTX Logic and Shift Instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#logic-and-shift-instructions +- PTX Comparison and Selection Instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#comparison-and-selection-instructions + +Last cross-check date: 2026-03-20 + diff --git a/content/cuda/docs/ptx-mbarrier-protocol-patterns/DOC.md b/content/cuda/docs/ptx-mbarrier-protocol-patterns/DOC.md new file mode 100644 index 00000000..9eda6556 --- /dev/null +++ b/content/cuda/docs/ptx-mbarrier-protocol-patterns/DOC.md @@ -0,0 +1,57 @@ +--- +name: ptx-mbarrier-protocol-patterns +description: "PTX mbarrier protocol patterns: arrive/test_wait/arrive_drop flows, async-copy integration, and phase-safety rules." +metadata: + languages: "cpp" + versions: "9.2" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,ptx,mbarrier,arrive,test_wait,arrive_drop,cp.async,cp-async-mbarrier-arrive,cp.async.wait_group,cp.async.wait_all,async-proxy,phase,completion-protocol,producer-consumer" +--- + +# PTX mbarrier Protocol Patterns + +Use this page for robust phase-based synchronization protocols around async copy/compute pipelines. + +## Core Operations + +- Producer-side phase signal: `mbarrier.arrive` +- Participant drop from future phases: `mbarrier.arrive_drop` +- Consumer-side wait/poll: `mbarrier.test_wait` / `mbarrier.try_wait` +- Async-copy completion bridge: `cp.async.mbarrier.arrive` + +## Protocol Template + +1. Initialize barrier state and participant expectations. +2. Issue producer operations (for example async copy). +3. Signal completion with appropriate arrive semantics. +4. Wait on consumer side before data use. +5. Advance phases safely and apply `arrive_drop` when participation changes. + +## Phase Safety Rules + +- Keep producer and consumer on the same phase contract. +- Respect no-complete restrictions for `.noComplete` variants. +- Use sink `_` rules correctly for remote cluster-only flows. +- Avoid mixing unrelated work into the same mbarrier protocol. + +## Common Failure Modes + +- Deadlock from mismatched participant counts. +- Premature consumer reads due to missing wait checks. +- Undefined behavior by allowing `.noComplete` variant to complete a phase. + +## Related Topics + +- PTX data-movement async references: `../ptx/instructions/data-movement/references/cp-async.md` +- PTX TMA instructions: `../ptx/instructions/tma/DOC.md` +- PTX synchronization instructions: `../ptx/instructions/sync-comm/DOC.md` + +## Official Source Links (Fact Check) + +- PTX mbarrier instruction set: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-mbarrier +- PTX cp.async.mbarrier.arrive: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-cp-async-mbarrier-arrive +- PTX Asynchronous Operations: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-operations + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/ptx-warp-synchronization-patterns/DOC.md b/content/cuda/docs/ptx-warp-synchronization-patterns/DOC.md new file mode 100644 index 00000000..8be6eb2d --- /dev/null +++ b/content/cuda/docs/ptx-warp-synchronization-patterns/DOC.md @@ -0,0 +1,65 @@ +--- +name: ptx-warp-synchronization-patterns +description: "PTX warp synchronization patterns: vote/shfl/match/elect/bar.warp.sync composition for warp-cooperative algorithms." +metadata: + languages: "cpp" + versions: "9.2" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,ptx,warp,synchronization,shfl.sync,vote.sync,match.sync,elect.sync,bar.warp.sync,membermask" +--- + +# PTX Warp Synchronization Patterns + +Use this page for warp-cooperative control/data exchange patterns without escalating to CTA-wide barriers. + +## Key Warp-Level Primitives + +- `bar.warp.sync` +- `vote.sync` +- `shfl.sync` +- `match.sync` +- `elect.sync` + +## Practical Compositions + +- Warp reduction: + `shfl.sync` plus lane-conditional accumulation. +- Warp agreement checks: + `vote.sync` for any/all consensus. +- Key-based grouping: + `match.sync` for same-value subgrouping. +- Single-lane leadership: + `elect.sync` for representative-thread control logic. + +## Membermask Discipline + +Correctness depends on accurate `membermask` usage: + +- mask must match actual participating lanes on that control path +- mismatched masks can cause undefined or misleading results +- keep mask derivation stable across phases of the same protocol + +## Common Failure Modes + +- Divergent lanes use different masks for the same warp primitive. +- Lane index assumptions are invalid after control-flow divergence. +- Warp-level protocol accidentally used for cross-warp coordination. + +## Related Topics + +- PTX synchronization instructions: `../ptx/instructions/sync-comm/DOC.md` +- PTX control flow: `../ptx/instructions/control-flow/DOC.md` +- CUDA warp primitives (C++ view): `../warp-primitives/DOC.md` + +## Official Source Links (Fact Check) + +- PTX shfl.sync: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-shfl-sync +- PTX vote.sync: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-vote-sync +- PTX match.sync: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-match-sync +- PTX elect.sync: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-elect-sync +- PTX bar.warp.sync: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-bar-warp-sync + +Last cross-check date: 2026-03-20 + diff --git a/content/cuda/docs/ptx/DOC.md b/content/cuda/docs/ptx/DOC.md new file mode 100644 index 00000000..a44c7533 --- /dev/null +++ b/content/cuda/docs/ptx/DOC.md @@ -0,0 +1,118 @@ +--- +name: ptx +description: "NVIDIA PTX ISA 9.2 guide: instruction model, constraints, and architecture mapping." +metadata: + languages: "cpp" + versions: "9.2" + revision: 2 + updated-on: "2026-03-19" + source: official + tags: "cuda,ptx,isa,gpu,assembly,nvidia,wmma,tensor-core,tensorcore,matrix-multiply,matrix-multiply-accumulate,shared-memory,cp.async,mbarrier,bank-conflict,swizzling" +--- + +# PTX ISA 9.2 Navigation + +This directory follows the PTX ISA 9.2 official documentation and provides executable, constrained, and traceable instruction semantics for agents. + +## Coverage + +- Program model: thread hierarchy, state spaces, data types, functions, and ABI +- Instruction format: predicates, opcodes, type suffixes, modifiers, and operands +- Memory model: scope + semantics (relaxed/acquire/release) +- Instruction families: integer, floating point, data movement, control flow, synchronization, WGMMA, and TMA +- Special registers: `%tid`, `%ctaid`, `%smid`, and related registers + +## Recommended Reading Path + +1. `references/programming-model.md` +2. `references/state-spaces-and-types.md` +3. `references/instruction-format-and-operands.md` +4. `references/memory-consistency-model.md` +5. `references/abi-and-calling-convention.md` +6. `instructions/*/DOC.md` + +## Shared Memory Related Entry Points + +- CUDA C++ shared memory base entry: `../shared-memory/DOC.md` +- CUDA C++ Tensor Core entry: `../tensor-cores/DOC.md` +- CUDA execution model entry: `../execution-model/DOC.md` +- CUDA throughput model entry: `../compute-throughput/DOC.md` +- CUDA Core path entry: `../cuda-core/DOC.md` +- CUDA path-selection entry: `../cuda-core-vs-tensor-core-path-selection/DOC.md` +- CUDA WMMA patterns entry: `../wmma-kernel-patterns/DOC.md` +- CUDA Tensor Core pipeline entry: `../tensor-core-pipeline-patterns/DOC.md` +- CUDA async copy entry: `../async-copy/DOC.md` +- CUDA Cooperative Groups entry: `../cooperative-groups/DOC.md` +- CUDA Cluster / DSM entry: `../thread-block-clusters/DOC.md` +- CUDA stream/event entry: `../streams-and-events/DOC.md` +- CUDA fence/ordering entry: `../memory-fences-and-ordering/DOC.md` +- CUDA Graphs entry: `../cuda-graphs/DOC.md` +- CUDA performance diagnostics entry: `../performance-debugging/DOC.md` +- CUDA launch bounds/registers entry: `../launch-bounds-and-registers/DOC.md` +- CUDA Unified Memory entry: `../unified-memory/DOC.md` +- CUDA pinned transfer entry: `../pinned-memory-and-transfers/DOC.md` +- CUDA multi-GPU/P2P entry: `../multi-gpu-and-peer-access/DOC.md` +- CUDA Dynamic Parallelism entry: `../dynamic-parallelism/DOC.md` +- CUDA debug-build/error-handling entry: `../error-handling-and-debug-build/DOC.md` +- CUDA cuBLAS/cuDNN integration entry: `../cublas-cudnn-integration-patterns/DOC.md` +- CUDA NVTX profiling entry: `../nvtx-and-profiling-workflow/DOC.md` +- CUDA numerics/precision entry: `../numerics-and-precision/DOC.md` +- CUDA reproducibility entry: `../randomness-and-reproducibility/DOC.md` +- CUDA fused-kernel design entry: `../fused-kernel-design-patterns/DOC.md` +- CUDA build/ABI compatibility entry: `../build-and-abi-compatibility/DOC.md` +- CUDA sparse/irregular kernels entry: `../sparse-and-irregular-kernels/DOC.md` +- CUDA collective communication patterns entry: `../collective-communication-patterns/DOC.md` +- CUDA benchmarking methodology entry: `../benchmarking-methodology/DOC.md` +- CUDA regression testing/CI entry: `../regression-testing-and-ci/DOC.md` +- CUDA data-layout/alignment entry: `../data-layout-and-alignment/DOC.md` +- CUDA cache behavior entry: `../cache-behavior-and-access-policy/DOC.md` +- CUDA persistent-kernel/work-queue entry: `../persistent-kernels-and-work-queues/DOC.md` +- CUDA production readiness checklist entry: `../production-readiness-checklist/DOC.md` +- CUDA kernel API design entry: `../kernel-api-design-guidelines/DOC.md` +- CUDA shape-specialization/autotuning entry: `../input-shape-specialization-and-autotuning/DOC.md` +- CUDA capability-detection/fallback entry: `../fallback-strategies-and-capability-detection/DOC.md` +- CUDA incident-response/rollback entry: `../incident-response-and-rollback-playbook/DOC.md` +- `.shared` state-space reference: `references/state-spaces-and-types.md` +- `cp.async` reference: `instructions/data-movement/references/cp-async.md` +- `mbarrier` reference: `instructions/sync-comm/DOC.md` +- TMA/shared-memory layout reference: `instructions/tma/DOC.md` + +## PTX Pattern Playbooks + +- Integer and bit-manipulation patterns: `../ptx-integer-bit-manipulation-patterns/DOC.md` +- Atomic and reduction patterns: `../ptx-atomic-and-reduction-patterns/DOC.md` +- mbarrier protocol patterns: `../ptx-mbarrier-protocol-patterns/DOC.md` +- Warp synchronization patterns: `../ptx-warp-synchronization-patterns/DOC.md` + +## Instruction Category Entry Points + +- Integer Arithmetic: `instructions/integer/DOC.md` +- Floating-Point: `instructions/floating-point/DOC.md` +- Data Movement: `instructions/data-movement/DOC.md` +- Control Flow: `instructions/control-flow/DOC.md` +- Synchronization and Communication: `instructions/sync-comm/DOC.md` +- Warpgroup MMA: `instructions/wgmma/DOC.md` +- Tensor Memory Accelerator: `instructions/tma/DOC.md` +- Special Registers: `instructions/special-registers/DOC.md` + +## Documentation Reliability Notes + +- Syntax and semantic claims in this directory map to NVIDIA PTX ISA sections. +- Each document includes section-level anchors for direct verification. +- If newer PTX versions are released, prioritize release-notes deltas. + +## Official Source Links (fact check) + +- PTX main documentation: https://docs.nvidia.com/cuda/parallel-thread-execution/ +- PTX ISA Notes: https://docs.nvidia.com/cuda/parallel-thread-execution/#ptx-isa-notes +- Target ISA Notes: https://docs.nvidia.com/cuda/parallel-thread-execution/#target-isa-notes +- Release Notes: https://docs.nvidia.com/cuda/parallel-thread-execution/#release-notes + +Last verified date: 2026-03-19 + +## B-series Special Entry Points + +- H-series special instruction summary: `references/h-series-special-instructions.md` +- Architecture capability matrix: `references/b-series-arch-matrix.md` +- Delta vs Hopper: `references/b-series-delta-from-hopper.md` +- tcgen05 special topic: `instructions/tcgen05/DOC.md` diff --git a/content/cuda/docs/ptx/instructions/control-flow/DOC.md b/content/cuda/docs/ptx/instructions/control-flow/DOC.md new file mode 100644 index 00000000..217217ea --- /dev/null +++ b/content/cuda/docs/ptx/instructions/control-flow/DOC.md @@ -0,0 +1,55 @@ +--- +name: ptx-control-flow-instructions +description: "PTX control-flow instructions and divergence-related behaviors in ISA 9.2." +metadata: + languages: "cpp" + versions: "9.2" + revision: 2 + updated-on: "2026-03-19" + source: official + tags: "cuda,ptx,control-flow,branch,call" +--- + +# PTX Control Flow + +Control-flow instructions determine branching, calling, and exit behavior, while also affecting warp divergence and execution efficiency. + +## Common Instructions + +- `bra` conditional/unconditional branch +- `call` device function call +- `ret` function return +- `exit` thread exit +- `brx.idx` indirect branch + +## Syntax Example (PTX style) + +```ptx +@p bra L_done; +call.uni (_), my_func, (arg0); +ret; +``` + +## Constraints and Pitfalls + +- Predicate-controlled branches can introduce warp divergence. +- `call` paths must satisfy parameter and ABI rules. +- Some branch variants have version or target ISA requirements. + +## Official Source Links (fact check) + +- Control Flow Instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#control-flow-instructions +- bra: https://docs.nvidia.com/cuda/parallel-thread-execution/#control-flow-instructions-bra +- call: https://docs.nvidia.com/cuda/parallel-thread-execution/#control-flow-instructions-call +- ret: https://docs.nvidia.com/cuda/parallel-thread-execution/#control-flow-instructions-ret + +Last verified date: 2026-03-19 + +## Single-Instruction References + +- `references/bra.md` +- `references/call.md` +- `references/ret.md` +- `references/brx-idx.md` +- `references/exit.md` +- `references/trap.md` diff --git a/content/cuda/docs/ptx/instructions/control-flow/references/bra.md b/content/cuda/docs/ptx/instructions/control-flow/references/bra.md new file mode 100644 index 00000000..c9076202 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/control-flow/references/bra.md @@ -0,0 +1,34 @@ +# PTX Instruction Topic: bra + +`bra` is a fundamental PTX branch instruction that supports predicate-controlled conditional branching. + +## Official Description +- Documentation section: Control Flow Instructions: `bra` +- Commonly used together with predicates generated by `setp` + +## Key Constraints +- Conditional branching depends on the result of a predicate register. +- Branch divergence can affect warp execution efficiency. +- The target label must be within a valid control-flow range. + +## Usage Notes +- Prefer `setp + bra` patterns that keep divergent regions short. +- Keep branch targets structurally simple so join behavior is easy to audit. + +## Common Failure Modes +- Predicate values are stale because producer instructions were reordered or conditionally skipped. +- Divergent branch regions grow too large and create avoidable warp-serial execution. + +## Example (PTX Style) + +```ptx +@p bra L_true; +bra L_end; +``` + +## Official Source Links (Fact Check) + +- bra: https://docs.nvidia.com/cuda/parallel-thread-execution/#control-flow-instructions-bra +- Control flow instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#control-flow-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/control-flow/references/brx-idx.md b/content/cuda/docs/ptx/instructions/control-flow/references/brx-idx.md new file mode 100644 index 00000000..7266efe1 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/control-flow/references/brx-idx.md @@ -0,0 +1,32 @@ +# PTX Instruction Topic: brx.idx + +`brx.idx` is an index-based branch control flow instruction, commonly used for jump-table-style dispatch. + +## Official Description + +- Documentation section: Control Flow Instructions: `brx.idx` +- Used to select the branch target based on an index value + +## Key Constraints + +- The index range must match the number of valid table entries. +- The strategy for handling invalid indices should be clearly defined in higher-level logic. +- Conditional paths must keep warp-level control-flow consistency manageable. + +## Usage Notes + +- Use `brx.idx` for dense dispatch tables where branch targets are static and well-audited. +- Include a default-safe path for out-of-range indices before deployment on variable inputs. + +## Example (PTX Style, Illustrative) + +```ptx +brx.idx idx, table; +``` + +## Official Source Links (Fact Check) + +- brx.idx: https://docs.nvidia.com/cuda/parallel-thread-execution/#control-flow-instructions-brx-idx +- Control flow instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#control-flow-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/control-flow/references/call.md b/content/cuda/docs/ptx/instructions/control-flow/references/call.md new file mode 100644 index 00000000..2a6e9dbf --- /dev/null +++ b/content/cuda/docs/ptx/instructions/control-flow/references/call.md @@ -0,0 +1,33 @@ +# PTX Instruction Topic: call + +`call` is used for device function calls, involving parameter passing and calling conventions. + +## Official Description + +- Documentation section: Control Flow Instructions: `call` +- Related to `.func` declarations, the `.param` parameter space, and ABI constraints + +## Key Constraints + +- The parameter list must match the callee function signature. +- Register/return value semantics along the calling path must be consistent. +- Under conditional execution, avoid control-flow inconsistencies that could lead to undefined behavior. + +## Usage Notes + +- Prefer `call.uni` only when uniform control-flow assumptions are guaranteed. +- Re-evaluate register pressure and inlining tradeoffs when introducing helper calls in hot kernels. + +## Example (PTX Style) + +```ptx +call.uni (retval), my_func, (arg0, arg1); +``` + +## Official Source Links (Fact Check) + +- call: https://docs.nvidia.com/cuda/parallel-thread-execution/#control-flow-instructions-call +- Function declarations and definitions: https://docs.nvidia.com/cuda/parallel-thread-execution/#function-declarations-and-definitions +- Abstracting the ABI: https://docs.nvidia.com/cuda/parallel-thread-execution/#abstracting-the-abi + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/control-flow/references/exit.md b/content/cuda/docs/ptx/instructions/control-flow/references/exit.md new file mode 100644 index 00000000..78ec56ef --- /dev/null +++ b/content/cuda/docs/ptx/instructions/control-flow/references/exit.md @@ -0,0 +1,31 @@ +# PTX Instruction Topic: exit + +`exit` terminates the current thread’s execution and is a thread-level exit primitive inside a kernel. + +## Official Description + +- Documentation section: Control Flow Instructions: `exit` +- Commonly used for early-exit paths and boundary-condition handling + +## Key Constraints + +- Before exiting, ensure that shared-state updates and synchronization requirements are satisfied. +- Avoid issuing `exit` early at points that require all participants in a synchronization, otherwise the protocol may be mismatched. + +## Usage Notes + +- Use early `exit` only on paths that do not participate in later collective synchronization. +- Prefer predicate-guarded compute skip when protocol consistency is more important than early termination. + +## Example (PTX Style) + +```ptx +@p exit; +``` + +## Official Source Links (Fact Check) + +- exit: https://docs.nvidia.com/cuda/parallel-thread-execution/#control-flow-instructions-exit +- Control flow instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#control-flow-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/control-flow/references/ret.md b/content/cuda/docs/ptx/instructions/control-flow/references/ret.md new file mode 100644 index 00000000..d1af344e --- /dev/null +++ b/content/cuda/docs/ptx/instructions/control-flow/references/ret.md @@ -0,0 +1,31 @@ +# PTX Instruction Topic: ret + +`ret` is used for function return, ending the current function call path. + +## Official Description + +- Documentation section: Control Flow Instructions: `ret` +- Matches the call boundary of `call` + +## Key Constraints + +- The return path must be consistent with the function definition and calling convention. +- In complex control flow, ensure that all paths can reach a valid return point. + +## Usage Notes + +- Keep return conventions explicit when mixing `.func` helpers and inlined call sites. +- Validate that predicate-driven paths still preserve a legal return sequence. + +## Example (PTX Style) + +```ptx +ret; +``` + +## Official Source Links (Fact Check) + +- ret: https://docs.nvidia.com/cuda/parallel-thread-execution/#control-flow-instructions-ret +- Control flow instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#control-flow-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/control-flow/references/trap.md b/content/cuda/docs/ptx/instructions/control-flow/references/trap.md new file mode 100644 index 00000000..ca6916a1 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/control-flow/references/trap.md @@ -0,0 +1,33 @@ +# PTX Instruction Topic: trap + +`trap` is used to trigger exceptions/debug traps and is commonly used on error paths or as a debugging breakpoint. + +## Official Description + +- Documentation section: Control Flow Instructions: `trap` +- Mainly used for diagnostics and fail-fast scenarios + +## Usage Notes + +- Trigger it only under clearly defined error conditions. +- Use it cautiously on production paths to avoid impacting throughput. +- Pair `trap` with a clear diagnostics policy so failures are reproducible. +- Avoid embedding `trap` in speculative fast paths that may execute under benign edge conditions. + +## Common Failure Modes + +- Leaving debug-only `trap` paths enabled in production builds. +- Emitting `trap` without enough context to diagnose the triggering condition. + +## Example (PTX Style) + +```ptx +@p trap; +``` + +## Official Source Links (Fact Check) + +- trap: https://docs.nvidia.com/cuda/parallel-thread-execution/#control-flow-instructions-trap +- Control flow instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#control-flow-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/data-movement/DOC.md b/content/cuda/docs/ptx/instructions/data-movement/DOC.md new file mode 100644 index 00000000..3ca7242c --- /dev/null +++ b/content/cuda/docs/ptx/instructions/data-movement/DOC.md @@ -0,0 +1,65 @@ +--- +name: ptx-data-movement-instructions +description: "PTX data movement instructions in ISA 9.2, including ld/st/ldu, cvt/cvt.pack/cvta, cp.async paths, and prefetch hints." +metadata: + languages: "cpp" + versions: "9.2" + revision: 2 + updated-on: "2026-03-19" + source: official + tags: "cuda,ptx,load,store,memory,cp.async,cp.async.bulk,ld,ldu,ld.global.nc,st,st.async,st.bulk,cvt,cvt.pack,cvta,mov,prefetch,prefetchu,data-movement" +--- + +# PTX Data Movement + +This page covers PTX load/store, conversion, and async movement patterns that dominate memory-side kernel behavior. + +## Representative Syntax + +```ptx +cp.async.ca.shared{::cta}.global{.level::cache_hint}{.level::prefetch_size} [dst], [src], cp-size{, src-size}{, cache-policy}; +cp.async.commit_group; +cp.async.wait_group N; +cp.async.wait_all; +``` + +## Minimal Async Copy Pattern + +```ptx +cp.async.ca.shared.global [smem_ptr], [gmem_ptr], 16; +cp.async.commit_group; +cp.async.wait_group 0; +``` + +## Constraints and Pitfalls + +- Source/destination state spaces must match the instruction form. +- Async copy completion must be explicitly synchronized before consumer access. +- Conversion/load/store variants have operand width and alignment constraints. + +## Official Source Links (fact check) + +- Data Movement and Conversion Instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions +- cp.async: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async +- cp.async.commit_group: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-commit-group +- cp.async.wait_group/wait_all: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-wait-group-cp-async-wait-all +- ld: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-ld + +Last verified date: 2026-03-19 + +## Single-Instruction References + +- `references/cp-async.md` +- `references/cp-async-bulk.md` +- `references/ld.md` +- `references/st.md` +- `references/cp-async-wait-group.md` +- `references/prefetch.md` +- `references/cvta.md` +- `references/mov.md` +- `references/cvt.md` +- `references/ld-global-nc.md` +- `references/st-async.md` +- `references/st-bulk.md` +- `references/cvt-pack.md` +- `references/ldu.md` diff --git a/content/cuda/docs/ptx/instructions/data-movement/references/cp-async-bulk.md b/content/cuda/docs/ptx/instructions/data-movement/references/cp-async-bulk.md new file mode 100644 index 00000000..dee0888a --- /dev/null +++ b/content/cuda/docs/ptx/instructions/data-movement/references/cp-async-bulk.md @@ -0,0 +1,32 @@ +# PTX Instruction Note: cp.async.bulk + +`cp.async.bulk` is a bulk async copy instruction with mbarrier-based completion, suitable for larger transfers. + +## Official Syntax (Representative Form) + +```ptx +cp.async.bulk.shared::cta.global.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [mbar]; +cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [dstMem], [srcMem], size, [mbar]; +``` + +## Key Semantics + +- The instruction executes on the async proxy and is a weak memory operation. +- Completion can be configured via `.mbarrier::complete_tx::bytes`. +- complete-tx carries `completeCount=bytes` on the mbarrier. +- The documentation states completion is followed by an implicit generic-async proxy fence. +- You still need async-group or mbarrier waits before consuming the data. + +## Key Constraints + +- Source/destination state spaces must match the selected bulk variant form. +- `size` and operand alignment must satisfy ISA requirements for the target architecture. +- Completion tracking must be explicit before downstream consumers read results. + +## Official Source Links (fact check) + +- cp.async.bulk: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-bulk +- Asynchronous data movement: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-data-movement-and-conversion-instructions +- Memory consistency model: https://docs.nvidia.com/cuda/parallel-thread-execution/#memory-consistency-model + +Last verified date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/data-movement/references/cp-async-wait-group.md b/content/cuda/docs/ptx/instructions/data-movement/references/cp-async-wait-group.md new file mode 100644 index 00000000..205f33aa --- /dev/null +++ b/content/cuda/docs/ptx/instructions/data-movement/references/cp-async-wait-group.md @@ -0,0 +1,35 @@ +# PTX Instruction Note: cp.async.wait_group / cp.async.wait_all + +`cp.async.wait_group` / `cp.async.wait_all` is used to wait for `cp.async` groups to complete. + +## Official Syntax + +```ptx +cp.async.wait_group N; +cp.async.wait_all; +``` + +## Key Semantics + +- `wait_group N`: waits until at most N recent pending groups remain, and all earlier groups complete. +- When `N=0`, waits for all prior `cp.async` groups to complete. +- This wait only applies to `cp.async` completion; it does not provide ordering/visibility for other memory operations. + +## Usage Recommendations + +- Execute the wait before consuming destination shared-memory data. +- Do not treat this as a general fence; it only applies to `cp.async` completion semantics. + +## Common Failure Modes + +- Waiting on the wrong stage depth (`N`) and reading tiles that are not yet complete. +- Mixing unrelated async pipelines into one wait protocol and causing phase confusion. +- Assuming `wait_group` replaces other synchronization steps needed by the overall algorithm. + +## Official Source Links (fact check) + +- cp.async.wait_group / wait_all: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-wait-group-cp-async-wait-all +- cp.async: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async +- Asynchronous operations: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-operations + +Last verified date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/data-movement/references/cp-async.md b/content/cuda/docs/ptx/instructions/data-movement/references/cp-async.md new file mode 100644 index 00000000..f9dec9eb --- /dev/null +++ b/content/cuda/docs/ptx/instructions/data-movement/references/cp-async.md @@ -0,0 +1,41 @@ +# PTX Instruction Note: cp.async + +`cp.async` is a non-blocking async copy instruction from `.global` to `.shared`, and requires explicit waiting via group or mbarrier mechanisms. + +## Official Syntax (Excerpt) + +```ptx +cp.async.ca.shared{::cta}.global{.level::cache_hint}{.level::prefetch_size} [dst], [src], cp-size{, src-size}{, cache-policy}; +cp.async.cg.shared{::cta}.global{.level::cache_hint}{.level::prefetch_size} [dst], [src], cp-size{, src-size}{, cache-policy}; +``` + +## Key Semantics + +- The instruction is non-blocking; the issuing thread continues execution. +- `src` must be in global memory and `dst` must be in shared memory. +- If optional `src-size` is smaller than `cp-size`, remaining `dst` bytes are zero-filled. +- `src-size > cp-size` is undefined behavior. + +## Completion and Visibility + +- Without explicit synchronization, ordering between `cp.async` operations is not guaranteed. +- Completion can be tracked through: + - `cp.async.commit_group` + `cp.async.wait_group` / `cp.async.wait_all` + - `cp.async.mbarrier.arrive` + `mbarrier.test_wait/try_wait` + +## Minimal Pattern + +```ptx +cp.async.ca.shared.global [smem_ptr], [gmem_ptr], 16; +cp.async.commit_group; +cp.async.wait_group 0; +``` + +## Official Source Links (fact check) + +- cp.async: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async +- cp.async.commit_group: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-commit-group +- cp.async.wait_group / wait_all: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-wait-group-cp-async-wait-all +- Asynchronous operations: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-operations + +Last verified date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/data-movement/references/cvt-pack.md b/content/cuda/docs/ptx/instructions/data-movement/references/cvt-pack.md new file mode 100644 index 00000000..e856dd3a --- /dev/null +++ b/content/cuda/docs/ptx/instructions/data-movement/references/cvt-pack.md @@ -0,0 +1,31 @@ +# PTX Instruction Note: cvt.pack + +`cvt.pack` converts and packs multiple source elements into a compact destination representation. + +## Official Positioning + +- Documentation section: Data Movement and Conversion Instructions: `cvt.pack` + +## Key Constraints + +- Source element types, destination packed type, and rounding/saturation modifiers must form a legal variant. +- Packing order and lane composition follow ISA-defined operand ordering. +- Use saturation/rounding modifiers explicitly when narrowing precision. + +## Usage Notes + +- Use `cvt.pack` to reduce instruction count when packing quantized outputs. +- Validate lane ordering assumptions before integrating with vectorized unpack paths. + +## Example (PTX style) + +```ptx +cvt.pack.sat.u8.s32.b32 d, a, b, c; +``` + +## Official Source Links (fact check) + +- cvt.pack: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cvt-pack +- cvt: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cvt + +Last verified date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/data-movement/references/cvt.md b/content/cuda/docs/ptx/instructions/data-movement/references/cvt.md new file mode 100644 index 00000000..c1c226d5 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/data-movement/references/cvt.md @@ -0,0 +1,33 @@ +# PTX Instruction Note: cvt + +`cvt` is used for numeric type conversion (integer/float/bit-width changes), a key instruction for mixed precision and interface adaptation. + +## Official Positioning + +- Documentation section: Data Movement and Conversion Instructions: `cvt` +- Related extension section: `cvt.pack` + +## Key Constraints + +- Target type suffix determines rounding/truncation behavior. +- Float-to-integer conversion requires overflow and rounding handling. +- Packed variants must satisfy element-type and packing-format requirements. + +## Usage Notes + +- Use explicit rounding modes (`rn`, `rz`, `rm`, `rp`) to make conversion policy reviewable. +- Validate saturation and overflow handling before deploying quantization paths. + +## Example (PTX style) + +```ptx +cvt.rn.f32.f16 f1, h1; +cvt.rzi.s32.f32 r1, f1; +``` + +## Official Source Links (fact check) + +- cvt: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cvt +- cvt.pack: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cvt-pack + +Last verified date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/data-movement/references/cvta.md b/content/cuda/docs/ptx/instructions/data-movement/references/cvta.md new file mode 100644 index 00000000..1ab09d29 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/data-movement/references/cvta.md @@ -0,0 +1,35 @@ +# PTX Instruction Note: cvta + +`cvta` is used for address conversion/normalization (`convert address`) and is critical for cross-address-space pointer handling. + +## Official Positioning + +- Documentation section: Data Movement and Conversion Instructions: `cvta` + +## Key Constraints + +- Target state space and input address must match an allowed conversion direction. +- Result register bit width must accommodate the target address representation. + +## Usage Notes + +- Use `cvta` at ABI boundaries where generic pointers must be normalized to explicit state spaces. +- Keep pointer width explicit (`u32` vs `u64`) to avoid truncation on mixed-address workflows. + +## Common Failure Modes + +- Converting to an incorrect target state space and then reusing the pointer in unrelated load/store paths. +- Address-width truncation when 64-bit addresses are forced into 32-bit intermediates. + +## Example (PTX style) + +```ptx +cvta.to.global.u64 rd, ra; +``` + +## Official Source Links (fact check) + +- cvta: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cvta +- State spaces: https://docs.nvidia.com/cuda/parallel-thread-execution/#state-spaces + +Last verified date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/data-movement/references/ld-global-nc.md b/content/cuda/docs/ptx/instructions/data-movement/references/ld-global-nc.md new file mode 100644 index 00000000..1e0cd4eb --- /dev/null +++ b/content/cuda/docs/ptx/instructions/data-movement/references/ld-global-nc.md @@ -0,0 +1,31 @@ +# PTX Instruction Note: ld.global.nc + +`ld.global.nc` performs non-coherent global-memory loads with cache-policy controls defined by ISA variants. + +## Official Positioning + +- Documentation section: Data Movement and Conversion Instructions: `ld.global.nc` + +## Key Constraints + +- Applicable only to legal global-memory address forms and supported type variants. +- Cache/modifier combinations must match the documented variant constraints. +- Ordering/visibility guarantees differ from coherent paths; combine with appropriate synchronization when required. + +## Usage Notes + +- Use `ld.global.nc` for read-mostly streams where non-coherent cache behavior is intentional. +- Validate cache-policy choices with profiler counters instead of assuming lower latency. + +## Example (PTX style) + +```ptx +ld.global.nc.u32 r1, [addr]; +``` + +## Official Source Links (fact check) + +- ld.global.nc: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-ld-global-nc +- ld: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-ld + +Last verified date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/data-movement/references/ld.md b/content/cuda/docs/ptx/instructions/data-movement/references/ld.md new file mode 100644 index 00000000..2ae24796 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/data-movement/references/ld.md @@ -0,0 +1,34 @@ +# PTX Instruction Note: ld + +`ld` is the base PTX load instruction family across global/shared/local/constant state spaces. + +## Official Positioning + +- Documentation section: Data Movement and Conversion Instructions: `ld` + +## Key Constraints + +- Address state space and instruction variant must match. +- Destination register type/width must match the loaded element format. +- Variant modifiers (cache, scope, vector width) must satisfy ISA-specific constraints. + +## Usage Notes + +- Use coherent `ld` forms by default; switch to specialized variants only with measured justification. +- Align load width and vectorization with producer layout to preserve coalescing efficiency. +- Keep cache modifiers consistent across hot paths to reduce unpredictable locality behavior. + +## Example (PTX style) + +```ptx +ld.global.u32 r1, [addr]; +ld.shared.f32 f1, [saddr]; +``` + +## Official Source Links (fact check) + +- ld: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-ld +- ld.global.nc: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-ld-global-nc +- ldu: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-ldu + +Last verified date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/data-movement/references/ldu.md b/content/cuda/docs/ptx/instructions/data-movement/references/ldu.md new file mode 100644 index 00000000..cd7f9fe9 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/data-movement/references/ldu.md @@ -0,0 +1,32 @@ +# PTX Instruction Note: ldu + +`ldu` provides a uniform load path for addresses that are expected to be uniform across threads. + +## Official Positioning + +- Documentation section: Data Movement and Conversion Instructions: `ldu` + +## Key Constraints + +- Use only with legal `ldu` state-space/type combinations documented by ISA. +- Intended uniform-access assumptions should match actual access behavior for best results. +- Do not treat `ldu` as a generic replacement for all `ld` forms. + +## Usage Recommendations + +- Prefer `ldu` when operand addresses are naturally uniform within the execution group. +- Validate performance impact with profiling because benefit is pattern-dependent. + +## Example (PTX style) + +```ptx +ldu.global.u32 r1, [addr]; +``` + +## Official Source Links (fact check) + +- ldu: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-ldu +- ld: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-ld +- ld.global.nc: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-ld-global-nc + +Last verified date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/data-movement/references/mov.md b/content/cuda/docs/ptx/instructions/data-movement/references/mov.md new file mode 100644 index 00000000..5f2876ac --- /dev/null +++ b/content/cuda/docs/ptx/instructions/data-movement/references/mov.md @@ -0,0 +1,32 @@ +# PTX Instruction Note: mov + +`mov` transfers values between registers and selected special-register/constant forms. + +## Official Positioning + +- Documentation section: Data Movement and Conversion Instructions: `mov` + +## Key Constraints + +- Source and destination operand classes must match a legal `mov` variant. +- Width/type suffixes must preserve valid bit-width semantics. +- Special-register movement forms require supported register names and target ISA. + +## Usage Notes + +- Use `mov` for explicit register/value handoff when clarity is more important than implicit compiler rewrites. +- Keep special-register reads localized to reduce accidental architectural coupling. + +## Example (PTX style) + +```ptx +mov.u32 r1, r2; +mov.u32 r_tid, %tid.x; +``` + +## Official Source Links (fact check) + +- mov: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-mov +- Special registers: https://docs.nvidia.com/cuda/parallel-thread-execution/#special-registers + +Last verified date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/data-movement/references/prefetch.md b/content/cuda/docs/ptx/instructions/data-movement/references/prefetch.md new file mode 100644 index 00000000..945e8526 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/data-movement/references/prefetch.md @@ -0,0 +1,32 @@ +# PTX Instruction Note: prefetch / prefetchu + +`prefetch` and `prefetchu` provide advisory cache prefetch behavior for eligible memory access patterns. + +## Official Positioning + +- Documentation section: Data Movement and Conversion Instructions: `prefetch, prefetchu` + +## Key Constraints + +- Prefetch instructions are hints; they do not guarantee residency or strict ordering semantics. +- Address form and state-space usage must match legal variants. +- Overuse can add overhead without gain when locality is weak. + +## Usage Recommendations + +- Use for predictable forward-access streams where cache warmup is beneficial. +- Confirm benefit with profiler metrics rather than assuming speedup. +- Combine with coalesced access patterns; prefetch does not fix poor memory layout. + +## Example (PTX style, Illustrative) + +```ptx +prefetch.global.L2 [addr]; +``` + +## Official Source Links (fact check) + +- prefetch, prefetchu: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-prefetch-prefetchu +- Data movement instruction set: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions + +Last verified date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/data-movement/references/st-async.md b/content/cuda/docs/ptx/instructions/data-movement/references/st-async.md new file mode 100644 index 00000000..f6964d37 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/data-movement/references/st-async.md @@ -0,0 +1,32 @@ +# PTX Instruction Note: st.async + +`st.async` issues asynchronous store operations with completion signaling in supported variants. + +## Official Positioning + +- Documentation section: Data Movement and Conversion Instructions: `st.async` + +## Key Semantics + +- Operation is asynchronous; consumer visibility must follow explicit completion/synchronization rules. +- mbarrier-based completion variants publish transfer completion through documented mechanisms. +- Ordering and visibility follow PTX async-operation and memory-consistency semantics. + +## Usage Notes + +- Keep each asynchronous store pipeline tied to a clear barrier/phase protocol. +- Avoid mixing unrelated producer paths into the same completion channel. + +## Example (PTX style) + +```ptx +st.async.shared::cluster.mbarrier::complete_tx::bytes.u32 [addr], b, [mbar_addr]; +``` + +## Official Source Links (fact check) + +- st.async: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-st-async +- mbarrier: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-mbarrier +- Memory consistency model: https://docs.nvidia.com/cuda/parallel-thread-execution/#memory-consistency-model + +Last verified date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/data-movement/references/st-bulk.md b/content/cuda/docs/ptx/instructions/data-movement/references/st-bulk.md new file mode 100644 index 00000000..167c8a69 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/data-movement/references/st-bulk.md @@ -0,0 +1,31 @@ +# PTX Instruction Note: st.bulk + +`st.bulk` is the bulk-store instruction family for larger transfer paths in supported state-space combinations. + +## Official Positioning + +- Documentation section: Data Movement and Conversion Instructions: `st.bulk` + +## Key Constraints + +- Source data type, destination state space, and size operands must match legal forms. +- Bulk-store usage should respect architecture-specific restrictions and completion semantics. +- Use explicit synchronization where subsequent consumers depend on completion. + +## Usage Recommendations + +- Prefer `st.bulk` for structured large transfers where the ISA form is supported. +- Validate that store granularity and alignment match your buffer layout. + +## Example (PTX style, Illustrative) + +```ptx +st.bulk.shared::cluster.u32 [addr], r1, bytes; +``` + +## Official Source Links (fact check) + +- st.bulk: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-st-bulk +- st: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-st + +Last verified date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/data-movement/references/st.md b/content/cuda/docs/ptx/instructions/data-movement/references/st.md new file mode 100644 index 00000000..dd66e624 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/data-movement/references/st.md @@ -0,0 +1,33 @@ +# PTX Instruction Note: st + +`st` stores register values to memory in the specified state space and type form. + +## Official Positioning + +- Documentation section: Data Movement and Conversion Instructions: `st` + +## Key Constraints + +- Destination address state space must match the selected `st` variant. +- Source register type must match stored element type. +- For concurrent shared-data read/write, establish ordering with fence/atom/barrier. + +## Usage Notes + +- Keep alignment and element-size choices consistent with consumer load patterns. +- Use the narrowest valid state-space form and pair with explicit synchronization when required. + +## Example (PTX style) + +```ptx +st.global.u32 [addr], r1; +st.shared.f32 [saddr], f1; +``` + +## Official Source Links (fact check) + +- st: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-st +- st.async: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-st-async +- st.bulk: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-st-bulk + +Last verified date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/floating-point/DOC.md b/content/cuda/docs/ptx/instructions/floating-point/DOC.md new file mode 100644 index 00000000..92899cda --- /dev/null +++ b/content/cuda/docs/ptx/instructions/floating-point/DOC.md @@ -0,0 +1,67 @@ +--- +name: ptx-floating-point-instructions +description: "PTX floating-point instructions, rounding behavior, and type constraints in ISA 9.2." +metadata: + languages: "cpp" + versions: "9.2" + revision: 2 + updated-on: "2026-03-19" + source: official + tags: "cuda,ptx,floating-point,math" +--- + +# PTX Floating-Point + +This page focuses on PTX floating-point paths, rounding semantics, and common pitfalls. + +## Common Instructions + +- `add` / `sub` / `mul` +- `fma` +- `div` +- `sqrt` + +## Syntax Example (PTX style) + +```ptx +fma.rn.f32 d, a, b, c; +sqrt.rn.f32 d, a; +``` + +## Constraints and Pitfalls + +- Rounding suffixes and type suffixes must match legal ISA forms. +- Approximate transcendental forms can differ from high-precision library references. +- NaN/Inf and exceptional-value behavior should be treated according to ISA semantics. + +## Usage Recommendations + +- Validate precision-sensitive kernels against a reference implementation. +- Distinguish approximate and exact variants when setting numerical tolerances. +- Keep mixed-precision policies explicit (input type, compute type, accumulation type). + +## Official Source Links (fact check) + +- Floating Point Instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions +- fma: https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions-fma +- sqrt: https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions-sqrt +- Half Precision instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#half-precision-floating-point-instructions + +Last verified date: 2026-03-19 + +## Single-Instruction References + +- `references/add.md` +- `references/sub.md` +- `references/mul.md` +- `references/fma.md` +- `references/sqrt.md` +- `references/rcp.md` +- `references/rsqrt.md` +- `references/sin.md` +- `references/cos.md` +- `references/lg2.md` +- `references/ex2.md` +- `references/tanh.md` +- `references/copysign.md` +- `references/testp.md` diff --git a/content/cuda/docs/ptx/instructions/floating-point/references/add.md b/content/cuda/docs/ptx/instructions/floating-point/references/add.md new file mode 100644 index 00000000..dca1f2c9 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/floating-point/references/add.md @@ -0,0 +1,33 @@ +# PTX Instruction Note: add (floating-point) + +`add` performs floating-point addition with PTX-defined type and rounding variants. + +## Official Positioning + +- Documentation section: Floating Point Instructions: `add` +- Related sections: Half precision and mixed precision `add` variants + +## Key Constraints + +- Use a type/rounding suffix combination that is valid for the selected variant. +- Source and destination operand types must match the instruction form. +- NaN/Inf and exceptional cases follow ISA-defined floating-point semantics. + +## Usage Notes + +- Use explicit rounding suffixes in numerically audited kernels to avoid implicit behavior drift. +- Validate mixed-precision accumulation paths when `add` consumes converted inputs. + +## Example (PTX style) + +```ptx +add.rn.f32 d, a, b; +``` + +## Official Source Links (fact check) + +- add: https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions-add +- Half precision add: https://docs.nvidia.com/cuda/parallel-thread-execution/#half-precision-floating-point-instructions-add +- Mixed precision add: https://docs.nvidia.com/cuda/parallel-thread-execution/#mixed-precision-floating-point-instructions-add + +Last verified date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/floating-point/references/copysign.md b/content/cuda/docs/ptx/instructions/floating-point/references/copysign.md new file mode 100644 index 00000000..28ed41b8 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/floating-point/references/copysign.md @@ -0,0 +1,31 @@ +# PTX Instruction Note: copysign + +`copysign` returns the magnitude of the first operand with the sign bit of the second operand. + +## Official Positioning + +- Documentation section: Floating Point Instructions: `copysign` + +## Key Constraints + +- Operand and destination types must match the selected variant. +- This is a sign-bit transform, not a fused arithmetic operation. +- Special-value behavior follows ISA-defined floating-point semantics. + +## Usage Notes + +- Use `copysign` for branchless sign injection while preserving magnitude. +- Keep NaN and signed-zero behavior aligned with your numerical policy. + +## Example (PTX style) + +```ptx +copysign.f32 d, a, b; +``` + +## Official Source Links (fact check) + +- copysign: https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions-copysign +- Floating point instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions + +Last verified date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/floating-point/references/cos.md b/content/cuda/docs/ptx/instructions/floating-point/references/cos.md new file mode 100644 index 00000000..4858c88a --- /dev/null +++ b/content/cuda/docs/ptx/instructions/floating-point/references/cos.md @@ -0,0 +1,31 @@ +# PTX Instruction Note: cos + +`cos` computes cosine using PTX-defined floating-point variants. + +## Official Positioning + +- Documentation section: Floating Point Instructions: `cos` + +## Key Constraints + +- Common forms are approximate variants; check precision requirements before use. +- Input domain handling and internal range behavior are ISA-defined. +- Use reference checks for numerically sensitive kernels. + +## Usage Notes + +- Use transcendental intrinsics selectively in hot loops because throughput is typically lower than basic arithmetic. +- Pre-normalize input range where possible to improve numerical stability of approximate forms. + +## Example (PTX style) + +```ptx +cos.approx.f32 d, a; +``` + +## Official Source Links (fact check) + +- cos: https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions-cos +- Floating point instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions + +Last verified date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/floating-point/references/ex2.md b/content/cuda/docs/ptx/instructions/floating-point/references/ex2.md new file mode 100644 index 00000000..74c47466 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/floating-point/references/ex2.md @@ -0,0 +1,32 @@ +# PTX Instruction Note: ex2 + +`ex2` computes `2^x` for PTX floating-point variants. + +## Official Positioning + +- Documentation section: Floating Point Instructions: `ex2` +- Related extension: Half precision `ex2` + +## Key Constraints + +- Common forms are approximate and may differ from high-precision library output. +- Select type suffixes that match downstream numeric requirements. +- Validate error behavior on representative production ranges. + +## Usage Notes + +- Use `ex2` for base-2 exponentiation paths to avoid extra base conversion overhead. +- Recheck stability when `ex2` output is immediately fed into normalization or softmax-like pipelines. + +## Example (PTX style) + +```ptx +ex2.approx.f32 d, a; +``` + +## Official Source Links (fact check) + +- ex2: https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions-ex2 +- Half precision ex2: https://docs.nvidia.com/cuda/parallel-thread-execution/#half-precision-floating-point-instructions-ex2 + +Last verified date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/floating-point/references/fma.md b/content/cuda/docs/ptx/instructions/floating-point/references/fma.md new file mode 100644 index 00000000..47de47da --- /dev/null +++ b/content/cuda/docs/ptx/instructions/floating-point/references/fma.md @@ -0,0 +1,33 @@ +# PTX Instruction Note: fma (floating-point) + +`fma` performs fused multiply-add with single-rounding semantics for the selected variant. + +## Official Positioning + +- Documentation section: Floating Point Instructions: `fma` +- Related extensions: Half precision and mixed precision `fma` + +## Key Constraints + +- `fma` is not equivalent to separate `mul` then `add` in rounding behavior. +- Type and rounding suffixes must match variant requirements. +- Validate precision-sensitive kernels when switching between fused and split forms. + +## Usage Notes + +- Prefer `fma` in compute-bound loops to reduce instruction count and intermediate rounding error. +- Compare against non-fused baselines when strict bitwise reproducibility is required. + +## Example (PTX style) + +```ptx +fma.rn.f32 d, a, b, c; +``` + +## Official Source Links (fact check) + +- fma: https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions-fma +- Half precision fma: https://docs.nvidia.com/cuda/parallel-thread-execution/#half-precision-floating-point-instructions-fma +- Mixed precision fma: https://docs.nvidia.com/cuda/parallel-thread-execution/#mixed-precision-floating-point-instructions-fma + +Last verified date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/floating-point/references/lg2.md b/content/cuda/docs/ptx/instructions/floating-point/references/lg2.md new file mode 100644 index 00000000..9570e088 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/floating-point/references/lg2.md @@ -0,0 +1,31 @@ +# PTX Instruction Note: lg2 + +`lg2` computes logarithm base 2 for PTX floating-point variants. + +## Official Positioning + +- Documentation section: Floating Point Instructions: `lg2` + +## Key Constraints + +- Approximate forms are common; accuracy depends on the selected variant. +- Domain handling for zero, negative, and exceptional inputs follows ISA rules. +- Use reference validation when numerical stability is critical. + +## Usage Notes + +- Use `lg2` when your algorithm is naturally base-2 (for example entropy-like or bit-scale transforms). +- Check behavior near zero and denormal ranges when downstream code assumes finite outputs. + +## Example (PTX style) + +```ptx +lg2.approx.f32 d, a; +``` + +## Official Source Links (fact check) + +- lg2: https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions-lg2 +- Floating point instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions + +Last verified date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/floating-point/references/mul.md b/content/cuda/docs/ptx/instructions/floating-point/references/mul.md new file mode 100644 index 00000000..88fbf572 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/floating-point/references/mul.md @@ -0,0 +1,32 @@ +# PTX Instruction Note: mul (floating-point) + +`mul` performs floating-point multiplication with PTX-defined type and rounding variants. + +## Official Positioning + +- Documentation section: Floating Point Instructions: `mul` +- Related extension: Half precision `mul` + +## Key Constraints + +- Use valid type/rounding suffix combinations for the target variant. +- Operand types must match the chosen instruction form. +- Verify precision behavior when combining with mixed-precision accumulation. + +## Usage Notes + +- Prefer fused forms (`fma`) when multiply-add is immediately chained and numerical policy allows it. +- Track denormal and FTZ behavior when reproducing CPU reference results. + +## Example (PTX style) + +```ptx +mul.rn.f32 d, a, b; +``` + +## Official Source Links (fact check) + +- mul: https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions-mul +- Half precision mul: https://docs.nvidia.com/cuda/parallel-thread-execution/#half-precision-floating-point-instructions-mul + +Last verified date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/floating-point/references/rcp.md b/content/cuda/docs/ptx/instructions/floating-point/references/rcp.md new file mode 100644 index 00000000..2d0b1737 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/floating-point/references/rcp.md @@ -0,0 +1,32 @@ +# PTX Instruction Note: rcp + +`rcp` computes reciprocal for the selected floating-point variant. + +## Official Positioning + +- Documentation section: Floating Point Instructions: `rcp` +- Related extension: `rcp.approx.ftz.f64` + +## Key Constraints + +- Distinguish exact/rounded vs approximate variants based on requirements. +- Zero and exceptional input behavior follows ISA-defined floating-point semantics. +- Validate error tolerance before using approximate forms in iterative kernels. + +## Usage Notes + +- Use `rcp` to replace scalar division hot paths when reciprocal error is acceptable. +- Reassess convergence/stability if approximate reciprocals feed iterative updates. + +## Example (PTX style) + +```ptx +rcp.rn.f32 d, a; +``` + +## Official Source Links (fact check) + +- rcp: https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions-rcp +- rcp.approx.ftz.f64: https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions-rcp-approx-ftz-f64 + +Last verified date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/floating-point/references/rsqrt.md b/content/cuda/docs/ptx/instructions/floating-point/references/rsqrt.md new file mode 100644 index 00000000..38982c58 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/floating-point/references/rsqrt.md @@ -0,0 +1,32 @@ +# PTX Instruction Note: rsqrt + +`rsqrt` computes reciprocal square root for the selected floating-point variant. + +## Official Positioning + +- Documentation section: Floating Point Instructions: `rsqrt` +- Related extension: `rsqrt.approx.ftz.f64` + +## Key Constraints + +- Approximate forms are common and should be validated against error budgets. +- Negative and exceptional inputs follow ISA-defined semantics. +- Choose variant precision to match normalization or solver stability needs. + +## Usage Notes + +- Use `rsqrt` in normalization-heavy kernels to reduce divide and square-root pressure. +- Pair approximate forms with one refinement step when tighter relative error is required. + +## Example (PTX style) + +```ptx +rsqrt.approx.f32 d, a; +``` + +## Official Source Links (fact check) + +- rsqrt: https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions-rsqrt +- rsqrt.approx.ftz.f64: https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions-rsqrt-approx-ftz-f64 + +Last verified date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/floating-point/references/sin.md b/content/cuda/docs/ptx/instructions/floating-point/references/sin.md new file mode 100644 index 00000000..1eba36b0 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/floating-point/references/sin.md @@ -0,0 +1,31 @@ +# PTX Instruction Note: sin + +`sin` computes sine using PTX floating-point variants. + +## Official Positioning + +- Documentation section: Floating Point Instructions: `sin` + +## Key Constraints + +- Common forms are approximate; accuracy varies by variant and architecture. +- Exceptional-value handling follows ISA-defined semantics. +- Validate on production ranges for numerically sensitive workloads. + +## Usage Notes + +- Favor `sin` for moderate-accuracy signal paths; validate if gradients or phase error are sensitive. +- Benchmark with realistic input distributions, not only uniform synthetic ranges. + +## Example (PTX style) + +```ptx +sin.approx.f32 d, a; +``` + +## Official Source Links (fact check) + +- sin: https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions-sin +- Floating point instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions + +Last verified date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/floating-point/references/sqrt.md b/content/cuda/docs/ptx/instructions/floating-point/references/sqrt.md new file mode 100644 index 00000000..ccd850ea --- /dev/null +++ b/content/cuda/docs/ptx/instructions/floating-point/references/sqrt.md @@ -0,0 +1,31 @@ +# PTX Instruction Note: sqrt + +`sqrt` computes square root for the selected floating-point variant. + +## Official Positioning + +- Documentation section: Floating Point Instructions: `sqrt` + +## Key Constraints + +- Use variant-specific rounding and type suffixes where required. +- Negative and exceptional input behavior follows ISA-defined semantics. +- Evaluate precision/performance tradeoffs between exact and approximate forms. + +## Usage Notes + +- Prefer `rsqrt` plus refinement when reciprocal-root throughput is the primary goal. +- Validate corner cases (very small, very large, and subnormal inputs) when switching variants. + +## Example (PTX style) + +```ptx +sqrt.rn.f32 d, a; +``` + +## Official Source Links (fact check) + +- sqrt: https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions-sqrt +- Floating point instruction set: https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions + +Last verified date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/floating-point/references/sub.md b/content/cuda/docs/ptx/instructions/floating-point/references/sub.md new file mode 100644 index 00000000..98aa0d87 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/floating-point/references/sub.md @@ -0,0 +1,33 @@ +# PTX Instruction Note: sub (floating-point) + +`sub` performs floating-point subtraction with PTX-defined type and rounding variants. + +## Official Positioning + +- Documentation section: Floating Point Instructions: `sub` +- Related sections: Half precision and mixed precision `sub` variants + +## Key Constraints + +- Use valid type/rounding suffix combinations for the selected variant. +- Operand types must match the instruction form. +- Special-value behavior follows ISA-defined floating-point semantics. + +## Usage Notes + +- Keep subtract order explicit in refactors because `a - b` vs `b - a` can alter cancellation behavior. +- Re-evaluate tolerance thresholds when replacing `sub` with fused alternatives. + +## Example (PTX style) + +```ptx +sub.rn.f32 d, a, b; +``` + +## Official Source Links (fact check) + +- sub: https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions-sub +- Half precision sub: https://docs.nvidia.com/cuda/parallel-thread-execution/#half-precision-floating-point-instructions-sub +- Mixed precision sub: https://docs.nvidia.com/cuda/parallel-thread-execution/#mixed-precision-floating-point-instructions-sub + +Last verified date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/floating-point/references/tanh.md b/content/cuda/docs/ptx/instructions/floating-point/references/tanh.md new file mode 100644 index 00000000..fbcac272 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/floating-point/references/tanh.md @@ -0,0 +1,32 @@ +# PTX Instruction Note: tanh + +`tanh` computes hyperbolic tangent using PTX floating-point variants. + +## Official Positioning + +- Documentation section: Floating Point Instructions: `tanh` +- Related extension: Half precision `tanh` + +## Key Constraints + +- Typical forms are approximate and should be validated for model-specific tolerances. +- Saturation and exceptional input behavior follow ISA-defined semantics. +- Use reference comparisons for numerically sensitive paths. + +## Usage Notes + +- Use `tanh` where bounded output is required and approximation error is acceptable. +- Check gradient-sensitive training/inference paths separately from forward-only tolerance checks. + +## Example (PTX style) + +```ptx +tanh.approx.f32 d, a; +``` + +## Official Source Links (fact check) + +- tanh: https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions-tanh +- Half precision tanh: https://docs.nvidia.com/cuda/parallel-thread-execution/#half-precision-floating-point-instructions-tanh + +Last verified date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/floating-point/references/testp.md b/content/cuda/docs/ptx/instructions/floating-point/references/testp.md new file mode 100644 index 00000000..713d01c2 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/floating-point/references/testp.md @@ -0,0 +1,31 @@ +# PTX Instruction Note: testp + +`testp` evaluates floating-point class/property predicates and writes a predicate result. + +## Official Positioning + +- Documentation section: Floating Point Instructions: `testp` + +## Key Constraints + +- Predicate selector (`nan`, `finite`, and related forms) controls the test semantics. +- Destination is a predicate register and is typically consumed by branch/selection instructions. +- Type suffix and selector must match a legal ISA form. + +## Usage Notes + +- Use `testp` to isolate exceptional-value handling into explicit predicate paths. +- Pair with `selp` for branchless fallback selection when divergence is undesirable. + +## Example (PTX style) + +```ptx +testp.nan.f32 p, a; +``` + +## Official Source Links (fact check) + +- testp: https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions-testp +- Floating point instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions + +Last verified date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/integer/DOC.md b/content/cuda/docs/ptx/instructions/integer/DOC.md new file mode 100644 index 00000000..d74f05e0 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/integer/DOC.md @@ -0,0 +1,74 @@ +--- +name: ptx-integer-instructions +description: "PTX integer arithmetic instructions and constraints for ISA 9.2." +metadata: + languages: "cpp" + versions: "9.2" + revision: 2 + updated-on: "2026-03-19" + source: official + tags: "cuda,ptx,integer,arithmetic" +--- + +# PTX Integer Arithmetic + +This page covers the core semantics and practical constraints of PTX integer arithmetic instruction families. + +## Common Instructions + +- `add` / `sub` / `mul` +- `mad` (multiply-add) +- `div` / `rem` +- `abs` / `neg` + +## Syntax Example (PTX Style) + +```ptx +add.s32 d, a, b; +mad.lo.s32 d, a, b, c; +``` + +## Constraints and Pitfalls + +- `.s*` / `.u*` must match both the register types and the operation semantics. +- Variants such as `mad` should be checked for high/low-part selection and rounding behavior. +- Different bit-widths and variants may be restricted by PTX ISA / Target ISA requirements. + +## Usage Recommendations + +- Prefer keeping clearly defined signed/unsigned semantics within the same code region. +- When dealing with overflow semantics, do not rely on the compiler to automatically infer behavior. + +## Official Source Links (Fact Check) + +- Integer Arithmetic Instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#integer-arithmetic-instructions +- add: https://docs.nvidia.com/cuda/parallel-thread-execution/#integer-arithmetic-instructions-add +- mad: https://docs.nvidia.com/cuda/parallel-thread-execution/#integer-arithmetic-instructions-mad +- mul: https://docs.nvidia.com/cuda/parallel-thread-execution/#integer-arithmetic-instructions-mul + +Last cross-check date: 2026-03-19 + +## Single-instruction Topics +- `references/setp.md` +- `references/selp.md` +- `references/brev.md` +- `references/bfind.md` +- `references/bfe.md` +- `references/bfi.md` +- `references/prmt.md` +- `references/lop3.md` +- `references/popc.md` +- `references/sad.md` +- `references/mul24.md` +- `references/mad24.md` +- `references/clz.md` +- `references/and.md` +- `references/xor.md` +- `references/shf.md` +- `references/or.md` +- `references/not.md` +- `references/shl.md` +- `references/shr.md` +- `references/min.md` +- `references/max.md` +- `references/div.md` diff --git a/content/cuda/docs/ptx/instructions/integer/references/and.md b/content/cuda/docs/ptx/instructions/integer/references/and.md new file mode 100644 index 00000000..66a9c280 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/integer/references/and.md @@ -0,0 +1,31 @@ +# PTX Instruction Topic: and + +`and` performs a bitwise AND and is a fundamental operation in the Logic and Shift instruction family. + +## Official Description + +- Documentation section: Logic and Shift Instructions: `and` + +## Key Constraints + +- Operand width/type suffixes must match (`.b16/.b32/.b64` forms as applicable). +- Inputs must already be normalized to the intended bit-width domain. +- Mask constants should use explicit width to avoid unintended sign/width propagation. + +## Usage Notes + +- Use `and` for deterministic mask extraction before shifts or comparisons. +- In packed-field code, pair with `shl/shr/bfe` to keep bit positions explicit. + +## Example (PTX Style) + +```ptx +and.b32 d, a, b; +``` + +## Official Source Links (Fact Check) + +- and: https://docs.nvidia.com/cuda/parallel-thread-execution/#logic-and-shift-instructions-and +- Logic and Shift instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#logic-and-shift-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/integer/references/bfe.md b/content/cuda/docs/ptx/instructions/integer/references/bfe.md new file mode 100644 index 00000000..68d0f2c4 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/integer/references/bfe.md @@ -0,0 +1,31 @@ +# PTX Instruction Topic: bfe + +`bfe` (bit-field extract) extracts a specified bit-field from a source value. + +## Official Description + +- Documentation section: Logic and Shift Instructions: `bfe` +- Commonly used for packed data decoding and field extraction + +## Key Constraints + +- The start bit and length parameters must satisfy the bit-width range. +- The signed/unsigned extraction semantics are determined by the variant suffix. + +## Usage Notes + +- Prefer `bfe` over ad hoc mask/shift chains when decoding packed metadata fields. +- Keep `pos/len` constants explicit and centralized to avoid layout drift bugs. + +## Example (PTX Style, Illustrative) + +```ptx +bfe.u32 d, a, pos, len; +``` + +## Official Source Links (Fact Check) + +- bfe: https://docs.nvidia.com/cuda/parallel-thread-execution/#logic-and-shift-instructions-bfe +- Logic and Shift instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#logic-and-shift-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/integer/references/bfi.md b/content/cuda/docs/ptx/instructions/integer/references/bfi.md new file mode 100644 index 00000000..52f4b919 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/integer/references/bfi.md @@ -0,0 +1,31 @@ +# PTX Instruction Topic: bfi + +`bfi` (bit-field insert) writes a field into a target bit range. + +## Official Description + +- Documentation section: Logic and Shift Instructions: `bfi` +- Often used together with `bfe` for packed data encoding + +## Key Constraints + +- The insert-range parameters must be within the target bit-width range. +- The combination of source field width and position must satisfy the variant definition. + +## Usage Notes + +- Use `bfi` to update packed headers/flags without disturbing unaffected bit fields. +- Pair with `bfe` in encode/decode pipelines to keep bit-layout contracts symmetric. + +## Example (PTX Style, Illustrative) + +```ptx +bfi.b32 d, a, b, pos, len; +``` + +## Official Source Links (Fact Check) + +- bfi: https://docs.nvidia.com/cuda/parallel-thread-execution/#logic-and-shift-instructions-bfi +- Logic and Shift instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#logic-and-shift-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/integer/references/bfind.md b/content/cuda/docs/ptx/instructions/integer/references/bfind.md new file mode 100644 index 00000000..ed58dcdf --- /dev/null +++ b/content/cuda/docs/ptx/instructions/integer/references/bfind.md @@ -0,0 +1,31 @@ +# PTX Instruction Topic: bfind + +`bfind` finds the position of a bit (the most/least significant bit position, depending on the variant semantics). + +## Official Description + +- Documentation section: Logic and Shift Instructions: `bfind` +- Suitable for bit scanning, normalization, and encoding optimization paths + +## Key Constraints + +- For empty input (e.g., all zeros), the result semantics follow the variant definition. +- The type/bit-width must match the suffix and the destination register. + +## Usage Notes + +- Use `bfind` for fast position lookup in sparse-bit masks and encoding routines. +- Guard zero-input handling explicitly when downstream logic assumes a valid bit index. + +## Example (PTX Style, Illustrative) + +```ptx +bfind.u32 d, a; +``` + +## Official Source Links (Fact Check) + +- bfind: https://docs.nvidia.com/cuda/parallel-thread-execution/#logic-and-shift-instructions-bfind +- Logic and Shift instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#logic-and-shift-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/integer/references/brev.md b/content/cuda/docs/ptx/instructions/integer/references/brev.md new file mode 100644 index 00000000..c0626d47 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/integer/references/brev.md @@ -0,0 +1,31 @@ +# PTX Instruction Topic: brev + +`brev` performs a bit reverse and is commonly used for bit-manipulation rearrangement and index transformations. + +## Official Description + +- Documentation section: Logic and Shift Instructions: `brev` +- Commonly used in scenarios that require bit-level reversed mappings + +## Key Constraints + +- The input/output bit widths must match the instruction variant. +- It only changes bit ordering; it does not extend arithmetic semantics. + +## Usage Notes + +- Useful for bit-reversed indexing patterns and bitstream transformations. +- Keep post-transform masking explicit when only subsets of bits are semantically valid. + +## Example (PTX Style) + +```ptx +brev.b32 d, a; +``` + +## Official Source Links (Fact Check) + +- brev: https://docs.nvidia.com/cuda/parallel-thread-execution/#logic-and-shift-instructions-brev +- Logic and Shift instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#logic-and-shift-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/integer/references/clz.md b/content/cuda/docs/ptx/instructions/integer/references/clz.md new file mode 100644 index 00000000..465c03eb --- /dev/null +++ b/content/cuda/docs/ptx/instructions/integer/references/clz.md @@ -0,0 +1,35 @@ +# PTX Instruction Topic: clz + +`clz` (count leading zeros) counts the number of consecutive zero bits starting from the most significant bit. + +## Official Description + +- Documentation section: Integer Arithmetic Instructions: `clz` + +## Key Constraints + +- When the input is 0, result is the operand bit width for the corresponding variant. +- The bit-width suffix must match the register type. + +## Usage Notes + +- Use `clz` as a primitive for normalization, integer `log2` approximations, and bit-scan helpers. +- Keep input width explicit (`.b32` vs `.b64`) when results are consumed by index arithmetic. + +## Common Failure Modes + +- Assuming zero input returns a sentinel other than operand bit width. +- Mixing 32-bit and 64-bit `clz` outputs in shared index math without conversion. + +## Example (PTX Style) + +```ptx +clz.b32 d, a; +``` + +## Official Source Links (Fact Check) + +- clz: https://docs.nvidia.com/cuda/parallel-thread-execution/#integer-arithmetic-instructions-clz +- Integer arithmetic instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#integer-arithmetic-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/integer/references/div.md b/content/cuda/docs/ptx/instructions/integer/references/div.md new file mode 100644 index 00000000..9b451848 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/integer/references/div.md @@ -0,0 +1,31 @@ +# PTX Instruction Topic: div + +`div` performs division and supports different types and variant semantics. + +## Official Description + +- Documentation section: Integer Arithmetic Instructions: `div` + +## Key Constraints + +- The behavior when the divisor is 0 is defined by the official specification; protect against it before use. +- Signed/unsigned division semantics differ. +- On performance-critical paths, evaluate `div` latency and consider alternative strategies. + +## Usage Notes + +- In tight loops, replace division by compile-time constants with multiply/shift transforms when valid. +- Keep explicit preconditions for divisor domain to avoid hidden exceptional-path costs. + +## Example (PTX Style) + +```ptx +div.s32 d, a, b; +``` + +## Official Source Links (Fact Check) + +- div: https://docs.nvidia.com/cuda/parallel-thread-execution/#integer-arithmetic-instructions-div +- Integer arithmetic instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#integer-arithmetic-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/integer/references/lop3.md b/content/cuda/docs/ptx/instructions/integer/references/lop3.md new file mode 100644 index 00000000..a5a5f8ea --- /dev/null +++ b/content/cuda/docs/ptx/instructions/integer/references/lop3.md @@ -0,0 +1,31 @@ +# PTX Instruction Topic: lop3 + +`lop3` is a three-input lookup-table logical operation that can express any three-input boolean function. + +## Official Description + +- Documentation section: Logic and Shift Instructions: `lop3` +- Commonly used to fuse multiple boolean instructions into a single logical operation + +## Key Constraints + +- The 8-bit immediate truth table defines the boolean function. +- The type suffix must match the input bit widths. + +## Usage Notes + +- Use `lop3` to compress multi-stage boolean logic into one instruction where possible. +- Keep LUT constants named and documented, because readability drops quickly with raw immediates. + +## Example (PTX Style, Illustrative) + +```ptx +lop3.b32 d, a, b, c, immLut; +``` + +## Official Source Links (Fact Check) + +- lop3: https://docs.nvidia.com/cuda/parallel-thread-execution/#logic-and-shift-instructions-lop3 +- Logic and Shift instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#logic-and-shift-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/integer/references/mad24.md b/content/cuda/docs/ptx/instructions/integer/references/mad24.md new file mode 100644 index 00000000..9af5e531 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/integer/references/mad24.md @@ -0,0 +1,33 @@ +# PTX Instruction Topic: mad24 + +`mad24` adds a third operand on top of the `mul24` result and supports variants such as `.hi`/`.lo` and saturated modes. + +## Official Description + +- Documentation section: Integer Arithmetic Instructions: `mad24` + +## Official Syntax (Excerpt) + +```ptx +mad24.mode.type d, a, b, c; +mad24.hi.sat.s32 d, a, b, c; +``` + +## Key Semantics + +- `.lo`: adds `c` to the low 32 bits of a 24x24 product. +- `.hi`: adds `c` to the high 32 bits of a 24x24 product. +- `.hi` may be slower when there is no dedicated 24-bit multiplication hardware. + +## Example (PTX Style) + +```ptx +mad24.lo.s32 d, a, b, c; +``` + +## Official Source Links (Fact Check) + +- mad24: https://docs.nvidia.com/cuda/parallel-thread-execution/#integer-arithmetic-instructions-mad24 +- Integer arithmetic instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#integer-arithmetic-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/integer/references/max.md b/content/cuda/docs/ptx/instructions/integer/references/max.md new file mode 100644 index 00000000..dbffde0b --- /dev/null +++ b/content/cuda/docs/ptx/instructions/integer/references/max.md @@ -0,0 +1,35 @@ +# PTX Instruction Topic: max + +`max` returns the larger of two operands and is commonly used for threshold clamping and range constraints. + +## Official Description + +- Documentation section: Integer Arithmetic Instructions: `max` + +## Key Constraints + +- Semantics depend on the data type and the variant suffix. +- For floating-point variants, refer to the official NaN semantics. + +## Usage Notes + +- Use signed variants for signed ranges and unsigned variants for raw bit-pattern ranges. +- For clamp logic, combine `max` and `min` in a fixed order to keep behavior predictable. + +## Common Failure Modes + +- Signed/unsigned variant mismatch causes incorrect ordering around high-bit values. +- Floating-point `max` behavior is assumed identical to host-language helper semantics without NaN checks. + +## Example (PTX Style) + +```ptx +max.s32 d, a, b; +``` + +## Official Source Links (Fact Check) + +- max: https://docs.nvidia.com/cuda/parallel-thread-execution/#integer-arithmetic-instructions-max +- Integer arithmetic instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#integer-arithmetic-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/integer/references/min.md b/content/cuda/docs/ptx/instructions/integer/references/min.md new file mode 100644 index 00000000..b4e82664 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/integer/references/min.md @@ -0,0 +1,35 @@ +# PTX Instruction Topic: min + +`min` returns the smaller of two operands and supports integer/float variants (as defined in the official section). + +## Official Description + +- Documentation section: Integer Arithmetic Instructions: `min` + +## Key Constraints + +- Result semantics are determined by the type and suffix. +- For floating-point comparison paths, pay attention to NaN handling (see the corresponding section notes). + +## Usage Notes + +- Pick variant suffixes to match the intended numeric ordering (`.s*` vs `.u*`). +- Use together with `max` to build branchless bound enforcement. + +## Common Failure Modes + +- Bound-check logic reverses `min`/`max` order and silently changes clamp behavior. +- Integer and floating-point minima are mixed in shared helper paths without variant-specific handling. + +## Example (PTX Style) + +```ptx +min.s32 d, a, b; +``` + +## Official Source Links (Fact Check) + +- min: https://docs.nvidia.com/cuda/parallel-thread-execution/#integer-arithmetic-instructions-min +- Integer arithmetic instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#integer-arithmetic-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/integer/references/mul24.md b/content/cuda/docs/ptx/instructions/integer/references/mul24.md new file mode 100644 index 00000000..d3da559c --- /dev/null +++ b/content/cuda/docs/ptx/instructions/integer/references/mul24.md @@ -0,0 +1,32 @@ +# PTX Instruction Topic: mul24 + +`mul24` returns either the high 32 bits or the low 32 bits of a 48-bit result from a 24x24-bit multiplication (depending on the `.hi`/`.lo` mode). + +## Official Description + +- Documentation section: Integer Arithmetic Instructions: `mul24` + +## Official Syntax (Excerpt) + +```ptx +mul24.mode.type d, a, b; +``` + +## Key Semantics + +- `.lo`: returns the low 32 bits of the 48-bit product. +- `.hi`: returns the high 32 bits of the 48-bit product. +- The documentation notes that on some hardware, `.hi` may be less efficient. + +## Example (PTX Style) + +```ptx +mul24.lo.s32 d, a, b; +``` + +## Official Source Links (Fact Check) + +- mul24: https://docs.nvidia.com/cuda/parallel-thread-execution/#integer-arithmetic-instructions-mul24 +- Integer arithmetic instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#integer-arithmetic-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/integer/references/not.md b/content/cuda/docs/ptx/instructions/integer/references/not.md new file mode 100644 index 00000000..6e5b5723 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/integer/references/not.md @@ -0,0 +1,31 @@ +# PTX Instruction Topic: not + +`not` performs a bitwise inversion and is a fundamental instruction for mask construction and logical complement operations. + +## Official Description + +- Documentation section: Logic and Shift Instructions: `not` + +## Key Constraints + +- Destination width must match the intended inversion domain. +- Inversion of packed fields should be followed by masking when only partial bits are valid. +- Do not treat `not` as arithmetic negation; semantics are bitwise inversion. + +## Usage Notes + +- Use `not` for complement masks and branchless bit-condition rewrites. +- Pair with `and` to isolate relevant inverted ranges in packed representations. + +## Example (PTX Style) + +```ptx +not.b32 d, a; +``` + +## Official Source Links (Fact Check) + +- not: https://docs.nvidia.com/cuda/parallel-thread-execution/#logic-and-shift-instructions-not +- Logic and Shift instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#logic-and-shift-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/integer/references/or.md b/content/cuda/docs/ptx/instructions/integer/references/or.md new file mode 100644 index 00000000..19b1d02f --- /dev/null +++ b/content/cuda/docs/ptx/instructions/integer/references/or.md @@ -0,0 +1,31 @@ +# PTX Instruction Topic: or + +`or` is a bitwise OR instruction and belongs to the Logic and Shift instruction family. + +## Official Description + +- Documentation section: Logic and Shift Instructions: `or` + +## Key Constraints + +- Operand widths and type suffixes must match the selected variant. +- Bit-layout assumptions should be documented before combining packed fields. +- Use explicit constants with matching width to avoid implicit truncation confusion. + +## Usage Notes + +- Use `or` to compose flags and packed-bit fields after proper masking/shift steps. +- Prefer readable staged composition over opaque one-line bit merges in critical code. + +## Example (PTX Style) + +```ptx +or.b32 d, a, b; +``` + +## Official Source Links (Fact Check) + +- or: https://docs.nvidia.com/cuda/parallel-thread-execution/#logic-and-shift-instructions-or +- Logic and Shift instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#logic-and-shift-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/integer/references/popc.md b/content/cuda/docs/ptx/instructions/integer/references/popc.md new file mode 100644 index 00000000..cc54d3f3 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/integer/references/popc.md @@ -0,0 +1,31 @@ +# PTX Instruction Topic: popc + +`popc` (population count) counts the number of set bits in a binary value. + +## Official Description + +- Documentation section: Integer Arithmetic Instructions: `popc` +- Common uses include mask counting, bitset operations, and compact encoding + +## Key Constraints + +- The input bit width determines the counting range. +- The result type must be able to hold the maximum count. + +## Usage Notes + +- Common for bitset density metrics, mask compaction prepasses, and voting summaries. +- Validate accumulator width if multiple `popc` results are aggregated. + +## Example (PTX Style) + +```ptx +popc.b32 d, a; +``` + +## Official Source Links (Fact Check) + +- popc: https://docs.nvidia.com/cuda/parallel-thread-execution/#integer-arithmetic-instructions-popc +- Integer arithmetic instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#integer-arithmetic-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/integer/references/prmt.md b/content/cuda/docs/ptx/instructions/integer/references/prmt.md new file mode 100644 index 00000000..5e974da2 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/integer/references/prmt.md @@ -0,0 +1,31 @@ +# PTX Instruction Topic: prmt + +`prmt` (permute) reorders bytes/nibbles under selection control and is suitable for bit-level data rearrangement. + +## Official Description + +- Documentation section: Logic and Shift Instructions: `prmt` +- Common in encoding/decoding and data-layout adjustments + +## Key Constraints + +- The control mask determines the reorder sources and order. +- Ensure the permute mode matches the input data layout. + +## Usage Notes + +- Prefer `prmt` for byte-lane rearrangement when scalar mask/shift sequences become instruction-heavy. +- Keep test vectors for endianness-sensitive paths to catch layout mistakes early. + +## Example (PTX Style, Illustrative) + +```ptx +prmt.b32 d, a, b, c; +``` + +## Official Source Links (Fact Check) + +- prmt: https://docs.nvidia.com/cuda/parallel-thread-execution/#logic-and-shift-instructions-prmt +- Logic and Shift instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#logic-and-shift-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/integer/references/sad.md b/content/cuda/docs/ptx/instructions/integer/references/sad.md new file mode 100644 index 00000000..327d561b --- /dev/null +++ b/content/cuda/docs/ptx/instructions/integer/references/sad.md @@ -0,0 +1,35 @@ +# PTX Instruction Topic: sad + +`sad` (sum of absolute differences) computes the sum of absolute differences and is commonly used in image processing and distance-related operators. + +## Official Description + +- Documentation section: Integer Arithmetic Instructions: `sad` + +## Key Constraints + +- Operand types and bit widths must match the variant suffix. +- The accumulation width must be able to hold the sum result across multiple elements. + +## Usage Notes + +- Use `sad` for low-overhead distance accumulation in matching and scoring loops. +- Validate accumulation range early when chaining multiple `sad` stages. + +## Common Failure Modes + +- Accumulation width is too narrow for multi-stage reductions and overflows silently. +- Input packing assumptions differ between producer and `sad` consumer paths. + +## Example (PTX Style, Illustrative) + +```ptx +sad.u32 d, a, b; +``` + +## Official Source Links (Fact Check) + +- sad: https://docs.nvidia.com/cuda/parallel-thread-execution/#integer-arithmetic-instructions-sad +- Integer arithmetic instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#integer-arithmetic-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/integer/references/selp.md b/content/cuda/docs/ptx/instructions/integer/references/selp.md new file mode 100644 index 00000000..e3fd8db0 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/integer/references/selp.md @@ -0,0 +1,32 @@ +# PTX Instruction Topic: selp + +`selp` selects between two operands based on a predicate and is commonly used for branchless conditional assignment. + +## Official Description + +- Documentation section: Comparison and Selection Instructions: `selp` +- Commonly used as an alternative to simple if/else to reduce branch divergence + +## Key Constraints + +- The predicate operand must be a valid predicate. +- The source/destination types must match the `selp` suffix. +- When strict numeric semantics are required, ensure that the value types are fully consistent. + +## Usage Notes + +- Use `selp` to remove short divergent branches when both candidate values are already available. +- Keep expensive side-effecting work outside `selp` paths because values are produced before selection. + +## Example (PTX Style) + +```ptx +selp.s32 d, a, b, p; +``` + +## Official Source Links (Fact Check) + +- selp: https://docs.nvidia.com/cuda/parallel-thread-execution/#comparison-and-selection-instructions-selp +- Comparison and Selection instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#comparison-and-selection-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/integer/references/setp.md b/content/cuda/docs/ptx/instructions/integer/references/setp.md new file mode 100644 index 00000000..a7cb1d21 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/integer/references/setp.md @@ -0,0 +1,33 @@ +# PTX Instruction Topic: setp + +`setp` is a core instruction that compares and writes a predicate register, used to build conditional branching and predicated (masked) execution. + +## Official Description + +- Documentation section: Comparison and Selection Instructions: `setp` +- Generates a predicate result based on the comparison relation; commonly used with `@p bra` and `selp` + +## Key Constraints + +- The comparison operand types must match the variant suffix. +- The result is written to a predicate register and can be used later as a predication condition. +- For floating-point comparisons, pay attention to NaN-related comparison semantics. + +## Usage Notes + +- Use `setp + selp` for branchless value selection in divergence-sensitive paths. +- Keep predicate lifetimes short and explicit to avoid accidental predicate reuse bugs. + +## Example (PTX Style) + +```ptx +setp.lt.s32 p, a, b; +@p bra L_true; +``` + +## Official Source Links (Fact Check) + +- setp: https://docs.nvidia.com/cuda/parallel-thread-execution/#comparison-and-selection-instructions-setp +- Comparison and Selection instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#comparison-and-selection-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/integer/references/shf.md b/content/cuda/docs/ptx/instructions/integer/references/shf.md new file mode 100644 index 00000000..ef268de5 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/integer/references/shf.md @@ -0,0 +1,35 @@ +# PTX Instruction Topic: shf + +`shf` provides shift/concatenation semantics that combine left and right operands (see the specific variants in the official section). + +## Official Description + +- Documentation section: Logic and Shift Instructions: `shf` + +## Key Constraints + +- The shift amount and mode must follow the variant definition. +- Commonly used for wide-data rearrangement and efficient shift sequences. + +## Usage Notes + +- Use `wrap` forms for rotate-like behavior and `clamp` forms for bounded lane extraction behavior. +- Prefer `shf` over manual shift/or sequences when modeling cross-word shifts. + +## Common Failure Modes + +- `wrap` and `clamp` semantics are confused, causing incorrect bit propagation. +- Shift-count origin is not normalized and produces architecture-dependent behavior in edge cases. + +## Example (PTX Style, Illustrative) + +```ptx +shf.l.wrap.b32 d, a, b, c; +``` + +## Official Source Links (Fact Check) + +- shf: https://docs.nvidia.com/cuda/parallel-thread-execution/#logic-and-shift-instructions-shf +- Logic and Shift instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#logic-and-shift-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/integer/references/shl.md b/content/cuda/docs/ptx/instructions/integer/references/shl.md new file mode 100644 index 00000000..8ee34659 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/integer/references/shl.md @@ -0,0 +1,35 @@ +# PTX Instruction Topic: shl + +`shl` is a left-shift instruction, used for bit extension and constructing high-bit alignment. + +## Official Description + +- Documentation section: Logic and Shift Instructions: `shl` + +## Key Constraints + +- The shift amount should be within the legal range for the bit width. +- For computations related to signed semantics, carefully verify overflow behavior. + +## Usage Notes + +- Treat `shl` as a bit operation, not a safe arithmetic multiply substitute under overflow-sensitive logic. +- Keep shift-count provenance explicit when inputs may exceed legal ranges. + +## Common Failure Modes + +- Shift counts exceed legal bit width assumptions and produce unexpected masked behavior. +- Arithmetic intent is encoded with `shl` where overflow handling is actually required. + +## Example (PTX Style) + +```ptx +shl.b32 d, a, b; +``` + +## Official Source Links (Fact Check) + +- shl: https://docs.nvidia.com/cuda/parallel-thread-execution/#logic-and-shift-instructions-shl +- Logic and Shift instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#logic-and-shift-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/integer/references/shr.md b/content/cuda/docs/ptx/instructions/integer/references/shr.md new file mode 100644 index 00000000..c754681e --- /dev/null +++ b/content/cuda/docs/ptx/instructions/integer/references/shr.md @@ -0,0 +1,35 @@ +# PTX Instruction Topic: shr + +`shr` is a right-shift instruction that supports logical/arithmetic right shifts (depending on the variant suffix). + +## Official Description + +- Documentation section: Logic and Shift Instructions: `shr` + +## Key Constraints + +- The signed/unsigned suffix affects the high-bit fill semantics. +- The shift amount must be within the allowed bit-width range. + +## Usage Notes + +- Use signed variants for arithmetic right shift and unsigned variants for logical right shift. +- Audit downstream mask/extract logic when switching between `.s*` and `.u*` variants. + +## Common Failure Modes + +- Logical right shift is expected but arithmetic variant is used under signed types. +- Post-shift masking is omitted when consumers assume zero-filled high bits. + +## Example (PTX Style) + +```ptx +shr.u32 d, a, b; +``` + +## Official Source Links (Fact Check) + +- shr: https://docs.nvidia.com/cuda/parallel-thread-execution/#logic-and-shift-instructions-shr +- Logic and Shift instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#logic-and-shift-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/integer/references/xor.md b/content/cuda/docs/ptx/instructions/integer/references/xor.md new file mode 100644 index 00000000..89408c33 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/integer/references/xor.md @@ -0,0 +1,31 @@ +# PTX Instruction Topic: xor + +`xor` performs a bitwise XOR and is commonly used for mask toggling and simple encryption/checksum paths. + +## Official Description + +- Documentation section: Logic and Shift Instructions: `xor` + +## Key Constraints + +- Operand width/type suffixes must match legal ISA variants. +- For parity/checksum style paths, define whether truncation at each stage is acceptable. +- Avoid mixing signed arithmetic assumptions with pure bitwise transformations. + +## Usage Notes + +- Use `xor` for parity checks, mask toggles, and cheap difference markers. +- In lock-free protocols, avoid overloading `xor` logic with unclear state encoding. + +## Example (PTX Style) + +```ptx +xor.b32 d, a, b; +``` + +## Official Source Links (Fact Check) + +- xor: https://docs.nvidia.com/cuda/parallel-thread-execution/#logic-and-shift-instructions-xor +- Logic and Shift instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#logic-and-shift-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/special-registers/DOC.md b/content/cuda/docs/ptx/instructions/special-registers/DOC.md new file mode 100644 index 00000000..c0829c7f --- /dev/null +++ b/content/cuda/docs/ptx/instructions/special-registers/DOC.md @@ -0,0 +1,49 @@ +--- +name: ptx-special-registers +description: "PTX special registers reference for ISA 9.2 with common usage patterns." +metadata: + languages: "cpp" + versions: "9.2" + revision: 2 + updated-on: "2026-03-19" + source: official + tags: "cuda,ptx,special-registers,tid,ctaid" +--- + +# PTX Special Registers + +Special registers provide execution context such as thread indices, grid information, and SM-related details. + +## Common Registers + +- `%tid`: thread index within the CTA +- `%ntid`: CTA dimensions +- `%ctaid`: CTA index within the thread grid +- `%nctaid`: total number of CTAs in the grid (per dimension) +- `%smid`: SM ID (target related) + +## Usage Notes + +- Rely on special registers directly only when low-level control is truly needed. +- When inferring scheduling/topology, first verify that the target ISA is supported and the semantics are stable. + +## Example + +```ptx +mov.u32 r0, %tid.x; +mov.u32 r1, %ctaid.x; +``` + +## Official Source Links (Fact Check) + +- Special Registers: https://docs.nvidia.com/cuda/parallel-thread-execution/#special-registers +- %tid: https://docs.nvidia.com/cuda/parallel-thread-execution/#special-registers-tid +- %ctaid: https://docs.nvidia.com/cuda/parallel-thread-execution/#special-registers-ctaid +- %smid: https://docs.nvidia.com/cuda/parallel-thread-execution/#special-registers-smid + +Last cross-check date: 2026-03-19 + +## Single-instruction Topics + +- `references/tid-ctaid.md` +- `references/activemask.md` diff --git a/content/cuda/docs/ptx/instructions/special-registers/references/activemask.md b/content/cuda/docs/ptx/instructions/special-registers/references/activemask.md new file mode 100644 index 00000000..bef3a82a --- /dev/null +++ b/content/cuda/docs/ptx/instructions/special-registers/references/activemask.md @@ -0,0 +1,32 @@ +# PTX Instruction Topic: activemask + +`activemask` is used to retrieve the current active thread mask and is commonly used in warp-level cooperative algorithms. + +## Official Description + +- Documentation section: Miscellaneous Instructions: `activemask` +- Commonly used together with warp primitives such as `shfl.sync` and `vote.sync` + +## Key Constraints + +- The mask value reflects the set of active threads at the current execution point. +- If used after branch divergence, ensure the mask semantics are well understood. + +## Usage Notes + +- Read `activemask` as late as possible on the path that consumes it. +- Keep `membermask` derivation stable when chaining `shfl.sync` and `vote.sync`. +- Avoid reusing masks captured before divergence points. + +## Example (PTX Style, Illustrative) + +```ptx +activemask.b32 r_mask; +``` + +## Official Source Links (Fact Check) + +- activemask: https://docs.nvidia.com/cuda/parallel-thread-execution/#miscellaneous-instructions-activemask +- Miscellaneous instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#miscellaneous-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/special-registers/references/tid-ctaid.md b/content/cuda/docs/ptx/instructions/special-registers/references/tid-ctaid.md new file mode 100644 index 00000000..9c7b5948 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/special-registers/references/tid-ctaid.md @@ -0,0 +1,33 @@ +# PTX Instruction Topic: %tid and %ctaid + +`%tid` and `%ctaid` are the most commonly used index registers. They represent a thread's position within a CTA and the CTA's position within the grid, respectively. + +## Typical Usage + +```ptx +mov.u32 r_tid, %tid.x; +mov.u32 r_cta, %ctaid.x; +``` + +## Usage Notes + +- `%tid` / `%ctaid` are read-only special registers. +- The dimension components (`.x/.y/.z`) must match how the kernel is organized. + +## Common Failure Modes + +- Assuming 1D launch indexing while kernels are configured as 2D/3D. +- Mixing CTA-level and global index formulas across helper functions. +- Recomputing indices with mismatched integer width when problem size exceeds 32-bit ranges. + +## Indexing Reminder + +- Build global index formulas with explicit dimension strides (`blockDim` and `gridDim`) to avoid shape-dependent bugs. + +## Official Source Links (Fact Check) + +- Special Registers: https://docs.nvidia.com/cuda/parallel-thread-execution/#special-registers +- %tid: https://docs.nvidia.com/cuda/parallel-thread-execution/#special-registers-tid +- %ctaid: https://docs.nvidia.com/cuda/parallel-thread-execution/#special-registers-ctaid + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/sync-comm/DOC.md b/content/cuda/docs/ptx/instructions/sync-comm/DOC.md new file mode 100644 index 00000000..3bb0a928 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/sync-comm/DOC.md @@ -0,0 +1,66 @@ +--- +name: ptx-sync-comm-instructions +description: "PTX synchronization and communication instructions with scope-aware usage in ISA 9.2." +metadata: + languages: "cpp" + versions: "9.2" + revision: 2 + updated-on: "2026-03-19" + source: official + tags: "cuda,ptx,synchronization,mbarrier,barrier" +--- + +# PTX Synchronization and Communication + +This page covers core synchronization and communication primitives such as `barrier`, `mbarrier`, `atom`, `red`, and `fence`. + +## Official Semantics Excerpts (Key Points) + +- PTX documentation notes: asynchronous copy completion can be tracked via async-group or mbarrier mechanisms. +- For `cp.async`, if you do not use `wait_group/wait_all` or an mbarrier, the synchronization relationship does not hold. +- `cp.async.bulk`-related `complete-tx` operations on mbarrier provide `.release` and `.cluster` semantics (see the section definitions). + +## Common Patterns + +```ptx +// Initiate the async transfer first, then observe completion via mbarrier. +cp.async.bulk.shared::cta.global.mbarrier::complete_tx::bytes [dst], [src], size, [mbar]; +// ... +mbarrier.test_wait.acquire.shared::cta.b64 p, [mbar], state; +``` + +## Usage Notes + +- First determine the scope, then apply semantic modifiers (acquire/release/relaxed). +- Explicitly connect the producer completion point to the consumer-visible point. +- When using `atom` together with async copies, carefully review ordering relationships. + +## Official Source Links (Fact Check) + +- Parallel Synchronization and Communication Instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions +- mbarrier: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-mbarrier +- barrier: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-barrier +- Asynchronous operations: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-operations + +Last cross-check date: 2026-03-19 + +## Single-instruction Topics + +- `references/mbarrier-test-wait.md` +- `references/barrier.md` +- `references/atom.md` +- `references/membar-fence.md` +- `references/red.md` +- `references/elect-sync.md` +- `references/bar-sync.md` +- `references/atom-cas.md` +- `references/vote-sync.md` +- `references/match-sync.md` +- `references/shfl-sync.md` +- `references/mbarrier-arrive.md` +- `references/redux-sync.md` +- `references/mbarrier-arrive-drop.md` +- `references/cp-async-mbarrier-arrive.md` +- `references/bar-warp-sync.md` +- `references/fence-proxy.md` +- `references/membar-proxy.md` diff --git a/content/cuda/docs/ptx/instructions/sync-comm/references/atom-cas.md b/content/cuda/docs/ptx/instructions/sync-comm/references/atom-cas.md new file mode 100644 index 00000000..ae5ee78b --- /dev/null +++ b/content/cuda/docs/ptx/instructions/sync-comm/references/atom-cas.md @@ -0,0 +1,35 @@ +# PTX Instruction Topic: atom.cas + +`atom.cas` provides compare-and-swap atomic semantics and is a commonly used foundation instruction for lock-free data structures. + +## Official Notes + +- As part of the `atom` family, it has variants distinguished by address space and type. +- The documentation lists version and architecture requirements for some low-bit-width variants (e.g., `atom.cas.b16`). + +## Usage Notes + +- Build a lock-free update path by combining CAS with retry loops. +- Clearly specify scope and semantic modifiers to avoid cross-thread visibility issues. +- Ensure the target address is naturally aligned for the selected data width. +- Keep producer/consumer memory-order assumptions consistent with the selected atom semantics. + +## Common Failure Modes + +- CAS retry loops omit backoff under heavy contention and stall forward progress. +- `expected` value reuse is incorrect after failed CAS attempts. +- Scope/semantic modifiers do not match producer-consumer visibility requirements. + +## Example (PTX style) + +```ptx +atom.cas.gpu.global.u32 old, [addr], expected, desired; +``` + +## Official Source Links (Fact Check) + +- atom: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-atom +- atom.cas notes in atom section: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-atom +- Memory consistency model: https://docs.nvidia.com/cuda/parallel-thread-execution/#memory-consistency-model + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/sync-comm/references/atom.md b/content/cuda/docs/ptx/instructions/sync-comm/references/atom.md new file mode 100644 index 00000000..84d3368a --- /dev/null +++ b/content/cuda/docs/ptx/instructions/sync-comm/references/atom.md @@ -0,0 +1,33 @@ +# PTX Instruction Topic: atom + +`atom` provides atomic read-modify-write operations for concurrently updating shared/global state. + +## Official Description + +- Documentation section: Parallel Synchronization and Communication Instructions: `atom` +- Common operations include add/min/max/cas/exch, etc. (depending on the type and state space) + +## Key Constraints + +- The combination of operand type and state space must match the specified variant. +- The memory semantics (e.g., acquire/release/relaxed) and the scope must satisfy synchronization requirements. +- Choosing the wrong scope can lead to results that look correct but are concurrency-unstable. + +## Usage Notes + +- Use the narrowest valid scope (`cta` before `gpu`/`sys`) to reduce coherence traffic. +- Prefer warp/block local aggregation before global atomics under high contention. + +## Example (PTX Style) + +```ptx +atom.global.add.u32 r_old, [addr], r_val; +``` + +## Official Source Links (Fact Check) + +- atom: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-atom +- Memory consistency model: https://docs.nvidia.com/cuda/parallel-thread-execution/#memory-consistency-model +- Scope and applicability: https://docs.nvidia.com/cuda/parallel-thread-execution/#scope-and-applicability-of-the-model + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/sync-comm/references/bar-sync.md b/content/cuda/docs/ptx/instructions/sync-comm/references/bar-sync.md new file mode 100644 index 00000000..92a3bc57 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/sync-comm/references/bar-sync.md @@ -0,0 +1,35 @@ +# PTX Instruction Topic: bar.sync + +`bar.sync` is a commonly used barrier synchronization form that waits for the participating threads to rendezvous before continuing. + +## Official Notes + +- Supports variants with no thread count as well as variants with a thread count (see the section examples). +- Commonly used for phase transitions within a CTA and as boundaries for shared-memory reads/writes. + +## Example (PTX Style) + +```ptx +bar.sync 0; +bar.sync 1, 64; +``` + +## Usage Notes + +- Synchronize only the set of threads that participate in the same barrier protocol. +- Cannot replace specialized completion-wait mechanisms for `cp.async` / `wgmma`. +- Keep barrier identifier usage deterministic across all participating threads. +- Prefer full-CTA barriers unless a subset barrier protocol is explicitly designed and verified. + +## Common Failure Modes + +- Barrier id is reused by overlapping protocols in the same kernel phase. +- Some participant threads bypass the barrier due to conditional control flow. +- Barrier placement is correct for compute but misses shared-memory producer-consumer boundaries. + +## Official Source Links (Fact Check) + +- barrier: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-barrier +- bar.sync examples context: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/sync-comm/references/bar-warp-sync.md b/content/cuda/docs/ptx/instructions/sync-comm/references/bar-warp-sync.md new file mode 100644 index 00000000..d78ac5eb --- /dev/null +++ b/content/cuda/docs/ptx/instructions/sync-comm/references/bar-warp-sync.md @@ -0,0 +1,31 @@ +# PTX Instruction Topic: bar.warp.sync + +`bar.warp.sync` provides a warp-level synchronization barrier and is used for phase synchronization within a warp. + +## Official Description + +- Documentation section: Parallel Synchronization and Communication Instructions: `bar.warp.sync` +- Finer-grained than CTA-level barriers + +## Key Constraints + +- The participation mask must match the threads that actually participate. +- Should not be used as a substitute for synchronization primitives across warps/CTAs. + +## Usage Notes + +- Prefer `bar.warp.sync` for intra-warp phase boundaries with explicit member masks. +- Recompute/propagate `membermask` carefully after divergent control flow. + +## Example (PTX Style, Illustrative) + +```ptx +bar.warp.sync membermask; +``` + +## Official Source Links (Fact Check) + +- bar.warp.sync: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-bar-warp-sync +- Parallel synchronization instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/sync-comm/references/barrier.md b/content/cuda/docs/ptx/instructions/sync-comm/references/barrier.md new file mode 100644 index 00000000..9968b501 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/sync-comm/references/barrier.md @@ -0,0 +1,33 @@ +# PTX Instruction Topic: barrier + +The `barrier` family is used for thread-cooperative synchronization and is commonly used for phase transitions at the CTA/cluster level. + +## Official Description + +- Use `barrier` when you need threads to rendezvous before continuing. +- When you need to track completion of an asynchronous transfer, prefer the async-group / mbarrier mechanism specified in the documentation; do not use `barrier` as a substitute. + +## Key Constraints + +- All intended participants must reach the same barrier protocol point. +- Do not mix barrier identifiers/protocols across incompatible control paths. +- Use warp-level primitives instead when only warp-scope coordination is required. + +## Usage Notes + +- Reserve one barrier id per protocol stage to keep code auditing straightforward. +- Keep barrier placement symmetric across control-flow paths for all participating threads. + +## Example (PTX style) + +```ptx +barrier.sync 0; +``` + +## Official Source Links (Fact Check) + +- barrier: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-barrier +- Parallel synchronization instruction set: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions +- Asynchronous operations: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-operations + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/sync-comm/references/cp-async-mbarrier-arrive.md b/content/cuda/docs/ptx/instructions/sync-comm/references/cp-async-mbarrier-arrive.md new file mode 100644 index 00000000..ced7db53 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/sync-comm/references/cp-async-mbarrier-arrive.md @@ -0,0 +1,36 @@ +# PTX Instruction Topic: cp.async.mbarrier.arrive + +`cp.async.mbarrier.arrive` maps “completion of a prior `cp.async` operation” to an mbarrier arrive-on event. + +## Official Syntax + +```ptx +cp.async.mbarrier.arrive{.noinc}{.shared{::cta}}.b64 [addr]; +``` + +## Key Semantics + +- The system triggers the mbarrier arrive-on after the “`cp.async` completion that was initiated earlier by the current thread”. +- The arrive-on relative to the execution of `cp.async.mbarrier.arrive` itself is asynchronous. +- The documentation describes the ordering relationship with the prior `cp.async` and it is commonly used with `mbarrier.test_wait`. + +## Usage Notes + +- Use it to incorporate `cp.async` completion events into a unified mbarrier protocol. +- Keep it consistent with the participation count used by `mbarrier.init` to avoid count mismatches. +- Pair with explicit wait/check points before consumer loads from the staged region. +- Keep each async pipeline stage on a clear phase contract to avoid cross-stage completion confusion. + +## Common Failure Modes + +- Completion events are wired to the wrong barrier instance in multi-stage pipelines. +- Stage counters are updated without matching `arrive` expectations. +- Consumer paths assume arrival implies full protocol completion without wait checks. + +## Official Source Links (Fact Check) + +- cp.async.mbarrier.arrive: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-cp-async-mbarrier-arrive +- cp.async: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async +- mbarrier.test_wait / try_wait: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-mbarrier-try-wait + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/sync-comm/references/elect-sync.md b/content/cuda/docs/ptx/instructions/sync-comm/references/elect-sync.md new file mode 100644 index 00000000..aedc7d24 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/sync-comm/references/elect-sync.md @@ -0,0 +1,35 @@ +# PTX Instruction Topic: elect.sync + +`elect.sync` elects a representative thread within a synchronization mask scope and is commonly used for role assignment within a warp. + +## Official Description + +- Documentation section: Parallel Synchronization and Communication Instructions: `elect.sync` +- Produces a consistent “elected thread” result across the participating thread set + +## Usage Notes + +- Use when you need a single thread to execute management logic (e.g., writing shared metadata). +- Combine with synchronization primitives such as `bar` / `mbarrier` to ensure phase consistency. +- Ensure all participating threads execute with a consistent `membermask`. +- Pair leader-election paths with explicit broadcast or shared-memory publication when followers consume leader results. + +## Common Failure Modes + +- Leader path writes metadata without synchronization before follower reads. +- Different `membermask` values are used across divergent paths in the same warp. +- Elected-lane assumptions are hard-coded and break under changed active-lane patterns. + +## Example (PTX Style, Illustrative) + +```ptx +elect.sync %p, membermask; +@%p // elected thread path +``` + +## Official Source Links (Fact Check) + +- elect.sync: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-elect-sync +- Parallel synchronization instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/sync-comm/references/fence-proxy.md b/content/cuda/docs/ptx/instructions/sync-comm/references/fence-proxy.md new file mode 100644 index 00000000..905e9352 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/sync-comm/references/fence-proxy.md @@ -0,0 +1,36 @@ +# PTX Instruction Topic: fence.proxy + +`fence.proxy` establishes ordering relationships across proxies, especially for synchronization between proxies such as generic/async/tensormap. + +## Official Syntax (Excerpt) + +```ptx +fence.proxy.proxykind; +fence.proxy.to_proxykind::from_proxykind.release.scope; +fence.proxy.to_proxykind::from_proxykind.acquire.scope [addr], size; +``` + +## Key Semantics + +- Addresses ordering issues when the same memory location is accessed through different proxies. +- `fence.proxy.async` is used to synchronize between generic proxy and async proxy. +- The documentation provides the version and target-architecture requirements for `fence.proxy.async`. + +## Usage Notes + +- Apply `fence.proxy` only where cross-proxy visibility is a real requirement. +- Keep proxy-domain assumptions explicit in comments/protocol docs to avoid misuse. + +## Common Failure Modes + +- Generic-proxy ordering is assumed to cover async/tensormap proxy access without explicit fence rules. +- Acquire/release direction is reversed for producer-consumer handoff. +- Fence scope is too narrow for the actual sharing domain. + +## Official Source Links (Fact Check) + +- membar / fence (including fence.proxy): https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-membar-fence +- Asynchronous operations notes: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-operations +- Memory consistency model: https://docs.nvidia.com/cuda/parallel-thread-execution/#memory-consistency-model + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/sync-comm/references/match-sync.md b/content/cuda/docs/ptx/instructions/sync-comm/references/match-sync.md new file mode 100644 index 00000000..d8bb154e --- /dev/null +++ b/content/cuda/docs/ptx/instructions/sync-comm/references/match-sync.md @@ -0,0 +1,31 @@ +# PTX Instruction Topic: match.sync + +`match.sync` performs value matching within a synchronization mask scope and is used for warp-level grouping and consistency checks. + +## Official Description + +- Documentation section: Parallel Synchronization and Communication Instructions: `match.sync` +- Can be used to build warp-level cooperative logic grouped by key + +## Key Constraints + +- The comparison value types must match the requirements of the specific variant. +- The participation mask must match the execution path to avoid distorted results. + +## Usage Notes + +- Use for warp-level key grouping before subgroup-local reductions or dispatch. +- Validate mask consistency in debug builds for paths with complex divergence. + +## Example (PTX Style, Illustrative) + +```ptx +match.any.sync.b32 mask_out, value, membermask; +``` + +## Official Source Links (Fact Check) + +- match.sync: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-match-sync +- Parallel synchronization instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/sync-comm/references/mbarrier-arrive-drop.md b/content/cuda/docs/ptx/instructions/sync-comm/references/mbarrier-arrive-drop.md new file mode 100644 index 00000000..23b32dd1 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/sync-comm/references/mbarrier-arrive-drop.md @@ -0,0 +1,31 @@ +# PTX Instruction Topic: mbarrier.arrive_drop + +`mbarrier.arrive_drop` removes the current thread from the set of subsequent participants while also performing the arrive-on action. + +## Official Syntax (Excerpt) + +```ptx +mbarrier.arrive_drop{.sem.scope}{.shared{::cta}}.b64 state, [addr]{, count}; +mbarrier.arrive_drop{.sem.scope}{.shared::cluster}.b64 _, [addr]{, count}; +mbarrier.arrive_drop.noComplete{.release.cta}{.shared{::cta}}.b64 state, [addr], count; +``` + +## Key Semantics + +- Used by threads that “exit/not participate anymore” in the mbarrier protocol. +- The `.release` variant forms a release pattern and can synchronize with an acquire side. +- If the `.noComplete` variant leads to the phase completing, the behavior is undefined. +- In scenarios that use only `.shared::cluster` (not the current CTA), the destination operand must be `_`. + +## Usage Notes + +- Use `arrive_drop` when thread participation shrinks across pipeline phases. +- Reconcile participant counts with `mbarrier.init` contract to avoid deadlocks. + +## Official Source Links (Fact Check) + +- mbarrier.arrive_drop: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-mbarrier-arrive-drop +- mbarrier.arrive: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-mbarrier-arrive +- Memory consistency model: https://docs.nvidia.com/cuda/parallel-thread-execution/#memory-consistency-model + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/sync-comm/references/mbarrier-arrive.md b/content/cuda/docs/ptx/instructions/sync-comm/references/mbarrier-arrive.md new file mode 100644 index 00000000..cdff6f21 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/sync-comm/references/mbarrier-arrive.md @@ -0,0 +1,31 @@ +# PTX Instruction Topic: mbarrier.arrive + +`mbarrier.arrive` performs an arrive-on operation on a specified mbarrier and is a commonly used producer-side primitive for asynchronous workflows. + +## Official Syntax (Excerpt) + +```ptx +mbarrier.arrive{.sem.scope}{.shared{::cta}}.b64 state, [addr]{, count}; +mbarrier.arrive.expect_tx{.sem.scope}{.shared{::cta}}.b64 state, [addr], txCount; +mbarrier.arrive.noComplete{.release.cta}{.shared{::cta}}.b64 state, [addr], count; +``` + +## Key Semantics + +- For a `.shared::cta` mbarrier, an opaque `state` value can be returned to represent the phase. +- For scenarios that use only `.shared::cluster` (not the current CTA), the target operand must be the sink `_`. +- The `.noComplete` variant must not cause the current phase to complete; otherwise, behavior is undefined. +- The `.release` semantics can synchronize with a consumer-side acquire mode. + +## Usage Notes + +- Use `state` together with `mbarrier.test_wait/try_wait` to avoid phase-mixing confusion. +- For remote cluster barrier scenarios, strictly follow the sink rules. + +## Official Source Links (Fact Check) + +- mbarrier.arrive: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-mbarrier-arrive +- mbarrier.test_wait / try_wait: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-mbarrier-try-wait +- Memory consistency model: https://docs.nvidia.com/cuda/parallel-thread-execution/#memory-consistency-model + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/sync-comm/references/mbarrier-test-wait.md b/content/cuda/docs/ptx/instructions/sync-comm/references/mbarrier-test-wait.md new file mode 100644 index 00000000..279c53d7 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/sync-comm/references/mbarrier-test-wait.md @@ -0,0 +1,36 @@ +# PTX Instruction Topic: mbarrier.test_wait / mbarrier.try_wait + +`mbarrier.test_wait` / `mbarrier.try_wait` are used to test whether an mbarrier phase has completed and are commonly used wait primitives on the consumer side for asynchronous transfers. + +## Official Syntax (Excerpt) + +```ptx +mbarrier.test_wait{.sem.scope}{.shared{::cta}}.b64 waitComplete, [addr], state; +mbarrier.test_wait.parity{.sem.scope}{.shared{::cta}}.b64 waitComplete, [addr], phaseParity; +``` + +## Key Semantics + +- `test_wait` is a non-blocking test. +- When used with `.acquire` and returning `True`, it forms an acquire mode (see the memory model section). +- `.scope` defaults to `.cta` when not explicitly specified. + +## Version and Target + +- Documentation indicates `mbarrier.test_wait` was introduced in PTX ISA 7.0. +- Documentation indicates it requires `sm_80` or higher. + +## Minimal Mode + +```ptx +mbarrier.test_wait.shared::cta.b64 p, [mbar_addr], state; +@!p bra retry; +``` + +## Official Source Links (Fact Check) + +- mbarrier.test_wait / try_wait: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-mbarrier-try-wait +- mbarrier family: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-mbarrier +- Memory consistency model: https://docs.nvidia.com/cuda/parallel-thread-execution/#memory-consistency-model + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/sync-comm/references/membar-fence.md b/content/cuda/docs/ptx/instructions/sync-comm/references/membar-fence.md new file mode 100644 index 00000000..0c66feaf --- /dev/null +++ b/content/cuda/docs/ptx/instructions/sync-comm/references/membar-fence.md @@ -0,0 +1,33 @@ +# PTX Instruction Topic: membar / fence + +`membar`/`fence` establish ordering for memory accesses and are fundamental primitives for correctness in concurrent execution. + +## Official Syntax (Excerpt) + +```ptx +membar.gl; +membar.cta; +membar.sys; +fence.sc.cta; +fence.sc.cluster; +``` + +## Key Semantics + +- `membar` ensures that prior memory accesses in the current thread are observed before subsequent accesses at the specified level. +- The documentation explains that `fence.sc` can restore sequential consistency at sufficient locations, but with a higher cost. +- On `sm_70+`, the semantic relationship between `membar` and `fence.sc` is clearly documented as being compatible (see the section notes). + +## Version and Target + +- `membar.{cta,gl}`: introduced in PTX ISA 1.4 +- `membar.sys`: introduced in PTX ISA 2.0, requires `sm_20+` +- `membar.proxy` / `fence.proxy`: introduced in PTX ISA 7.5 + +## Official Source Links (Fact Check) + +- membar / fence: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-membar-fence +- Memory consistency model: https://docs.nvidia.com/cuda/parallel-thread-execution/#memory-consistency-model +- Scope and applicability: https://docs.nvidia.com/cuda/parallel-thread-execution/#scope-and-applicability-of-the-model + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/sync-comm/references/membar-proxy.md b/content/cuda/docs/ptx/instructions/sync-comm/references/membar-proxy.md new file mode 100644 index 00000000..cd5b0cf2 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/sync-comm/references/membar-proxy.md @@ -0,0 +1,33 @@ +# PTX Instruction Topic: membar.proxy + +`membar.proxy` is a cross-proxy ordering primitive, historically tied to `fence.proxy` via semantic mapping. + +## Official Description + +- Defined in the `membar/fence` section as the relationship between `membar.proxy` and `fence.proxy`. +- The documentation notes that on `sm_70+`, `membar.proxy` and `fence.proxy` are synonymous. + +## Version and Target + +- `membar.proxy` / `fence.proxy`: introduced in PTX ISA 7.5 +- `membar.proxy`: requires `sm_60+` +- `fence.proxy`: requires `sm_70+` + +## Usage Notes + +- Use proxy fences only when data crosses proxy domains (for example, async-proxy to generic-proxy handoff). +- Do not substitute proxy fences for full protocol synchronization (`mbarrier`/barrier) when completion must also be tracked. + +## Example (PTX style) + +```ptx +membar.proxy.alias; +``` + +## Official Source Links (Fact Check) + +- membar / fence: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-membar-fence +- PTX ISA notes: https://docs.nvidia.com/cuda/parallel-thread-execution/#ptx-isa-notes +- Target ISA notes: https://docs.nvidia.com/cuda/parallel-thread-execution/#target-isa-notes + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/sync-comm/references/red.md b/content/cuda/docs/ptx/instructions/sync-comm/references/red.md new file mode 100644 index 00000000..973ec0ef --- /dev/null +++ b/content/cuda/docs/ptx/instructions/sync-comm/references/red.md @@ -0,0 +1,33 @@ +# PTX Instruction Topic: red + +`red` is a parallel reduction-update instruction family: it performs an atomic reduction on a specified memory location and writes the result back to the same location (overwriting the original value). + +## Official Description + +- Documentation section: Parallel Synchronization and Communication Instructions: `red` +- Compared with `atom`/`atom.*`, the core semantic of `red` is reduction write-back; whether an additional destination register exists/ is used depends on the specific variant syntax (see the corresponding ISA subsection). + +## Key Constraints + +- The operation type and the target address space must match the specific `red` variant. +- Concurrency semantics depend on the specified memory semantics and scope. +- It must be used together with consumer-side synchronization primitives to ensure visibility. + +## Usage Notes + +- Use `red` when you only need in-place accumulation and do not require the previous value. +- Combine with hierarchical reduction (warp/block first) to reduce global contention. + +## Example (PTX Style) + +```ptx +red.global.add.u32 [addr], r1; +``` + +## Official Source Links (Fact Check) + +- red: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-red +- red.async: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-red-async +- Memory consistency model: https://docs.nvidia.com/cuda/parallel-thread-execution/#memory-consistency-model + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/sync-comm/references/redux-sync.md b/content/cuda/docs/ptx/instructions/sync-comm/references/redux-sync.md new file mode 100644 index 00000000..492a8b09 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/sync-comm/references/redux-sync.md @@ -0,0 +1,32 @@ +# PTX Instruction Topic: redux.sync + +`redux.sync` provides a synchronized reduction operation used for mask-based reduction computations within a thread group. + +## Official Description + +- Documentation section: Parallel Synchronization and Communication Instructions: `redux.sync` +- Applicable to reduction scenarios that require a synchronized participation set + +## Key Constraints + +- `membermask` must correctly cover participating threads. +- The data type and reduction operator must match the instruction variant. +- The overall synchronization protocol must still be satisfied with subsequent consumer paths. + +## Usage Notes + +- Use `redux.sync` for compact warp-scope reductions when shared-memory staging is unnecessary. +- Keep mask construction stable across control-flow paths to avoid partial participation bugs. + +## Example (PTX Style, Illustrative) + +```ptx +redux.sync.add.s32 r_out, r_in, membermask; +``` + +## Official Source Links (Fact Check) + +- redux.sync: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-redux-sync +- Parallel synchronization instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/sync-comm/references/shfl-sync.md b/content/cuda/docs/ptx/instructions/sync-comm/references/shfl-sync.md new file mode 100644 index 00000000..2c5c79a9 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/sync-comm/references/shfl-sync.md @@ -0,0 +1,32 @@ +# PTX Instruction Topic: shfl.sync + +`shfl.sync` exchanges register data within a warp and is commonly used for warp-level communication and reductions. + +## Official Description + +- Documentation section: Parallel Synchronization and Communication Instructions: `shfl.sync` +- Commonly used for warp-level broadcast, down-scan, up-scan, and cross-lane exchange + +## Key Constraints + +- `membermask` must correctly describe the participating threads. +- lane indices and width parameters must follow the variant definition. +- Confirm that the target architecture supports this synchronized shuffle semantic before use. + +## Usage Notes + +- Use `shfl.sync` for warp-local broadcast and tree reductions to reduce shared-memory traffic. +- Keep lane mapping logic explicit when mixing `bfly`, `up`, `down`, and indexed shuffle forms. + +## Example (PTX Style, Illustrative) + +```ptx +shfl.sync.bfly.b32 r_out, r_in, laneMask, clamp, membermask; +``` + +## Official Source Links (Fact Check) + +- shfl.sync: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-shfl-sync +- Parallel synchronization instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/sync-comm/references/vote-sync.md b/content/cuda/docs/ptx/instructions/sync-comm/references/vote-sync.md new file mode 100644 index 00000000..828fb66e --- /dev/null +++ b/content/cuda/docs/ptx/instructions/sync-comm/references/vote-sync.md @@ -0,0 +1,31 @@ +# PTX Instruction Topic: vote.sync + +`vote.sync` performs a boolean reduction vote within the participating threads mask and is commonly used for warp-level conditional aggregation. + +## Official Description + +- Documentation section: Parallel Synchronization and Communication Instructions: `vote.sync` +- Common uses: determine whether any thread/all threads satisfy a condition + +## Key Constraints + +- The participating mask must match the actual set of active threads. +- When the vote result drives control flow, avoid mask mismatches that can cause semantic deviations. + +## Usage Notes + +- Use `vote.sync.any/all` for fast warp agreement checks before expensive work. +- Keep mask derivation adjacent to the vote site for easier correctness auditing. + +## Example (PTX Style, Illustrative) + +```ptx +vote.sync.any.pred p_out, p_in, membermask; +``` + +## Official Source Links (Fact Check) + +- vote.sync: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-vote-sync +- Parallel synchronization instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/tcgen05/DOC.md b/content/cuda/docs/ptx/instructions/tcgen05/DOC.md new file mode 100644 index 00000000..e5ccb682 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/tcgen05/DOC.md @@ -0,0 +1,46 @@ +--- +name: ptx-tcgen05-instructions +description: "PTX TensorCore 5th Generation (tcgen05) entry and B-series related constraints." +metadata: + languages: "cpp" + versions: "9.2" + revision: 1 + updated-on: "2026-03-19" + source: official + tags: "cuda,ptx,tcgen05,tensorcore,b-series,sm100,sm120" +--- + +# PTX tcgen05 (TensorCore 5th Generation) + +This directory focuses on tcgen05 entry points and B-series architectural constraints, without duplicating the `wgmma` details already covered elsewhere. + +## Core Positioning + +- tcgen05 is the TensorCore 5th Generation entry point in the PTX documentation. +- It is tightly related to WGMMA, mixed/alternate precision, and new type qualifier constraints. +- Multiple capabilities in the documentation are bound to `sm_100`/`sm_120` family feature thresholds. + +## Recommended Reading + +- `references/overview.md` +- `references/arch-gating.md` +- `references/wgmma-tcgen05-relationship.md` +- `references/b-series-checklist.md` + +## Further Reading + +- `references/tcgen05-mma-kinds.md` +- `references/tcgen05-block-scale.md` +- `references/tcgen05-sm120a-restrictions.md` +- `references/tcgen05-sm100-sm120-mapping.md` +- `references/tcgen05-alt-fp-types.md` +- `references/tcgen05-sparse-path.md` +- `references/tcgen05-migration-playbook.md` + +## Official Source Links (Fact Check) + +- TensorCore 5th Generation: https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions +- WGMMA: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-matrix-instructions +- Target ISA Notes: https://docs.nvidia.com/cuda/parallel-thread-execution/#target-isa-notes + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/tcgen05/references/arch-gating.md b/content/cuda/docs/ptx/instructions/tcgen05/references/arch-gating.md new file mode 100644 index 00000000..c009c65f --- /dev/null +++ b/content/cuda/docs/ptx/instructions/tcgen05/references/arch-gating.md @@ -0,0 +1,34 @@ +# tcgen05 Architecture Gating (B-Series) + +tcgen05-related capabilities are highly coupled in the official documentation with `sm_100`/`sm_120` and the `a/f` family conditions. + +## Architecture Gating Recommendations + +- Abstract “availability” as capabilities (e.g., `has_tcgen05`, `has_alt_fp`, `has_cp_mask`). +- Filter instruction templates by capabilities before generating kernels. +- Explicitly avoid or degrade restricted types on `sm_120a` (especially sub-byte / alternate fp). + +## Usage Notes + +- Gate at kernel-generation time, not only at runtime dispatch, to avoid generating illegal templates. +- Keep capability probing and fallback policy versioned with PTX/CUDA upgrade milestones. + +## Common Failure Modes + +- Capability flags are defined but not enforced during code emission. +- `sm_120a` restrictions are checked for compute path but missed for data-movement path. +- Fallback kernels compile but violate numerical contract due to dtype drift. + +## Minimal Test Matrix + +- `sm_100*` dense path with baseline type combinations. +- `sm_120a` restricted-type path with fallback validation. +- `sm_120f` extended-support path with regression parity checks. + +## Official Source Links (Fact Check) + +- Target ISA Notes: https://docs.nvidia.com/cuda/parallel-thread-execution/#target-isa-notes +- PTX ISA Notes: https://docs.nvidia.com/cuda/parallel-thread-execution/#ptx-isa-notes +- cp.async.bulk.tensor restrictions context: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-bulk-tensor + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/tcgen05/references/b-series-checklist.md b/content/cuda/docs/ptx/instructions/tcgen05/references/b-series-checklist.md new file mode 100644 index 00000000..c37ba630 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/tcgen05/references/b-series-checklist.md @@ -0,0 +1,32 @@ +# B-Series Implementation Checklist (tcgen05-related) + +For quick verification before engineering rollout. + +## Checklist + +- [ ] `.target` matches the actual deployment architecture (`sm_100`/`sm_120`). +- [ ] All tcgen05/WGMMA variants have passed capability gating. +- [ ] Relevant async protocols (fence/commit/wait) are complete. +- [ ] `sm_120a` restricted types have been checked and have fallbacks. +- [ ] Linked scenarios with TMA paths have completed correctness regression testing. + +## Release Notes for Reviewers + +- Record the capability matrix used during generation and testing. +- Include sparse and alternate-FP coverage status explicitly in release notes. +- Document fallback behavior when tcgen05 constraints fail on target hardware. + +## Minimum Evidence Package + +- One correctness report per architecture family (`sm_100*`, `sm_120*`) with capability-gated variants. +- One protocol trace confirming async fence/commit/wait ordering on representative kernels. +- One numerical report covering dense, sparse, and alternate-FP routes. + +## Official Source Links (Fact Check) + +- Target ISA Notes: https://docs.nvidia.com/cuda/parallel-thread-execution/#target-isa-notes +- TensorCore 5th Generation: https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions +- cp.async.bulk.tensor: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-bulk-tensor +- wgmma.mma_async: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-matrix-instructions-wgmma-mma-async + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/tcgen05/references/overview.md b/content/cuda/docs/ptx/instructions/tcgen05/references/overview.md new file mode 100644 index 00000000..e465621c --- /dev/null +++ b/content/cuda/docs/ptx/instructions/tcgen05/references/overview.md @@ -0,0 +1,33 @@ +# tcgen05 Overview + +tcgen05 is the entry point chapter for TensorCore 5th Generation in PTX 9.2, covering capabilities and constraints related to new-generation matrix computations. + +## Core Capability Axes + +- Which data types, MMA kinds, and qualifiers are legal on the target architecture. +- Which capability subsets are gated by `sm_100*` and `sm_120*` families. +- Which async protocols are mandatory when composed with WGMMA/TMA paths. + +## Usage Notes + +- Treat tcgen05 as a capability map, then bind concrete instruction templates after gating. +- Keep architecture, type, and protocol checks in one validation layer to avoid drift. + +## Common Failure Modes + +- Selecting a legal MMA shape with an illegal type/scale combination. +- Assuming support transfers across architecture variants without checking target notes. +- Reusing async synchronization recipes that are valid for WGMMA but incomplete for tcgen05 composition. + +## Quick Start Checklist + +- Confirm architecture capability before selecting instruction templates. +- Validate `kind`/type/scale combinations before code generation. +- Verify async protocol completion before accumulator consumption. + +## Official Source Links (Fact Check) + +- TensorCore 5th Generation: https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions +- WGMMA MMA Async: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-matrix-instructions-wgmma-mma-async + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/tcgen05/references/tcgen05-alt-fp-types.md b/content/cuda/docs/ptx/instructions/tcgen05/references/tcgen05-alt-fp-types.md new file mode 100644 index 00000000..b1635146 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/tcgen05/references/tcgen05-alt-fp-types.md @@ -0,0 +1,31 @@ +# tcgen05 Topic: Alternate Floating-Point Types + +This page focuses on usage constraints for common alternate FP types on the tcgen05 paths (e.g., `.e2m1/.e3m2/.e2m3`). + +## Official Notes + +- The documentation provides legal combinations of these types with `.kind`, shape, and `scale_vec_size`. +- Multiple entries explicitly tie support conditions to `sm_120a` / `sm_120f`. + +## Engineering Guidance + +- Build a separate numerical regression baseline for alternate FP paths. +- Bind the type-support matrix and architecture thresholds to the same configuration source. + +## Usage Notes + +- Keep alternate-FP enablement behind explicit feature flags in kernel selection logic. +- Store tolerance thresholds per type family instead of sharing one global tolerance. + +## Common Failure Modes + +- Alternate-FP kernels pass shape checks but fail hidden type-combination rules. +- Tolerances copied from FP16/BF16 baselines under-report alternate-FP drift. +- Architecture gating is checked for compute ops but missed for related async transfer paths. + +## Official Source Links (Fact Check) + +- TensorCore 5th Generation: https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions +- Warp-level MMA instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#warp-level-matrix-instructions-mma + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/tcgen05/references/tcgen05-block-scale.md b/content/cuda/docs/ptx/instructions/tcgen05/references/tcgen05-block-scale.md new file mode 100644 index 00000000..5abe2698 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/tcgen05/references/tcgen05-block-scale.md @@ -0,0 +1,31 @@ +# tcgen05 Topic: block_scale and scale_vec_size + +This page covers `scale_vec_size` constraints in tcgen05-related `block_scale` paths. + +## Official Notes + +- `.block_scale` indicates that A/B matrices are scaled by `scale_A/scale_B` before multiply-add. +- `scale_vec_size` determines the shape of the scale matrix and how the selector is interpreted. +- Different `.kind` entries allow different values of `scale_vec_size` (the document tables define legal combinations). + +## B-Series Guidance + +- Do static validation using the triplet “kind + stype + scale_vec_size”. +- Check legal combinations before compilation to avoid runtime undefined behavior. + +## Usage Notes + +- Keep scale tensor layout and selector interpretation documented next to kernel templates. +- Validate block-scale metadata generation on host side before launching compute kernels. + +## Common Failure Modes + +- Correct compute opcode with invalid scale metadata layout. +- `scale_vec_size` chosen from template defaults without checking `.kind` constraints. + +## Official Source Links (Fact Check) + +- TensorCore 5th Generation: https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions +- MMA instructions (block scale context): https://docs.nvidia.com/cuda/parallel-thread-execution/#warp-level-matrix-instructions-mma + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/tcgen05/references/tcgen05-migration-playbook.md b/content/cuda/docs/ptx/instructions/tcgen05/references/tcgen05-migration-playbook.md new file mode 100644 index 00000000..af48cb45 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/tcgen05/references/tcgen05-migration-playbook.md @@ -0,0 +1,35 @@ +# tcgen05 Migration Playbook (From “Works” to “Stable”) + +This page provides a minimal process for taking tcgen05 from “compiles” to “ready for stable production.” + +## Four-Step Process + +1. Architecture gate check: first determine whether `sm_100*`/`sm_120*` are available. +2. Combination validity check: verify `kind + stype + scale_vec_size`. +3. Protocol correctness check: ensure fences/commit/wait on the async path and full mbarrier participation. +4. Numerical and performance regression: establish baselines separately for alternate FP and sparse paths. + +## Exit Criteria + +- All generated kernels pass architecture-gated validation without manual overrides. +- Async protocol traces show correct fence/commit/wait ordering under stress inputs. +- Numerical tolerance and performance deltas are recorded for dense and sparse variants. + +## Common Failure Modes + +- Migration stops at "compiles" without protocol or numerical regression coverage. +- Sparse and alternate-FP paths share the same baseline, hiding path-specific drift. +- Fallback policy is undocumented, leading to deployment-time behavior changes. + +## Rollback Readiness + +- Keep a tested fallback path for unsupported architecture/type combinations. +- Version migration decisions with reproducible benchmark and correctness artifacts. + +## Official Source Links (Fact Check) + +- TensorCore 5th Generation: https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions +- Target ISA Notes: https://docs.nvidia.com/cuda/parallel-thread-execution/#target-isa-notes +- Asynchronous operations: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-operations + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/tcgen05/references/tcgen05-mma-kinds.md b/content/cuda/docs/ptx/instructions/tcgen05/references/tcgen05-mma-kinds.md new file mode 100644 index 00000000..8c591676 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/tcgen05/references/tcgen05-mma-kinds.md @@ -0,0 +1,31 @@ +# tcgen05 Topic: MMA kind Family (f8f6f4 / mxf4 / mxf4nvf4 / mxf8f6f4) + +This page focuses on the `.kind` families for tcgen05-related MMA and their engineering meaning. + +## Official Notes + +- The documentation lists families such as `.kind::f8f6f4`, `.kind::mxf4`, `.kind::mxf4nvf4`, and `.kind::mxf8f6f4`. +- Different `.kind` entries impose different restrictions on data packing, optional `scale_vec_size`, and available type combinations. + +## B-Series Guidance + +- Treat `.kind` as a first-class capability parameter at the code-generation level. +- Enforce an explicit `scale_vec_size` for `mxf4nvf4` (per the official rules). + +## Usage Notes + +- Carry `.kind` through scheduling, metadata generation, and validation stages as one parameter. +- Keep fallback templates keyed by `.kind` to avoid silent conversion to unsupported combinations. + +## Common Failure Modes + +- Selecting `.kind` by benchmark speed only, without validating legal type combinations. +- Forgetting to propagate `.kind` choice into sparse/scale metadata generation. +- Using shared fallback code paths that silently change `.kind`-dependent numerics. + +## Official Source Links (Fact Check) + +- TensorCore 5th Generation: https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions +- MMA block-scale/kind context: https://docs.nvidia.com/cuda/parallel-thread-execution/#warp-level-matrix-instructions-mma + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/tcgen05/references/tcgen05-sm100-sm120-mapping.md b/content/cuda/docs/ptx/instructions/tcgen05/references/tcgen05-sm100-sm120-mapping.md new file mode 100644 index 00000000..5fe365ff --- /dev/null +++ b/content/cuda/docs/ptx/instructions/tcgen05/references/tcgen05-sm100-sm120-mapping.md @@ -0,0 +1,31 @@ +# tcgen05 Topic: sm_100 to sm_120 Mapping + +This page describes a mapping approach for tcgen05-related capabilities across the `sm_100*` and `sm_120*` families. + +## Mapping Approach + +- `sm_100a/sm_120a`: typically indicates earlier availability or more strict gating paths. +- `sm_100f/sm_120f`: the documentation frequently notes “higher targets within the same family provide support.” +- Specific functionality should follow the Target ISA notes in the corresponding section; do not infer across sections. + +## Implementation Guidance + +- Encode architecture checks as a `supports(feature, sm)` function. +- Let the generator degrade along the feature dimension instead of scattering many `#if` in kernel source. + +## Usage Notes + +- Centralize mapping rules in one table consumed by codegen, runtime dispatch, and tests. +- Keep mapping updates synchronized with CUDA/PTX version bumps. + +## Common Failure Modes + +- Hard-coding architecture assumptions per kernel instead of using shared mapping logic. +- Conflating "supported on family" with "supported on every family variant." + +## Official Source Links (Fact Check) + +- Target ISA Notes: https://docs.nvidia.com/cuda/parallel-thread-execution/#target-isa-notes +- PTX ISA Notes: https://docs.nvidia.com/cuda/parallel-thread-execution/#ptx-isa-notes + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/tcgen05/references/tcgen05-sm120a-restrictions.md b/content/cuda/docs/ptx/instructions/tcgen05/references/tcgen05-sm120a-restrictions.md new file mode 100644 index 00000000..0413a1b2 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/tcgen05/references/tcgen05-sm120a-restrictions.md @@ -0,0 +1,34 @@ +# tcgen05 Topic: sm_120a Restrictions and Notes + +This page distills the restriction entries in the PTX documentation that are directly related to `sm_120a`. + +## Official Signals + +- Multiple sections explicitly state that `sm_120a` is the initial support point, while `sm_120f` provides later support within the same family. +- Some sub-byte / alternate floating-point types have restriction notes for `sm_120a`. +- Asynchronous tensor paths such as `cp.async.bulk.tensor` have dedicated restrictions entries for `sm_120a`. + +## Engineering Guidance + +- Maintain a separate “disabled types list” for `sm_120a`. +- First perform dedicated testing on `sm_120a` for a new kernel, then expand to `sm_120f`. + +## Common Failure Modes + +- Assuming `sm_120f` support implies `sm_120a` parity for all type paths. +- Missing fallback coverage for restricted alternate-FP and sub-byte routes. +- Validating only throughput and skipping correctness checks on restricted configurations. + +## Verification Checklist + +- Confirm restricted-type disables are active on `sm_120a`. +- Confirm fallback kernels preserve numerical contract and output layout. +- Re-run protocol validation for async tensor paths under restricted modes. + +## Official Source Links (Fact Check) + +- Target ISA Notes: https://docs.nvidia.com/cuda/parallel-thread-execution/#target-isa-notes +- cp.async.bulk.tensor restrictions: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-bulk-tensor +- TensorCore 5th Generation: https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/tcgen05/references/tcgen05-sparse-path.md b/content/cuda/docs/ptx/instructions/tcgen05/references/tcgen05-sparse-path.md new file mode 100644 index 00000000..b110f7db --- /dev/null +++ b/content/cuda/docs/ptx/instructions/tcgen05/references/tcgen05-sparse-path.md @@ -0,0 +1,33 @@ +# tcgen05 Topic: Sparse MMA Paths + +This page covers key points for tcgen05-related sparse MMA (`mma.sp`) across the `kind` / `block-scale` dimensions. + +## Official Notes + +- The sparse path introduces additional metadata/selectors operands. +- When combined with block scale, you still must satisfy valid combinations of `kind`/`stype`/`scale_vec_size`. +- The documentation specifies architectural requirements for alternate FP sparse paths. + +## Engineering Guidance + +- Build separate templates for sparse and dense paths to avoid cross-contaminating parameters. +- Perform boundary and consistency checks for metadata and selector parameters on the host side. + +## Common Failure Modes + +- Reusing dense-path metadata assumptions in sparse kernels. +- Sparse selector tensors generated with mismatched shape/stride conventions. +- Shared regression suites miss sparse-only corner cases. + +## Verification Checklist + +- Validate sparse metadata shape/range before kernel launch. +- Compare sparse and dense numerical baselines under identical problem shapes. +- Confirm architecture gates for sparse alternate-FP combinations. + +## Official Source Links (Fact Check) + +- TensorCore 5th Generation: https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions +- Warp-level MMA sparse context: https://docs.nvidia.com/cuda/parallel-thread-execution/#warp-level-matrix-instructions-mma-sp + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/tcgen05/references/wgmma-tcgen05-relationship.md b/content/cuda/docs/ptx/instructions/tcgen05/references/wgmma-tcgen05-relationship.md new file mode 100644 index 00000000..ee158bfc --- /dev/null +++ b/content/cuda/docs/ptx/instructions/tcgen05/references/wgmma-tcgen05-relationship.md @@ -0,0 +1,35 @@ +# Relationship Between WGMMA and tcgen05 + +In the current PTX structure, WGMMA is a high-frequency entry point at the implementation level, while tcgen05 provides the generational capability and constraint framework. + +## Practical Relationship + +- First check the capability boundaries of tcgen05, then choose the specific WGMMA variant. +- WGMMA depends on the `wgmma.fence` + `commit_group` + `wait_group` protocol. +- Async paths involve async proxies and require matching fence/wait semantics. + +## Usage Notes + +- Use tcgen05 gating to decide whether WGMMA templates are eligible before launch configuration tuning. +- Keep one protocol contract per pipeline stage to avoid mixing WGMMA and non-WGMMA completion logic. + +## Common Failure Modes + +- Choosing a WGMMA template first and discovering tcgen05 incompatibility late in the pipeline. +- Reusing wait-group thresholds across kernels with different stage depth and tile size. +- Assuming fence semantics are interchangeable across all async producer-consumer chains. + +## Integration Checklist + +- Gate tcgen05 capability before WGMMA template selection. +- Validate fence/commit/wait sequencing under representative stage depth. +- Confirm accumulator-read boundaries are protected by matching wait semantics. + +## Official Source Links (Fact Check) + +- TensorCore 5th Generation: https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions +- wgmma.fence: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-matrix-instructions-wgmma-fence +- wgmma.commit_group: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-matrix-instructions-wgmma-commit-group +- wgmma.wait_group: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-matrix-instructions-wgmma-wait-group + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/tma/DOC.md b/content/cuda/docs/ptx/instructions/tma/DOC.md new file mode 100644 index 00000000..beda7b1e --- /dev/null +++ b/content/cuda/docs/ptx/instructions/tma/DOC.md @@ -0,0 +1,56 @@ +--- +name: ptx-tma-instructions +description: "PTX Tensor Memory Accelerator related instructions and usage constraints in ISA 9.2." +metadata: + languages: "cpp" + versions: "9.2" + revision: 2 + updated-on: "2026-03-19" + source: official + tags: "cuda,ptx,tma,async,memory" +--- + +# PTX TMA + +Tensor Memory Accelerator (TMA) instructions move tensor tiles asynchronously with explicit completion protocols. + +## Representative Syntax + +```ptx +cp.async.bulk.tensor.1d.shared::cta.global.mbarrier::complete_tx::bytes.tile [dstMem], [tensorMap, {tc}], [mbar]; +``` + +This is a representative form. Actual variants add dimension, source/destination state-space, completion mechanism, and multicast/reduction modifiers. + +## Key Semantics + +- TMA operations are asynchronous and require explicit completion handling before consumer use. +- Completion may use mbarrier-based `complete_tx` or bulk-group wait mechanisms depending on variant. +- Memory visibility and ordering follow PTX asynchronous-operation rules and proxy semantics. + +## Common Constraints + +- `tensorMap` descriptors and coordinate operands must be valid for the selected dimension/layout form. +- Variant-specific modifiers (for example multicast/reduce forms) require matching operand lists. +- Alignment, shape, and state-space combinations must match ISA restrictions for the target architecture. + +## Usage Recommendations + +- First validate correctness with a single-stage movement/compute loop. +- Add staged pipelining only after synchronization boundaries are explicit and correct. +- Keep a fallback path for architectures or types that do not support your chosen TMA variant. + +## Official Source Links (fact check) + +- cp.async.bulk.tensor: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-bulk-tensor +- Tensor Map: https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-map +- mbarrier: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-mbarrier +- Asynchronous operations: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-operations + +Last verified date: 2026-03-19 + +## Single-Instruction References + +- `references/cp-async-bulk-tensor.md` +- `references/cp-reduce-async-bulk.md` +- `references/multimem-cp-reduce-async-bulk.md` diff --git a/content/cuda/docs/ptx/instructions/tma/references/cp-async-bulk-tensor.md b/content/cuda/docs/ptx/instructions/tma/references/cp-async-bulk-tensor.md new file mode 100644 index 00000000..0849efbe --- /dev/null +++ b/content/cuda/docs/ptx/instructions/tma/references/cp-async-bulk-tensor.md @@ -0,0 +1,48 @@ +# PTX Instruction Note: cp.async.bulk.tensor (TMA) + +`cp.async.bulk.tensor` is the core PTX TMA instruction family for asynchronous tensor movement between selected state spaces. + +## Official Syntax (Excerpt) + +```ptx +cp.async.bulk.tensor.dim.dst.src{.load_mode}.completion_mechanism{.cta_group}{.level::cache_hint} + [dstMem], [tensorMap, tensorCoords], [mbar]{, im2colInfo}{, cache-policy} +``` + +```ptx +cp.async.bulk.tensor.dim.dst.src{.load_mode}.completion_mechanism{.multicast}{.cta_group}{.level::cache_hint} + [dstMem], [tensorMap, tensorCoords], [mbar]{, im2colInfo}{, ctaMask}{, cache-policy} +``` + +```ptx +cp.async.bulk.tensor.dim.dst.src{.load_mode}.completion_mechanism{.level::cache_hint} + [tensorMap, tensorCoords], [srcMem]{, cache-policy} +``` + +## Key Semantics + +- The instruction is asynchronous and requires an explicit completion protocol before consumer use. +- Completion mechanism is variant-dependent (`.mbarrier::complete_tx::bytes` or `.bulk_group` in eligible forms). +- Source/destination state-space and modifier choices determine valid operand templates. +- Memory ordering and visibility follow PTX asynchronous-operation and proxy rules. + +## Common Constraints + +- `tensorMap` and coordinate operands must match dimension, load mode, and layout requirements. +- Multicast and CTA-group modifiers require correct target-mask or grouping operands. +- Architecture/type restrictions apply to specific variants; verify against the ISA restrictions section. + +## Example (PTX style) + +```ptx +cp.async.bulk.tensor.1d.shared::cta.global.mbarrier::complete_tx::bytes.tile [sMem0], [tensorMap0, {tc0}], [mbar0]; +``` + +## Official Source Links (fact check) + +- cp.async.bulk.tensor: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-bulk-tensor +- Tensor Map: https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-map +- Asynchronous operations: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-operations +- mbarrier: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-mbarrier + +Last verified date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/tma/references/cp-reduce-async-bulk.md b/content/cuda/docs/ptx/instructions/tma/references/cp-reduce-async-bulk.md new file mode 100644 index 00000000..3f1724b2 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/tma/references/cp-reduce-async-bulk.md @@ -0,0 +1,31 @@ +# PTX Instruction Note: cp.reduce.async.bulk + +`cp.reduce.async.bulk` is an async bulk reduction-copy instruction that performs element-wise reduction during transfer. + +## Official Syntax (Excerpt) + +```ptx +cp.reduce.async.bulk.dst.src.completion_mechanism.redOp.type [dstMem], [srcMem], size, [mbar]; +cp.reduce.async.bulk.dst.src.completion_mechanism.add.noftz.type [dstMem], [srcMem], size, [mbar]; +``` + +## Key Semantics + +- The instruction is non-blocking and issues asynchronous reduction work. +- `.mbarrier::complete_tx::bytes`: executes complete-tx on mbarrier at completion. +- `.bulk_group`: uses bulk async-group completion. +- The docs classify this path as a weak memory operation; reduction has `.relaxed.gpu` semantics. + +## Detailed Constraints (Official Highlights) + +- `size` specifies equal source/destination array length. +- `add.f16/add.bf16` requires `.noftz`. +- Some sub-byte types are unsupported (see restrictions section). + +## Official Source Links (fact check) + +- cp.reduce.async.bulk: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-reduce-async-bulk +- Async data movement instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-data-movement-and-conversion-instructions +- Memory consistency model: https://docs.nvidia.com/cuda/parallel-thread-execution/#memory-consistency-model + +Last verified date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/tma/references/multimem-cp-reduce-async-bulk.md b/content/cuda/docs/ptx/instructions/tma/references/multimem-cp-reduce-async-bulk.md new file mode 100644 index 00000000..cb99dc9e --- /dev/null +++ b/content/cuda/docs/ptx/instructions/tma/references/multimem-cp-reduce-async-bulk.md @@ -0,0 +1,39 @@ +# PTX Instruction Note: multimem.cp.reduce.async.bulk + +`multimem.cp.reduce.async.bulk` performs asynchronous bulk copy-reduction to multi-memory targets. + +## Official Syntax (Excerpt) + +```ptx +multimem.cp.reduce.async.bulk.dst.src.completion_mechanism.redOp.type [dstMem], [srcMem], size; +``` + +## Key Semantics + +- The operation is asynchronous and reduction-enabled across multi-memory destinations. +- Completion semantics follow the selected completion mechanism for this variant family. +- Memory ordering and visibility behavior follow PTX memory-consistency and async-operation rules. + +## Common Constraints + +- Reduction operator and data type must be a legal ISA combination. +- `size` and address ranges must match source/destination requirements. +- Architecture restrictions apply; verify the target ISA and restrictions sections. + +## Usage Notes + +- Use this path when multi-memory reduction transport is required by system-level sharding design. +- Validate completion mechanism selection against downstream consumer synchronization points. + +## Common Failure Modes + +- Reduction operator is valid in isolation but illegal for the selected multimem variant. +- Completion mechanism is correct for copy but insufficient for consumer visibility requirements. + +## Official Source Links (fact check) + +- multimem.cp.reduce.async.bulk: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-multimem-cp-reduce-async-bulk +- cp.reduce.async.bulk: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-reduce-async-bulk +- Memory consistency model: https://docs.nvidia.com/cuda/parallel-thread-execution/#memory-consistency-model + +Last verified date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/wgmma/DOC.md b/content/cuda/docs/ptx/instructions/wgmma/DOC.md new file mode 100644 index 00000000..2e4ed82c --- /dev/null +++ b/content/cuda/docs/ptx/instructions/wgmma/DOC.md @@ -0,0 +1,48 @@ +--- +name: ptx-wgmma-instructions +description: "PTX warpgroup-level matrix multiply-accumulate instructions and constraints for ISA 9.2." +metadata: + languages: "cpp" + versions: "9.2" + revision: 2 + updated-on: "2026-03-19" + source: official + tags: "cuda,ptx,wgmma,mma,tensorcore,wmma,tensor-core,matrix-multiply,matrix-multiply-accumulate" +--- + +# PTX WGMMA + +WGMMA is used for warpgroup-level matrix multiply-accumulate and targets high-throughput Tensor Core paths. + +## Feature Positioning + +- Compared with traditional `mma`, WGMMA is designed for higher-level cooperative execution. +- It is commonly combined with asynchronous movement (e.g., TMA) to reduce data waiting. + +## Key Constraints + +- The combination of tile shape, layout, and dtype must fully match the specification. +- Instruction availability depends on the target architecture (see the Target ISA notes). +- Asynchronous compute paths require corresponding wait/synchronization mechanisms. + +## Example (Structural Illustrative) + +```ptx +// Specific operand formats should follow the official section. +wgmma.mma_async.sync.aligned ...; +``` + +## Official Source Links (Fact Check) + +- Asynchronous Warpgroup Level Matrix Instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-matrix-instructions +- wgmma.mma_async: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-matrix-instructions-wgmma-mma-async +- TensorCore 5th Generation: https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions + +Last cross-check date: 2026-03-19 + +## Single-instruction Topics + +- `references/wgmma-mma-async.md` +- `references/wgmma-commit-group.md` +- `references/wgmma-wait-group.md` +- `references/wgmma-fence.md` diff --git a/content/cuda/docs/ptx/instructions/wgmma/references/wgmma-commit-group.md b/content/cuda/docs/ptx/instructions/wgmma/references/wgmma-commit-group.md new file mode 100644 index 00000000..d73eee26 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/wgmma/references/wgmma-commit-group.md @@ -0,0 +1,34 @@ +# PTX Instruction Topic: wgmma.commit_group + +`wgmma.commit_group` commits the currently uncommitted batch of `wgmma.mma_async` as one wgmma-group. + +## Official Syntax + +```ptx +wgmma.commit_group.sync.aligned; +``` + +## Key Semantics + +- Each warpgroup creates a new wgmma-group and collects previously uncommitted `wgmma.mma_async`. +- If there are no uncommitted operations, it creates an empty group. +- `.sync` requires threads within the warp to rendezvous at the same instruction point. +- `.aligned` requires all threads in the warpgroup to execute the same `commit_group`; inconsistencies under conditional branches lead to undefined behavior. + +## Usage Notes + +- Commit once per pipeline stage after all stage-local `wgmma.mma_async` instructions are issued. +- Keep commit boundaries consistent across all participating warps in the warpgroup. + +## Common Failure Modes + +- Multiple commits are issued for one logical stage due to divergent control paths. +- Commit is skipped on one warp path, causing wait-group protocol mismatch later. + +## Official Source Links (Fact Check) + +- wgmma.commit_group: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-matrix-instructions-wgmma-commit-group +- wgmma.mma_async: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-matrix-instructions-wgmma-mma-async +- Async warpgroup matrix instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-matrix-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/wgmma/references/wgmma-fence.md b/content/cuda/docs/ptx/instructions/wgmma/references/wgmma-fence.md new file mode 100644 index 00000000..611fa1a3 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/wgmma/references/wgmma-fence.md @@ -0,0 +1,36 @@ +# PTX Instruction Topic: wgmma.fence + +`wgmma.fence` is used to constrain the ordering boundary of register accesses related to `wgmma.mma_async`. + +## Official Key Semantics + +- The documentation explicitly states that you must use `wgmma.fence` before `wgmma.mma_async` to isolate the related register accesses; otherwise behavior is undefined. +- It is typically combined with `wgmma.commit_group` / `wgmma.wait_group` to form a complete execution protocol. + +## Usage Notes + +- Insert `wgmma.fence` at stage boundaries where operand register ownership changes. +- Keep fence placement identical across participating warpgroup threads. + +## Common Failure Modes + +- Omitting fence on one path in a conditionally structured pipeline. +- Assuming `commit_group` alone is sufficient for register-handoff correctness. + +## Usage Patterns (Illustrative) + +```ptx +wgmma.fence.sync.aligned; +wgmma.mma_async.sync.aligned ...; +wgmma.commit_group.sync.aligned; +wgmma.wait_group.sync.aligned 0; +``` + +## Official Source Links (Fact Check) + +- wgmma.fence: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-matrix-instructions-wgmma-fence +- wgmma.mma_async: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-matrix-instructions-wgmma-mma-async +- wgmma.commit_group: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-matrix-instructions-wgmma-commit-group +- wgmma.wait_group: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-matrix-instructions-wgmma-wait-group + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/wgmma/references/wgmma-mma-async.md b/content/cuda/docs/ptx/instructions/wgmma/references/wgmma-mma-async.md new file mode 100644 index 00000000..7ddcb003 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/wgmma/references/wgmma-mma-async.md @@ -0,0 +1,41 @@ +# PTX Instruction Topic: wgmma.mma_async + +`wgmma.mma_async` is a warpgroup-level asynchronous matrix multiply-accumulate instruction that runs on the async proxy. + +## Official Syntax (Excerpt) + +```ptx +wgmma.mma_async.sync.aligned.shape.dtype.f16.f16 d, a-desc, b-desc, scale-d, imm-scale-a, imm-scale-b, imm-trans-a, imm-trans-b; +wgmma.mma_async.sync.aligned.shape.dtype.tf32.tf32 d, a-desc, b-desc, scale-d, imm-scale-a, imm-scale-b; +``` + +## Key Semantics + +- The instruction executes on the async proxy, and an implicit generic-async proxy fence occurs upon completion. +- You must use mechanisms such as `wgmma.commit_group` + `wgmma.wait_group` to wait for completion. +- The documentation emphasizes: `wgmma.fence` must be used to isolate the related register accesses; otherwise behavior is undefined. + +## Parameter Constraints (High-Risk) + +- `imm-trans-a` / `imm-trans-b` only allow 0 or 1. +- For floating-point variants, `imm-scale-a` / `imm-scale-b` only allow -1 or 1. +- The `shape` / `dtype` / descriptor layout must match the official matrix fragment definitions. + +## Usage Notes + +- Keep descriptor generation and shape selection in one helper to avoid operand mismatch. +- Pair each issued async MMA stage with explicit commit and wait boundaries before accumulator reads. + +## Common Failure Modes + +- Descriptor layout matches shape but not selected dtype variant. +- `imm-scale-*` values are propagated from host config without variant validation. +- Register consumption starts before wait-group completion for the corresponding stage. + +## Official Source Links (Fact Check) + +- wgmma.mma_async: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-matrix-instructions-wgmma-mma-async +- Asynchronous warpgroup matrix instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-matrix-instructions +- Async proxy notes: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-operations + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/instructions/wgmma/references/wgmma-wait-group.md b/content/cuda/docs/ptx/instructions/wgmma/references/wgmma-wait-group.md new file mode 100644 index 00000000..dc6e83f3 --- /dev/null +++ b/content/cuda/docs/ptx/instructions/wgmma/references/wgmma-wait-group.md @@ -0,0 +1,34 @@ +# PTX Instruction Topic: wgmma.wait_group + +`wgmma.wait_group` waits for the wgmma-group to complete and is a necessary step before reading the results of `wgmma.mma_async`. + +## Official Syntax + +```ptx +wgmma.wait_group.sync.aligned N; +``` + +## Key Semantics + +- Wait until the number of the most recent pending groups does not exceed `N`, and earlier groups have completed. +- `N=0` means waiting for all previously submitted groups to complete. +- The documentation states that if you access the accumulator / related input registers without waiting for the group that contains the target `wgmma.mma_async`, the behavior is undefined. +- `.sync` and `.aligned` have the same execution-consistency requirements as `commit_group`. + +## Usage Notes + +- Tune `N` according to pipeline depth and register-pressure limits. +- Place `wait_group` immediately before accumulator consumption boundaries. + +## Common Failure Modes + +- Using `N` from a different kernel stage depth and reading incomplete accumulators. +- Waiting too early and collapsing overlap between async compute stages. + +## Official Source Links (Fact Check) + +- wgmma.wait_group: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-matrix-instructions-wgmma-wait-group +- wgmma.commit_group: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-matrix-instructions-wgmma-commit-group +- wgmma.mma_async: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-matrix-instructions-wgmma-mma-async + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/references/abi-and-calling-convention.md b/content/cuda/docs/ptx/references/abi-and-calling-convention.md new file mode 100644 index 00000000..8066e9ed --- /dev/null +++ b/content/cuda/docs/ptx/references/abi-and-calling-convention.md @@ -0,0 +1,35 @@ +# PTX ABI and Calling Convention (9.2) + +PTX abstracts the ABI and calling convention at the `.entry` and `.func` levels; the parameter space and symbol declarations affect call correctness. + +## Key Points + +- `.entry`: kernel entry point, typically launched from the host side. +- `.func`: device function callable within PTX. +- Parameters are typically passed through the `.param` space. +- Function declarations and definitions must be consistent in symbols and parameters. + +## Common Mistakes + +- Mixing `.entry` and `.func` parameter rules. +- Inline PTX that ignores calling conventions can violate register constraints. +- Inconsistent symbol definitions across multiple files during linking. + +## Usage Notes + +- Keep `.entry` and `.func` signatures in shared templates to prevent declaration drift. +- Validate parameter layout assumptions when integrating inline PTX with compiler-generated code. + +## Verification Checklist + +- Check symbol names and `.param` ordering across declarations and definitions. +- Confirm call sites use operand types consistent with callee parameter types. + +## Official Source Links (Fact Check) + +- Abstracting the ABI: https://docs.nvidia.com/cuda/parallel-thread-execution/#abstracting-the-abi +- Function Declarations and Definitions: https://docs.nvidia.com/cuda/parallel-thread-execution/#function-declarations-and-definitions +- Parameter State Space: https://docs.nvidia.com/cuda/parallel-thread-execution/#parameter-state-space +- Linking directives: https://docs.nvidia.com/cuda/parallel-thread-execution/#linking-directives + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/references/b-series-arch-matrix.md b/content/cuda/docs/ptx/references/b-series-arch-matrix.md new file mode 100644 index 00000000..c88064e6 --- /dev/null +++ b/content/cuda/docs/ptx/references/b-series-arch-matrix.md @@ -0,0 +1,35 @@ +# B-Series Architecture Capability Matrix (PTX 9.2) + +This page summarizes the target-architecture constraints in the PTX 9.2 documentation that are related to the B-Series, with a focus on `sm_100`/`sm_120` and their `a/f` family conditions. + +## Key Observations (from Official Sections) + +- Multiple instruction variants are explicitly marked as “requires `sm_100` or higher”. +- Some advanced variants use `sm_100a` / `sm_120a` as first-launch requirements, while also noting that `sm_100f` / `sm_120f` in the same family provide support in higher versions. +- `cp.async.bulk.tensor` and `cp.reduce.async.bulk.tensor` have type restriction entries for `sm_120a`. +- Certain modifiers related to `.multicast::cluster` and `.cp_mask` provide performance/availability notes on `sm_100+` paths. + +## Capability Matrix (Current Document View) + +| Capability Direction | Key Instructions/Features | Architecture Signals (Official Docs) | +|---|---|---| +| Asynchronous tensor movement | `cp.async.bulk.tensor` | `sm_100`/`sm_100a`/`sm_100f` and `sm_120a` restriction entries | +| Asynchronous reduction movement | `cp.reduce.async.bulk(.tensor)` | `sm_100+` paths + type restriction entries | +| Proxy synchronization enhancements | `fence.proxy.async` | Documented higher architecture thresholds (`sm_90`/`sm_100+` related) | +| Advanced MMA/TensorCore | `wgmma` + `tcgen05` family entry points | Documented new types and qualifier conditions on `sm_120`/`sm_120a` | + +## Usage Suggestions + +- For B-Series-specific paths, perform “target-architecture threshold checks” before generating code. +- Implement `a`/`f` family differences as explicit capability flags in the engineering codebase, rather than scattering them in kernel code. +- Validate all “new types/new qualifiers” via compilation and runtime checks on both `sm_100` and `sm_120` platforms. + +## Official Source Links (Fact Check) + +- PTX main document: https://docs.nvidia.com/cuda/parallel-thread-execution/ +- PTX ISA Notes: https://docs.nvidia.com/cuda/parallel-thread-execution/#ptx-isa-notes +- Target ISA Notes: https://docs.nvidia.com/cuda/parallel-thread-execution/#target-isa-notes +- TensorCore 5th Generation: https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions +- cp.async.bulk.tensor: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-bulk-tensor + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/references/b-series-delta-from-hopper.md b/content/cuda/docs/ptx/references/b-series-delta-from-hopper.md new file mode 100644 index 00000000..594bcea8 --- /dev/null +++ b/content/cuda/docs/ptx/references/b-series-delta-from-hopper.md @@ -0,0 +1,32 @@ +# B-Series Delta Index vs. Hopper + +This page records the key PTX differences to focus on when migrating from Hopper (e.g., the `sm_90a` path) to the B-Series (`sm_100`/`sm_120`). + +## Observed Difference Directions + +- More instructions/modifiers are marked as available only under `sm_100+` in the documentation. +- `sm_120a` vs. `sm_120f` includes extra notes on certain types and qualifiers. +- TensorCore 5th Generation and related mixed/alternate-precision conditions are more common on the new-architecture paths. +- Asynchronous tensor movement and reduction paths (TMA / async bulk) include more architecture/type restriction entries. + +## Migration Checklist + +1. Check that `target` and compilation options match the intended target architecture. +2. Check whether any `sm_100+` threshold features are used (e.g., some cache/eviction/async proxy variants). +3. Check whether restricted types on `sm_120a` are triggered. +4. Perform minimal runnable regression tests for WGMMA / tcgen05 / TMA paths. + +## Common Failure Modes + +- Porting instruction syntax while leaving Hopper-specific gating assumptions unchanged. +- Validating only dense compute paths and missing sparse/alternate-FP restrictions. +- Applying one fallback policy across `sm_100*` and `sm_120*` without feature-level checks. + +## Official Source Links (Fact Check) + +- PTX ISA Notes: https://docs.nvidia.com/cuda/parallel-thread-execution/#ptx-isa-notes +- Target ISA Notes: https://docs.nvidia.com/cuda/parallel-thread-execution/#target-isa-notes +- Asynchronous operations: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-operations +- TensorCore 5th Generation: https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/references/h-series-special-instructions.md b/content/cuda/docs/ptx/references/h-series-special-instructions.md new file mode 100644 index 00000000..6a692622 --- /dev/null +++ b/content/cuda/docs/ptx/references/h-series-special-instructions.md @@ -0,0 +1,60 @@ +# H-Series (Hopper) Specialized Instructions and Mechanisms (Summary) + +This document is intended for engineering use and organizes capabilities in the PTX documentation that can be classified as first introduced in the H-Series (`sm_90/sm_90a`) or strongly related to it. + +## Key Takeaways + +- Hopper introduced and systematized the **cluster + async proxy + mbarrier + TMA + WGMMA** capability combination. +- Many of these capabilities were later extended in the B-Series, so they should be understood as “**H debuted, later inherited**.” +- For code generation, prefer capability gating over only looking at architecture codenames. + +## A. Core Capabilities Debuted in H and Inherited Later + +### 1) WGMMA Asynchronous Matrix Multiply-Accumulate Path + +- Representative instructions: `wgmma.mma_async`, `wgmma.fence`, `wgmma.commit_group`, `wgmma.wait_group` +- Typical meaning: a warpgroup-level asynchronous MMA protocol (initiate / commit / wait) + +### 2) TMA / Tensor Asynchronous Copy Path + +- Representative instruction: `cp.async.bulk.tensor` +- Related objects: `tensormap`, `prefetch.*.tensormap` +- Typical meaning: high-throughput tensor movement + dedicated completion mechanisms + +### 3) mbarrier Completion-Tracking System + +- Representative instructions: `mbarrier.arrive`, `mbarrier.arrive_drop`, `mbarrier.test_wait`, `mbarrier.try_wait` +- Related instruction: `cp.async.mbarrier.arrive` +- Typical meaning: explicitly ties asynchronous completion to visibility synchronization + +### 4) Cluster and Cross-Proxy Synchronization Mechanisms + +- Representative capabilities: `.cluster` scope, `fence.proxy.async` +- Typical meaning: ordering guarantees across paths for generic/async proxies + +## B. Common “requires sm_90+” Signals on the H Path (Examples) + +- Multiple `.cluster` scope instructions/modifiers are marked `requires sm_90 or higher` +- `.tensormap`-related paths are marked `requires sm_90 or higher` +- `fence.proxy.async` is marked `requires sm_90 or higher` +- Some bf16/bf16x2 and mixed-precision variants have explicit thresholds on the H path + +## Engineering Implementation Suggestions + +1. Break capabilities into feature flags (e.g., `has_wgmma`, `has_tma`, `has_mbarrier_cluster`, `has_proxy_async_fence`). +2. Do “capability detection -> instruction template selection” first, then perform kernel generation. +3. Reuse the same semantic checks for H and B, but apply different fallbacks based on `sm`. + +## Official Source Links (Fact Check) + +- PTX main document: https://docs.nvidia.com/cuda/parallel-thread-execution/ +- WGMMA: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-matrix-instructions +- wgmma.mma_async: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-matrix-instructions-wgmma-mma-async +- cp.async.bulk.tensor: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-bulk-tensor +- mbarrier family: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-mbarrier +- cp.async.mbarrier.arrive: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-cp-async-mbarrier-arrive +- membar/fence (including proxy semantics): https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-membar-fence +- Asynchronous operations: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-operations +- Target ISA notes: https://docs.nvidia.com/cuda/parallel-thread-execution/#target-isa-notes + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/references/instruction-format-and-operands.md b/content/cuda/docs/ptx/references/instruction-format-and-operands.md new file mode 100644 index 00000000..f40e953f --- /dev/null +++ b/content/cuda/docs/ptx/references/instruction-format-and-operands.md @@ -0,0 +1,36 @@ +# Instruction Format and Operands (9.2) +PTX instructions typically consist of a predicate, opcode, suffix, modifiers, and an operand array. Operand rules are one of the most common sources of errors. + +## Instruction Components + +- Optional predicate: `@p` / `@!p` +- Opcode: e.g., `add`, `ld`, `cp.async` +- Type suffix: e.g., `.s32`, `.f32` +- Semantic modifiers: e.g., `.acquire`, `.release`, `.relaxed` +- Scope modifiers: e.g., `.cta`, `.cluster`, `.gpu`, `.sys` + +## Operand Check List + +- Whether the immediate ranges satisfy the definitions in the section +- Whether address operands come from valid state spaces +- Whether source/destination types permit implicit or explicit conversions +- Whether additional synchronization is required (e.g., waiting for an async copy) + +## Key Facts Related to Asynchronous Instructions + +The PTX documentation clearly states that `cp.async` operations do not provide completion-order guarantees by default; explicit synchronization is required using `cp.async.wait_all` / `cp.async.wait_group` or mbarrier. + +## Common Failure Modes + +- Modifier ordering is syntactically valid but semantically wrong for the intended memory model. +- Operand width and state-space assumptions diverge between template and instantiated code. +- Async instructions are emitted without matching wait or barrier completion paths. + +## Official Source Links (Fact Check) + +- Instruction Statements: https://docs.nvidia.com/cuda/parallel-thread-execution/#instruction-statements +- Instruction Operands: https://docs.nvidia.com/cuda/parallel-thread-execution/#instruction-operands +- Operand Costs: https://docs.nvidia.com/cuda/parallel-thread-execution/#operand-costs +- Asynchronous Data Movement semantics: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-data-movement-and-conversion-instructions + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/references/memory-consistency-model.md b/content/cuda/docs/ptx/references/memory-consistency-model.md new file mode 100644 index 00000000..07a0ea8e --- /dev/null +++ b/content/cuda/docs/ptx/references/memory-consistency-model.md @@ -0,0 +1,35 @@ +# Memory Consistency Model (9.2) +PTX consistency model is defined by “semantics + scope + proxy”. Asynchronous instructions are typically modeled as weak memory operations. + +## Core Concepts + +- Semantics: `relaxed`, `acquire`, `release`, etc. +- Scope: `cta`, `cluster`, `gpu`, `sys` +- Proxies: generic proxy / async proxy, etc. + +## Focus for Asynchronous Paths + +- `cp.async` and `cp.async.bulk` belong to asynchronous copy paths. +- The documentation states that there is no ordering guarantee between `cp.async` operations unless you explicitly synchronize. +- After `cp.async.bulk` / `cp.reduce.async.bulk` completes, an implicit generic-async proxy fence is applied (see the section notes). +- `mbarrier complete-tx` has `.release` at `.cluster` scope semantics in the corresponding description. + +## Practical Recommendations + +- Establish the relationship between “transfer completion” and “visibility to consumers” using the specified mechanisms. +- When mixing `atom`/`fence`/`mbarrier`, draw the happens-before relationships before writing code. + +## Common Failure Modes + +- Using correct scope with wrong semantics (`relaxed` where release/acquire is required). +- Assuming async-copy completion implies ordering for unrelated memory operations. +- Combining proxy domains without explicit proxy fence rules. + +## Official Source Links (Fact Check) + +- Memory Consistency Model: https://docs.nvidia.com/cuda/parallel-thread-execution/#memory-consistency-model +- Scope and applicability: https://docs.nvidia.com/cuda/parallel-thread-execution/#scope-and-applicability-of-the-model +- Parallel sync instructions: https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions +- Async operations and ordering notes: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-operations + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/references/programming-model.md b/content/cuda/docs/ptx/references/programming-model.md new file mode 100644 index 00000000..cad10b6a --- /dev/null +++ b/content/cuda/docs/ptx/references/programming-model.md @@ -0,0 +1,31 @@ +# PTX Programming Model (9.2) + +The PTX programming model describes thread organization, execution hierarchy, state spaces, and function boundaries, and it is a prerequisite for understanding instruction semantics and synchronization scopes. + +## Structured Takeaways + +- Thread execution is organized at the CTA / cluster / grid hierarchy levels. +- Synchronization and visibility depend on the scope; you cannot assume visibility across scopes. +- Kernels (`.entry`) and functions (`.func`) differ in parameter and call boundaries. +- Asynchronous instructions (e.g., `cp.async`, `cp.async.bulk`, `wgmma.mma_async`) do not fully follow ordinary program order. + +## Practical Interpretation + +- Before choosing synchronization primitives, first determine which scope the data is shared within. +- When writing asynchronous copies or asynchronous MMA, you must explicitly wait for completion mechanisms (group or mbarrier). +- Do not infer cross-thread visibility from the apparent sequential execution behavior in a single thread. + +## Common Failure Modes + +- Selecting block-wide synchronization where cluster or system scope is required. +- Assuming async producer completion implies consumer visibility without explicit protocol completion. +- Mixing scope assumptions across helper kernels in the same pipeline. + +## Official Source Links (Fact Check) + +- Programming Model: https://docs.nvidia.com/cuda/parallel-thread-execution/#programming-model +- Machine Model: https://docs.nvidia.com/cuda/parallel-thread-execution/#machine-model +- State Spaces: https://docs.nvidia.com/cuda/parallel-thread-execution/#state-spaces +- Asynchronous operations notes: https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-operations + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/references/release-notes-ptx-9.2.md b/content/cuda/docs/ptx/references/release-notes-ptx-9.2.md new file mode 100644 index 00000000..e1944d77 --- /dev/null +++ b/content/cuda/docs/ptx/references/release-notes-ptx-9.2.md @@ -0,0 +1,35 @@ +# PTX 9.2 Release Notes Index + +This page tracks PTX 9.2 newly added features, behavior changes, compatibility limitations, and target-architecture requirements. + +## Suggested Review Process + +1. First review the release notes to identify newly added/changed instructions. +2. Then consult the corresponding instruction section’s PTX ISA notes. +3. Finally review the Target ISA notes to determine availability under `.target sm_xx`. + +## Change Categories to Track + +- New instruction families and qualifiers. +- Semantic changes that alter ordering, completion, or undefined-behavior boundaries. +- Target gating updates that change legal deployment architectures. + +## Common Failure Modes + +- Applying syntax updates while missing semantic changes in the same release. +- Updating PTX templates without synchronizing architecture-gating logic. +- Treating release notes as optional and relying on historical behavior assumptions. + +## Verification Checklist + +- Re-run architecture-gating tests after release-note-driven template updates. +- Re-run numerical and protocol regression on kernels touched by updated instruction families. +- Revalidate fallback behavior on the oldest supported architecture target. + +## Official Source Links (Fact Check) + +- Release Notes: https://docs.nvidia.com/cuda/parallel-thread-execution/#release-notes +- PTX ISA Notes: https://docs.nvidia.com/cuda/parallel-thread-execution/#ptx-isa-notes +- Target ISA Notes: https://docs.nvidia.com/cuda/parallel-thread-execution/#target-isa-notes + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/ptx/references/state-spaces-and-types.md b/content/cuda/docs/ptx/references/state-spaces-and-types.md new file mode 100644 index 00000000..3e366b62 --- /dev/null +++ b/content/cuda/docs/ptx/references/state-spaces-and-types.md @@ -0,0 +1,34 @@ +# PTX State Spaces and Types (9.2) + +PTX validity is jointly constrained by “state spaces + the type system”. Being syntactically correct alone is not sufficient to guarantee semantic correctness. + +## Common State Spaces + +- `.reg`: registers +- `.local`: thread-private memory +- `.shared`: CTA/cluster shared memory (depending on modifiers) +- `.global`: global memory +- `.const`: constant memory +- `.param`: parameter space + +## Common Type Families + +- Bit types: `.b8/.b16/.b32/.b64` +- Integer types: `.s*` / `.u*` +- Floating-point types: `.f16/.bf16/.tf32/.f32/.f64` +- Vector and packed types: commonly used in load/store, mma, and tensor operations + +## Practical Constraints + +- The address space for `ld/st/cp` must match the instruction variant. +- Arithmetic type suffixes must be compatible with the register declarations. +- Mixed-precision and tensor instructions often have stricter type/tile combination constraints. + +## Official Source Links (Fact Check) + +- State Spaces: https://docs.nvidia.com/cuda/parallel-thread-execution/#state-spaces +- Types: https://docs.nvidia.com/cuda/parallel-thread-execution/#types +- Variables: https://docs.nvidia.com/cuda/parallel-thread-execution/#variables +- Parameter State Space: https://docs.nvidia.com/cuda/parallel-thread-execution/#parameter-state-space + +Last cross-check date: 2026-03-19 diff --git a/content/cuda/docs/randomness-and-reproducibility/DOC.md b/content/cuda/docs/randomness-and-reproducibility/DOC.md new file mode 100644 index 00000000..5bf83bfd --- /dev/null +++ b/content/cuda/docs/randomness-and-reproducibility/DOC.md @@ -0,0 +1,66 @@ +--- +name: randomness-and-reproducibility +description: "CUDA randomness and reproducibility essentials: RNG strategy, seed control, deterministic settings, and cross-run consistency pitfalls." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,reproducibility,determinism,randomness,seed,curand,atomic-order,numerical-variance" +--- + +# CUDA Randomness And Reproducibility (C++) + +Use this page when you need stable results across runs, devices, or software versions. + +## Reproducibility Scope + +Define what you need: + +- same run, same machine +- same machine across runs +- across GPUs/driver/toolkit versions + +The stricter the target, the more constraints you must apply. + +## RNG Strategy + +For random-number generation in CUDA pipelines: + +- use explicit seed management +- separate per-thread/sequence state deterministically +- avoid implicit global RNG side effects in hot kernels + +cuRAND is common for production-grade GPU RNG workflows. + +## Determinism Pitfalls + +Even without RNG, floating-point results can vary due to: + +- reduction order changes +- atomic update ordering +- parallel scheduling differences +- precision/mode differences (for example Tensor Core math paths) + +Bitwise reproducibility is usually harder than statistical reproducibility. + +## Practical Checklist + +1. fix seeds and log them. +2. pin algorithm/mode choices that affect operation order. +3. define tolerance-based correctness checks when bitwise identity is unrealistic. +4. isolate non-deterministic kernels and test them separately. + +## Related Topics + +- Numerics and precision: `../numerics-and-precision/DOC.md` +- Atomics and reductions: `../atomics-and-reductions/DOC.md` +- Error handling and debug build: `../error-handling-and-debug-build/DOC.md` + +## Official Source Links (Fact Check) + +- cuRAND documentation: https://docs.nvidia.com/cuda/curand/index.html +- CUDA C++ Programming Guide, floating-point and parallel execution caveats: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/regression-testing-and-ci/DOC.md b/content/cuda/docs/regression-testing-and-ci/DOC.md new file mode 100644 index 00000000..8e95425e --- /dev/null +++ b/content/cuda/docs/regression-testing-and-ci/DOC.md @@ -0,0 +1,67 @@ +--- +name: regression-testing-and-ci +description: "CUDA regression testing and CI essentials: correctness baselines, tolerance strategy, perf guardrails, and multi-arch validation." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,testing,regression,ci,correctness,tolerance,performance-guardrail,multi-arch" +--- + +# CUDA Regression Testing And CI (C++) + +Use this page to keep CUDA kernels stable across optimizations and toolchain updates. + +## Test Layers + +Keep separate layers: + +- functional correctness tests +- numerical tolerance tests +- performance regression tests + +Blending all three into one pass makes failures hard to diagnose. + +## Correctness Baselines + +- keep a trusted reference path (CPU or high-precision GPU) +- compare output shapes, boundary behavior, and representative edge cases +- include deterministic seeds for stochastic paths + +## Tolerance Policy + +Define tolerance per operator class and precision mode. + +- tighter for stable FP32 math +- looser but explicit for FP16/BF16/TF32 or nondeterministic orderings + +Store tolerance policy in code/config, not ad-hoc comments. + +## Performance Guardrails + +- track key benchmarks in CI (or scheduled perf jobs) +- compare against a baseline window, not a single run +- alert on sustained regression beyond threshold + +## Multi-Arch Validation + +When possible, validate across representative GPU classes. + +- architecture differences can expose hidden assumptions +- build matrices should reflect deployment reality + +## Related Topics + +- Build and ABI compatibility: `../build-and-abi-compatibility/DOC.md` +- Benchmarking methodology: `../benchmarking-methodology/DOC.md` +- Numerics and precision: `../numerics-and-precision/DOC.md` +- Randomness and reproducibility: `../randomness-and-reproducibility/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Best Practices Guide, verification and optimization workflow context: https://docs.nvidia.com/cuda/archive/13.0.0/cuda-c-best-practices-guide/index.html +- CUDA Programming Guide, numerical/ordering considerations: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/runtime/DOC.md b/content/cuda/docs/runtime/DOC.md new file mode 100644 index 00000000..0ebf57e0 --- /dev/null +++ b/content/cuda/docs/runtime/DOC.md @@ -0,0 +1,151 @@ +--- +name: runtime +description: "CUDA Runtime API essentials for allocating memory, launching kernels, and managing streams." +metadata: + languages: "cpp" + versions: "12.4" + revision: 1 + updated-on: "2026-03-18" + source: community + tags: "cuda,gpu,kernel,runtime,api" +--- + +# CUDA Runtime API (C++) + +Use the CUDA Runtime API for most application-level kernel development. It provides a simpler model than the Driver API while still exposing streams, events, and device management. + +## Minimal End-to-End Example + +```cpp +#include +#include + +__global__ void saxpy(const float* x, const float* y, float* out, float a, int n) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < n) out[i] = a * x[i] + y[i]; +} + +int main() { + const int n = 1 << 20; + const size_t bytes = n * sizeof(float); + float *h_x = (float*)malloc(bytes); + float *h_y = (float*)malloc(bytes); + float *h_out = (float*)malloc(bytes); + + float *d_x = nullptr, *d_y = nullptr, *d_out = nullptr; + cudaMalloc(&d_x, bytes); + cudaMalloc(&d_y, bytes); + cudaMalloc(&d_out, bytes); + + cudaMemcpy(d_x, h_x, bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d_y, h_y, bytes, cudaMemcpyHostToDevice); + + const int threads = 256; + const int blocks = (n + threads - 1) / threads; + saxpy<<>>(d_x, d_y, d_out, 2.0f, n); + + cudaDeviceSynchronize(); + cudaMemcpy(h_out, d_out, bytes, cudaMemcpyDeviceToHost); + + cudaFree(d_x); + cudaFree(d_y); + cudaFree(d_out); + free(h_x); + free(h_y); + free(h_out); + return 0; +} +``` + +## Core Runtime APIs + +Use these first when building kernels: + +- `cudaMalloc`, `cudaFree` for device memory +- `cudaMemcpy`, `cudaMemcpyAsync` for transfers +- `cudaMemset` for initialization +- `cudaGetLastError`, `cudaDeviceSynchronize` for error detection +- `cudaStreamCreate`, `cudaStreamDestroy` for async execution +- `cudaEventCreate`, `cudaEventRecord`, `cudaEventElapsedTime` for timing + +## Error Handling Pattern + +Always check errors for: + +- the kernel launch (use `cudaGetLastError`) +- the execution (use `cudaDeviceSynchronize` or stream sync) + +See `references/error-handling.md` for a macro-based pattern. + +## Common Pitfalls + +- Forgetting to synchronize before reading results on the host +- Miscomputing grid size (off-by-one on tail elements) +- Assuming host memory is page-locked (use `cudaHostAlloc` if needed) +- Launching with too few blocks to cover all elements + +## When to Use Streams + +Use streams when: + +- You need overlap of copy and compute (`cudaMemcpyAsync`) +- You want concurrent kernels +- You want explicit ordering without global device sync + +## Related Topics + +- Error handling macro and diagnostics: `references/error-handling.md` +- Memory hierarchy overview: `../memory-hierarchy/DOC.md` +- Shared memory overview: `../shared-memory/DOC.md` +- Synchronization overview: `../synchronization/DOC.md` +- Coalescing overview: `../coalescing/DOC.md` +- Occupancy tuning: `../occupancy/DOC.md` +- Warp-level primitives: `../warp-primitives/DOC.md` +- Execution model: `../execution-model/DOC.md` +- Compute throughput: `../compute-throughput/DOC.md` +- CUDA Core path: `../cuda-core/DOC.md` +- CUDA Core optimization checklist: `../cuda-core-optimization-checklist/DOC.md` +- Tensor Core usage: `../tensor-cores/DOC.md` +- WMMA kernel patterns: `../wmma-kernel-patterns/DOC.md` +- WMMA debugging checklist: `../wmma-debugging-checklist/DOC.md` +- Tensor Core pipeline patterns: `../tensor-core-pipeline-patterns/DOC.md` +- Tensor Core numerical validation: `../tensor-core-numerical-validation/DOC.md` +- CUDA Core vs Tensor Core path selection: `../cuda-core-vs-tensor-core-path-selection/DOC.md` +- Kernel bottleneck diagnosis workflow: `../kernel-bottleneck-diagnosis-workflow/DOC.md` +- Memory-bound optimization playbook: `../memory-bound-kernel-optimization-playbook/DOC.md` +- Compute-bound optimization playbook: `../compute-bound-kernel-optimization-playbook/DOC.md` +- Launch-bound optimization playbook: `../launch-bound-optimization-playbook/DOC.md` +- Nsight metrics interpretation cheatsheet: `../nsight-metrics-interpretation-cheatsheet/DOC.md` +- Atomics and reductions: `../atomics-and-reductions/DOC.md` +- Cooperative Groups: `../cooperative-groups/DOC.md` +- Async copy: `../async-copy/DOC.md` +- Thread Block Clusters: `../thread-block-clusters/DOC.md` +- Streams and events: `../streams-and-events/DOC.md` +- Memory fences and ordering: `../memory-fences-and-ordering/DOC.md` +- CUDA Graphs: `../cuda-graphs/DOC.md` +- Performance debugging: `../performance-debugging/DOC.md` +- Launch bounds and registers: `../launch-bounds-and-registers/DOC.md` +- Unified Memory: `../unified-memory/DOC.md` +- Pinned memory and transfers: `../pinned-memory-and-transfers/DOC.md` +- Multi-GPU and peer access: `../multi-gpu-and-peer-access/DOC.md` +- Dynamic Parallelism: `../dynamic-parallelism/DOC.md` +- Error handling and debug build: `../error-handling-and-debug-build/DOC.md` +- cuBLAS/cuDNN integration patterns: `../cublas-cudnn-integration-patterns/DOC.md` +- NVTX and profiling workflow: `../nvtx-and-profiling-workflow/DOC.md` +- Numerics and precision: `../numerics-and-precision/DOC.md` +- Randomness and reproducibility: `../randomness-and-reproducibility/DOC.md` +- Fused kernel design patterns: `../fused-kernel-design-patterns/DOC.md` +- Build and ABI compatibility: `../build-and-abi-compatibility/DOC.md` +- Sparse and irregular kernels: `../sparse-and-irregular-kernels/DOC.md` +- Collective communication patterns: `../collective-communication-patterns/DOC.md` +- Benchmarking methodology: `../benchmarking-methodology/DOC.md` +- Regression testing and CI: `../regression-testing-and-ci/DOC.md` +- Data layout and alignment: `../data-layout-and-alignment/DOC.md` +- Cache behavior and access policy: `../cache-behavior-and-access-policy/DOC.md` +- Persistent kernels and work queues: `../persistent-kernels-and-work-queues/DOC.md` +- Production readiness checklist: `../production-readiness-checklist/DOC.md` +- Kernel API design guidelines: `../kernel-api-design-guidelines/DOC.md` +- Shape specialization and autotuning: `../input-shape-specialization-and-autotuning/DOC.md` +- Fallback strategies and capability detection: `../fallback-strategies-and-capability-detection/DOC.md` +- Incident response and rollback playbook: `../incident-response-and-rollback-playbook/DOC.md` +- PTX shared-memory async path: `../ptx/instructions/data-movement/references/cp-async.md` diff --git a/content/cuda/docs/runtime/references/error-handling.md b/content/cuda/docs/runtime/references/error-handling.md new file mode 100644 index 00000000..658aaf5e --- /dev/null +++ b/content/cuda/docs/runtime/references/error-handling.md @@ -0,0 +1,28 @@ +# CUDA Runtime Error Handling + +Use a small helper to surface errors early. Check both launch errors and runtime errors. + +```cpp +#include +#include + +#define CUDA_CHECK(call) \ + do { \ + cudaError_t err = call; \ + if (err != cudaSuccess) { \ + fprintf(stderr, "CUDA error %s:%d: %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \ + exit(1); \ + } \ + } while (0) + +// Usage +CUDA_CHECK(cudaMalloc(&d_x, bytes)); +// After kernel launch +CUDA_CHECK(cudaGetLastError()); +CUDA_CHECK(cudaDeviceSynchronize()); +``` + +Notes: +- `cudaGetLastError()` catches launch errors. +- `cudaDeviceSynchronize()` surfaces runtime errors. +- For async workflows, prefer `cudaStreamSynchronize(stream)`. diff --git a/content/cuda/docs/shared-memory/DOC.md b/content/cuda/docs/shared-memory/DOC.md new file mode 100644 index 00000000..f35dfa8b --- /dev/null +++ b/content/cuda/docs/shared-memory/DOC.md @@ -0,0 +1,174 @@ +--- +name: shared-memory +description: "CUDA shared memory essentials: __shared__, dynamic shared memory, synchronization, bank conflicts, and async copy." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,shared-memory,sharedmem,smem,__shared__,dynamic-shared-memory,__syncthreads__,bank-conflict,bank-conflicts,bank-conflict-avoidance,padding,shared-memory-tiling,cp.async,mbarrier" +--- + +# CUDA Shared Memory (C++) + +Use this page when you need the CUDA C++ view of shared memory: what `__shared__` means, how dynamic shared memory is declared, when `__syncthreads()` is required, and how bank conflicts affect performance. + +## What Shared Memory Is + +In the CUDA C++ Programming Guide, `__shared__` declares storage that: + +- resides in the shared memory space of a thread block +- has the lifetime of the block +- has a distinct object per block +- is accessible only to threads in the same block + +This makes shared memory the standard scratchpad for cooperation within a block. + +## Static Shared Memory + +Use a compile-time-sized declaration when the storage size is fixed: + +```cpp +__global__ void saxpy_tile(const float* x, const float* y, float* out, int n) { + __shared__ float tile[256]; + + int tid = threadIdx.x; + int i = blockIdx.x * blockDim.x + tid; + + if (i < n) { + tile[tid] = x[i]; + } + __syncthreads(); + + if (i < n) { + out[i] = 2.0f * tile[tid] + y[i]; + } +} +``` + +Use this form when the tile shape is fixed and simple. + +## Dynamic Shared Memory + +Use `extern __shared__` when the size is determined at launch time: + +```cpp +__global__ void reduce_kernel(const float* input, float* output, int n) { + extern __shared__ float smem[]; + + int tid = threadIdx.x; + int i = blockIdx.x * blockDim.x + tid; + + smem[tid] = (i < n) ? input[i] : 0.0f; + __syncthreads(); + + for (int stride = blockDim.x / 2; stride > 0; stride >>= 1) { + if (tid < stride) smem[tid] += smem[tid + stride]; + __syncthreads(); + } + + if (tid == 0) output[blockIdx.x] = smem[0]; +} + +// Launch with dynamic shared memory bytes: +// reduce_kernel<<>>(...); +``` + +The CUDA C++ Programming Guide notes that all `extern __shared__` variables start at the same address, so if you pack multiple arrays into dynamic shared memory you must manage offsets and alignment explicitly. + +## Synchronization Rule + +Use `__syncthreads()` when one set of threads writes shared memory and another set of threads in the same block will read it later. + +- `__syncthreads()` is a block-wide barrier +- writes to shared memory before the barrier are visible to threads in the block after the barrier +- do not place it in divergent control flow unless the condition is uniform across the whole block + +Typical cases: + +- loading a tile from global memory into shared memory +- reduction steps between iterations +- transpose or stencil phases where threads consume values written by other threads + +## Why Shared Memory Helps + +The Best Practices Guide highlights three common reasons to use shared memory: + +- avoid redundant loads from global memory +- transform global accesses into coalesced accesses +- avoid wasted bandwidth from strided patterns + +Shared memory is especially useful for tiled GEMM, stencil, convolution, reduction, and transpose kernels. + +## Bank Conflicts + +Shared memory performance depends on bank usage. + +- modern devices expose 32 banks for warp accesses +- successive 32-bit words map to successive banks +- if threads in a warp hit distinct banks, accesses can proceed concurrently +- if multiple threads hit the same bank, the access is split and serialized +- one important exception is broadcast: when threads read the same shared location, hardware can serve that efficiently + +The standard remedy for column-wise access on a 32x32 tile is padding: + +```cpp +__shared__ float tile[32][33]; +``` + +The Best Practices Guide uses this pattern to remove many-way bank conflicts in a transpose-like matrix multiply example. + +## Async Copy Path + +For newer CUDA toolchains and architectures, shared memory can also participate in explicit async copy pipelines from global memory. + +- C++ layer: `__pipeline_memcpy_async`, `__pipeline_commit`, `__pipeline_wait_prior` +- PTX layer: `cp.async`, `cp.async.commit_group`, `cp.async.wait_group`, and mbarrier-based completion + +Use this path when you need to overlap global-to-shared transfers with computation and reduce intermediate register traffic. + +## When To Escalate To PTX Docs + +Stay in CUDA C++ docs for: + +- `__shared__` +- dynamic shared memory launch configuration +- `__syncthreads()` +- bank conflict basics + +Jump to PTX docs for: + +- `.shared` state-space rules +- `cp.async` +- `mbarrier` +- TMA and shared-memory layout/swizzling + +See: + +- `../ptx/references/state-spaces-and-types.md` +- `../ptx/instructions/data-movement/references/cp-async.md` +- `../ptx/instructions/sync-comm/DOC.md` +- `../ptx/instructions/tma/DOC.md` + +## Related Topics + +- CUDA Runtime overview: `../runtime/DOC.md` +- Synchronization rules: `../synchronization/DOC.md` +- Memory-space overview: `../memory-hierarchy/DOC.md` +- Global-memory coalescing: `../coalescing/DOC.md` +- Warp-level primitives: `../warp-primitives/DOC.md` +- Tensor Core usage: `../tensor-cores/DOC.md` +- Async copy: `../async-copy/DOC.md` +- Thread Block Clusters / DSM: `../thread-block-clusters/DOC.md` +- PTX ISA overview: `../ptx/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Programming Guide, `__shared__`: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html#shared +- CUDA C++ Programming Guide, synchronization functions: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html#synchronization-functions +- CUDA C++ Best Practices Guide, Shared Memory: https://docs.nvidia.com/cuda/archive/13.0.0/cuda-c-best-practices-guide/index.html#shared-memory +- CUDA C++ Best Practices Guide, Shared Memory and Memory Banks: https://docs.nvidia.com/cuda/archive/13.0.0/cuda-c-best-practices-guide/index.html#shared-memory-and-memory-banks +- CUDA C++ Best Practices Guide, Async Copy from Global Memory to Shared Memory: https://docs.nvidia.com/cuda/archive/13.0.0/cuda-c-best-practices-guide/index.html#asynchronous-copy-from-global-memory-to-shared-memory + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/sparse-and-irregular-kernels/DOC.md b/content/cuda/docs/sparse-and-irregular-kernels/DOC.md new file mode 100644 index 00000000..b63ebd85 --- /dev/null +++ b/content/cuda/docs/sparse-and-irregular-kernels/DOC.md @@ -0,0 +1,65 @@ +--- +name: sparse-and-irregular-kernels +description: "CUDA sparse/irregular kernel essentials: load imbalance, indirect access, divergence control, and locality-aware data layouts." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,sparse,irregular,load-balance,divergence,indirect-access,gather,scatter" +--- + +# CUDA Sparse And Irregular Kernels (C++) + +Use this page when access patterns are indirect, data-dependent, or highly skewed. + +## Why These Kernels Are Hard + +Sparse/irregular workloads often suffer from: + +- poor coalescing from indirect addressing +- warp divergence from data-dependent control flow +- load imbalance across warps/blocks +- cache inefficiency from weak locality + +## Design Priorities + +1. reduce divergence where possible. +2. improve memory locality through data reordering. +3. balance work granularity to avoid long-tail warps. +4. isolate hot irregular regions from regular compute regions. + +## Common Patterns + +- work queues for dynamic tasks +- segmented processing for variable-length rows/lists +- gather/scatter with index compression/reordering +- two-phase pipelines: count/scan then compact/execute + +## Practical Techniques + +- reorder indices to improve spatial locality +- use warp-level primitives for local compaction and voting +- split heavy/light workloads into separate kernels +- avoid over-synchronizing global progress paths + +## Typical Pitfalls + +- one-thread-per-item mapping with heavy skew +- atomics on hot addresses without privatization +- excessive branch nesting in the main kernel body + +## Related Topics + +- Coalescing: `../coalescing/DOC.md` +- Warp primitives: `../warp-primitives/DOC.md` +- Atomics and reductions: `../atomics-and-reductions/DOC.md` +- Fused kernel patterns: `../fused-kernel-design-patterns/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Best Practices Guide, memory behavior and control divergence context: https://docs.nvidia.com/cuda/archive/13.0.0/cuda-c-best-practices-guide/index.html +- CUDA C++ Programming Guide, execution and memory model background: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/streams-and-events/DOC.md b/content/cuda/docs/streams-and-events/DOC.md new file mode 100644 index 00000000..1676504b --- /dev/null +++ b/content/cuda/docs/streams-and-events/DOC.md @@ -0,0 +1,91 @@ +--- +name: streams-and-events +description: "CUDA streams and events essentials: ordering, overlap, cudaStreamWaitEvent, timing, and default-stream caveats." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,streams,events,cudaStreamWaitEvent,cudaEventRecord,cudaEventElapsedTime,default-stream,overlap" +--- + +# CUDA Streams And Events (C++) + +Use this page for CUDA work orchestration on the host side: stream ordering, event dependencies, and timing. + +## Streams + +A stream is an ordered sequence of operations on the device. + +- operations in the same stream execute in issue order +- operations in different streams may overlap when dependencies allow +- stream-level concurrency is the basic CUDA mechanism for overlapping copy and compute + +## Events + +Events are lightweight synchronization markers. + +Common uses: + +- record progress in a stream with `cudaEventRecord` +- make another stream wait with `cudaStreamWaitEvent` +- measure elapsed time with `cudaEventElapsedTime` + +Events are the standard tool for cross-stream dependencies. + +## Basic Cross-Stream Dependency + +```cpp +cudaEvent_t done; +cudaEventCreate(&done); + +kernelA<<>>(...); +cudaEventRecord(done, streamA); +cudaStreamWaitEvent(streamB, done, 0); +kernelB<<>>(...); +``` + +This keeps the dependency local and avoids device-wide synchronization. + +## Default Stream Caveat + +The default stream has special behavior. + +- legacy default stream semantics can introduce implicit synchronization +- per-thread default stream semantics behave differently + +Do not assume the default stream behaves like an ordinary user-created stream unless you know which mode your application uses. + +## Timing Rule + +For coarse kernel timing: + +1. create start/end events +2. record them in the target stream +3. synchronize on the end event +4. call `cudaEventElapsedTime` + +This is the standard CUDA timing pattern when you want stream-local measurements. + +## Common Mistakes + +- using `cudaDeviceSynchronize()` when a stream or event sync is enough +- assuming different streams imply overlap without checking dependencies or resources +- forgetting that synchronous APIs can force serialization +- timing a stream with events but synchronizing the whole device + +## Related Topics + +- CUDA Graphs: `../cuda-graphs/DOC.md` +- Async copy pipelines: `../async-copy/DOC.md` +- Runtime API overview: `../runtime/DOC.md` +- Performance debugging: `../performance-debugging/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Programming Guide, streams and concurrency: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html +- CUDA C++ Programming Guide, events and cross-stream dependencies: https://docs.nvidia.com/cuda/archive/11.7.0/cuda-c-programming-guide/index.html +- CUDA Runtime API, stream and event functions: https://docs.nvidia.com/cuda/cuda-runtime-api/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/synchronization/DOC.md b/content/cuda/docs/synchronization/DOC.md new file mode 100644 index 00000000..358e3e72 --- /dev/null +++ b/content/cuda/docs/synchronization/DOC.md @@ -0,0 +1,120 @@ +--- +name: synchronization +description: "CUDA synchronization essentials: __syncthreads, __syncwarp, block-wide visibility, and common barrier rules." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,synchronization,syncthreads,syncwarp,block-barrier,barrier-divergence,__syncthreads__,__syncwarp,barrier,warp,thread-block,memory-ordering" +--- + +# CUDA Synchronization (C++) + +Use this page for CUDA C++ synchronization rules at the thread-block and warp levels. + +## Thread-Block Synchronization + +`__syncthreads()` is the standard block-wide barrier. + +- every non-exited thread in the block must reach it +- it waits until all threads in the block arrive +- global and shared memory accesses before the barrier become visible to threads in the block after the barrier + +Use it when threads in a block communicate through memory. + +Typical cases: + +- one phase writes shared memory and a later phase reads it +- reduction loops between strides +- transpose, stencil, or tiled GEMM phases + +## Conditional Barrier Rule + +Do not place `__syncthreads()` in divergent control flow unless the condition is uniform across the entire block. + +Unsafe pattern: + +```cpp +if (threadIdx.x < 16) { + __syncthreads(); // Wrong unless every thread takes the same branch +} +``` + +Safe pattern: + +```cpp +bool active = threadIdx.x < 16; +if (active) { + // work +} +__syncthreads(); +``` + +## Variants of `__syncthreads()` + +CUDA also provides block-wide variants that combine a barrier with a predicate reduction: + +- `__syncthreads_count(predicate)` +- `__syncthreads_and(predicate)` +- `__syncthreads_or(predicate)` + +Use them when you need a collective decision at block scope without adding a separate reduction pass. + +## Warp-Level Synchronization + +`__syncwarp(mask)` synchronizes participating lanes in a warp. + +- every participating lane must use the same mask +- each calling lane must have its own bit set in the mask +- it provides memory ordering among participating threads + +Use `__syncwarp()` when: + +- threads communicate only within one warp +- you want a lighter-weight barrier than `__syncthreads()` +- you are using warp-specialized code paths + +## Important Distinction: Warp Vote vs Barrier + +Warp vote intrinsics such as: + +- `__all_sync` +- `__any_sync` +- `__ballot_sync` + +do not imply a memory barrier by themselves. Use `__syncwarp()` when lanes must safely communicate through memory. + +## Common Mistakes + +- assuming warp-synchronous execution without an explicit warp barrier +- using `__syncthreads()` in a branch that only some threads take +- reading shared memory written by other threads before a barrier +- using block-wide barriers when the communication scope is only one warp + +## Rule of Thumb + +- use `__syncthreads()` for cross-warp communication inside a block +- use `__syncwarp()` for intra-warp communication +- if the communication path uses shared memory, place the barrier between the producer and consumer phases + +## Related Topics + +- Shared memory usage: `../shared-memory/DOC.md` +- Memory space overview: `../memory-hierarchy/DOC.md` +- Coalesced global access: `../coalescing/DOC.md` +- Warp-level primitives: `../warp-primitives/DOC.md` +- Atomics and reductions: `../atomics-and-reductions/DOC.md` +- Cooperative Groups: `../cooperative-groups/DOC.md` +- Async copy: `../async-copy/DOC.md` +- Memory fences and ordering: `../memory-fences-and-ordering/DOC.md` +- PTX synchronization primitives: `../ptx/instructions/sync-comm/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Programming Guide, thread hierarchy and cooperation: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html +- CUDA C++ Programming Guide, synchronization functions: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html#synchronization-functions +- CUDA C++ Programming Guide, warp vote and match functions: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/tensor-core-numerical-validation/DOC.md b/content/cuda/docs/tensor-core-numerical-validation/DOC.md new file mode 100644 index 00000000..eb0e9552 --- /dev/null +++ b/content/cuda/docs/tensor-core-numerical-validation/DOC.md @@ -0,0 +1,71 @@ +--- +name: tensor-core-numerical-validation +description: "Tensor Core numerical validation workflow: baseline comparison, tolerance policy, shape coverage, and regression gates." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,tensor-core,numerics,validation,tolerance,baseline,wmma,tf32,fp16,bf16,regression" +--- + +# Tensor Core Numerical Validation (C++) + +Use this page when enabling WMMA/Tensor Core paths and you need a defensible numerical-validation process. + +## Baseline Strategy + +- Keep a trusted reference path (often FP32 accumulate). +- Run identical input tensors through baseline and Tensor Core paths. +- Compare per-output error and aggregate metrics. + +## Tolerance Policy + +Define tolerance before tuning: + +- absolute tolerance +- relative tolerance +- special-case handling for near-zero regions + +Document tolerance by workload category, not by one benchmark snapshot. + +## Coverage Requirements + +Validate across: + +- representative shapes (small, medium, large) +- boundary shapes (tail tiles, non-multiple dimensions) +- realistic value ranges (not only unit random data) +- production-like batch distributions + +## Failure Triage + +If error exceeds policy: + +- check dtype/accumulator configuration first +- check layout and tile mapping consistency +- check whether a supposedly Tensor Core path silently falls back or changes instruction mix +- re-run with deterministic seeds and fixed launch configs + +## Regression Gates + +- Add numerical checks into CI for key shapes. +- Keep per-architecture baselines where behavior differs by hardware mode. +- Block performance-only changes when they break agreed numeric policy. + +## Related Topics + +- Tensor Cores: `../tensor-cores/DOC.md` +- WMMA patterns: `../wmma-kernel-patterns/DOC.md` +- WMMA debugging checklist: `../wmma-debugging-checklist/DOC.md` +- Numerics and precision: `../numerics-and-precision/DOC.md` +- Regression testing and CI: `../regression-testing-and-ci/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Programming Guide, floating-point behavior and Tensor Core context: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html +- CUDA C++ Best Practices Guide, verification guidance: https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/ + +Last cross-check date: 2026-03-20 + diff --git a/content/cuda/docs/tensor-core-pipeline-patterns/DOC.md b/content/cuda/docs/tensor-core-pipeline-patterns/DOC.md new file mode 100644 index 00000000..5f1d25db --- /dev/null +++ b/content/cuda/docs/tensor-core-pipeline-patterns/DOC.md @@ -0,0 +1,102 @@ +--- +name: tensor-core-pipeline-patterns +description: "Tensor Core pipeline patterns: global-to-shared staging, multi-stage K loops, async copy synchronization, and escalation to WGMMA/TMA." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,tensor-core,tensorcore,pipeline,pipelining,multi-stage-pipeline,cp.async,async-copy,shared-memory,mbarrier,wmma,wgmma,tma,double-buffering,stage-depth" +--- + +# Tensor Core Pipeline Patterns (C++) + +Use this page for end-to-end Tensor Core kernel structure, not just a single `mma_sync` call. + +## Why Pipeline Design Dominates + +In real GEMM-like kernels, arithmetic throughput is often high enough that data staging and synchronization decide final performance. + +A strong Tensor Core kernel usually needs: + +- global-memory tile fetch +- shared-memory staging and layout control +- fragment load and matrix instruction issue +- overlapped staging for the next K tile + +## Canonical Multi-Stage Loop + +A practical loop has at least two stages: + +1. Stage N: copy tile data for current compute. +2. Stage N+1: prefetch tile data for next compute step. + +With larger K, three-stage pipelines can smooth latency at the cost of more shared memory and register pressure. + +## Synchronization Boundaries + +You need explicit boundaries between: + +- producer writes to shared memory +- consumer fragment loads +- matrix instruction issue +- buffer reuse for next stage + +At C++ level this usually means structured barrier usage. At lower levels it can include async-copy wait semantics and mbarrier protocols. + +## Shared-Memory Layout Rules + +Tensor Core pipelines fail or slow down when shared layout is wrong. + +- align tile rows/strides for load requirements +- avoid severe bank conflicts in the staging pattern +- keep layout choices consistent with fragment load layout expectations + +## Stage-Depth Tradeoff + +More stages can hide memory latency better, but also: + +- increase shared-memory footprint per block +- reduce occupancy +- increase control complexity + +Tune stage count jointly with block-level warp count and tile shapes. + +## WMMA vs WGMMA/TMA Escalation + +Stay with WMMA-focused C++ pipeline when: + +- supported tile shapes and types fit +- performance is acceptable after staging and synchronization tuning + +Escalate toward lower-level PTX workflows when: + +- you need architecture-specific warpgroup matrix instructions +- you need advanced async tensor movement control +- your kernel requires fine-grained control beyond C++ WMMA surface area + +## Profiling Checks + +- matrix instruction activity is present and dominant in hot loops +- shared-memory pressure is not causing severe bank-serialization stalls +- memory pipeline overlaps compute in timeline and stall analysis +- occupancy remains sufficient for latency hiding + +## Related Topics + +- Tensor Core API overview: `../tensor-cores/DOC.md` +- WMMA practical patterns: `../wmma-kernel-patterns/DOC.md` +- Shared memory: `../shared-memory/DOC.md` +- Async copy: `../async-copy/DOC.md` +- Performance debugging: `../performance-debugging/DOC.md` +- PTX TMA: `../ptx/instructions/tma/DOC.md` +- PTX WGMMA: `../ptx/instructions/wgmma/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Programming Guide, asynchronous data movement and pipelines: https://docs.nvidia.com/cuda/cuda-c-programming-guide/ +- CUDA C++ Best Practices Guide, async copy and memory staging: https://docs.nvidia.com/cuda/archive/13.0.0/cuda-c-best-practices-guide/index.html +- PTX ISA docs for advanced matrix/tensor movement paths: https://docs.nvidia.com/cuda/parallel-thread-execution/ + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/tensor-cores/DOC.md b/content/cuda/docs/tensor-cores/DOC.md new file mode 100644 index 00000000..d804fbb2 --- /dev/null +++ b/content/cuda/docs/tensor-cores/DOC.md @@ -0,0 +1,144 @@ +--- +name: tensor-cores +description: "CUDA Tensor Core essentials: WMMA fragments, load/store rules, mma_sync, and when to drop to PTX WGMMA." +metadata: + languages: "cpp" + versions: "12.9" + revision: 2 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,tensor-cores,tensor-core,tensorcore,wmma,nvcuda::wmma,warp-matrix-multiply-accumulate,warp-mma,load_matrix_sync,mma_sync,store_matrix_sync,wgmma,mma,matrix-multiply-accumulate,fragment" +--- + +# CUDA Tensor Cores (C++) + +Use this page for the CUDA C++ API view of Tensor Cores. It is the correct first stop for `wmma` questions. + +## Primary API Namespace + +CUDA exposes the warp-level matrix API in `nvcuda::wmma`. + +Core concepts: + +- `wmma::fragment` +- `wmma::load_matrix_sync` +- `wmma::store_matrix_sync` +- `wmma::fill_fragment` +- `wmma::mma_sync` + +All of these are warp-synchronous interfaces. + +## Mental Model + +Each warp collaborates on a matrix tile. + +- matrix A and B tiles are loaded into fragments +- an accumulator fragment holds C / D +- `mma_sync` performs `D = A * B + C` +- results are written back with `store_matrix_sync` + +## Minimal Workflow + +```cpp +using namespace nvcuda; + +wmma::fragment a_frag; +wmma::fragment b_frag; +wmma::fragment c_frag; + +wmma::fill_fragment(c_frag, 0.0f); +wmma::load_matrix_sync(a_frag, a_ptr, lda); +wmma::load_matrix_sync(b_frag, b_ptr, ldb); +wmma::mma_sync(c_frag, a_frag, b_frag, c_frag); +wmma::store_matrix_sync(d_ptr, c_frag, ldd, wmma::mem_row_major); +``` + +## Usage Rules + +- all threads in the warp must participate +- `mptr`, `ldm`, layout, and template parameters must match across the warp +- memory pointers for matrix loads/stores must satisfy the documented alignment and leading-dimension requirements +- fragment element mapping across lanes is opaque; do not assume a stable per-lane layout + +## Alignment And Stride Constraints + +`load_matrix_sync` and `store_matrix_sync` have strict requirements. + +- the pointer must meet the documented alignment requirement +- `ldm` must satisfy the documented stride constraint in elements +- all lanes in the warp must agree on the arguments + +If these conditions are violated, behavior is undefined or performance will collapse around the staging path. + +## Supported Types And Shapes + +WMMA does not mean "any matrix multiply on Tensor Cores". + +- only specific tile shapes are supported +- only specific multiplicand and accumulator type combinations are supported +- support varies by architecture and API subset + +When the type / shape combination is outside the documented WMMA set, you either stay on the ordinary arithmetic path or move to a lower-level PTX path if the hardware and toolchain support it. + +## Shared Memory Staging Is Common + +High-performance Tensor Core kernels usually do more than call `mma_sync`. + +Typical structure: + +1. move tiles from global memory +2. stage or reorder them in shared memory if needed +3. load fragments +4. execute `mma_sync` +5. store accumulators back to memory + +So Tensor Core performance is often gated by shared-memory layout, coalescing, and synchronization as much as by the MMA instruction itself. + +## Restrictions That Matter + +- fragment layout is architecture-specific +- passing fragments across separately compiled code for different architectures is unsafe +- if fragments must cross an interface boundary, store to memory first and pass ordinary pointers instead + +## When WMMA Is The Right Layer + +Stay with WMMA when: + +- you are writing CUDA C++ kernels +- you want a supported high-level Tensor Core interface +- the problem maps naturally to documented WMMA tile shapes and types + +Drop to PTX when: + +- you need `wgmma` +- you need architecture-specific async MMA protocols +- you are working with TMA, mbarrier, or lower-level Hopper/Blackwell Tensor Core workflows + +## WMMA vs "CUDA Core" Arithmetic + +If a matrix multiply is written as ordinary nested scalar FMAs, it usually runs on the ordinary arithmetic path rather than the Tensor Core path. + +To reliably target Tensor Cores from CUDA C++, use the documented WMMA interfaces or an equivalent library path that emits the required matrix instructions. + +## Related Topics + +- Execution model: `../execution-model/DOC.md` +- CUDA Core path: `../cuda-core/DOC.md` +- Compute throughput model: `../compute-throughput/DOC.md` +- Warp-level execution model: `../warp-primitives/DOC.md` +- Shared memory staging: `../shared-memory/DOC.md` +- WMMA practical patterns: `../wmma-kernel-patterns/DOC.md` +- WMMA debugging checklist: `../wmma-debugging-checklist/DOC.md` +- Tensor Core pipeline patterns: `../tensor-core-pipeline-patterns/DOC.md` +- Tensor Core numerical validation: `../tensor-core-numerical-validation/DOC.md` +- CUDA Core vs Tensor Core path selection: `../cuda-core-vs-tensor-core-path-selection/DOC.md` +- PTX WGMMA entry: `../ptx/instructions/wgmma/DOC.md` +- PTX TMA entry: `../ptx/instructions/tma/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Programming Guide, WMMA API and fragments: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html +- CUDA C++ Programming Guide, `load_matrix_sync` / `store_matrix_sync` / `mma_sync`: https://docs.nvidia.com/cuda/archive/9.1/cuda-c-programming-guide/index.html +- CUDA C++ Programming Guide, Tensor Core restrictions: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/thread-block-clusters/DOC.md b/content/cuda/docs/thread-block-clusters/DOC.md new file mode 100644 index 00000000..a4ed8bba --- /dev/null +++ b/content/cuda/docs/thread-block-clusters/DOC.md @@ -0,0 +1,99 @@ +--- +name: thread-block-clusters +description: "CUDA thread block cluster essentials: cluster launch, cluster.sync, distributed shared memory, and portable cluster-size rules." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,thread-block-clusters,cluster,distributed-shared-memory,dsm,cluster.sync,__cluster_dims__,cudaLaunchKernelEx" +--- + +# CUDA Thread Block Clusters (C++) + +Use this page for the CUDA C++ view of cluster launch, cluster-level synchronization, and distributed shared memory. + +## What A Cluster Is + +Thread Block Clusters add an optional hierarchy level above blocks. + +- multiple blocks form one cluster +- blocks in a cluster are co-scheduled on the same GPC +- blocks in the cluster can synchronize and communicate more directly than unrelated blocks + +This feature is available on compute capability 9.0 and higher. + +## Launch Mechanisms + +Clusters can be specified either: + +- at compile time with `__cluster_dims__(x, y, z)` +- at launch time with `cudaLaunchKernelEx` and a cluster-dimension attribute + +Important: + +- `gridDim` still counts blocks, not clusters +- the grid should be compatible with the cluster dimensions + +## Cluster Synchronization + +CUDA exposes cluster-level synchronization through the Cooperative Groups cluster API. + +Typical pattern: + +- obtain the cluster handle +- coordinate phases with `cluster.sync()` + +This is the cluster-scope analogue of block synchronization, but for blocks that belong to the same cluster. + +## Distributed Shared Memory + +Blocks in a cluster can access distributed shared memory. + +That means: + +- a block can read or write shared memory owned by another block in the same cluster +- atomics can also target addresses in distributed shared memory + +This is useful when one block's normal shared memory is too small, but full global-memory communication would be too expensive. + +## Portable Cluster Size Rule + +CUDA documentation describes 8 blocks as the portable maximum cluster size. + +- some hardware or configurations may support less +- some architectures can support larger nonportable sizes +- query support instead of hard-coding assumptions + +Relevant APIs include occupancy helpers such as `cudaOccupancyMaxPotentialClusterSize`. + +## When To Use Clusters + +Clusters are a good fit when: + +- communication across several neighboring blocks is frequent +- distributed shared memory removes expensive global-memory round trips +- the algorithm naturally decomposes into a few tightly coupled blocks + +Avoid them when: + +- the kernel is simple enough for ordinary per-block decomposition +- portability matters more than architecture-specific optimization +- the communication pattern is weak or irregular + +## Related Topics + +- Cooperative Groups: `../cooperative-groups/DOC.md` +- Shared memory usage: `../shared-memory/DOC.md` +- Occupancy tuning: `../occupancy/DOC.md` +- Async copy and TMA: `../async-copy/DOC.md` +- PTX cluster / mbarrier / TMA path: `../ptx/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA Programming Guide, Thread Block Clusters: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html +- CUDA Programming Guide, modern programming-model introduction to clusters: https://docs.nvidia.com/cuda/archive/13.1.1/cuda-programming-guide/01-introduction/programming-model.html +- Hopper Tuning Guide, distributed shared memory and cluster notes: https://docs.nvidia.com/cuda/archive/12.4.0/hopper-tuning-guide/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/unified-memory/DOC.md b/content/cuda/docs/unified-memory/DOC.md new file mode 100644 index 00000000..6bf76c52 --- /dev/null +++ b/content/cuda/docs/unified-memory/DOC.md @@ -0,0 +1,71 @@ +--- +name: unified-memory +description: "CUDA Unified Memory essentials: managed allocations, migration behavior, prefetch/advice, and common performance pitfalls." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,unified-memory,managed-memory,cudaMallocManaged,cudaMemPrefetchAsync,cudaMemAdvise,page-migration" +--- + +# CUDA Unified Memory (C++) + +Use this page when you need a single pointer model across CPU and GPU with on-demand migration. + +## Core API + +Unified Memory is commonly allocated with: + +- `cudaMallocManaged` + +The runtime and driver can migrate pages between host and device as memory is accessed. + +## Why It Helps + +- simpler programming model for heterogeneous memory access +- easier incremental porting from CPU-oriented code +- fewer explicit memcpy calls in basic workflows + +## Why It Can Be Slow + +On-demand page migration can stall kernels if data is not resident on the device when accessed. + +Symptoms: + +- unpredictable first-touch latency +- page-fault-driven migration overhead +- lower effective bandwidth than explicit transfer pipelines + +## Performance Controls + +Use: + +- `cudaMemPrefetchAsync` to place data near expected access +- `cudaMemAdvise` hints for access patterns and preferred location + +These often reduce migration faults and smooth performance. + +## When To Prefer Explicit Transfers + +Prefer explicit host/device transfers when: + +- access pattern is stable and predictable +- maximum throughput is required +- migration overhead dominates runtime + +Unified Memory is often best for productivity first, then selectively optimized for hot paths. + +## Related Topics + +- Pinned memory and transfers: `../pinned-memory-and-transfers/DOC.md` +- Streams and events: `../streams-and-events/DOC.md` +- Performance debugging: `../performance-debugging/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Programming Guide, Unified Memory programming: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html +- CUDA Runtime API, managed-memory and memory-advice APIs: https://docs.nvidia.com/cuda/cuda-runtime-api/index.html + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/warp-primitives/DOC.md b/content/cuda/docs/warp-primitives/DOC.md new file mode 100644 index 00000000..d2edd596 --- /dev/null +++ b/content/cuda/docs/warp-primitives/DOC.md @@ -0,0 +1,105 @@ +--- +name: warp-primitives +description: "CUDA warp-level primitives: shuffle, ballot, active masks, syncwarp, and when to replace shared memory with warp collectives." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,warp,warp-primitives,warp-collectives,warp-synchronous,shuffle,ballot,warp-vote,__shfl_sync,__ballot_sync,__activemask,__syncwarp,warp-reduction" +--- + +# CUDA Warp Primitives (C++) + +Use this page for warp-scope communication patterns that avoid block-wide synchronization and often reduce shared-memory traffic. + +## Core Warp Primitives + +Common warp-level intrinsics include: + +- `__shfl_sync` +- `__shfl_down_sync` +- `__shfl_xor_sync` +- `__ballot_sync` +- `__all_sync` +- `__any_sync` +- `__activemask` +- `__syncwarp` + +These operate on the active lanes of a warp and require a consistent participation mask. + +## When Warp Primitives Help + +Use warp primitives when: + +- communication stays within one warp +- you want to avoid shared memory for a small reduction or exchange +- a block-wide barrier would be too expensive or unnecessary + +Typical cases: + +- warp reductions +- prefix-like exchanges within a warp +- voting and mask construction +- lane permutation for register-resident data + +## Shuffle vs Shared Memory + +Shuffle intrinsics move register values directly between lanes. + +Prefer shuffle when: + +- the communication scope is one warp +- data volume is small +- you want to avoid shared-memory stores, loads, and `__syncthreads()` + +Prefer shared memory when: + +- communication crosses warp boundaries +- the data footprint exceeds what is comfortable in registers +- the access pattern spans the whole block + +## Memory Ordering Rule + +- `__syncwarp()` provides warp-scope synchronization and memory ordering for participating lanes +- vote intrinsics such as `__ballot_sync` do not by themselves imply a memory barrier + +If lanes communicate through memory, insert `__syncwarp()`. + +## Minimal Warp Reduction Pattern + +```cpp +float x = value; +for (int offset = 16; offset > 0; offset >>= 1) { + x += __shfl_down_sync(0xffffffff, x, offset); +} +``` + +This is the standard first step before reducing across warps with shared memory or atomics. + +## Mask Discipline + +For `_sync` intrinsics: + +- every participating lane must use the same mask +- each calling lane must have its own bit set in the mask +- all named non-exited lanes must execute the same intrinsic with the same mask + +Violating mask discipline leads to undefined behavior. + +## Related Topics + +- Execution model: `../execution-model/DOC.md` +- Block and warp synchronization: `../synchronization/DOC.md` +- Shared memory alternatives: `../shared-memory/DOC.md` +- Atomics and reductions: `../atomics-and-reductions/DOC.md` +- Tensor Core warp-level usage: `../tensor-cores/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Programming Guide, warp vote and match functions: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html +- CUDA C++ Programming Guide, shuffle functions: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html +- CUDA C++ Programming Guide, `__syncwarp()`: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html#synchronization-functions + +Last cross-check date: 2026-03-20 diff --git a/content/cuda/docs/wmma-debugging-checklist/DOC.md b/content/cuda/docs/wmma-debugging-checklist/DOC.md new file mode 100644 index 00000000..b2712d02 --- /dev/null +++ b/content/cuda/docs/wmma-debugging-checklist/DOC.md @@ -0,0 +1,61 @@ +--- +name: wmma-debugging-checklist +description: "WMMA debugging checklist: fragment/layout mismatches, leading-dimension issues, warp participation errors, and profiling verification." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,wmma,debugging,checklist,tensor-core,fragment,load_matrix_sync,mma_sync,store_matrix_sync,ldm,alignment" +--- + +# WMMA Debugging Checklist (C++) + +Use this page when a WMMA kernel is incorrect, unstable, or unexpectedly slow. + +## Correctness Checklist + +- Warp participation is complete for every WMMA call. +- `matrix_a` / `matrix_b` layout templates match actual memory layout. +- `ldm` values are in elements and match tensor strides. +- Load/store pointers satisfy required alignment. +- Accumulator type and final store type match intended precision policy. + +## Common Failure Signatures + +- Output full of zeros or repeated blocks: wrong pointer arithmetic or tile mapping. +- Numerically wrong but stable shape: wrong layout or `ldm` mismatch. +- Intermittent corruption: partial-warp execution or out-of-bounds tile guards. +- Correct output but poor speed: data staging dominates, not matrix instruction issue. + +## Profiling Checklist + +- Confirm matrix instruction activity is present. +- Confirm expected hot kernels use Tensor Core-capable instruction mix. +- Check shared-memory staging quality and bank-conflict pressure. +- Check occupancy/register pressure after unrolling and staging changes. + +## Minimal Debug Order + +1. Validate one warp, one tile, one K-step. +2. Validate full K-loop accumulation. +3. Scale to multi-warp block mapping. +4. Add pipelining/staging optimizations only after correctness is stable. + +## Related Topics + +- Tensor Core overview: `../tensor-cores/DOC.md` +- WMMA kernel patterns: `../wmma-kernel-patterns/DOC.md` +- Tensor Core pipeline patterns: `../tensor-core-pipeline-patterns/DOC.md` +- Numerics and precision: `../numerics-and-precision/DOC.md` +- Tensor Core numerical validation: `../tensor-core-numerical-validation/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Programming Guide, WMMA APIs: https://docs.nvidia.com/cuda/cuda-c-programming-guide/ +- CUDA C++ Programming Guide, Tensor Core restrictions: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html +- Nsight Compute Profiling Guide: https://docs.nvidia.com/nsight-compute/2024.2/ProfilingGuide/index.html + +Last cross-check date: 2026-03-20 + diff --git a/content/cuda/docs/wmma-kernel-patterns/DOC.md b/content/cuda/docs/wmma-kernel-patterns/DOC.md new file mode 100644 index 00000000..7766e5ea --- /dev/null +++ b/content/cuda/docs/wmma-kernel-patterns/DOC.md @@ -0,0 +1,107 @@ +--- +name: wmma-kernel-patterns +description: "Practical WMMA kernel patterns: warp-to-tile mapping, fragment loading rules, accumulator handling, and common failure modes." +metadata: + languages: "cpp" + versions: "12.9" + revision: 1 + updated-on: "2026-03-20" + source: official + tags: "cuda,gpu,kernel,wmma,tensor-core,tensorcore,nvcuda::wmma,warp-matrix-multiply-accumulate,warp-mma,matrix-multiply-accumulate,fragment,mma_sync,load_matrix_sync,store_matrix_sync,gemm" +--- + +# WMMA Kernel Patterns (C++) + +Use this page when you need a practical implementation pattern for `nvcuda::wmma`, not just API names. + +## Warp-To-Tile Mapping + +The baseline mapping is one warp per output tile: + +- one warp loads A/B tile fragments +- one warp keeps the accumulator fragment +- one warp stores results back + +Scale to larger problems by assigning multiple warps per block and iterating over K tiles. + +## Minimal Pattern Skeleton + +```cpp +using namespace nvcuda; + +__global__ void wmma_gemm_kernel(const half* A, const half* B, float* C, + int M, int N, int K, + int lda, int ldb, int ldc) { + int warp_id_in_block = threadIdx.x / 32; + int lane_id = threadIdx.x % 32; + + int warp_m = (blockIdx.y * (blockDim.x / 32) + warp_id_in_block); + int warp_n = blockIdx.x; + + if (warp_m * 16 >= M || warp_n * 16 >= N) return; + + wmma::fragment c_frag; + wmma::fill_fragment(c_frag, 0.0f); + + for (int k0 = 0; k0 < K; k0 += 16) { + wmma::fragment a_frag; + wmma::fragment b_frag; + + const half* a_ptr = A + (warp_m * 16) * lda + k0; + const half* b_ptr = B + k0 * ldb + (warp_n * 16); + + wmma::load_matrix_sync(a_frag, a_ptr, lda); + wmma::load_matrix_sync(b_frag, b_ptr, ldb); + wmma::mma_sync(c_frag, a_frag, b_frag, c_frag); + } + + float* c_ptr = C + (warp_m * 16) * ldc + (warp_n * 16); + wmma::store_matrix_sync(c_ptr, c_frag, ldc, wmma::mem_row_major); +} +``` + +This skeleton is intentionally simple. Production kernels usually add shared-memory staging and pipelining. + +## Critical Correctness Rules + +- All lanes in the warp must execute the WMMA calls with consistent arguments. +- Layout and leading-dimension parameters must match fragment template expectations. +- Pointer alignment and stride constraints for load/store must satisfy API requirements. +- Fragment internal lane mapping is opaque; do not index fragment storage with custom lane assumptions. + +## High-Value Performance Patterns + +- Stage A/B tiles in shared memory to reduce uncoalesced global traffic. +- Use double-buffered tile staging when K is large. +- Keep one accumulator fragment alive across K-loop iterations. +- Control register pressure before adding heavy unrolling. + +## Common Failure Modes + +- Wrong `row_major`/`col_major` choice for multiplicands. +- Incorrect `lda`/`ldb`/`ldc` in element units. +- Partial-warp execution due to guard branches around WMMA calls. +- Correct output with low speed because data movement dominates MMA throughput. + +## Verification Workflow + +1. Compare numerics against a trusted GEMM baseline. +2. Confirm matrix instruction activity in profiler output. +3. Confirm shared-memory staging efficiency and low bank-conflict pressure. +4. Sweep block-level warp count and K-step scheduling for throughput. + +## Related Topics + +- Tensor Core overview: `../tensor-cores/DOC.md` +- Tensor Core pipeline patterns: `../tensor-core-pipeline-patterns/DOC.md` +- Shared memory: `../shared-memory/DOC.md` +- Async copy: `../async-copy/DOC.md` +- Numerics and precision: `../numerics-and-precision/DOC.md` +- PTX WGMMA: `../ptx/instructions/wgmma/DOC.md` + +## Official Source Links (Fact Check) + +- CUDA C++ Programming Guide, WMMA API: https://docs.nvidia.com/cuda/cuda-c-programming-guide/ +- CUDA C++ Programming Guide, Tensor Core usage restrictions: https://docs.nvidia.com/cuda/archive/12.9.1/cuda-c-programming-guide/index.html + +Last cross-check date: 2026-03-20 diff --git a/docs/features/search-regression.md b/docs/features/search-regression.md new file mode 100644 index 00000000..d4884ab0 --- /dev/null +++ b/docs/features/search-regression.md @@ -0,0 +1,43 @@ +# Search Regression + +This workflow provides repeatable search-quality checks for local Context Hub content. + +## Files + +- `scripts/search_regression.py`: regression runner. +- `scripts/search_regression_cases.json`: query cases and expectations. +- `scripts/search_regression_baseline.json`: generated snapshot (current top results). + +## Run + +From repository root: + +```bash +python3 scripts/search_regression.py --mode check +``` + +Generate a fresh snapshot/baseline: + +```bash +python3 scripts/search_regression.py --mode snapshot +``` + +## Case Format + +Each case in `search_regression_cases.json` supports: + +- `id`: stable case identifier +- `query`: search query text +- `tags`: optional `--tags` value +- `lang`: optional `--lang` value +- `limit`: search result count +- `top_k`: range used for assertions +- `expect_top1`: expected id at rank 1 +- `expect_all`: all expected ids must appear in top-k +- `expect_any`: at least one id must appear in top-k +- `expect_absent`: ids that must not appear in top-k + +## CI Suggestion + +- Run `python3 scripts/search_regression.py --mode check` after `chub build`. +- Store `scripts/search_regression_baseline.json` as an artifact to track ranking drift. diff --git a/scripts/search_regression.py b/scripts/search_regression.py new file mode 100644 index 00000000..20d8f7c4 --- /dev/null +++ b/scripts/search_regression.py @@ -0,0 +1,210 @@ +#!/usr/bin/env python3 +"""Run search regression checks for Context Hub.""" + +from __future__ import annotations + +import argparse +import json +import subprocess +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Any + + +@dataclass +class CaseResult: + case_id: str + query: str + passed: bool + message: str + top_ids: list[str] + raw_results: list[dict[str, Any]] + + +def _load_json(path: Path) -> Any: + return json.loads(path.read_text(encoding="utf-8")) + + +def _run_search( + chub: str, query: str, tags: str | None, lang: str | None, limit: int +) -> list[dict[str, Any]]: + cmd = [chub, "search", query, "--limit", str(limit), "--json"] + if tags: + cmd.extend(["--tags", tags]) + if lang: + cmd.extend(["--lang", lang]) + + proc = subprocess.run(cmd, capture_output=True, text=True, check=False) + if proc.returncode != 0: + raise RuntimeError( + f"command failed ({proc.returncode}): {' '.join(cmd)}\n{proc.stderr.strip()}" + ) + + text = proc.stdout.strip() + if not text: + raise RuntimeError(f"empty output for query: {query}") + + try: + payload = json.loads(text) + except json.JSONDecodeError as exc: + raise RuntimeError(f"invalid JSON output for query: {query}\n{text}") from exc + + results = payload.get("results") + if not isinstance(results, list): + raise RuntimeError(f"missing `results` in output for query: {query}") + return results + + +def _evaluate_case(case: dict[str, Any], results: list[dict[str, Any]]) -> CaseResult: + case_id = str(case["id"]) + query = str(case["query"]) + top_k = int(case.get("top_k", len(results))) + top_ids = [str(r.get("id", "")) for r in results[:top_k]] + + expect_top1 = case.get("expect_top1") + expect_all = [str(x) for x in case.get("expect_all", [])] + expect_any = [str(x) for x in case.get("expect_any", [])] + expect_absent = [str(x) for x in case.get("expect_absent", [])] + + failures: list[str] = [] + if expect_top1 and (not top_ids or top_ids[0] != expect_top1): + got = top_ids[0] if top_ids else "" + failures.append(f"top1 expected `{expect_top1}`, got `{got}`") + + missing_all = [x for x in expect_all if x not in top_ids] + if missing_all: + failures.append(f"missing expected ids in top-{top_k}: {missing_all}") + + if expect_any and not any(x in top_ids for x in expect_any): + failures.append(f"none of expect_any found in top-{top_k}: {expect_any}") + + present_absent = [x for x in expect_absent if x in top_ids] + if present_absent: + failures.append(f"unexpected ids found in top-{top_k}: {present_absent}") + + if failures: + return CaseResult( + case_id=case_id, + query=query, + passed=False, + message="; ".join(failures), + top_ids=top_ids, + raw_results=results, + ) + return CaseResult( + case_id=case_id, + query=query, + passed=True, + message="ok", + top_ids=top_ids, + raw_results=results, + ) + + +def _snapshot_payload(run_results: list[CaseResult]) -> dict[str, Any]: + cases: list[dict[str, Any]] = [] + for r in run_results: + top_items = [] + for item in r.raw_results[:10]: + top_items.append( + { + "id": item.get("id"), + "name": item.get("name"), + "score": item.get("_score"), + } + ) + cases.append( + { + "id": r.case_id, + "query": r.query, + "passed": r.passed, + "top_ids": r.top_ids, + "top_items": top_items, + } + ) + return {"cases": cases} + + +def main() -> int: + parser = argparse.ArgumentParser(description="Run Context Hub search regressions.") + parser.add_argument( + "--cases", + default="scripts/search_regression_cases.json", + help="Path to regression case JSON file.", + ) + parser.add_argument( + "--chub", + default="./cli/bin/chub", + help="Path to chub executable.", + ) + parser.add_argument( + "--mode", + choices=["check", "snapshot"], + default="check", + help="check: assert expectations; snapshot: emit current top results", + ) + parser.add_argument( + "--snapshot-out", + default="scripts/search_regression_baseline.json", + help="Where to write snapshot/baseline JSON.", + ) + args = parser.parse_args() + + cases = _load_json(Path(args.cases)) + if not isinstance(cases, list) or not cases: + print("error: cases file must be a non-empty JSON array", file=sys.stderr) + return 2 + + run_results: list[CaseResult] = [] + hard_failures = 0 + + for case in cases: + try: + query = str(case["query"]) + limit = int(case.get("limit", 5)) + tags = case.get("tags") + lang = case.get("lang") + results = _run_search(args.chub, query, tags, lang, limit) + result = _evaluate_case(case, results) + except Exception as exc: # pragma: no cover + hard_failures += 1 + case_id = str(case.get("id", "")) + query = str(case.get("query", "")) + result = CaseResult( + case_id=case_id, + query=query, + passed=False, + message=str(exc), + top_ids=[], + raw_results=[], + ) + run_results.append(result) + + pass_count = sum(1 for r in run_results if r.passed) + fail_count = len(run_results) - pass_count + + for r in run_results: + status = "PASS" if r.passed else "FAIL" + print(f"[{status}] {r.case_id}: {r.message}") + if not r.passed: + print(f" query={r.query}") + if r.top_ids: + print(f" top_ids={r.top_ids}") + + snapshot = _snapshot_payload(run_results) + out_path = Path(args.snapshot_out) + if args.mode == "snapshot" or fail_count: + out_path.write_text(json.dumps(snapshot, indent=2), encoding="utf-8") + print(f"wrote snapshot: {out_path}") + + print( + f"summary: total={len(run_results)} pass={pass_count} fail={fail_count} hard_failures={hard_failures}" + ) + if args.mode == "snapshot": + return 0 + return 0 if fail_count == 0 else 1 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/search_regression_baseline.json b/scripts/search_regression_baseline.json new file mode 100644 index 00000000..2d400c0f --- /dev/null +++ b/scripts/search_regression_baseline.json @@ -0,0 +1,690 @@ +{ + "cases": [ + { + "id": "wmma-how-to-use", + "query": "how to use wmma", + "passed": true, + "top_ids": [ + "cuda/wmma-kernel-patterns", + "cuda/wmma-debugging-checklist", + "cuda/tensor-cores" + ], + "top_items": [ + { + "id": "cuda/wmma-kernel-patterns", + "name": "wmma-kernel-patterns", + "score": 42.72239226326069 + }, + { + "id": "cuda/wmma-debugging-checklist", + "name": "wmma-debugging-checklist", + "score": 41.09123189504223 + }, + { + "id": "cuda/tensor-cores", + "name": "tensor-cores", + "score": 12.49033613627519 + }, + { + "id": "cuda/ptx-wgmma-instructions", + "name": "ptx-wgmma-instructions", + "score": 8.015605451537379 + }, + { + "id": "cuda/tensor-core-numerical-validation", + "name": "tensor-core-numerical-validation", + "score": 7.672928478098094 + } + ] + }, + { + "id": "shared-memory-core", + "query": "shared memory cuda", + "passed": true, + "top_ids": [ + "cuda/shared-memory", + "cuda/memory-hierarchy", + "cuda/unified-memory", + "cuda/memory-bound-kernel-optimization-playbook", + "cuda/memory-fences-and-ordering" + ], + "top_items": [ + { + "id": "cuda/shared-memory", + "name": "shared-memory", + "score": 657.8079742112991 + }, + { + "id": "cuda/memory-hierarchy", + "name": "memory-hierarchy", + "score": 73.17322342909286 + }, + { + "id": "cuda/unified-memory", + "name": "unified-memory", + "score": 60.3646893631704 + }, + { + "id": "cuda/memory-bound-kernel-optimization-playbook", + "name": "memory-bound-kernel-optimization-playbook", + "score": 57.53270604100074 + }, + { + "id": "cuda/memory-fences-and-ordering", + "name": "memory-fences-and-ordering", + "score": 54.23784477307811 + } + ] + }, + { + "id": "tensor-core-pipeline", + "query": "tensor core pipeline", + "passed": true, + "top_ids": [ + "cuda/tensor-core-pipeline-patterns", + "cuda/tensor-core-numerical-validation", + "cuda/cuda-core-vs-tensor-core-path-selection", + "cuda/tensor-cores", + "cuda/cuda-core" + ], + "top_items": [ + { + "id": "cuda/tensor-core-pipeline-patterns", + "name": "tensor-core-pipeline-patterns", + "score": 644.1075200959514 + }, + { + "id": "cuda/tensor-core-numerical-validation", + "name": "tensor-core-numerical-validation", + "score": 61.48468932198023 + }, + { + "id": "cuda/cuda-core-vs-tensor-core-path-selection", + "name": "cuda-core-vs-tensor-core-path-selection", + "score": 60.74471140903582 + }, + { + "id": "cuda/tensor-cores", + "name": "tensor-cores", + "score": 47.81338301557305 + }, + { + "id": "cuda/cuda-core", + "name": "cuda-core", + "score": 33.3511979484339 + } + ] + }, + { + "id": "cuda-core-checklist", + "query": "cuda core optimization checklist", + "passed": true, + "top_ids": [ + "cuda/cuda-core-optimization-checklist", + "cuda/production-readiness-checklist", + "cuda/cuda-core", + "cuda/wmma-debugging-checklist", + "cuda/compute-bound-kernel-optimization-playbook" + ], + "top_items": [ + { + "id": "cuda/cuda-core-optimization-checklist", + "name": "cuda-core-optimization-checklist", + "score": 759.407109261226 + }, + { + "id": "cuda/production-readiness-checklist", + "name": "production-readiness-checklist", + "score": 70.7996693382976 + }, + { + "id": "cuda/cuda-core", + "name": "cuda-core", + "score": 69.74042809797584 + }, + { + "id": "cuda/wmma-debugging-checklist", + "name": "wmma-debugging-checklist", + "score": 67.05937556573964 + }, + { + "id": "cuda/compute-bound-kernel-optimization-playbook", + "name": "compute-bound-kernel-optimization-playbook", + "score": 57.217605086477704 + } + ] + }, + { + "id": "ptx-mbarrier-patterns", + "query": "ptx cp.async mbarrier", + "passed": true, + "top_ids": [ + "cuda/ptx-mbarrier-protocol-patterns", + "cuda/ptx", + "cuda/ptx-sync-comm-instructions" + ], + "top_items": [ + { + "id": "cuda/ptx-mbarrier-protocol-patterns", + "name": "ptx-mbarrier-protocol-patterns", + "score": 97.10445123474443 + }, + { + "id": "cuda/ptx", + "name": "ptx", + "score": 62.61826372079303 + }, + { + "id": "cuda/ptx-sync-comm-instructions", + "name": "ptx-sync-comm-instructions", + "score": 52.027086225402634 + }, + { + "id": "cuda/ptx-data-movement-instructions", + "name": "ptx-data-movement-instructions", + "score": 52.00293090969298 + }, + { + "id": "cuda/ptx-tma-instructions", + "name": "ptx-tma-instructions", + "score": 48.82956085700581 + } + ] + }, + { + "id": "wmma-debugging", + "query": "wmma debugging checklist", + "passed": true, + "top_ids": [ + "cuda/wmma-debugging-checklist", + "cuda/production-readiness-checklist", + "cuda/cuda-core-optimization-checklist" + ], + "top_items": [ + { + "id": "cuda/wmma-debugging-checklist", + "name": "wmma-debugging-checklist", + "score": 750.2110571099413 + }, + { + "id": "cuda/production-readiness-checklist", + "name": "production-readiness-checklist", + "score": 51.353724997868376 + }, + { + "id": "cuda/cuda-core-optimization-checklist", + "name": "cuda-core-optimization-checklist", + "score": 47.219036371224455 + }, + { + "id": "cuda/performance-debugging", + "name": "performance-debugging", + "score": 46.88234948872761 + }, + { + "id": "cuda/wmma-kernel-patterns", + "name": "wmma-kernel-patterns", + "score": 42.72239226326069 + } + ] + }, + { + "id": "warp-primitives", + "query": "warp primitives shuffle ballot syncwarp", + "passed": true, + "top_ids": [ + "cuda/warp-primitives", + "cuda/ptx-warp-synchronization-patterns", + "cuda/synchronization", + "cuda/atomics-and-reductions", + "cuda/wmma-kernel-patterns" + ], + "top_items": [ + { + "id": "cuda/warp-primitives", + "name": "warp-primitives", + "score": 150.16976993918206 + }, + { + "id": "cuda/ptx-warp-synchronization-patterns", + "name": "ptx-warp-synchronization-patterns", + "score": 42.529269747467985 + }, + { + "id": "cuda/synchronization", + "name": "synchronization", + "score": 25.93295374080112 + }, + { + "id": "cuda/atomics-and-reductions", + "name": "atomics-and-reductions", + "score": 13.001420566012728 + }, + { + "id": "cuda/wmma-kernel-patterns", + "name": "wmma-kernel-patterns", + "score": 12.05164899412507 + } + ] + }, + { + "id": "sync-basics", + "query": "cuda synchronization syncthreads syncwarp", + "passed": true, + "top_ids": [ + "cuda/synchronization", + "cuda/ptx-warp-synchronization-patterns", + "cuda/ptx-sync-comm-instructions", + "cuda/cuda-graphs", + "cuda/cuda-core" + ], + "top_items": [ + { + "id": "cuda/synchronization", + "name": "synchronization", + "score": 115.53494858139211 + }, + { + "id": "cuda/ptx-warp-synchronization-patterns", + "name": "ptx-warp-synchronization-patterns", + "score": 51.064361100519484 + }, + { + "id": "cuda/ptx-sync-comm-instructions", + "name": "ptx-sync-comm-instructions", + "score": 35.1019026522119 + }, + { + "id": "cuda/cuda-graphs", + "name": "cuda-graphs", + "score": 32.66211358082386 + }, + { + "id": "cuda/cuda-core", + "name": "cuda-core", + "score": 31.781700957557508 + } + ] + }, + { + "id": "coalescing", + "query": "cuda memory coalescing global load store", + "passed": true, + "top_ids": [ + "cuda/coalescing", + "cuda/memory-hierarchy", + "cuda/memory-bound-kernel-optimization-playbook" + ], + "top_items": [ + { + "id": "cuda/coalescing", + "name": "coalescing", + "score": 114.46137435508729 + }, + { + "id": "cuda/memory-hierarchy", + "name": "memory-hierarchy", + "score": 76.00414047531201 + }, + { + "id": "cuda/memory-bound-kernel-optimization-playbook", + "name": "memory-bound-kernel-optimization-playbook", + "score": 61.62359200788258 + }, + { + "id": "cuda/unified-memory", + "name": "unified-memory", + "score": 60.3646893631704 + }, + { + "id": "cuda/shared-memory", + "name": "shared-memory", + "score": 58.77381854116289 + } + ] + }, + { + "id": "occupancy-and-launch-bounds", + "query": "occupancy register pressure launch bounds", + "passed": true, + "top_ids": [ + "cuda/launch-bounds-and-registers", + "cuda/occupancy", + "cuda/launch-bound-optimization-playbook", + "cuda/compute-bound-kernel-optimization-playbook" + ], + "top_items": [ + { + "id": "cuda/launch-bounds-and-registers", + "name": "launch-bounds-and-registers", + "score": 146.62780726587894 + }, + { + "id": "cuda/occupancy", + "name": "occupancy", + "score": 88.03880324897167 + }, + { + "id": "cuda/launch-bound-optimization-playbook", + "name": "launch-bound-optimization-playbook", + "score": 40.8986821615221 + }, + { + "id": "cuda/compute-bound-kernel-optimization-playbook", + "name": "compute-bound-kernel-optimization-playbook", + "score": 38.13732896303176 + }, + { + "id": "cuda/cuda-core-optimization-checklist", + "name": "cuda-core-optimization-checklist", + "score": 28.823017954994313 + } + ] + }, + { + "id": "unified-memory", + "query": "cuda unified memory prefetch advise", + "passed": true, + "top_ids": [ + "cuda/unified-memory", + "cuda/memory-hierarchy", + "cuda/shared-memory", + "cuda/memory-fences-and-ordering", + "cuda/pinned-memory-and-transfers" + ], + "top_items": [ + { + "id": "cuda/unified-memory", + "name": "unified-memory", + "score": 125.91467276225826 + }, + { + "id": "cuda/memory-hierarchy", + "name": "memory-hierarchy", + "score": 63.70635719692105 + }, + { + "id": "cuda/shared-memory", + "name": "shared-memory", + "score": 58.77381854116289 + }, + { + "id": "cuda/memory-fences-and-ordering", + "name": "memory-fences-and-ordering", + "score": 54.23784477307811 + }, + { + "id": "cuda/pinned-memory-and-transfers", + "name": "pinned-memory-and-transfers", + "score": 53.704153789779426 + } + ] + }, + { + "id": "graphs", + "query": "cuda graphs stream capture", + "passed": true, + "top_ids": [ + "cuda/cuda-graphs", + "cuda/launch-bound-optimization-playbook", + "cuda/streams-and-events", + "cuda/cuda-core", + "cuda/cublas-cudnn-integration-patterns" + ], + "top_items": [ + { + "id": "cuda/cuda-graphs", + "name": "cuda-graphs", + "score": 122.41291614046733 + }, + { + "id": "cuda/launch-bound-optimization-playbook", + "name": "launch-bound-optimization-playbook", + "score": 41.33640294345046 + }, + { + "id": "cuda/streams-and-events", + "name": "streams-and-events", + "score": 34.44998146409479 + }, + { + "id": "cuda/cuda-core", + "name": "cuda-core", + "score": 31.781700957557508 + }, + { + "id": "cuda/cublas-cudnn-integration-patterns", + "name": "cublas-cudnn-integration-patterns", + "score": 30.514857031952204 + } + ] + }, + { + "id": "numerics", + "query": "cuda numerics precision tf32 fp16 bf16", + "passed": true, + "top_ids": [ + "cuda/numerics-and-precision", + "cuda/tensor-core-numerical-validation", + "cuda/cuda-graphs", + "cuda/cuda-core" + ], + "top_items": [ + { + "id": "cuda/numerics-and-precision", + "name": "numerics-and-precision", + "score": 183.59776324812412 + }, + { + "id": "cuda/tensor-core-numerical-validation", + "name": "tensor-core-numerical-validation", + "score": 52.00733109154891 + }, + { + "id": "cuda/cuda-graphs", + "name": "cuda-graphs", + "score": 32.66211358082386 + }, + { + "id": "cuda/cuda-core", + "name": "cuda-core", + "score": 31.781700957557508 + }, + { + "id": "cuda/cuda-core-vs-tensor-core-path-selection", + "name": "cuda-core-vs-tensor-core-path-selection", + "score": 31.402549363889474 + } + ] + }, + { + "id": "bottleneck-workflow", + "query": "kernel bottleneck diagnosis workflow", + "passed": true, + "top_ids": [ + "cuda/kernel-bottleneck-diagnosis-workflow", + "cuda/nvtx-and-profiling-workflow", + "cuda/fused-kernel-design-patterns", + "cuda/kernel-api-design-guidelines", + "cuda/wmma-kernel-patterns" + ], + "top_items": [ + { + "id": "cuda/kernel-bottleneck-diagnosis-workflow", + "name": "kernel-bottleneck-diagnosis-workflow", + "score": 768.9931435006229 + }, + { + "id": "cuda/nvtx-and-profiling-workflow", + "name": "nvtx-and-profiling-workflow", + "score": 32.0504265214994 + }, + { + "id": "cuda/fused-kernel-design-patterns", + "name": "fused-kernel-design-patterns", + "score": 27.43176420745622 + }, + { + "id": "cuda/kernel-api-design-guidelines", + "name": "kernel-api-design-guidelines", + "score": 25.1148108398069 + }, + { + "id": "cuda/wmma-kernel-patterns", + "name": "wmma-kernel-patterns", + "score": 25.0716810325601 + } + ] + }, + { + "id": "production-readiness", + "query": "cuda production readiness checklist", + "passed": true, + "top_ids": [ + "cuda/production-readiness-checklist", + "cuda/cuda-core-optimization-checklist", + "cuda/wmma-debugging-checklist", + "cuda/cuda-graphs", + "cuda/cuda-core" + ], + "top_items": [ + { + "id": "cuda/production-readiness-checklist", + "name": "production-readiness-checklist", + "score": 769.9486340479549 + }, + { + "id": "cuda/cuda-core-optimization-checklist", + "name": "cuda-core-optimization-checklist", + "score": 73.82898181666549 + }, + { + "id": "cuda/wmma-debugging-checklist", + "name": "wmma-debugging-checklist", + "score": 62.75565279395994 + }, + { + "id": "cuda/cuda-graphs", + "name": "cuda-graphs", + "score": 32.66211358082386 + }, + { + "id": "cuda/cuda-core", + "name": "cuda-core", + "score": 31.781700957557508 + } + ] + }, + { + "id": "ptx-wgmma", + "query": "ptx wgmma commit wait fence", + "passed": true, + "top_ids": [ + "cuda/ptx-wgmma-instructions", + "cuda/ptx-mbarrier-protocol-patterns", + "cuda/ptx" + ], + "top_items": [ + { + "id": "cuda/ptx-wgmma-instructions", + "name": "ptx-wgmma-instructions", + "score": 79.69390442977547 + }, + { + "id": "cuda/ptx-mbarrier-protocol-patterns", + "name": "ptx-mbarrier-protocol-patterns", + "score": 49.434469077760816 + }, + { + "id": "cuda/ptx", + "name": "ptx", + "score": 47.42237009982906 + }, + { + "id": "cuda/ptx-integer-instructions", + "name": "ptx-integer-instructions", + "score": 42.656363750529316 + }, + { + "id": "cuda/ptx-tma-instructions", + "name": "ptx-tma-instructions", + "score": 41.23960122735729 + } + ] + }, + { + "id": "ptx-atomics", + "query": "ptx atom cas red redux", + "passed": true, + "top_ids": [ + "cuda/ptx-atomic-and-reduction-patterns", + "cuda/ptx", + "cuda/ptx-integer-instructions" + ], + "top_items": [ + { + "id": "cuda/ptx-atomic-and-reduction-patterns", + "name": "ptx-atomic-and-reduction-patterns", + "score": 97.96228201139625 + }, + { + "id": "cuda/ptx", + "name": "ptx", + "score": 47.42237009982906 + }, + { + "id": "cuda/ptx-integer-instructions", + "name": "ptx-integer-instructions", + "score": 42.656363750529316 + }, + { + "id": "cuda/ptx-tma-instructions", + "name": "ptx-tma-instructions", + "score": 41.23960122735729 + }, + { + "id": "cuda/ptx-special-registers", + "name": "ptx-special-registers", + "score": 40.72920089806367 + } + ] + }, + { + "id": "ptx-integer-bitops", + "query": "ptx integer bit manipulation lop3 bfe bfi", + "passed": true, + "top_ids": [ + "cuda/ptx-integer-bit-manipulation-patterns", + "cuda/ptx-integer-instructions", + "cuda/ptx" + ], + "top_items": [ + { + "id": "cuda/ptx-integer-bit-manipulation-patterns", + "name": "ptx-integer-bit-manipulation-patterns", + "score": 199.73122717340425 + }, + { + "id": "cuda/ptx-integer-instructions", + "name": "ptx-integer-instructions", + "score": 103.62199663868175 + }, + { + "id": "cuda/ptx", + "name": "ptx", + "score": 47.42237009982906 + }, + { + "id": "cuda/ptx-tma-instructions", + "name": "ptx-tma-instructions", + "score": 41.23960122735729 + }, + { + "id": "cuda/ptx-special-registers", + "name": "ptx-special-registers", + "score": 40.72920089806367 + } + ] + } + ] +} \ No newline at end of file diff --git a/scripts/search_regression_cases.json b/scripts/search_regression_cases.json new file mode 100644 index 00000000..1df87864 --- /dev/null +++ b/scripts/search_regression_cases.json @@ -0,0 +1,166 @@ +[ + { + "id": "wmma-how-to-use", + "query": "how to use wmma", + "tags": "cuda", + "lang": "cpp", + "limit": 5, + "top_k": 3, + "expect_top1": "cuda/wmma-kernel-patterns", + "expect_any": ["cuda/tensor-cores"] + }, + { + "id": "shared-memory-core", + "query": "shared memory cuda", + "tags": "cuda", + "lang": "cpp", + "limit": 5, + "expect_top1": "cuda/shared-memory" + }, + { + "id": "tensor-core-pipeline", + "query": "tensor core pipeline", + "tags": "cuda", + "lang": "cpp", + "limit": 5, + "expect_top1": "cuda/tensor-core-pipeline-patterns" + }, + { + "id": "cuda-core-checklist", + "query": "cuda core optimization checklist", + "tags": "cuda", + "lang": "cpp", + "limit": 5, + "expect_top1": "cuda/cuda-core-optimization-checklist" + }, + { + "id": "ptx-mbarrier-patterns", + "query": "ptx cp.async mbarrier", + "tags": "cuda", + "lang": "cpp", + "limit": 5, + "top_k": 3, + "expect_top1": "cuda/ptx-mbarrier-protocol-patterns", + "expect_any": ["cuda/ptx-sync-comm-instructions"] + }, + { + "id": "wmma-debugging", + "query": "wmma debugging checklist", + "tags": "cuda", + "lang": "cpp", + "limit": 5, + "top_k": 3, + "expect_any": [ + "cuda/wmma-debugging-checklist", + "cuda/wmma-kernel-patterns" + ] + }, + { + "id": "warp-primitives", + "query": "warp primitives shuffle ballot syncwarp", + "tags": "cuda", + "lang": "cpp", + "limit": 5, + "expect_top1": "cuda/warp-primitives" + }, + { + "id": "sync-basics", + "query": "cuda synchronization syncthreads syncwarp", + "tags": "cuda", + "lang": "cpp", + "limit": 5, + "expect_top1": "cuda/synchronization" + }, + { + "id": "coalescing", + "query": "cuda memory coalescing global load store", + "tags": "cuda", + "lang": "cpp", + "limit": 5, + "top_k": 3, + "expect_any": ["cuda/coalescing", "cuda/memory-bound-kernel-optimization-playbook"] + }, + { + "id": "occupancy-and-launch-bounds", + "query": "occupancy register pressure launch bounds", + "tags": "cuda", + "lang": "cpp", + "limit": 5, + "top_k": 4, + "expect_any": ["cuda/occupancy", "cuda/launch-bounds-and-registers"] + }, + { + "id": "unified-memory", + "query": "cuda unified memory prefetch advise", + "tags": "cuda", + "lang": "cpp", + "limit": 5, + "expect_top1": "cuda/unified-memory" + }, + { + "id": "graphs", + "query": "cuda graphs stream capture", + "tags": "cuda", + "lang": "cpp", + "limit": 5, + "expect_top1": "cuda/cuda-graphs" + }, + { + "id": "numerics", + "query": "cuda numerics precision tf32 fp16 bf16", + "tags": "cuda", + "lang": "cpp", + "limit": 5, + "top_k": 4, + "expect_any": ["cuda/numerics-and-precision", "cuda/tensor-core-numerical-validation"] + }, + { + "id": "bottleneck-workflow", + "query": "kernel bottleneck diagnosis workflow", + "tags": "cuda", + "lang": "cpp", + "limit": 5, + "expect_top1": "cuda/kernel-bottleneck-diagnosis-workflow" + }, + { + "id": "production-readiness", + "query": "cuda production readiness checklist", + "tags": "cuda", + "lang": "cpp", + "limit": 5, + "expect_top1": "cuda/production-readiness-checklist" + }, + { + "id": "ptx-wgmma", + "query": "ptx wgmma commit wait fence", + "tags": "cuda", + "lang": "cpp", + "limit": 5, + "top_k": 3, + "expect_any": ["cuda/ptx-wgmma-instructions", "cuda/tensor-core-pipeline-patterns"] + }, + { + "id": "ptx-atomics", + "query": "ptx atom cas red redux", + "tags": "cuda", + "lang": "cpp", + "limit": 5, + "top_k": 3, + "expect_any": [ + "cuda/ptx-atomic-and-reduction-patterns", + "cuda/ptx-sync-comm-instructions" + ] + }, + { + "id": "ptx-integer-bitops", + "query": "ptx integer bit manipulation lop3 bfe bfi", + "tags": "cuda", + "lang": "cpp", + "limit": 5, + "top_k": 3, + "expect_any": [ + "cuda/ptx-integer-bit-manipulation-patterns", + "cuda/ptx-integer-instructions" + ] + } +]