diff --git a/docs/libcudacxx/ptx/instructions.rst b/docs/libcudacxx/ptx/instructions.rst index 61b03ad3f0a..6225bf2c247 100644 --- a/docs/libcudacxx/ptx/instructions.rst +++ b/docs/libcudacxx/ptx/instructions.rst @@ -29,6 +29,7 @@ PTX Instructions instructions/multimem_red instructions/multimem_st instructions/red_async + instructions/shfl_sync instructions/st_async instructions/st_bulk instructions/tcgen05_alloc @@ -251,8 +252,8 @@ Instructions by section - No * - `shfl `__ - No - * - `shfl.s `__ - - No + * - `shfl.sync `__ + - Yes, CCCL 2.9.0 / CUDA 12.9 * - `prmt `__ - No * - `ld `__ diff --git a/docs/libcudacxx/ptx/instructions/generated/shfl_sync.rst b/docs/libcudacxx/ptx/instructions/generated/shfl_sync.rst new file mode 100644 index 00000000000..630f7afb344 --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/generated/shfl_sync.rst @@ -0,0 +1,77 @@ + +shfl.sync +^^^^^^^^^ + +.. code:: cuda + + // PTX ISA 6.0 + // shfl.sync.mode.b32 d[|p], a, b, c, membermask; + // .mode = { .up, .down, .bfly, .idx }; + + template + [[nodiscard]] __device__ static inline + T shfl_sync_idx(T data, + uint32_t lane_idx_offset, + uint32_t clamp_segmask, + uint32_t lane_mask) noexcept; + + template + [[nodiscard]] __device__ static inline + T shfl_sync_idx(T data, + bool& pred, + uint32_t lane_idx_offset, + uint32_t clamp_segmask, + uint32_t lane_mask) noexcept; + + template + [[nodiscard]] __device__ static inline + T shfl_sync_up(T data, + uint32_t lane_idx_offset, + uint32_t clamp_segmask, + uint32_t lane_mask) noexcept; + + template + [[nodiscard]] __device__ static inline + T shfl_sync_up(T data, + bool& pred, + uint32_t lane_idx_offset, + uint32_t clamp_segmask, + uint32_t lane_mask) noexcept; + + template + [[nodiscard]] __device__ static inline + T shfl_sync_down(T data, + uint32_t lane_idx_offset, + uint32_t clamp_segmask, + uint32_t lane_mask) noexcept; + + template + [[nodiscard]] __device__ static inline + T shfl_sync_down(T data, + bool& pred, + uint32_t lane_idx_offset, + uint32_t clamp_segmask, + uint32_t lane_mask) noexcept; + + template + [[nodiscard]] __device__ static inline + T shfl_sync_bfly(T data, + uint32_t lane_idx_offset, + uint32_t clamp_segmask, + uint32_t lane_mask) noexcept; + + template + [[nodiscard]] __device__ static inline + T shfl_sync_bfly(T data, + bool& pred, + uint32_t lane_idx_offset, + uint32_t clamp_segmask, + uint32_t lane_mask) noexcept; + +**Constrains and checks** + +- ``T`` must have 32-bit size (compile-time) +- ``lane_idx_offset`` must be less than the warp size (debug mode) +- ``clamp_segmask`` must use the bit positions [0:4] and [8:12] (debug mode) +- ``lane_mask`` must be a subset of the active mask (debug mode) +- The destination lane must be a member of the ``lane_mask`` (debug mode) diff --git a/docs/libcudacxx/ptx/instructions/shfl_sync.rst b/docs/libcudacxx/ptx/instructions/shfl_sync.rst new file mode 100644 index 00000000000..fa9b135d63d --- /dev/null +++ b/docs/libcudacxx/ptx/instructions/shfl_sync.rst @@ -0,0 +1,10 @@ + +.. _libcudacxx-ptx-instructions-shfl_sync: + +shfl.sync +========= + +- PTX ISA: + `shfl.sync `__ + +.. include:: generated/shfl_sync.rst diff --git a/libcudacxx/include/cuda/__ptx/instructions/shfl_sync.h b/libcudacxx/include/cuda/__ptx/instructions/shfl_sync.h new file mode 100644 index 00000000000..65aca3b294a --- /dev/null +++ b/libcudacxx/include/cuda/__ptx/instructions/shfl_sync.h @@ -0,0 +1,272 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_PTX_SHFL_SYNC_H +#define _CUDA_PTX_SHFL_SYNC_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include + +#include // __CUDA_MINIMUM_ARCH__ and friends + +_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX + +#if __cccl_ptx_isa >= 600 + +enum class __dot_shfl_mode +{ + __up, + __down, + __bfly, + __idx +}; + +[[maybe_unused]] +_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t __shfl_sync_dst_lane( + __dot_shfl_mode __shfl_mode, + _CUDA_VSTD::uint32_t __lane_idx_offset, + _CUDA_VSTD::uint32_t __clamp_segmask, + _CUDA_VSTD::uint32_t __lane_mask) +{ + auto __lane = ::cuda::ptx::get_sreg_laneid(); + auto __clamp = __clamp_segmask & 0b11111; + auto __segmask = __clamp_segmask >> 8; + auto __max_lane = (__lane & __segmask) | (__clamp & ~__segmask); + _CUDA_VSTD::uint32_t __j = 0; + if (__shfl_mode == __dot_shfl_mode::__idx) + { + auto __min_lane = __lane & __clamp; + __j = __min_lane | (__lane_idx_offset & ~__segmask); + } + else if (__shfl_mode == __dot_shfl_mode::__up) + { + __j = __lane - __lane_idx_offset; + } + else if (__shfl_mode == __dot_shfl_mode::__down) + { + __j = __lane + __lane_idx_offset; + } + else + { + __j = __lane ^ __lane_idx_offset; + } + auto __dst = __shfl_mode == __dot_shfl_mode::__up + ? (__j >= __max_lane ? __j : __lane) // + : (__j <= __max_lane ? __j : __lane); + return (1u << __dst); +} + +template +_CCCL_DEVICE static inline void __shfl_sync_checks( + __dot_shfl_mode __shfl_mode, + _Tp, + _CUDA_VSTD::uint32_t __lane_idx_offset, + _CUDA_VSTD::uint32_t __clamp_segmask, + _CUDA_VSTD::uint32_t __lane_mask) +{ + static_assert(sizeof(_Tp) == 4, "shfl.sync only accepts 4-byte data types"); + if (__shfl_mode != __dot_shfl_mode::__idx) + { + _CCCL_ASSERT(__lane_idx_offset < 32, "the lane index or offset must be less than the warp size"); + } + _CCCL_ASSERT((__clamp_segmask | 0b1111100011111) == 0b1111100011111, + "clamp value + segmentation mask must use the bit positions [0:4] and [8:12]"); + _CCCL_ASSERT((__lane_mask & __activemask()) == __lane_mask, "lane mask must be a subset of the active mask"); + _CCCL_ASSERT( + ::cuda::ptx::__shfl_sync_dst_lane(__shfl_mode, __lane_idx_offset, __clamp_segmask, __lane_mask) & __lane_mask, + "the destination lane must be a member of the lane mask"); +} + +template +_CCCL_NODISCARD _CCCL_DEVICE static inline _Tp shfl_sync_idx( + _Tp __data, + bool& __pred, + _CUDA_VSTD::uint32_t __lane_idx_offset, + _CUDA_VSTD::uint32_t __clamp_segmask, + _CUDA_VSTD::uint32_t __lane_mask) noexcept +{ + __shfl_sync_checks(__dot_shfl_mode::__idx, __data, __lane_idx_offset, __clamp_segmask, __lane_mask); + auto __data1 = _CUDA_VSTD::bit_cast<_CUDA_VSTD::uint32_t>(__data); + _CUDA_VSTD::int32_t __pred1; + _CUDA_VSTD::uint32_t __ret; + asm volatile( + "{ \n\t\t" + ".reg .pred p; \n\t\t" + "shfl.sync.idx.b32 %0|p, %2, %3, %4, %5; \n\t\t" + "selp.s32 %1, 1, 0, p; \n\t" + "}" + : "=r"(__ret), "=r"(__pred1) + : "r"(__data1), "r"(__lane_idx_offset), "r"(__clamp_segmask), "r"(__lane_mask)); + __pred = static_cast(__pred1); + return _CUDA_VSTD::bit_cast<_CUDA_VSTD::uint32_t>(__ret); +} + +template +_CCCL_NODISCARD _CCCL_DEVICE static inline _Tp shfl_sync_idx( + _Tp __data, + _CUDA_VSTD::uint32_t __lane_idx_offset, + _CUDA_VSTD::uint32_t __clamp_segmask, + _CUDA_VSTD::uint32_t __lane_mask) noexcept +{ + __shfl_sync_checks(__dot_shfl_mode::__idx, __data, __lane_idx_offset, __clamp_segmask, __lane_mask); + auto __data1 = _CUDA_VSTD::bit_cast<_CUDA_VSTD::uint32_t>(__data); + _CUDA_VSTD::uint32_t __ret; + asm volatile("{ \n\t\t" + "shfl.sync.idx.b32 %0, %1, %2, %3, %4; \n\t\t" + "}" + : "=r"(__ret) + : "r"(__data1), "r"(__lane_idx_offset), "r"(__clamp_segmask), "r"(__lane_mask)); + return _CUDA_VSTD::bit_cast<_CUDA_VSTD::uint32_t>(__ret); +} + +template +_CCCL_NODISCARD _CCCL_DEVICE static inline _Tp shfl_sync_up( + _Tp __data, + bool& __pred, + _CUDA_VSTD::uint32_t __lane_idx_offset, + _CUDA_VSTD::uint32_t __clamp_segmask, + _CUDA_VSTD::uint32_t __lane_mask) noexcept +{ + __shfl_sync_checks(__dot_shfl_mode::__up, __data, __lane_idx_offset, __clamp_segmask, __lane_mask); + auto __data1 = _CUDA_VSTD::bit_cast<_CUDA_VSTD::uint32_t>(__data); + _CUDA_VSTD::int32_t __pred1; + _CUDA_VSTD::uint32_t __ret; + asm volatile( + "{ \n\t\t" + ".reg .pred p; \n\t\t" + "shfl.sync.up.b32 %0|p, %2, %3, %4, %5; \n\t\t" + "selp.s32 %1, 1, 0, p; \n\t" + "}" + : "=r"(__ret), "=r"(__pred1) + : "r"(__data1), "r"(__lane_idx_offset), "r"(__clamp_segmask), "r"(__lane_mask)); + __pred = static_cast(__pred1); + return _CUDA_VSTD::bit_cast<_CUDA_VSTD::uint32_t>(__ret); +} + +template +_CCCL_NODISCARD _CCCL_DEVICE static inline _Tp shfl_sync_up( + _Tp __data, + _CUDA_VSTD::uint32_t __lane_idx_offset, + _CUDA_VSTD::uint32_t __clamp_segmask, + _CUDA_VSTD::uint32_t __lane_mask) noexcept +{ + __shfl_sync_checks(__dot_shfl_mode::__up, __data, __lane_idx_offset, __clamp_segmask, __lane_mask); + auto __data1 = _CUDA_VSTD::bit_cast<_CUDA_VSTD::uint32_t>(__data); + _CUDA_VSTD::uint32_t __ret; + asm volatile("{ \n\t\t" + "shfl.sync.up.b32 %0, %1, %2, %3, %4; \n\t\t" + "}" + : "=r"(__ret) + : "r"(__data1), "r"(__lane_idx_offset), "r"(__clamp_segmask), "r"(__lane_mask)); + return _CUDA_VSTD::bit_cast<_CUDA_VSTD::uint32_t>(__ret); +} + +template +_CCCL_NODISCARD _CCCL_DEVICE static inline _Tp shfl_sync_down( + _Tp __data, + bool& __pred, + _CUDA_VSTD::uint32_t __lane_idx_offset, + _CUDA_VSTD::uint32_t __clamp_segmask, + _CUDA_VSTD::uint32_t __lane_mask) noexcept +{ + __shfl_sync_checks(__dot_shfl_mode::__down, __data, __lane_idx_offset, __clamp_segmask, __lane_mask); + auto __data1 = _CUDA_VSTD::bit_cast<_CUDA_VSTD::uint32_t>(__data); + _CUDA_VSTD::int32_t __pred1; + _CUDA_VSTD::uint32_t __ret; + asm volatile( + "{ \n\t\t" + ".reg .pred p; \n\t\t" + "shfl.sync.down.b32 %0|p, %2, %3, %4, %5; \n\t\t" + "selp.s32 %1, 1, 0, p; \n\t" + "}" + : "=r"(__ret), "=r"(__pred1) + : "r"(__data1), "r"(__lane_idx_offset), "r"(__clamp_segmask), "r"(__lane_mask)); + __pred = static_cast(__pred1); + return _CUDA_VSTD::bit_cast<_CUDA_VSTD::uint32_t>(__ret); +} + +template +_CCCL_NODISCARD _CCCL_DEVICE static inline _Tp shfl_sync_down( + _Tp __data, + _CUDA_VSTD::uint32_t __lane_idx_offset, + _CUDA_VSTD::uint32_t __clamp_segmask, + _CUDA_VSTD::uint32_t __lane_mask) noexcept +{ + __shfl_sync_checks(__dot_shfl_mode::__down, __data, __lane_idx_offset, __clamp_segmask, __lane_mask); + auto __data1 = _CUDA_VSTD::bit_cast<_CUDA_VSTD::uint32_t>(__data); + _CUDA_VSTD::uint32_t __ret; + asm volatile("{ \n\t\t" + "shfl.sync.down.b32 %0, %1, %2, %3, %4; \n\t\t" + "}" + : "=r"(__ret) + : "r"(__data1), "r"(__lane_idx_offset), "r"(__clamp_segmask), "r"(__lane_mask)); + return _CUDA_VSTD::bit_cast<_CUDA_VSTD::uint32_t>(__ret); +} + +template +_CCCL_NODISCARD _CCCL_DEVICE static inline _Tp shfl_sync_bfly( + _Tp __data, + bool& __pred, + _CUDA_VSTD::uint32_t __lane_idx_offset, + _CUDA_VSTD::uint32_t __clamp_segmask, + _CUDA_VSTD::uint32_t __lane_mask) noexcept +{ + __shfl_sync_checks(__dot_shfl_mode::__bfly, __data, __lane_idx_offset, __clamp_segmask, __lane_mask); + auto __data1 = _CUDA_VSTD::bit_cast<_CUDA_VSTD::uint32_t>(__data); + _CUDA_VSTD::int32_t __pred1; + _CUDA_VSTD::uint32_t __ret; + asm volatile( + "{ \n\t\t" + ".reg .pred p; \n\t\t" + "shfl.sync.bfly.b32 %0|p, %2, %3, %4, %5; \n\t\t" + "selp.s32 %1, 1, 0, p; \n\t" + "}" + : "=r"(__ret), "=r"(__pred1) + : "r"(__data1), "r"(__lane_idx_offset), "r"(__clamp_segmask), "r"(__lane_mask)); + __pred = static_cast(__pred1); + return _CUDA_VSTD::bit_cast<_CUDA_VSTD::uint32_t>(__ret); +} + +template +_CCCL_NODISCARD _CCCL_DEVICE static inline _Tp shfl_sync_bfly( + _Tp __data, + _CUDA_VSTD::uint32_t __lane_idx_offset, + _CUDA_VSTD::uint32_t __clamp_segmask, + _CUDA_VSTD::uint32_t __lane_mask) noexcept +{ + __shfl_sync_checks(__dot_shfl_mode::__bfly, __data, __lane_idx_offset, __clamp_segmask, __lane_mask); + auto __data1 = _CUDA_VSTD::bit_cast<_CUDA_VSTD::uint32_t>(__data); + _CUDA_VSTD::uint32_t __ret; + asm volatile( // + "{ \n\t\t" + "shfl.sync.bfly.b32 %0, %1, %2, %3, %4; \n\t\t" + "}" + : "=r"(__ret) + : "r"(__data1), "r"(__lane_idx_offset), "r"(__clamp_segmask), "r"(__lane_mask)); + return _CUDA_VSTD::bit_cast<_CUDA_VSTD::uint32_t>(__ret); +} + +#endif // __cccl_ptx_isa >= 600 + +_LIBCUDACXX_END_NAMESPACE_CUDA_PTX + +#endif // _CUDA_PTX_SHFL_SYNC_H diff --git a/libcudacxx/include/cuda/ptx b/libcudacxx/include/cuda/ptx index 9b021262707..7855a42ed8c 100644 --- a/libcudacxx/include/cuda/ptx +++ b/libcudacxx/include/cuda/ptx @@ -91,6 +91,7 @@ #include #include #include +#include #include #include #include diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.shfl.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.shfl.compile.pass.cpp new file mode 100644 index 00000000000..64e4aee1c31 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.shfl.compile.pass.cpp @@ -0,0 +1,150 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// UNSUPPORTED: libcpp-has-no-threads +// UNSUPPORTED: clang && !nvcc + +// + +#include +#include + +__host__ __device__ void test_shfl_full_mask() +{ +#if __cccl_ptx_isa >= 600 && __CUDA_ARCH__ + constexpr unsigned FullMask = 0xFFFFFFFF; + auto data = threadIdx.x; + bool pred1, pred2, pred3, pred4; + auto res1 = cuda::ptx::shfl_sync_idx(data, pred1, 2 /*idx*/, 0b11111 /*clamp*/, FullMask); + assert(res1 == 2 && pred1); + + auto res2 = cuda::ptx::shfl_sync_up(data, pred2, 2 /*offset*/, 0 /*clamp*/, FullMask); + if (threadIdx.x <= 1) + { + assert(res2 == threadIdx.x && !pred2); + } + else + { + assert(res2 == threadIdx.x - 2 && pred2); + } + + auto res3 = cuda::ptx::shfl_sync_down(data, pred3, 2 /*offset*/, 0b11111 /*clamp*/, FullMask); + if (threadIdx.x >= 30) + { + assert(res3 == threadIdx.x && !pred3); + } + else + { + assert(res3 == threadIdx.x + 2 && pred3); + } + + auto res4 = cuda::ptx::shfl_sync_bfly(data, pred4, 2 /*offset*/, 0b11111 /*clamp*/, FullMask); + assert(res4 == threadIdx.x ^ 2 && pred4); +#endif // __cccl_ptx_isa >= 600 +} + +__host__ __device__ void test_shfl_full_mask_no_pred() +{ +#if __cccl_ptx_isa >= 600 && __CUDA_ARCH__ + constexpr unsigned FullMask = 0xFFFFFFFF; + auto data = threadIdx.x; + auto res1 = cuda::ptx::shfl_sync_idx(data, 2 /*idx*/, 0b11111 /*clamp*/, FullMask); + assert(res1 == 2); + + auto res2 = cuda::ptx::shfl_sync_up(data, 2 /*offset*/, 0 /*clamp*/, FullMask); + if (threadIdx.x <= 1) + { + assert(res2 == threadIdx.x); + } + else + { + assert(res2 == threadIdx.x - 2); + } + + auto res3 = cuda::ptx::shfl_sync_down(data, 2 /*offset*/, 0b11111 /*clamp*/, FullMask); + if (threadIdx.x >= 30) + { + assert(res3 == threadIdx.x); + } + else + { + assert(res3 == threadIdx.x + 2); + } + + auto res4 = cuda::ptx::shfl_sync_bfly(data, 2 /*offset*/, 0b11111 /*clamp*/, FullMask); + assert(res4 == threadIdx.x ^ 2); +#endif // __cccl_ptx_isa >= 600 +} + +__host__ __device__ void test_shfl_partial_mask() +{ +#if __cccl_ptx_isa >= 600 && __CUDA_ARCH__ + constexpr unsigned PartialMask = 0b1111; + auto data = threadIdx.x; + bool pred1; + if (threadIdx.x <= 3) + { + auto res1 = cuda::ptx::shfl_sync_idx(data, pred1, 2 /*idx*/, 0b11111 /*clamp*/, PartialMask); + assert(res1 == 2 && pred1); + } +#endif // __cccl_ptx_isa >= 600 +} + +__host__ __device__ void test_shfl_partial_warp() +{ +#if __cccl_ptx_isa >= 600 && __CUDA_ARCH__ + constexpr unsigned FullMask = 0xFFFFFFFF; + unsigned max_lane_mask = 16; + unsigned clamp = 0b11111; + unsigned clamp_segmark = (max_lane_mask << 8) | clamp; + auto data = threadIdx.x; + bool pred1, pred2, pred3, pred4; + auto res1 = cuda::ptx::shfl_sync_idx(data, pred1, 2 /*idx*/, clamp_segmark, FullMask); + if (threadIdx.x < 16) + { + assert(res1 == 2 && pred1); + } + else + { + assert(res1 == 16 + 2 && pred1); + } + + auto res2 = cuda::ptx::shfl_sync_up(data, pred2, 2 /*offset*/, (max_lane_mask << 8), FullMask); + if (threadIdx.x <= 1 || threadIdx.x == 16 || threadIdx.x == 17) + { + assert(res2 == threadIdx.x && !pred2); + } + else + { + assert(res2 == threadIdx.x - 2 && pred2); + } + + auto res3 = cuda::ptx::shfl_sync_down(data, pred3, 2 /*offset*/, clamp_segmark, FullMask); + if (threadIdx.x == 14 || threadIdx.x == 15 || threadIdx.x >= 30) + { + assert(res3 == threadIdx.x && !pred3); + } + else + { + assert(res3 == threadIdx.x + 2 && pred3); + } + + auto res4 = cuda::ptx::shfl_sync_bfly(data, pred4, 2 /*offset*/, clamp_segmark, FullMask); + assert(res4 == threadIdx.x ^ 2 && pred4); +#endif // __cccl_ptx_isa >= 600 +} + +int main(int, char**) +{ + NV_IF_TARGET(NV_IS_HOST, cuda_thread_count = 32;) + test_shfl_full_mask(); + test_shfl_partial_mask(); + test_shfl_partial_warp(); + return 0; +}