Skip to content

Commit b1c370b

Browse files
fbusatodavebayer
authored andcommitted
PTX shfl_sync (NVIDIA#3241)
1 parent f73dfa3 commit b1c370b

File tree

6 files changed

+513
-2
lines changed

6 files changed

+513
-2
lines changed

docs/libcudacxx/ptx/instructions.rst

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ PTX Instructions
2929
instructions/multimem_red
3030
instructions/multimem_st
3131
instructions/red_async
32+
instructions/shfl_sync
3233
instructions/st_async
3334
instructions/st_bulk
3435
instructions/tcgen05_alloc
@@ -251,8 +252,8 @@ Instructions by section
251252
- No
252253
* - `shfl <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-shfl-deprecated>`__
253254
- No
254-
* - `shfl.s <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-shfl-sync>`__
255-
- No
255+
* - `shfl.sync <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-shfl-sync>`__
256+
- Yes, CCCL 2.9.0 / CUDA 12.9
256257
* - `prmt <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prmt>`__
257258
- No
258259
* - `ld <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ld>`__
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
2+
shfl.sync
3+
^^^^^^^^^
4+
5+
.. code:: cuda
6+
7+
// PTX ISA 6.0
8+
// shfl.sync.mode.b32 d[|p], a, b, c, membermask;
9+
// .mode = { .up, .down, .bfly, .idx };
10+
11+
template<typename T>
12+
[[nodiscard]] __device__ static inline
13+
T shfl_sync_idx(T data,
14+
uint32_t lane_idx_offset,
15+
uint32_t clamp_segmask,
16+
uint32_t lane_mask) noexcept;
17+
18+
template<typename T>
19+
[[nodiscard]] __device__ static inline
20+
T shfl_sync_idx(T data,
21+
bool& pred,
22+
uint32_t lane_idx_offset,
23+
uint32_t clamp_segmask,
24+
uint32_t lane_mask) noexcept;
25+
26+
template<typename T>
27+
[[nodiscard]] __device__ static inline
28+
T shfl_sync_up(T data,
29+
uint32_t lane_idx_offset,
30+
uint32_t clamp_segmask,
31+
uint32_t lane_mask) noexcept;
32+
33+
template<typename T>
34+
[[nodiscard]] __device__ static inline
35+
T shfl_sync_up(T data,
36+
bool& pred,
37+
uint32_t lane_idx_offset,
38+
uint32_t clamp_segmask,
39+
uint32_t lane_mask) noexcept;
40+
41+
template<typename T>
42+
[[nodiscard]] __device__ static inline
43+
T shfl_sync_down(T data,
44+
uint32_t lane_idx_offset,
45+
uint32_t clamp_segmask,
46+
uint32_t lane_mask) noexcept;
47+
48+
template<typename T>
49+
[[nodiscard]] __device__ static inline
50+
T shfl_sync_down(T data,
51+
bool& pred,
52+
uint32_t lane_idx_offset,
53+
uint32_t clamp_segmask,
54+
uint32_t lane_mask) noexcept;
55+
56+
template<typename T>
57+
[[nodiscard]] __device__ static inline
58+
T shfl_sync_bfly(T data,
59+
uint32_t lane_idx_offset,
60+
uint32_t clamp_segmask,
61+
uint32_t lane_mask) noexcept;
62+
63+
template<typename T>
64+
[[nodiscard]] __device__ static inline
65+
T shfl_sync_bfly(T data,
66+
bool& pred,
67+
uint32_t lane_idx_offset,
68+
uint32_t clamp_segmask,
69+
uint32_t lane_mask) noexcept;
70+
71+
**Constrains and checks**
72+
73+
- ``T`` must have 32-bit size (compile-time)
74+
- ``lane_idx_offset`` must be less than the warp size (debug mode)
75+
- ``clamp_segmask`` must use the bit positions [0:4] and [8:12] (debug mode)
76+
- ``lane_mask`` must be a subset of the active mask (debug mode)
77+
- The destination lane must be a member of the ``lane_mask`` (debug mode)
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
2+
.. _libcudacxx-ptx-instructions-shfl_sync:
3+
4+
shfl.sync
5+
=========
6+
7+
- PTX ISA:
8+
`shfl.sync <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-shfl-sync>`__
9+
10+
.. include:: generated/shfl_sync.rst
Lines changed: 272 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,272 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of libcu++, the C++ Standard Library for your entire system,
4+
// under the Apache License v2.0 with LLVM Exceptions.
5+
// See https://llvm.org/LICENSE.txt for license information.
6+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7+
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
8+
//
9+
//===----------------------------------------------------------------------===//
10+
11+
#ifndef _CUDA_PTX_SHFL_SYNC_H
12+
#define _CUDA_PTX_SHFL_SYNC_H
13+
14+
#include <cuda/std/detail/__config>
15+
16+
#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
17+
# pragma GCC system_header
18+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
19+
# pragma clang system_header
20+
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
21+
# pragma system_header
22+
#endif // no system header
23+
24+
#include <cuda/__ptx/instructions/get_sreg.h>
25+
#include <cuda/__ptx/ptx_dot_variants.h>
26+
#include <cuda/std/__bit/bit_cast.h>
27+
#include <cuda/std/cstdint>
28+
29+
#include <nv/target> // __CUDA_MINIMUM_ARCH__ and friends
30+
31+
_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
32+
33+
#if __cccl_ptx_isa >= 600
34+
35+
enum class __dot_shfl_mode
36+
{
37+
__up,
38+
__down,
39+
__bfly,
40+
__idx
41+
};
42+
43+
[[maybe_unused]]
44+
_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t __shfl_sync_dst_lane(
45+
__dot_shfl_mode __shfl_mode,
46+
_CUDA_VSTD::uint32_t __lane_idx_offset,
47+
_CUDA_VSTD::uint32_t __clamp_segmask,
48+
_CUDA_VSTD::uint32_t __lane_mask)
49+
{
50+
auto __lane = ::cuda::ptx::get_sreg_laneid();
51+
auto __clamp = __clamp_segmask & 0b11111;
52+
auto __segmask = __clamp_segmask >> 8;
53+
auto __max_lane = (__lane & __segmask) | (__clamp & ~__segmask);
54+
_CUDA_VSTD::uint32_t __j = 0;
55+
if (__shfl_mode == __dot_shfl_mode::__idx)
56+
{
57+
auto __min_lane = __lane & __clamp;
58+
__j = __min_lane | (__lane_idx_offset & ~__segmask);
59+
}
60+
else if (__shfl_mode == __dot_shfl_mode::__up)
61+
{
62+
__j = __lane - __lane_idx_offset;
63+
}
64+
else if (__shfl_mode == __dot_shfl_mode::__down)
65+
{
66+
__j = __lane + __lane_idx_offset;
67+
}
68+
else
69+
{
70+
__j = __lane ^ __lane_idx_offset;
71+
}
72+
auto __dst = __shfl_mode == __dot_shfl_mode::__up
73+
? (__j >= __max_lane ? __j : __lane) //
74+
: (__j <= __max_lane ? __j : __lane);
75+
return (1u << __dst);
76+
}
77+
78+
template <typename _Tp>
79+
_CCCL_DEVICE static inline void __shfl_sync_checks(
80+
__dot_shfl_mode __shfl_mode,
81+
_Tp,
82+
_CUDA_VSTD::uint32_t __lane_idx_offset,
83+
_CUDA_VSTD::uint32_t __clamp_segmask,
84+
_CUDA_VSTD::uint32_t __lane_mask)
85+
{
86+
static_assert(sizeof(_Tp) == 4, "shfl.sync only accepts 4-byte data types");
87+
if (__shfl_mode != __dot_shfl_mode::__idx)
88+
{
89+
_CCCL_ASSERT(__lane_idx_offset < 32, "the lane index or offset must be less than the warp size");
90+
}
91+
_CCCL_ASSERT((__clamp_segmask | 0b1111100011111) == 0b1111100011111,
92+
"clamp value + segmentation mask must use the bit positions [0:4] and [8:12]");
93+
_CCCL_ASSERT((__lane_mask & __activemask()) == __lane_mask, "lane mask must be a subset of the active mask");
94+
_CCCL_ASSERT(
95+
::cuda::ptx::__shfl_sync_dst_lane(__shfl_mode, __lane_idx_offset, __clamp_segmask, __lane_mask) & __lane_mask,
96+
"the destination lane must be a member of the lane mask");
97+
}
98+
99+
template <typename _Tp>
100+
_CCCL_NODISCARD _CCCL_DEVICE static inline _Tp shfl_sync_idx(
101+
_Tp __data,
102+
bool& __pred,
103+
_CUDA_VSTD::uint32_t __lane_idx_offset,
104+
_CUDA_VSTD::uint32_t __clamp_segmask,
105+
_CUDA_VSTD::uint32_t __lane_mask) noexcept
106+
{
107+
__shfl_sync_checks(__dot_shfl_mode::__idx, __data, __lane_idx_offset, __clamp_segmask, __lane_mask);
108+
auto __data1 = _CUDA_VSTD::bit_cast<_CUDA_VSTD::uint32_t>(__data);
109+
_CUDA_VSTD::int32_t __pred1;
110+
_CUDA_VSTD::uint32_t __ret;
111+
asm volatile(
112+
"{ \n\t\t"
113+
".reg .pred p; \n\t\t"
114+
"shfl.sync.idx.b32 %0|p, %2, %3, %4, %5; \n\t\t"
115+
"selp.s32 %1, 1, 0, p; \n\t"
116+
"}"
117+
: "=r"(__ret), "=r"(__pred1)
118+
: "r"(__data1), "r"(__lane_idx_offset), "r"(__clamp_segmask), "r"(__lane_mask));
119+
__pred = static_cast<bool>(__pred1);
120+
return _CUDA_VSTD::bit_cast<_CUDA_VSTD::uint32_t>(__ret);
121+
}
122+
123+
template <typename _Tp>
124+
_CCCL_NODISCARD _CCCL_DEVICE static inline _Tp shfl_sync_idx(
125+
_Tp __data,
126+
_CUDA_VSTD::uint32_t __lane_idx_offset,
127+
_CUDA_VSTD::uint32_t __clamp_segmask,
128+
_CUDA_VSTD::uint32_t __lane_mask) noexcept
129+
{
130+
__shfl_sync_checks(__dot_shfl_mode::__idx, __data, __lane_idx_offset, __clamp_segmask, __lane_mask);
131+
auto __data1 = _CUDA_VSTD::bit_cast<_CUDA_VSTD::uint32_t>(__data);
132+
_CUDA_VSTD::uint32_t __ret;
133+
asm volatile("{ \n\t\t"
134+
"shfl.sync.idx.b32 %0, %1, %2, %3, %4; \n\t\t"
135+
"}"
136+
: "=r"(__ret)
137+
: "r"(__data1), "r"(__lane_idx_offset), "r"(__clamp_segmask), "r"(__lane_mask));
138+
return _CUDA_VSTD::bit_cast<_CUDA_VSTD::uint32_t>(__ret);
139+
}
140+
141+
template <typename _Tp>
142+
_CCCL_NODISCARD _CCCL_DEVICE static inline _Tp shfl_sync_up(
143+
_Tp __data,
144+
bool& __pred,
145+
_CUDA_VSTD::uint32_t __lane_idx_offset,
146+
_CUDA_VSTD::uint32_t __clamp_segmask,
147+
_CUDA_VSTD::uint32_t __lane_mask) noexcept
148+
{
149+
__shfl_sync_checks(__dot_shfl_mode::__up, __data, __lane_idx_offset, __clamp_segmask, __lane_mask);
150+
auto __data1 = _CUDA_VSTD::bit_cast<_CUDA_VSTD::uint32_t>(__data);
151+
_CUDA_VSTD::int32_t __pred1;
152+
_CUDA_VSTD::uint32_t __ret;
153+
asm volatile(
154+
"{ \n\t\t"
155+
".reg .pred p; \n\t\t"
156+
"shfl.sync.up.b32 %0|p, %2, %3, %4, %5; \n\t\t"
157+
"selp.s32 %1, 1, 0, p; \n\t"
158+
"}"
159+
: "=r"(__ret), "=r"(__pred1)
160+
: "r"(__data1), "r"(__lane_idx_offset), "r"(__clamp_segmask), "r"(__lane_mask));
161+
__pred = static_cast<bool>(__pred1);
162+
return _CUDA_VSTD::bit_cast<_CUDA_VSTD::uint32_t>(__ret);
163+
}
164+
165+
template <typename _Tp>
166+
_CCCL_NODISCARD _CCCL_DEVICE static inline _Tp shfl_sync_up(
167+
_Tp __data,
168+
_CUDA_VSTD::uint32_t __lane_idx_offset,
169+
_CUDA_VSTD::uint32_t __clamp_segmask,
170+
_CUDA_VSTD::uint32_t __lane_mask) noexcept
171+
{
172+
__shfl_sync_checks(__dot_shfl_mode::__up, __data, __lane_idx_offset, __clamp_segmask, __lane_mask);
173+
auto __data1 = _CUDA_VSTD::bit_cast<_CUDA_VSTD::uint32_t>(__data);
174+
_CUDA_VSTD::uint32_t __ret;
175+
asm volatile("{ \n\t\t"
176+
"shfl.sync.up.b32 %0, %1, %2, %3, %4; \n\t\t"
177+
"}"
178+
: "=r"(__ret)
179+
: "r"(__data1), "r"(__lane_idx_offset), "r"(__clamp_segmask), "r"(__lane_mask));
180+
return _CUDA_VSTD::bit_cast<_CUDA_VSTD::uint32_t>(__ret);
181+
}
182+
183+
template <typename _Tp>
184+
_CCCL_NODISCARD _CCCL_DEVICE static inline _Tp shfl_sync_down(
185+
_Tp __data,
186+
bool& __pred,
187+
_CUDA_VSTD::uint32_t __lane_idx_offset,
188+
_CUDA_VSTD::uint32_t __clamp_segmask,
189+
_CUDA_VSTD::uint32_t __lane_mask) noexcept
190+
{
191+
__shfl_sync_checks(__dot_shfl_mode::__down, __data, __lane_idx_offset, __clamp_segmask, __lane_mask);
192+
auto __data1 = _CUDA_VSTD::bit_cast<_CUDA_VSTD::uint32_t>(__data);
193+
_CUDA_VSTD::int32_t __pred1;
194+
_CUDA_VSTD::uint32_t __ret;
195+
asm volatile(
196+
"{ \n\t\t"
197+
".reg .pred p; \n\t\t"
198+
"shfl.sync.down.b32 %0|p, %2, %3, %4, %5; \n\t\t"
199+
"selp.s32 %1, 1, 0, p; \n\t"
200+
"}"
201+
: "=r"(__ret), "=r"(__pred1)
202+
: "r"(__data1), "r"(__lane_idx_offset), "r"(__clamp_segmask), "r"(__lane_mask));
203+
__pred = static_cast<bool>(__pred1);
204+
return _CUDA_VSTD::bit_cast<_CUDA_VSTD::uint32_t>(__ret);
205+
}
206+
207+
template <typename _Tp>
208+
_CCCL_NODISCARD _CCCL_DEVICE static inline _Tp shfl_sync_down(
209+
_Tp __data,
210+
_CUDA_VSTD::uint32_t __lane_idx_offset,
211+
_CUDA_VSTD::uint32_t __clamp_segmask,
212+
_CUDA_VSTD::uint32_t __lane_mask) noexcept
213+
{
214+
__shfl_sync_checks(__dot_shfl_mode::__down, __data, __lane_idx_offset, __clamp_segmask, __lane_mask);
215+
auto __data1 = _CUDA_VSTD::bit_cast<_CUDA_VSTD::uint32_t>(__data);
216+
_CUDA_VSTD::uint32_t __ret;
217+
asm volatile("{ \n\t\t"
218+
"shfl.sync.down.b32 %0, %1, %2, %3, %4; \n\t\t"
219+
"}"
220+
: "=r"(__ret)
221+
: "r"(__data1), "r"(__lane_idx_offset), "r"(__clamp_segmask), "r"(__lane_mask));
222+
return _CUDA_VSTD::bit_cast<_CUDA_VSTD::uint32_t>(__ret);
223+
}
224+
225+
template <typename _Tp>
226+
_CCCL_NODISCARD _CCCL_DEVICE static inline _Tp shfl_sync_bfly(
227+
_Tp __data,
228+
bool& __pred,
229+
_CUDA_VSTD::uint32_t __lane_idx_offset,
230+
_CUDA_VSTD::uint32_t __clamp_segmask,
231+
_CUDA_VSTD::uint32_t __lane_mask) noexcept
232+
{
233+
__shfl_sync_checks(__dot_shfl_mode::__bfly, __data, __lane_idx_offset, __clamp_segmask, __lane_mask);
234+
auto __data1 = _CUDA_VSTD::bit_cast<_CUDA_VSTD::uint32_t>(__data);
235+
_CUDA_VSTD::int32_t __pred1;
236+
_CUDA_VSTD::uint32_t __ret;
237+
asm volatile(
238+
"{ \n\t\t"
239+
".reg .pred p; \n\t\t"
240+
"shfl.sync.bfly.b32 %0|p, %2, %3, %4, %5; \n\t\t"
241+
"selp.s32 %1, 1, 0, p; \n\t"
242+
"}"
243+
: "=r"(__ret), "=r"(__pred1)
244+
: "r"(__data1), "r"(__lane_idx_offset), "r"(__clamp_segmask), "r"(__lane_mask));
245+
__pred = static_cast<bool>(__pred1);
246+
return _CUDA_VSTD::bit_cast<_CUDA_VSTD::uint32_t>(__ret);
247+
}
248+
249+
template <typename _Tp>
250+
_CCCL_NODISCARD _CCCL_DEVICE static inline _Tp shfl_sync_bfly(
251+
_Tp __data,
252+
_CUDA_VSTD::uint32_t __lane_idx_offset,
253+
_CUDA_VSTD::uint32_t __clamp_segmask,
254+
_CUDA_VSTD::uint32_t __lane_mask) noexcept
255+
{
256+
__shfl_sync_checks(__dot_shfl_mode::__bfly, __data, __lane_idx_offset, __clamp_segmask, __lane_mask);
257+
auto __data1 = _CUDA_VSTD::bit_cast<_CUDA_VSTD::uint32_t>(__data);
258+
_CUDA_VSTD::uint32_t __ret;
259+
asm volatile( //
260+
"{ \n\t\t"
261+
"shfl.sync.bfly.b32 %0, %1, %2, %3, %4; \n\t\t"
262+
"}"
263+
: "=r"(__ret)
264+
: "r"(__data1), "r"(__lane_idx_offset), "r"(__clamp_segmask), "r"(__lane_mask));
265+
return _CUDA_VSTD::bit_cast<_CUDA_VSTD::uint32_t>(__ret);
266+
}
267+
268+
#endif // __cccl_ptx_isa >= 600
269+
270+
_LIBCUDACXX_END_NAMESPACE_CUDA_PTX
271+
272+
#endif // _CUDA_PTX_SHFL_SYNC_H

libcudacxx/include/cuda/ptx

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@
9191
#include <cuda/__ptx/instructions/multimem_red.h>
9292
#include <cuda/__ptx/instructions/multimem_st.h>
9393
#include <cuda/__ptx/instructions/red_async.h>
94+
#include <cuda/__ptx/instructions/shfl_sync.h>
9495
#include <cuda/__ptx/instructions/st_async.h>
9596
#include <cuda/__ptx/instructions/st_bulk.h>
9697
#include <cuda/__ptx/instructions/tcgen05_alloc.h>

0 commit comments

Comments
 (0)