Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions docs/libcudacxx/ptx/instructions.rst
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ PTX Instructions
instructions/multimem_red
instructions/multimem_st
instructions/red_async
instructions/shfl_sync
instructions/st_async
instructions/st_bulk
instructions/tcgen05_alloc
Expand Down Expand Up @@ -251,8 +252,8 @@ Instructions by section
- No
* - `shfl <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-shfl-deprecated>`__
- No
* - `shfl.s <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-shfl-sync>`__
- No
* - `shfl.sync <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-shfl-sync>`__
- Yes, CCCL 2.9.0 / CUDA 12.9
* - `prmt <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prmt>`__
- No
* - `ld <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ld>`__
Expand Down
77 changes: 77 additions & 0 deletions docs/libcudacxx/ptx/instructions/generated/shfl_sync.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@

shfl.sync
^^^^^^^^^

.. code:: cuda

// PTX ISA 6.0
// shfl.sync.mode.b32 d[|p], a, b, c, membermask;
// .mode = { .up, .down, .bfly, .idx };

template<typename T>
[[nodiscard]] __device__ static inline
T shfl_sync_idx(T data,
uint32_t lane_idx_offset,
uint32_t clamp_segmask,
uint32_t lane_mask) noexcept;

template<typename T>
[[nodiscard]] __device__ static inline
T shfl_sync_idx(T data,
bool& pred,
uint32_t lane_idx_offset,
uint32_t clamp_segmask,
uint32_t lane_mask) noexcept;

template<typename T>
[[nodiscard]] __device__ static inline
T shfl_sync_up(T data,
uint32_t lane_idx_offset,
uint32_t clamp_segmask,
uint32_t lane_mask) noexcept;

template<typename T>
[[nodiscard]] __device__ static inline
T shfl_sync_up(T data,
bool& pred,
uint32_t lane_idx_offset,
uint32_t clamp_segmask,
uint32_t lane_mask) noexcept;

template<typename T>
[[nodiscard]] __device__ static inline
T shfl_sync_down(T data,
uint32_t lane_idx_offset,
uint32_t clamp_segmask,
uint32_t lane_mask) noexcept;

template<typename T>
[[nodiscard]] __device__ static inline
T shfl_sync_down(T data,
bool& pred,
uint32_t lane_idx_offset,
uint32_t clamp_segmask,
uint32_t lane_mask) noexcept;

template<typename T>
[[nodiscard]] __device__ static inline
T shfl_sync_bfly(T data,
uint32_t lane_idx_offset,
uint32_t clamp_segmask,
uint32_t lane_mask) noexcept;

template<typename T>
[[nodiscard]] __device__ static inline
T shfl_sync_bfly(T data,
bool& pred,
uint32_t lane_idx_offset,
uint32_t clamp_segmask,
uint32_t lane_mask) noexcept;

**Constrains and checks**

- ``T`` must have 32-bit size (compile-time)
- ``lane_idx_offset`` must be less than the warp size (debug mode)
- ``clamp_segmask`` must use the bit positions [0:4] and [8:12] (debug mode)
- ``lane_mask`` must be a subset of the active mask (debug mode)
- The destination lane must be a member of the ``lane_mask`` (debug mode)
10 changes: 10 additions & 0 deletions docs/libcudacxx/ptx/instructions/shfl_sync.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@

.. _libcudacxx-ptx-instructions-shfl_sync:

shfl.sync
=========

- PTX ISA:
`shfl.sync <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-shfl-sync>`__

.. include:: generated/shfl_sync.rst
272 changes: 272 additions & 0 deletions libcudacxx/include/cuda/__ptx/instructions/shfl_sync.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,272 @@
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#ifndef _CUDA_PTX_SHFL_SYNC_H
#define _CUDA_PTX_SHFL_SYNC_H

#include <cuda/std/detail/__config>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

#include <cuda/__ptx/instructions/get_sreg.h>
#include <cuda/__ptx/ptx_dot_variants.h>
#include <cuda/std/__bit/bit_cast.h>
#include <cuda/std/cstdint>

#include <nv/target> // __CUDA_MINIMUM_ARCH__ and friends

_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX

#if __cccl_ptx_isa >= 600

enum class __dot_shfl_mode
{
__up,
__down,
__bfly,
__idx
};

[[maybe_unused]]
_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t __shfl_sync_dst_lane(
__dot_shfl_mode __shfl_mode,
_CUDA_VSTD::uint32_t __lane_idx_offset,
_CUDA_VSTD::uint32_t __clamp_segmask,
_CUDA_VSTD::uint32_t __lane_mask)
{
auto __lane = ::cuda::ptx::get_sreg_laneid();
auto __clamp = __clamp_segmask & 0b11111;
auto __segmask = __clamp_segmask >> 8;
auto __max_lane = (__lane & __segmask) | (__clamp & ~__segmask);
_CUDA_VSTD::uint32_t __j = 0;
if (__shfl_mode == __dot_shfl_mode::__idx)
{
auto __min_lane = __lane & __clamp;
__j = __min_lane | (__lane_idx_offset & ~__segmask);
}
else if (__shfl_mode == __dot_shfl_mode::__up)
{
__j = __lane - __lane_idx_offset;
}
else if (__shfl_mode == __dot_shfl_mode::__down)
{
__j = __lane + __lane_idx_offset;
}
else
{
__j = __lane ^ __lane_idx_offset;
}
auto __dst = __shfl_mode == __dot_shfl_mode::__up
? (__j >= __max_lane ? __j : __lane) //
: (__j <= __max_lane ? __j : __lane);
return (1u << __dst);
}

template <typename _Tp>
_CCCL_DEVICE static inline void __shfl_sync_checks(
__dot_shfl_mode __shfl_mode,
_Tp,
_CUDA_VSTD::uint32_t __lane_idx_offset,
_CUDA_VSTD::uint32_t __clamp_segmask,
_CUDA_VSTD::uint32_t __lane_mask)
{
static_assert(sizeof(_Tp) == 4, "shfl.sync only accepts 4-byte data types");
if (__shfl_mode != __dot_shfl_mode::__idx)
{
_CCCL_ASSERT(__lane_idx_offset < 32, "the lane index or offset must be less than the warp size");
}
_CCCL_ASSERT((__clamp_segmask | 0b1111100011111) == 0b1111100011111,
"clamp value + segmentation mask must use the bit positions [0:4] and [8:12]");
_CCCL_ASSERT((__lane_mask & __activemask()) == __lane_mask, "lane mask must be a subset of the active mask");
_CCCL_ASSERT(
::cuda::ptx::__shfl_sync_dst_lane(__shfl_mode, __lane_idx_offset, __clamp_segmask, __lane_mask) & __lane_mask,
"the destination lane must be a member of the lane mask");
}

template <typename _Tp>
_CCCL_NODISCARD _CCCL_DEVICE static inline _Tp shfl_sync_idx(
_Tp __data,
bool& __pred,
_CUDA_VSTD::uint32_t __lane_idx_offset,
_CUDA_VSTD::uint32_t __clamp_segmask,
_CUDA_VSTD::uint32_t __lane_mask) noexcept
{
__shfl_sync_checks(__dot_shfl_mode::__idx, __data, __lane_idx_offset, __clamp_segmask, __lane_mask);
auto __data1 = _CUDA_VSTD::bit_cast<_CUDA_VSTD::uint32_t>(__data);
_CUDA_VSTD::int32_t __pred1;
_CUDA_VSTD::uint32_t __ret;
asm volatile(
"{ \n\t\t"
".reg .pred p; \n\t\t"
"shfl.sync.idx.b32 %0|p, %2, %3, %4, %5; \n\t\t"
"selp.s32 %1, 1, 0, p; \n\t"
"}"
: "=r"(__ret), "=r"(__pred1)
: "r"(__data1), "r"(__lane_idx_offset), "r"(__clamp_segmask), "r"(__lane_mask));
__pred = static_cast<bool>(__pred1);
return _CUDA_VSTD::bit_cast<_CUDA_VSTD::uint32_t>(__ret);
}

template <typename _Tp>
_CCCL_NODISCARD _CCCL_DEVICE static inline _Tp shfl_sync_idx(
_Tp __data,
_CUDA_VSTD::uint32_t __lane_idx_offset,
_CUDA_VSTD::uint32_t __clamp_segmask,
_CUDA_VSTD::uint32_t __lane_mask) noexcept
{
__shfl_sync_checks(__dot_shfl_mode::__idx, __data, __lane_idx_offset, __clamp_segmask, __lane_mask);
auto __data1 = _CUDA_VSTD::bit_cast<_CUDA_VSTD::uint32_t>(__data);
_CUDA_VSTD::uint32_t __ret;
asm volatile("{ \n\t\t"
"shfl.sync.idx.b32 %0, %1, %2, %3, %4; \n\t\t"
"}"
: "=r"(__ret)
: "r"(__data1), "r"(__lane_idx_offset), "r"(__clamp_segmask), "r"(__lane_mask));
return _CUDA_VSTD::bit_cast<_CUDA_VSTD::uint32_t>(__ret);
}

template <typename _Tp>
_CCCL_NODISCARD _CCCL_DEVICE static inline _Tp shfl_sync_up(
_Tp __data,
bool& __pred,
_CUDA_VSTD::uint32_t __lane_idx_offset,
_CUDA_VSTD::uint32_t __clamp_segmask,
_CUDA_VSTD::uint32_t __lane_mask) noexcept
{
__shfl_sync_checks(__dot_shfl_mode::__up, __data, __lane_idx_offset, __clamp_segmask, __lane_mask);
auto __data1 = _CUDA_VSTD::bit_cast<_CUDA_VSTD::uint32_t>(__data);
_CUDA_VSTD::int32_t __pred1;
_CUDA_VSTD::uint32_t __ret;
asm volatile(
"{ \n\t\t"
".reg .pred p; \n\t\t"
"shfl.sync.up.b32 %0|p, %2, %3, %4, %5; \n\t\t"
"selp.s32 %1, 1, 0, p; \n\t"
"}"
: "=r"(__ret), "=r"(__pred1)
: "r"(__data1), "r"(__lane_idx_offset), "r"(__clamp_segmask), "r"(__lane_mask));
__pred = static_cast<bool>(__pred1);
return _CUDA_VSTD::bit_cast<_CUDA_VSTD::uint32_t>(__ret);
}

template <typename _Tp>
_CCCL_NODISCARD _CCCL_DEVICE static inline _Tp shfl_sync_up(
_Tp __data,
_CUDA_VSTD::uint32_t __lane_idx_offset,
_CUDA_VSTD::uint32_t __clamp_segmask,
_CUDA_VSTD::uint32_t __lane_mask) noexcept
{
__shfl_sync_checks(__dot_shfl_mode::__up, __data, __lane_idx_offset, __clamp_segmask, __lane_mask);
auto __data1 = _CUDA_VSTD::bit_cast<_CUDA_VSTD::uint32_t>(__data);
_CUDA_VSTD::uint32_t __ret;
asm volatile("{ \n\t\t"
"shfl.sync.up.b32 %0, %1, %2, %3, %4; \n\t\t"
"}"
: "=r"(__ret)
: "r"(__data1), "r"(__lane_idx_offset), "r"(__clamp_segmask), "r"(__lane_mask));
return _CUDA_VSTD::bit_cast<_CUDA_VSTD::uint32_t>(__ret);
}

template <typename _Tp>
_CCCL_NODISCARD _CCCL_DEVICE static inline _Tp shfl_sync_down(
_Tp __data,
bool& __pred,
_CUDA_VSTD::uint32_t __lane_idx_offset,
_CUDA_VSTD::uint32_t __clamp_segmask,
_CUDA_VSTD::uint32_t __lane_mask) noexcept
{
__shfl_sync_checks(__dot_shfl_mode::__down, __data, __lane_idx_offset, __clamp_segmask, __lane_mask);
auto __data1 = _CUDA_VSTD::bit_cast<_CUDA_VSTD::uint32_t>(__data);
_CUDA_VSTD::int32_t __pred1;
_CUDA_VSTD::uint32_t __ret;
asm volatile(
"{ \n\t\t"
".reg .pred p; \n\t\t"
"shfl.sync.down.b32 %0|p, %2, %3, %4, %5; \n\t\t"
"selp.s32 %1, 1, 0, p; \n\t"
"}"
: "=r"(__ret), "=r"(__pred1)
: "r"(__data1), "r"(__lane_idx_offset), "r"(__clamp_segmask), "r"(__lane_mask));
__pred = static_cast<bool>(__pred1);
return _CUDA_VSTD::bit_cast<_CUDA_VSTD::uint32_t>(__ret);
}

template <typename _Tp>
_CCCL_NODISCARD _CCCL_DEVICE static inline _Tp shfl_sync_down(
_Tp __data,
_CUDA_VSTD::uint32_t __lane_idx_offset,
_CUDA_VSTD::uint32_t __clamp_segmask,
_CUDA_VSTD::uint32_t __lane_mask) noexcept
{
__shfl_sync_checks(__dot_shfl_mode::__down, __data, __lane_idx_offset, __clamp_segmask, __lane_mask);
auto __data1 = _CUDA_VSTD::bit_cast<_CUDA_VSTD::uint32_t>(__data);
_CUDA_VSTD::uint32_t __ret;
asm volatile("{ \n\t\t"
"shfl.sync.down.b32 %0, %1, %2, %3, %4; \n\t\t"
"}"
: "=r"(__ret)
: "r"(__data1), "r"(__lane_idx_offset), "r"(__clamp_segmask), "r"(__lane_mask));
return _CUDA_VSTD::bit_cast<_CUDA_VSTD::uint32_t>(__ret);
}

template <typename _Tp>
_CCCL_NODISCARD _CCCL_DEVICE static inline _Tp shfl_sync_bfly(
_Tp __data,
bool& __pred,
_CUDA_VSTD::uint32_t __lane_idx_offset,
_CUDA_VSTD::uint32_t __clamp_segmask,
_CUDA_VSTD::uint32_t __lane_mask) noexcept
{
__shfl_sync_checks(__dot_shfl_mode::__bfly, __data, __lane_idx_offset, __clamp_segmask, __lane_mask);
auto __data1 = _CUDA_VSTD::bit_cast<_CUDA_VSTD::uint32_t>(__data);
_CUDA_VSTD::int32_t __pred1;
_CUDA_VSTD::uint32_t __ret;
asm volatile(
"{ \n\t\t"
".reg .pred p; \n\t\t"
"shfl.sync.bfly.b32 %0|p, %2, %3, %4, %5; \n\t\t"
"selp.s32 %1, 1, 0, p; \n\t"
"}"
: "=r"(__ret), "=r"(__pred1)
: "r"(__data1), "r"(__lane_idx_offset), "r"(__clamp_segmask), "r"(__lane_mask));
__pred = static_cast<bool>(__pred1);
return _CUDA_VSTD::bit_cast<_CUDA_VSTD::uint32_t>(__ret);
}

template <typename _Tp>
_CCCL_NODISCARD _CCCL_DEVICE static inline _Tp shfl_sync_bfly(
_Tp __data,
_CUDA_VSTD::uint32_t __lane_idx_offset,
_CUDA_VSTD::uint32_t __clamp_segmask,
_CUDA_VSTD::uint32_t __lane_mask) noexcept
{
__shfl_sync_checks(__dot_shfl_mode::__bfly, __data, __lane_idx_offset, __clamp_segmask, __lane_mask);
auto __data1 = _CUDA_VSTD::bit_cast<_CUDA_VSTD::uint32_t>(__data);
_CUDA_VSTD::uint32_t __ret;
asm volatile( //
"{ \n\t\t"
"shfl.sync.bfly.b32 %0, %1, %2, %3, %4; \n\t\t"
"}"
: "=r"(__ret)
: "r"(__data1), "r"(__lane_idx_offset), "r"(__clamp_segmask), "r"(__lane_mask));
return _CUDA_VSTD::bit_cast<_CUDA_VSTD::uint32_t>(__ret);
}

#endif // __cccl_ptx_isa >= 600

_LIBCUDACXX_END_NAMESPACE_CUDA_PTX

#endif // _CUDA_PTX_SHFL_SYNC_H
1 change: 1 addition & 0 deletions libcudacxx/include/cuda/ptx
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@
#include <cuda/__ptx/instructions/multimem_red.h>
#include <cuda/__ptx/instructions/multimem_st.h>
#include <cuda/__ptx/instructions/red_async.h>
#include <cuda/__ptx/instructions/shfl_sync.h>
#include <cuda/__ptx/instructions/st_async.h>
#include <cuda/__ptx/instructions/st_bulk.h>
#include <cuda/__ptx/instructions/tcgen05_alloc.h>
Expand Down
Loading
Loading