-
Notifications
You must be signed in to change notification settings - Fork 286
PTX shfl_sync
#3241
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
PTX shfl_sync
#3241
Changes from 1 commit
Commits
Show all changes
20 commits
Select commit
Hold shift + click to select a range
fee650c
btx shfl_sync implementation
fbusato 24b8e28
add documentation and tests
fbusato db3edb9
Update libcudacxx/test/libcudacxx/cuda/ptx/manual/shfl_test.h
fbusato eb6df1d
move documentation file
fbusato c0be178
use template parameter for input data
fbusato f0caaa9
fix return type
fbusato 969df92
update docs
fbusato d957522
modify return value type
fbusato f683984
Merge branch 'main' into ptx-shuffle
fbusato 9e577a0
Merge branch 'main' into ptx-shuffle
miscco c17b109
copyright update
fbusato cbb9d84
refactor to better match PTX generator
fbusato 3c1a37a
copyright update
fbusato 9804923
fix comparison of integers of different signs
fbusato 9258523
fix documentation
fbusato 5c38e1e
change function names to match ptx
fbusato f237b19
Merge branch 'main' into ptx-shuffle
fbusato 69dc225
NIT
fbusato 9acb0d7
move manual/shfl_test.h
fbusato 1b4798d
recover correct instructions.rst header
fbusato File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,136 @@ | ||
| //===----------------------------------------------------------------------===// | ||
| // | ||
| // Part of libcu++, the C++ Standard Library for your entire system, | ||
| // under the Apache License v2.0 with LLVM Exceptions. | ||
| // See https://llvm.org/LICENSE.txt for license information. | ||
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
| // SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. | ||
| // | ||
| //===----------------------------------------------------------------------===// | ||
|
|
||
| #if !defined(_CUDA_PTX_SHFL_SYNC_H) | ||
| # define _CUDA_PTX_SHFL_SYNC_H | ||
|
|
||
| # include <cuda/std/detail/__config> | ||
|
|
||
| # if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) | ||
| # pragma GCC system_header | ||
| # elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) | ||
| # pragma clang system_header | ||
| # elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) | ||
| # pragma system_header | ||
| # endif // no system header | ||
|
|
||
| # if _CCCL_STD_VER >= 2017 | ||
|
|
||
| # include <cuda/__ptx/instructions/get_sreg.h> | ||
| # include <cuda/__ptx/ptx_dot_variants.h> | ||
| # include <cuda/std/__type_traits/is_integral.h> | ||
| # include <cuda/std/__type_traits/is_signed.h> | ||
| # include <cuda/std/cstdint> | ||
|
|
||
| # include <nv/target> // __CUDA_MINIMUM_ARCH__ and friends | ||
|
|
||
| _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX | ||
|
|
||
| # if __cccl_ptx_isa >= 600 | ||
|
|
||
| struct shfl_return_values | ||
| { | ||
| _CUDA_VSTD::uint32_t __data; | ||
| _CUDA_VSTD::int32_t __pred; | ||
| }; | ||
|
|
||
| template <dot_shfl_mode _ShuffleMode> | ||
| _CCCL_NODISCARD _CCCL_DEVICE static inline shfl_return_values shfl_sync( | ||
| shfl_mode_t<_ShuffleMode> __shfl_mode, | ||
| _CUDA_VSTD::uint32_t __data, | ||
| _CUDA_VSTD::uint32_t __lane_idx_offset, | ||
| _CUDA_VSTD::uint32_t __clamp_segmask, | ||
| _CUDA_VSTD::uint32_t __lane_mask) noexcept | ||
| { | ||
| _CCCL_ASSERT(__lane_idx_offset < 32, "the lane index or offset must be less than the warp size"); | ||
| _CCCL_ASSERT(__clamp_segmask <= 0b111111111111, "clamp value + segmentation mask must be less or equal than 12 bits"); | ||
| _CCCL_ASSERT((__lane_mask & __activemask()) == __lane_mask, "lane mask must be a subset of the active mask"); | ||
| # if defined(_CCCL_ENABLE_DEBUG_MODE) | ||
| auto __lane = get_sreg_laneid(); | ||
| auto __clamp = __clamp_segmask & 0b11111; | ||
| auto __segmask = __clamp_segmask >> 8; | ||
| auto __max_lane = (__lane & __segmask) | (__clamp & __segmask); | ||
| _CUDA_VSTD::uint32_t __dst = 0; | ||
| # endif | ||
| _CUDA_VSTD::int32_t __pred; | ||
| _CUDA_VSTD::uint32_t __ret; | ||
| if constexpr (__shfl_mode == shfl_mode_idx) | ||
| { | ||
| # if defined(_CCCL_ENABLE_DEBUG_MODE) | ||
| auto __min_lane = (__lane & __segmask); | ||
| auto __j = __min_lane | (__lane_idx_offset & ~__segmask); | ||
| __dst = __j <= __max_lane ? __j : __lane; | ||
| # endif | ||
| asm volatile( | ||
| "{ \n\t\t" | ||
| ".reg .pred p; \n\t\t" | ||
| "shfl_sync.sync.idx.b32 %0|p, %2, %3, %4, %5; \n\t\t" | ||
| "selp.s32 %1, 1, 0, p; \n\t" | ||
| "}" | ||
| : "=r"(__ret), "=r"(__pred) | ||
| : "r"(__data), "r"(__lane_idx_offset), "r"(__clamp_segmask), "r"(__lane_mask)); | ||
| } | ||
| else if constexpr (__shfl_mode == shfl_mode_up) | ||
| { | ||
| # if defined(_CCCL_ENABLE_DEBUG_MODE) | ||
| auto __j = __lane - __lane_idx_offset; | ||
| __dst = __j >= __max_lane ? __j : __lane; | ||
| # endif | ||
fbusato marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| asm volatile( | ||
| "{ \n\t\t" | ||
| ".reg .pred p; \n\t\t" | ||
| "shfl_sync.sync.up.b32 %0|p, %2, %3, %4, %5; \n\t\t" | ||
| "selp.s32 %1, 1, 0, p; \n\t" | ||
| "}" | ||
| : "=r"(__ret), "=r"(__pred) | ||
| : "r"(__data), "r"(__lane_idx_offset), "r"(__clamp_segmask), "r"(__lane_mask)); | ||
| } | ||
| else if constexpr (__shfl_mode == shfl_mode_down) | ||
| { | ||
| # if defined(_CCCL_ENABLE_DEBUG_MODE) | ||
| auto __j = __lane + __lane_idx_offset; | ||
| __dst = __j <= __max_lane ? __j : __lane; | ||
| # endif | ||
| asm volatile( | ||
| "{ \n\t\t" | ||
| ".reg .pred p; \n\t\t" | ||
| "shfl_sync.sync.down.b32 %0|p, %2, %3, %4, %5; \n\t\t" | ||
| "selp.s32 %1, 1, 0, p; \n\t" | ||
| "}" | ||
| : "=r"(__ret), "=r"(__pred) | ||
| : "r"(__data), "r"(__lane_idx_offset), "r"(__clamp_segmask), "r"(__lane_mask)); | ||
| } | ||
| else | ||
| { | ||
| # if defined(_CCCL_ENABLE_DEBUG_MODE) | ||
| auto __j = __lane ^ __lane_idx_offset; | ||
| __dst = __j <= __max_lane ? __j : __lane; | ||
| # endif | ||
| asm volatile( | ||
| "{ \n\t\t" | ||
| ".reg .pred p; \n\t\t" | ||
| "shfl_sync.sync.bfly.b32 %0|p, %2, %3, %4, %5; \n\t\t" | ||
| "selp.s32 %1, 1, 0, p; \n\t" | ||
| "}" | ||
| : "=r"(__ret), "=r"(__pred) | ||
| : "r"(__data), "r"(__lane_idx_offset), "r"(__clamp_segmask), "r"(__lane_mask)); | ||
| } | ||
| # if defined(_CCCL_ENABLE_DEBUG_MODE) | ||
| _CCCL_ASSERT((1 << __dst) & __lane_mask, "the destination lane must be a member of the lane mask"); | ||
| # endif | ||
| return shfl_return_values{__ret, __pred}; | ||
| } | ||
|
|
||
| # endif // __cccl_ptx_isa >= 600 | ||
|
|
||
| _LIBCUDACXX_END_NAMESPACE_CUDA_PTX | ||
|
|
||
| # endif // _CCCL_STD_VER >= 2017 | ||
| #endif // _CUDA_PTX_SHFL_SYNC_H | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.