-
Notifications
You must be signed in to change notification settings - Fork 321
Implement new tuning API arch dispatching #7093
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
+309
−0
Merged
Changes from 7 commits
Commits
Show all changes
11 commits
Select commit
Hold shift + click to select a range
b177f64
Implement new tuning API arch dispatching
bernhardmgruber 9f839cd
Drop concept check again
bernhardmgruber 1a6ca10
Better SM list handling
bernhardmgruber b816c44
Avoid unused warning
bernhardmgruber dce44ff
Avoid warning:
bernhardmgruber 4f63258
fix clang CUDA
bernhardmgruber 86baa14
naming convention
bernhardmgruber 404497a
Rename to policy_selector
bernhardmgruber c312b24
Reviewer feedback
bernhardmgruber 21da579
Merge branch 'main' into dispatch_arch
bernhardmgruber 3b769e0
Merge branch 'main' into dispatch_arch
bernhardmgruber File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,162 @@ | ||
| // SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
|
|
||
| #pragma once | ||
|
|
||
| #include <cub/config.cuh> | ||
|
|
||
| #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) | ||
| # pragma GCC system_header | ||
| #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) | ||
| # pragma clang system_header | ||
| #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) | ||
| # pragma system_header | ||
| #endif // no system header | ||
|
|
||
| #include <cuda/__device/arch_id.h> | ||
| #include <cuda/std/__type_traits/is_empty.h> | ||
| #include <cuda/std/__utility/forward.h> | ||
| #include <cuda/std/__utility/integer_sequence.h> | ||
| #include <cuda/std/array> | ||
|
|
||
| CUB_NAMESPACE_BEGIN | ||
|
|
||
| namespace detail | ||
| { | ||
| #if !defined(CUB_DEFINE_RUNTIME_POLICIES) && !_CCCL_COMPILER(NVRTC) | ||
|
|
||
| # if _CCCL_STD_VER < 2020 | ||
| template <typename ArchPolicies, ::cuda::arch_id LowestArchId> | ||
| struct policy_getter_17 | ||
| { | ||
| ArchPolicies arch_policies; | ||
|
|
||
| _CCCL_API _CCCL_FORCEINLINE constexpr auto operator()() const | ||
| { | ||
| return arch_policies(LowestArchId); | ||
| } | ||
| }; | ||
|
|
||
| template <typename ArchPolicies, size_t N> | ||
| _CCCL_API constexpr auto find_lowest_arch_with_same_policy( | ||
| ArchPolicies arch_policies, size_t i, const ::cuda::std::array<::cuda::arch_id, N>& all_arches) -> ::cuda::arch_id | ||
| { | ||
| const auto policy = arch_policies(all_arches[i]); | ||
| while (i > 0 && arch_policies(all_arches[i - 1]) == policy) | ||
| { | ||
| --i; | ||
| } | ||
| return all_arches[i]; | ||
| } | ||
|
|
||
| template <int ArchMult, typename CudaArches, typename ArchPolicies, size_t... Is> | ||
| struct lowest_arch_resolver; | ||
|
|
||
| // we keep the compile-time build up of the mapping table outside a template parameterized by a user-provided callable | ||
| template <int ArchMult, int... CudaArches, typename ArchPolicies, size_t... Is> | ||
| struct lowest_arch_resolver<ArchMult, ::cuda::std::integer_sequence<int, CudaArches...>, ArchPolicies, Is...> | ||
| { | ||
| static_assert(::cuda::std::is_empty_v<ArchPolicies>); | ||
| static_assert(sizeof...(CudaArches) == sizeof...(Is)); | ||
|
|
||
| using policy_t = decltype(ArchPolicies{}(::cuda::arch_id{})); | ||
|
|
||
| static constexpr ::cuda::arch_id all_arches[sizeof...(Is)] = {::cuda::arch_id{(CudaArches * ArchMult) / 10}...}; | ||
| static constexpr policy_t all_policies[sizeof...(Is)] = {ArchPolicies{}(all_arches[Is])...}; | ||
|
|
||
| _CCCL_API static constexpr auto find_lowest(size_t i) -> ::cuda::arch_id | ||
| { | ||
| const auto& policy = all_policies[i]; | ||
| while (i > 0 && policy == all_policies[i - 1]) | ||
| { | ||
| --i; | ||
| } | ||
| return all_arches[i]; | ||
| } | ||
|
|
||
| static constexpr ::cuda::arch_id lowest_arch_with_same_policy[sizeof...(Is)] = {find_lowest(Is)...}; | ||
| }; | ||
| # endif // if _CCCL_STD_VER < 2020 | ||
|
|
||
| template <int ArchMult, int... CudaArches, typename ArchPolicies, typename FunctorT, size_t... Is> | ||
| CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch_to_arch_list( | ||
| ArchPolicies arch_policies, ::cuda::arch_id device_arch, FunctorT&& f, ::cuda::std::index_sequence<Is...>) | ||
| { | ||
| _CCCL_ASSERT(((device_arch == ::cuda::arch_id{(CudaArches * ArchMult) / 10}) || ...), | ||
| "device_arch must appear in the list of architectures compiled for"); | ||
|
|
||
| using policy_t = decltype(arch_policies(::cuda::arch_id{})); | ||
|
|
||
| cudaError_t e = cudaErrorInvalidDeviceFunction; | ||
| # if _CCCL_STD_VER >= 2020 | ||
| // In C++20, we just create an integral_constant holding the policy, because policies are structural types in C++20. | ||
| // This causes f to be only instantiated for each distinct policy, since the same policy for different arches results | ||
| // in the same integral_constant type passed to f | ||
| (..., | ||
| (device_arch == ::cuda::arch_id{(CudaArches * ArchMult) / 10} | ||
| ? (e = f(::cuda::std::integral_constant<policy_t, arch_policies(::cuda::arch_id{(CudaArches * ArchMult) / 10})>{})) | ||
| : cudaSuccess)); | ||
| # else // if _CCCL_STD_VER >= 2020 | ||
| // In C++17, we have to collapse architectures with the same policies ourselves, so we instantiate call_for_arch once | ||
| // per policy on the lowest ArchId which produces the same policy | ||
| using resolver_t = | ||
| lowest_arch_resolver<ArchMult, ::cuda::std::integer_sequence<int, CudaArches...>, ArchPolicies, Is...>; | ||
| (..., | ||
| (device_arch == ::cuda::arch_id{(CudaArches * ArchMult) / 10} | ||
| ? (e = f(policy_getter_17<ArchPolicies, resolver_t::lowest_arch_with_same_policy[Is]>{arch_policies})) | ||
| : cudaSuccess)); | ||
|
|
||
| # endif // if _CCCL_STD_VER >= 2020 | ||
| return e; | ||
| } | ||
|
|
||
| template <typename ArchPolicies, typename FunctorT, size_t... Is> | ||
| CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch_all_arches_helper( | ||
| ArchPolicies arch_policies, ::cuda::arch_id device_arch, FunctorT&& f, ::cuda::std::index_sequence<Is...> seq) | ||
| { | ||
| static constexpr auto all_arches = ::cuda::__all_arch_ids(); | ||
| return dispatch_to_arch_list<10, static_cast<int>(all_arches[Is])...>(arch_policies, device_arch, f, seq); | ||
| } | ||
|
|
||
| //! Takes a policy hub and instantiates f with the minimum possible number of nullary functor types that return a policy | ||
| //! at compile-time (if possible), and then calls the appropriate instantiation based on a runtime GPU architecture. | ||
| //! Depending on the used compiler, C++ standard, and available macros, a different number of instantiations may be | ||
| //! produced. | ||
| template <typename ArchPolicies, typename F> | ||
| CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t | ||
| dispatch_arch(ArchPolicies arch_policies, ::cuda::arch_id device_arch, F&& f) | ||
| { | ||
| // if we have __CUDA_ARCH_LIST__ or NV_TARGET_SM_INTEGER_LIST, we only poll the policy hub for those arches. | ||
| # ifdef __CUDA_ARCH_LIST__ | ||
| [[maybe_unused]] static constexpr auto arch_seq = ::cuda::std::integer_sequence<int, __CUDA_ARCH_LIST__>{}; | ||
| return dispatch_to_arch_list<1, __CUDA_ARCH_LIST__>( | ||
| arch_policies, device_arch, ::cuda::std::forward<F>(f), ::cuda::std::make_index_sequence<arch_seq.size()>{}); | ||
| # elif defined(NV_TARGET_SM_INTEGER_LIST) | ||
| [[maybe_unused]] static constexpr auto arch_seq = ::cuda::std::integer_sequence<int, NV_TARGET_SM_INTEGER_LIST>{}; | ||
| return dispatch_to_arch_list<10, NV_TARGET_SM_INTEGER_LIST>( | ||
| arch_policies, device_arch, ::cuda::std::forward<F>(f), ::cuda::std::make_index_sequence<arch_seq.size()>{}); | ||
| # else | ||
| // some compilers don't tell us what arches we are compiling for, so we test all of them | ||
| return dispatch_all_arches_helper( | ||
| arch_policies, | ||
| device_arch, | ||
| ::cuda::std::forward<F>(f), | ||
| ::cuda::std::make_index_sequence<::cuda::__all_arch_ids().size()>{}); | ||
| # endif | ||
| } | ||
|
|
||
| #else // !defined(CUB_DEFINE_RUNTIME_POLICIES) && !_CCCL_COMPILER(NVRTC) | ||
|
|
||
| // if we are compiling CCCL.C with runtime policies, we cannot query the policy hub at compile time | ||
| _CCCL_EXEC_CHECK_DISABLE | ||
| template <typename ArchPolicies, typename F> | ||
| _CCCL_API _CCCL_FORCEINLINE cudaError_t dispatch_arch(ArchPolicies arch_policies, ::cuda::arch_id device_arch, F&& f) | ||
| { | ||
| return f([&] { | ||
| return arch_policies(device_arch); | ||
| }); | ||
| } | ||
| #endif // !defined(CUB_DEFINE_RUNTIME_POLICIES) && !_CCCL_COMPILER(NVRTC) | ||
| } // namespace detail | ||
|
|
||
| CUB_NAMESPACE_END |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,150 @@ | ||
| // SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
|
|
||
| #include <cub/detail/arch_dispatch.cuh> | ||
|
|
||
| #include <cuda/std/__algorithm/find_if.h> | ||
|
|
||
| #include <c2h/catch2_test_helper.h> | ||
|
|
||
| #ifdef __CUDA_ARCH_LIST__ | ||
| # define CUDA_SM_LIST __CUDA_ARCH_LIST__ | ||
| # define CUDA_SM_LIST_SCALE 1 | ||
| #elif defined(NV_TARGET_SM_INTEGER_LIST) | ||
| # define CUDA_SM_LIST NV_TARGET_SM_INTEGER_LIST | ||
| # define CUDA_SM_LIST_SCALE 10 | ||
| #endif | ||
|
|
||
| using cuda::arch_id; | ||
|
|
||
| struct arch_policy | ||
| { | ||
| arch_id value; | ||
|
|
||
| _CCCL_API constexpr bool operator==(const arch_policy& other) const noexcept | ||
| { | ||
| return value == other.value; | ||
| } | ||
|
|
||
| _CCCL_API constexpr bool operator!=(const arch_policy& other) const noexcept | ||
| { | ||
| return value != other.value; | ||
| } | ||
| }; | ||
|
|
||
| struct arch_policies_all | ||
| { | ||
| _CCCL_API constexpr auto operator()(arch_id id) const -> arch_policy | ||
| { | ||
| return arch_policy{id}; | ||
| } | ||
| }; | ||
|
|
||
| #ifdef CUDA_SM_LIST | ||
| // check that the selected policy exactly matches one of (scaled) arches we compile for | ||
| template <arch_id SelectedPolicyArch, int... ArchList> | ||
| struct check | ||
| { | ||
| static_assert(((SelectedPolicyArch == arch_id{ArchList * CUDA_SM_LIST_SCALE / 10}) || ...)); | ||
| using type = cudaError_t; | ||
| }; | ||
| #endif // CUDA_SM_LIST | ||
|
|
||
| struct closure_all | ||
| { | ||
| arch_id id; | ||
|
|
||
| template <typename PolicyGetter> | ||
| CUB_RUNTIME_FUNCTION auto operator()(PolicyGetter policy_getter) const -> | ||
| #ifdef CUDA_SM_LIST | ||
| typename check<PolicyGetter{}().value, CUDA_SM_LIST>::type | ||
| #else // CUDA_SM_LIST | ||
| cudaError_t | ||
| #endif // CUDA_SM_LIST | ||
| { | ||
| constexpr arch_policy active_policy = policy_getter(); | ||
| // since an individual policy is generated per architecture, we can do an exact comparison here | ||
| REQUIRE(active_policy.value == id); | ||
| return cudaSuccess; | ||
| } | ||
| }; | ||
|
|
||
| C2H_TEST("dispatch_arch prunes based on __CUDA_ARCH_LIST__/NV_TARGET_SM_INTEGER_LIST", "[util][dispatch]") | ||
| { | ||
| #ifdef CUDA_SM_LIST | ||
| for (const int sm_val : {CUDA_SM_LIST}) | ||
| { | ||
| const auto id = arch_id{sm_val * CUDA_SM_LIST_SCALE / 10}; | ||
| #else | ||
| for (const arch_id id : cuda::__all_arch_ids()) | ||
| { | ||
| #endif | ||
| CHECK(cub::detail::dispatch_arch(arch_policies_all{}, id, closure_all{id}) == cudaSuccess); | ||
| } | ||
| } | ||
|
|
||
| template <int NumPolicies> | ||
| struct check_policy_closure | ||
| { | ||
| arch_id id; | ||
| cuda::std::array<arch_id, NumPolicies> policy_ids; | ||
|
|
||
| template <typename PolicyGetter> | ||
| CUB_RUNTIME_FUNCTION cudaError_t operator()(PolicyGetter policy_getter) const | ||
| { | ||
| constexpr arch_policy active_policy = policy_getter(); | ||
| CAPTURE(id, policy_ids); | ||
| const auto policy_arch = *cuda::std::find_if(policy_ids.rbegin(), policy_ids.rend(), [&](arch_id policy_ver) { | ||
| return policy_ver <= id; | ||
| }); | ||
| REQUIRE(active_policy.value == policy_arch); | ||
| return cudaSuccess; | ||
| } | ||
| }; | ||
|
|
||
| // distinct policies for 60+, 80+ and 100+ | ||
| struct arch_policies_some | ||
| { | ||
| _CCCL_API constexpr auto operator()(arch_id id) const -> arch_policy | ||
| { | ||
| if (id >= arch_id::sm_100) | ||
| { | ||
| return arch_policy{arch_id::sm_100}; | ||
| } | ||
| if (id >= arch_id::sm_80) | ||
| { | ||
| return arch_policy{arch_id::sm_80}; | ||
| } | ||
| // default is policy 60 | ||
| return arch_policy{arch_id::sm_60}; | ||
| } | ||
| }; | ||
|
|
||
| // only a single policy | ||
| struct arch_policies_minimal | ||
| { | ||
| _CCCL_API constexpr auto operator()(arch_id) const -> arch_policy | ||
| { | ||
| // default is policy 60 | ||
| return arch_policy{arch_id::sm_60}; | ||
| } | ||
| }; | ||
|
|
||
| C2H_TEST("dispatch_arch invokes correct policy", "[util][dispatch]") | ||
| { | ||
| #ifdef CUDA_SM_LIST | ||
| for (const int sm_val : {CUDA_SM_LIST}) | ||
| { | ||
| const auto id = arch_id{sm_val * CUDA_SM_LIST_SCALE / 10}; | ||
| #else | ||
| for (const arch_id id : cuda::__all_arch_ids()) | ||
| { | ||
| #endif | ||
| const auto closure_some = | ||
| check_policy_closure<3>{id, cuda::std::array<arch_id, 3>{arch_id::sm_60, arch_id::sm_80, arch_id::sm_100}}; | ||
| CHECK(cub::detail::dispatch_arch(arch_policies_some{}, id, closure_some) == cudaSuccess); | ||
|
|
||
| const auto closure_minimal = check_policy_closure<1>{id, cuda::std::array<arch_id, 1>{arch_id::sm_60}}; | ||
| CHECK(cub::detail::dispatch_arch(arch_policies_minimal{}, id, closure_minimal) == cudaSuccess); | ||
| } | ||
| } | ||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.