-
Notifications
You must be signed in to change notification settings - Fork 321
Fixes for shuffle_iterator #7130
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
b0b19cd
337ac5e
70dcfb9
ea08bdf
df76e11
013af58
ff25c3b
013e0d3
6d6545e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -37,17 +37,11 @@ class __feistel_bijection | |
| private: | ||
| static constexpr uint32_t __num_rounds = 24; | ||
|
|
||
| uint64_t __right_side_bits{}; | ||
| uint64_t __left_side_bits{}; | ||
| uint64_t __right_side_mask{}; | ||
| uint64_t __left_side_mask{}; | ||
| uint32_t __keys[__num_rounds] = {}; | ||
|
|
||
| struct __decomposed | ||
| { | ||
| uint32_t __low; | ||
| uint32_t __high; | ||
| }; | ||
| uint64_t __R_bits_{}; | ||
| uint64_t __L_bits_{}; | ||
| uint64_t __R_mask_{}; | ||
| uint64_t __L_mask_{}; | ||
| uint32_t __keys_[__num_rounds] = {}; | ||
|
|
||
| public: | ||
| using index_type = uint64_t; | ||
|
|
@@ -57,44 +51,49 @@ class __feistel_bijection | |
| template <class _RNG> | ||
| _CCCL_API __feistel_bijection(uint64_t __num_elements, _RNG&& __gen) | ||
| { | ||
| const uint64_t __total_bits = (::cuda::std::max) (uint64_t{4}, ::cuda::std::bit_ceil(__num_elements)); | ||
| const uint64_t __total_bits = | ||
| (::cuda::std::max) (uint64_t{8}, static_cast<uint64_t>(::cuda::std::bit_width(__num_elements))); | ||
|
|
||
| // Half bits rounded down | ||
| __left_side_bits = __total_bits / 2; | ||
| __left_side_mask = (1ull << __left_side_bits) - 1; | ||
| __L_bits_ = __total_bits / 2; | ||
| __L_mask_ = (1ull << __L_bits_) - 1; | ||
| // Half the bits rounded up | ||
| __right_side_bits = __total_bits - __left_side_bits; | ||
| __right_side_mask = (1ull << __right_side_bits) - 1; | ||
| __R_bits_ = __total_bits - __L_bits_; | ||
| __R_mask_ = (1ull << __R_bits_) - 1; | ||
|
|
||
| ::cuda::std::uniform_int_distribution<uint32_t> dist{}; | ||
| ::cuda::std::uniform_int_distribution<uint32_t> __dist{}; | ||
| _CCCL_PRAGMA_UNROLL_FULL() | ||
| for (uint32_t i = 0; i < __num_rounds; i++) | ||
| { | ||
| __keys[i] = dist(__gen); | ||
| __keys_[i] = __dist(__gen); | ||
| } | ||
| } | ||
|
|
||
| [[nodiscard]] _CCCL_API constexpr uint64_t size() const noexcept | ||
| { | ||
| return 1ull << (__left_side_bits + __right_side_bits); | ||
| return 1ull << (__L_bits_ + __R_bits_); | ||
| } | ||
|
|
||
| [[nodiscard]] _CCCL_API constexpr uint64_t operator()(const uint64_t __val) const noexcept | ||
| { | ||
| __decomposed __state = {static_cast<uint32_t>(__val >> __right_side_bits), | ||
| static_cast<uint32_t>(__val & __right_side_mask)}; | ||
| for (uint32_t i = 0; i < __num_rounds; i++) | ||
| // Mitchell, Rory, et al. "Bandwidth-optimal random shuffling for GPUs." ACM Transactions on Parallel Computing 9.1 | ||
| // (2022): 1-20. | ||
| uint32_t __L = __val >> __R_bits_; | ||
| uint32_t __R = __val & __R_mask_; | ||
|
||
| for (uint32_t __i = 0; __i < __num_rounds; __i++) | ||
| { | ||
| constexpr uint64_t __m0 = 0xD2B74407B1CE6E93; | ||
| const uint64_t __product = __m0 * __state.__high; | ||
| const uint32_t __high = static_cast<uint32_t>(__product >> 32); | ||
| uint32_t __low = static_cast<uint32_t>(__product); | ||
| __low = (__low << (__right_side_bits - __left_side_bits)) | __state.__low >> __left_side_bits; | ||
| __state.__high = ((__high ^ __keys[i]) ^ __state.__low) & __left_side_mask; | ||
| __state.__low = __low & __right_side_mask; | ||
| const uint64_t __product = __m0 * __L; | ||
| uint32_t __F_k = (__product >> 32) ^ __keys_[__i]; | ||
| uint32_t __B_k = static_cast<uint32_t>(__product); | ||
| uint32_t __L_prime = __F_k ^ __R; | ||
|
|
||
| uint32_t __R_prime = (__B_k << (__R_bits_ - __L_bits_)) | __R >> __L_bits_; | ||
| __L = __L_prime & __L_mask_; | ||
| __R = __R_prime & __R_mask_; | ||
| } | ||
| // Combine the left and right sides together to get result | ||
| return (static_cast<uint64_t>(__state.__high) << __right_side_bits) | static_cast<uint64_t>(__state.__low); | ||
| return (static_cast<uint64_t>(__L) << __R_bits_) | static_cast<uint64_t>(__R); | ||
| } | ||
| }; | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should be fine now :) you can leave it as is