Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 27 additions & 29 deletions libcudacxx/include/cuda/__random/feistel_bijection.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,17 +37,11 @@ class __feistel_bijection
private:
static constexpr uint32_t __num_rounds = 24;

uint64_t __right_side_bits{};
uint64_t __left_side_bits{};
uint64_t __right_side_mask{};
uint64_t __left_side_mask{};
uint32_t __keys[__num_rounds] = {};

struct __decomposed
{
uint32_t __low;
uint32_t __high;
};
uint64_t __R_bits_{};
uint64_t __L_bits_{};
uint64_t __R_mask_{};
uint64_t __L_mask_{};
uint32_t __keys_[__num_rounds] = {};

public:
using index_type = uint64_t;
Expand All @@ -57,44 +51,48 @@ class __feistel_bijection
template <class _RNG>
_CCCL_API __feistel_bijection(uint64_t __num_elements, _RNG&& __gen)
{
const uint64_t __total_bits = static_cast<uint64_t>(::cuda::std::max(4, ::cuda::std::bit_width(__num_elements)));
const uint64_t __total_bits = static_cast<uint64_t>(::cuda::std::max(8, ::cuda::std::bit_width(__num_elements)));

// Half bits rounded down
__left_side_bits = __total_bits / 2;
__left_side_mask = (1ull << __left_side_bits) - 1;
__L_bits_ = __total_bits / 2;
__L_mask_ = (1ull << __L_bits_) - 1;
// Half the bits rounded up
__right_side_bits = __total_bits - __left_side_bits;
__right_side_mask = (1ull << __right_side_bits) - 1;
__R_bits_ = __total_bits - __L_bits_;
__R_mask_ = (1ull << __R_bits_) - 1;

::cuda::std::uniform_int_distribution<uint32_t> dist{};
::cuda::std::uniform_int_distribution<uint32_t> __dist{};
_CCCL_PRAGMA_UNROLL_FULL()
for (uint32_t i = 0; i < __num_rounds; i++)
{
__keys[i] = dist(__gen);
__keys_[i] = __dist(__gen);
}
}

[[nodiscard]] _CCCL_API constexpr uint64_t size() const noexcept
{
return 1ull << (__left_side_bits + __right_side_bits);
return 1ull << (__L_bits_ + __R_bits_);
}

[[nodiscard]] _CCCL_API constexpr uint64_t operator()(const uint64_t __val) const noexcept
{
__decomposed __state = {static_cast<uint32_t>(__val >> __right_side_bits),
static_cast<uint32_t>(__val & __right_side_mask)};
for (uint32_t i = 0; i < __num_rounds; i++)
// Mitchell, Rory, et al. "Bandwidth-optimal random shuffling for GPUs." ACM Transactions on Parallel Computing 9.1
// (2022): 1-20.
uint32_t __L = static_cast<uint32_t>(__val >> __R_bits_);
uint32_t __R = static_cast<uint32_t>(__val & __R_mask_);
for (uint32_t __i = 0; __i < __num_rounds; __i++)
{
constexpr uint64_t __m0 = 0xD2B74407B1CE6E93;
const uint64_t __product = __m0 * __state.__high;
const uint32_t __high = static_cast<uint32_t>(__product >> 32);
uint32_t __low = static_cast<uint32_t>(__product);
__low = (__low << (__right_side_bits - __left_side_bits)) | __state.__low >> __left_side_bits;
__state.__high = ((__high ^ __keys[i]) ^ __state.__low) & __left_side_mask;
__state.__low = __low & __right_side_mask;
const uint64_t __product = __m0 * __L;
uint32_t __F_k = (__product >> 32) ^ __keys_[__i];
uint32_t __B_k = static_cast<uint32_t>(__product);
uint32_t __L_prime = __F_k ^ __R;

uint32_t __R_prime = (__B_k << (__R_bits_ - __L_bits_)) | __R >> __L_bits_;
__L = __L_prime & __L_mask_;
__R = __R_prime & __R_mask_;
}
// Combine the left and right sides together to get result
return (static_cast<uint64_t>(__state.__high) << __right_side_bits) | static_cast<uint64_t>(__state.__low);
return (static_cast<uint64_t>(__L) << __R_bits_) | static_cast<uint64_t>(__R);
}
};

Expand Down
Loading