Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 28 additions & 29 deletions libcudacxx/include/cuda/__random/feistel_bijection.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,17 +37,11 @@ class __feistel_bijection
private:
static constexpr uint32_t __num_rounds = 24;

uint64_t __right_side_bits{};
uint64_t __left_side_bits{};
uint64_t __right_side_mask{};
uint64_t __left_side_mask{};
uint32_t __keys[__num_rounds] = {};

struct __decomposed
{
uint32_t __low;
uint32_t __high;
};
uint64_t __R_bits_{};
uint64_t __L_bits_{};
uint64_t __R_mask_{};
uint64_t __L_mask_{};
uint32_t __keys_[__num_rounds] = {};

public:
using index_type = uint64_t;
Expand All @@ -57,44 +51,49 @@ class __feistel_bijection
template <class _RNG>
_CCCL_API __feistel_bijection(uint64_t __num_elements, _RNG&& __gen)
{
const uint64_t __total_bits = (::cuda::std::max) (uint64_t{4}, ::cuda::std::bit_ceil(__num_elements));
const uint64_t __total_bits =
(::cuda::std::max) (uint64_t{8}, static_cast<uint64_t>(::cuda::std::bit_width(__num_elements)));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
(::cuda::std::max) (uint64_t{8}, static_cast<uint64_t>(::cuda::std::bit_width(__num_elements)));
::cuda::std::max(uint64_t{8}, static_cast<uint64_t>(::cuda::std::bit_width(__num_elements)));

Should be fine now :) you can leave it as is


// Half bits rounded down
__left_side_bits = __total_bits / 2;
__left_side_mask = (1ull << __left_side_bits) - 1;
__L_bits_ = __total_bits / 2;
__L_mask_ = (1ull << __L_bits_) - 1;
// Half the bits rounded up
__right_side_bits = __total_bits - __left_side_bits;
__right_side_mask = (1ull << __right_side_bits) - 1;
__R_bits_ = __total_bits - __L_bits_;
__R_mask_ = (1ull << __R_bits_) - 1;

::cuda::std::uniform_int_distribution<uint32_t> dist{};
::cuda::std::uniform_int_distribution<uint32_t> __dist{};
_CCCL_PRAGMA_UNROLL_FULL()
for (uint32_t i = 0; i < __num_rounds; i++)
{
__keys[i] = dist(__gen);
__keys_[i] = __dist(__gen);
}
}

[[nodiscard]] _CCCL_API constexpr uint64_t size() const noexcept
{
return 1ull << (__left_side_bits + __right_side_bits);
return 1ull << (__L_bits_ + __R_bits_);
}

[[nodiscard]] _CCCL_API constexpr uint64_t operator()(const uint64_t __val) const noexcept
{
__decomposed __state = {static_cast<uint32_t>(__val >> __right_side_bits),
static_cast<uint32_t>(__val & __right_side_mask)};
for (uint32_t i = 0; i < __num_rounds; i++)
// Mitchell, Rory, et al. "Bandwidth-optimal random shuffling for GPUs." ACM Transactions on Parallel Computing 9.1
// (2022): 1-20.
uint32_t __L = __val >> __R_bits_;
uint32_t __R = __val & __R_mask_;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Q: Is this part the actual fix, that we swap the definitions of the original high and low? Otherwise the rest looks the same to me.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually there was a significant bug. The high and low values were being initialised swapped from what they should have been. This meant that with odd number of bits one of the bits was getting removed - as a consequence the tests would run forever because the bijection was incorrect (i.e. values collided).

I wrote it from scratch again to find the bug as I couldn't find it at first! I relabeled everything with the notation from the paper which is less confusing to me.

for (uint32_t __i = 0; __i < __num_rounds; __i++)
{
constexpr uint64_t __m0 = 0xD2B74407B1CE6E93;
const uint64_t __product = __m0 * __state.__high;
const uint32_t __high = static_cast<uint32_t>(__product >> 32);
uint32_t __low = static_cast<uint32_t>(__product);
__low = (__low << (__right_side_bits - __left_side_bits)) | __state.__low >> __left_side_bits;
__state.__high = ((__high ^ __keys[i]) ^ __state.__low) & __left_side_mask;
__state.__low = __low & __right_side_mask;
const uint64_t __product = __m0 * __L;
uint32_t __F_k = (__product >> 32) ^ __keys_[__i];
uint32_t __B_k = static_cast<uint32_t>(__product);
uint32_t __L_prime = __F_k ^ __R;

uint32_t __R_prime = (__B_k << (__R_bits_ - __L_bits_)) | __R >> __L_bits_;
__L = __L_prime & __L_mask_;
__R = __R_prime & __R_mask_;
}
// Combine the left and right sides together to get result
return (static_cast<uint64_t>(__state.__high) << __right_side_bits) | static_cast<uint64_t>(__state.__low);
return (static_cast<uint64_t>(__L) << __R_bits_) | static_cast<uint64_t>(__R);
}
};

Expand Down
Loading
Loading