Skip to content

Commit

Permalink
added comments for review
Browse files Browse the repository at this point in the history
  • Loading branch information
DiamonDinoia committed Jul 26, 2024
1 parent b3237f7 commit db80aad
Show file tree
Hide file tree
Showing 5 changed files with 48 additions and 22 deletions.
1 change: 1 addition & 0 deletions CHANGELOG
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ V 2.3.0beta (7/21/24)
precision denormals to 0 and using fma where possible.
* cuFINUFFT using intrinsics in foldrescale and other places to increase
performance
* cuFINUFFT using SM90 float2 vector atomicAdd where supported

V 2.2.0 (12/12/23)

Expand Down
1 change: 1 addition & 0 deletions include/cufinufft/impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
Variables and arrays inside the plan struct are set and allocated.
Melody Shih 07/25/19. Use-facing moved to markdown, Barnett 2/16/21.
Marco Barbone 07/26/24. Using SM when shared memory available is enough.
*/
int ier;
cuDoubleComplex *d_a = nullptr; // fseries temp data
Expand Down
24 changes: 17 additions & 7 deletions include/cufinufft/spreadinterp.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,32 +12,42 @@ namespace spreadinterp {
template<typename T>
static __forceinline__ __device__ constexpr T fma(const T a, const T b, const T c) {
if constexpr (std::is_same_v<T, float>) {
// fused multiply-add, round to nearest even
return __fmaf_rn(a, b, c);
} else if constexpr (std::is_same_v<T, double>) {
// fused multiply-add, round to nearest even
return __fma_rn(a, b, c);
}
static_assert(std::is_same_v<T, float> || std::is_same_v<T, double>,
"Only float and double are supported.");
return T{0};
}

template<typename T, typename V>
constexpr __forceinline__ __host__ __device__ T fold_rescale(const T x, const V N) {
template<typename T>
constexpr __forceinline__ __host__ __device__ T fold_rescale(T x, int N) {
constexpr auto x2pi = T(0.159154943091895345554011992339482617);
constexpr auto half = T(0.5);
#if defined(__CUDA_ARCH__)
if constexpr (std::is_same_v<T, float>) {
const auto result = fma(x, x2pi, half);
return (result - floorf(result)) * static_cast<T>(N);
// fused multiply-add, round to nearest even
auto result = __fmaf_rn(x, x2pi, half);
// subtract, round down
result = __fsub_rd(result, floorf(result));
// multiply, round down
return __fmul_rd(result, static_cast<T>(N));
} else if constexpr (std::is_same_v<T, double>) {
const auto result = fma(x, x2pi, half);
return (result - floor(result)) * static_cast<T>(N);
// fused multiply-add, round to nearest even
auto result = __fma_rn(x, x2pi, half);
// subtract, round down
result = __dsub_rd(result, floor(result));
// multiply, round down
return __dmul_rd(result, static_cast<T>(N));
} else {
static_assert(std::is_same_v<T, float> || std::is_same_v<T, double>,
"Only float and double are supported.");
}
#else
const auto result = fma(x, x2pi, half);
const auto result = std::fma(x, x2pi, half);
return (result - std::floor(result)) * static_cast<T>(N);
#endif
}
Expand Down
25 changes: 20 additions & 5 deletions include/cufinufft/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,11 +74,14 @@ template<typename T> T infnorm(int n, std::complex<T> *a) {

#ifdef __CUDA_ARCH__
__forceinline__ __device__ auto interval(const int ns, const float x) {
// float to int round up and fused multiply-add to round up
const auto xstart = __float2int_ru(__fmaf_ru(ns, -.5f, x));
const auto xend = __float2int_rd(__fmaf_rd(ns, .5f, x));
// float to int round down and fused multiply-add to round down
const auto xend = __float2int_rd(__fmaf_rd(ns, .5f, x));
return int2{xstart, xend};
}
__forceinline__ __device__ auto interval(const int ns, const double x) {
// same as above
const auto xstart = __double2int_ru(__fma_ru(ns, -.5, x));
const auto xend = __double2int_rd(__fma_rd(ns, .5, x));
return int2{xstart, xend};
Expand Down Expand Up @@ -107,17 +110,29 @@ __forceinline__ __device__ auto interval(const int ns, const double x) {
#define COMPUTE_CAPABILITY_90_OR_HIGHER 0
#endif

/**
* does a complex atomic add on a shared memory address
* it adds the real and imaginary parts separately
* cuda does not support atomic operations
* on complex numbers on shared memory directly
*/

template<typename T>
static __forceinline__ __device__ void atomicAddComplexShared(
cuda_complex<T> *address, cuda_complex<T> res) {
static __forceinline__ __device__ void atomicAddComplexShared(cuda_complex<T> *address,
cuda_complex<T> res) {
const auto raw_address = reinterpret_cast<T *>(address);
atomicAdd(raw_address, res.x);
atomicAdd(raw_address + 1, res.y);
}

/**
* does a complex atomic add on a global memory address
* since cuda 90 atomic operations on complex numbers
* on shared memory are supported so we leverage them
*/
template<typename T>
static __forceinline__ __device__ void atomicAddComplexGlobal(
cuda_complex<T> *address, cuda_complex<T> res) {
static __forceinline__ __device__ void atomicAddComplexGlobal(cuda_complex<T> *address,
cuda_complex<T> res) {
if constexpr (
std::is_same_v<cuda_complex<T>, float2> && COMPUTE_CAPABILITY_90_OR_HIGHER) {
atomicAdd(address, res);
Expand Down
19 changes: 9 additions & 10 deletions src/cuda/common.cu
Original file line number Diff line number Diff line change
Expand Up @@ -202,8 +202,7 @@ void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, T *f, std::complex<doubl
template<typename T>
std::size_t shared_memory_required(int dim, int ns, int bin_size_x, int bin_size_y,
int bin_size_z) {
// printf("dim, ns, bin_size_x, bin_size_y, bin_size_z: %d %d %d %d %d\n", dim, ns,
// bin_size_x, bin_size_y, bin_size_z);
// Helper to compute the shared memory required for the spreader when using SM
int adjusted_ns = bin_size_x + ((ns + 1) / 2) * 2;

if (dim == 1) {
Expand All @@ -221,17 +220,18 @@ std::size_t shared_memory_required(int dim, int ns, int bin_size_x, int bin_size
return adjusted_ns * sizeof(cuda_complex<T>);
}

// Function to find bin_size_x == bin_size_y where bin_size_x * bin_size_y < MemSize
template<typename T> int find_bin_size(std::size_t MemSize, int dim, int ns) {
// Function to find bin_size_x == bin_size_y
// where bin_size_x * bin_size_y * bin_size_z < mem_size
// TODO: this can be done without a loop by using a direct formula
template<typename T> int find_bin_size(std::size_t mem_size, int dim, int ns) {
int binsize = 1; // Start with the smallest possible bin size

while (true) {
// Calculate the shared memory required for the current bin_size_x and bin_size_y
std::size_t required_memory =
shared_memory_required<T>(dim, ns, binsize, binsize, binsize);

// Check if the required memory is less than the available memory
if (required_memory > MemSize) {
if (required_memory > mem_size) {
// If the condition is met, return the current bin_size_x
return binsize - 1;
}
Expand All @@ -243,6 +243,9 @@ template<typename T> int find_bin_size(std::size_t MemSize, int dim, int ns) {

template<typename T>
void cufinufft_setup_binsize(int type, int ns, int dim, cufinufft_opts *opts) {
// Marco Barbone 07/26/24. Using the shared memory available on the device, to
// determine the optimal binsize for the spreader.
// TODO: This can still be improved some sizes are hardcoded still
int shared_mem_per_block{}, device_id{};
switch (dim) {
case 1: {
Expand Down Expand Up @@ -290,10 +293,6 @@ void cufinufft_setup_binsize(int type, int ns, int dim, cufinufft_opts *opts) {
} break;
}
}
// const auto shared_mem_required = shared_memory_required<T>(
// dim, ns, opts->gpu_binsizex, opts->gpu_binsizey, opts->gpu_binsizez);
// printf("binsizex: %d, binsizey: %d, shared_mem_required %ld (bytes)\n",
// opts->gpu_binsizex, opts->gpu_binsizey, shared_mem_required);
opts->gpu_binsizez = 1;
} break;
case 3: {
Expand Down

0 comments on commit db80aad

Please sign in to comment.