Skip to content

Commit

Permalink
#5 Added preliminary GPU version, which has branch-free critical loop
Browse files Browse the repository at this point in the history
  • Loading branch information
carljohnsen committed Mar 7, 2024
1 parent d324edc commit 87b9da0
Show file tree
Hide file tree
Showing 2 changed files with 119 additions and 1 deletion.
109 changes: 109 additions & 0 deletions src/lib/cpp/gpu/diffusion.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
#include "diffusion.hh"

namespace gpu {

void diffusion_core(const float *__restrict__ input, const float *__restrict__ kernel, float *__restrict__ output, const shape_t &N, const int64_t dim, const int64_t radius, const int64_t padding) {
#pragma omp target teams distribute parallel for collapse(3) device(omp_get_thread_num() % n_devices)
for (int64_t i = 0; i < N.z+padding; i++) {
for (int64_t j = 0; j < N.y+padding; j++) {
for (int64_t k = 0; k < N.x+padding; k++) {
const int64_t
X[3] = {i, j, k},
stride[3] = {(N.y+padding)*(N.x+padding), N.x+padding, 1},
Ns[3] = {N.z+padding, N.y+padding, N.x+padding},
ranges[2] = {
-std::min(radius, X[dim]), std::min(radius, Ns[dim]-X[dim]-1)
},
output_index = i*stride[0] + j*stride[1] + k*stride[2];

float sum = 0.0f;

for (int64_t r = -radius; r <= radius; r++) {
const int64_t input_index = output_index + r*stride[dim];
// If the loop ranges guards the accesses
//float val = input[input_index];
//sum += val * kernel[radius+r];

// Branch free version
bool cond = r >= ranges[0] && r <= ranges[1];
int32_t mask = -cond;
mask &= mask >> 31;
int64_t index = (mask & input_index) | (~mask & output_index);
float val = input[index];

// Branch free version - pointer casting
int32_t *vali = (int32_t*) &val;
*vali &= mask;

// Branch free version - memcpy
//int32_t vali;
//std::memcpy(&vali, &val, sizeof(val));
//vali &= mask;
//std::memcpy(&val, &vali, sizeof(val));

// Branch free version - union
//raw32 val32;
//val32.f = val;
//val32.i &= mask;
//val = val32.f;

sum += val * kernel[radius+r];

// Original:
//float val = cond ? input[input_index] : 0.0f;
//sum += val * kernel[radius+r];
}

output[output_index] = sum;
}
}
}
}

void convert_float_to_uint8(const float *__restrict__ src, uint8_t *__restrict__ dst, const int64_t total_flat_size) {
#pragma omp target teams distribute parallel for device(omp_get_thread_num() % n_devices)
for (int64_t i = 0; i < total_flat_size; i++) {
dst[i] = (uint8_t) std::floor(src[i] * 255.0f);
}
}

void convert_uint8_to_float(const uint8_t *__restrict__ src, float *__restrict__ dst, const int64_t total_flat_size) {
#pragma omp target teams distribute parallel for device(omp_get_thread_num() % n_devices)
for (int64_t i = 0; i < total_flat_size; i++) {
dst[i] = src[i] > 0 ? 1.0f : 0.0f;
}
}

void illuminate(const uint8_t *__restrict__ mask, float *__restrict__ output, const int64_t local_flat_size) {
#pragma omp target teams distribute parallel for device(omp_get_thread_num() % n_devices)
for (int64_t i = 0; i < local_flat_size; i++) {
if (mask[i] > 0) {
output[i] = 1.0f;
}
}
}

void diffusion_in_memory(const uint8_t *__restrict__ voxels, const shape_t &N, const float *__restrict__ kernel, const int64_t kernel_size, const int64_t repititions, uint8_t *__restrict__ output) {
const int64_t
total_size = N.z*N.y*N.x,
radius = kernel_size / 2;
float **buffers = new float*[2];
buffers[0] = new float[total_size];
buffers[1] = new float[total_size];

convert_uint8_to_float(voxels, buffers[0], total_size);
for (int64_t rep = 0; rep < repititions; rep++) {
for (int64_t dim = 0; dim < 3; dim++) {
diffusion_core(buffers[0], kernel, buffers[1], N, dim, radius, 0);
std::swap(buffers[0], buffers[1]);
}
illuminate(voxels, buffers[0], total_size);
}
convert_float_to_uint8(buffers[0], output, total_size);

delete[] buffers[0];
delete[] buffers[1];
delete[] buffers;
}

}
11 changes: 10 additions & 1 deletion src/lib/cpp/include/datatypes.hh
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,17 @@ typedef std::array<real_t,4> vector4;
typedef std::array<real_t,9> matrix3x3;
typedef std::array<real_t,3> vector3;

// Struct for holding the shape of a 3D array
typedef struct {
int64_t z, y, x;
int64_t z, y, x;
} shape_t;

// Struct for accessing the raw bits of a 32-bit float
typedef struct {
union {
float f;
int32_t i;
};
} raw32_t;

#endif

0 comments on commit 87b9da0

Please sign in to comment.