Skip to content

Commit

Permalink
reduce page access violations and seemingly Gamma_LU min_max fails
Browse files Browse the repository at this point in the history
  • Loading branch information
PDoakORNL committed Apr 10, 2024
1 parent 1347b39 commit 9f01b16
Show file tree
Hide file tree
Showing 4 changed files with 57 additions and 1 deletion.
3 changes: 3 additions & 0 deletions cmake/dca_hip.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,9 @@ if (CMAKE_HIP_COMPILER)
set(DCA_HIP_PROPERTIES "CMAKE_HIP_ARCHITECTURES gfx908,gfx90a")
set(CMAKE_HIP_STANDARD 17)
list(APPEND HIP_HIPCC_FLAGS "-fPIC")
list(APPEND HIP_HIPCC_FLAGS "-mno-unsafe-fp-atomics")
list(APPEND HIP_HIPCC_FLAGS "-fgpu-default-stream=per-thread")

# doesn't appear to work
set(CMAKE_HIP_SOURCE_FILE_EXTENSIONS cu)
message("Enabled HIP as a language")
Expand Down
49 changes: 48 additions & 1 deletion include/dca/linalg/util/atomic_add_cuda.cu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,52 @@ __device__ void inline atomicAdd(double* address, const double val) {
atomicAddImpl(address, val);
}

#elif defined(DCA_HAVE_HIP)
// HIP seems to have some horrible problem with concurrent atomic operations.
__device__ double inline atomicAddImpl(double* address, const double val) {
unsigned long long int* address_as_ull = (unsigned long long int*)address;
unsigned long long int old = *address_as_ull, assumed;
do {
assumed = old;
old = atomicCAS(address_as_ull, assumed,
__double_as_longlong(val + __longlong_as_double(assumed)));
// Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) }
} while (assumed != old);
return __longlong_as_double(old);
}

__device__ double inline atomicAddImpl(float* address, const float val) {
unsigned long int* address_as_int = (unsigned long int*)address;
unsigned long int old = *address_as_int, assumed;
do {
assumed = old;
old = atomicCAS(address_as_int, assumed,
__float_as_int(val + __int_as_float(assumed)));
// Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) }
} while (assumed != old);
return __int_as_float(old);
}

__device__ void inline atomicAdd(float* address, const float val) {
atomicAddImpl(address, val);
}

__device__ void inline atomicAdd(double* address, const double val) {
atomicAddImpl(address, val);
}

__device__ void inline atomicAdd(cuDoubleComplex* address, cuDoubleComplex val) {
double* a_d = reinterpret_cast<double*>(address);
atomicAddImpl(a_d, val.x);
atomicAddImpl(a_d + 1, val.y);
}

__device__ void inline atomicAdd(magmaFloatComplex* const address, magmaFloatComplex val) {
double* a_d = reinterpret_cast<double*>(address);
atomicAddImpl(a_d, val.x);
atomicAddImpl(a_d + 1, val.y);
}

#else
__device__ void inline atomicAdd(double* address, double val) {
::atomicAdd(address, val);
Expand All @@ -62,8 +108,9 @@ __device__ void inline atomicAdd(cuDoubleComplex* address, cuDoubleComplex val)
atomicAdd(a_d, val.x);
atomicAdd(a_d + 1, val.y);
}
#endif // __CUDA_ARCH__
#endif // atomic operation help


} // linalg
} // dca

Expand Down
2 changes: 2 additions & 0 deletions include/dca/math/nfft/dnfft_1d_gpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,8 @@ void Dnfft1DGpu<Scalar, WDmn, RDmn, oversampling, CUBIC>::accumulate(
config_left_dev_.setAsync(config_left_, stream_);
times_dev_.setAsync(times_, stream_);

//hipStreamSynchronize(stream_.streamActually());

details::accumulateOnDevice<oversampling, BaseClass::window_sampling_, Scalar, Real>(
M.ptr(), M.leadingDimension(), factor, accumulation_matrix_.ptr(),
accumulation_matrix_sqr_.ptr(), accumulation_matrix_.leadingDimension(), config_left_dev_.ptr(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1012,11 +1012,15 @@ void CtauxWalker<device_t, Parameters, Data>::read_Gamma_matrices(e_spin_states
case e_DN:
CT_AUX_WALKER_TOOLS<device_t, Scalar>::compute_Gamma(
Gamma_dn, N_dn, G_dn, vertex_indixes, exp_V, exp_delta_V, thread_id, stream_id);
// assume we've no guarantee this will be allowed to finish before the async copy starts
linalg::util::syncStream(thread_id, stream_id);
break;

case e_UP:
CT_AUX_WALKER_TOOLS<device_t, Scalar>::compute_Gamma(
Gamma_up, N_up, G_up, vertex_indixes, exp_V, exp_delta_V, thread_id, stream_id);
// assume we've no guarantee this will be allowed to finish before the async copy starts
linalg::util::syncStream(thread_id, stream_id);
break;

default:
Expand Down

0 comments on commit 9f01b16

Please sign in to comment.