diff --git a/cmake/dca_hip.cmake b/cmake/dca_hip.cmake index 2d3e0ebb3..4ba855a03 100644 --- a/cmake/dca_hip.cmake +++ b/cmake/dca_hip.cmake @@ -74,6 +74,9 @@ if (CMAKE_HIP_COMPILER) set(DCA_HIP_PROPERTIES "CMAKE_HIP_ARCHITECTURES gfx908,gfx90a") set(CMAKE_HIP_STANDARD 17) list(APPEND HIP_HIPCC_FLAGS "-fPIC") + list(APPEND HIP_HIPCC_FLAGS "-mno-unsafe-fp-atomics") + list(APPEND HIP_HIPCC_FLAGS "-fgpu-default-stream=per-thread") + # doesn't appear to work set(CMAKE_HIP_SOURCE_FILE_EXTENSIONS cu) message("Enabled HIP as a language") diff --git a/include/dca/linalg/util/atomic_add_cuda.cu.hpp b/include/dca/linalg/util/atomic_add_cuda.cu.hpp index da7b35dc3..f0bd9c134 100644 --- a/include/dca/linalg/util/atomic_add_cuda.cu.hpp +++ b/include/dca/linalg/util/atomic_add_cuda.cu.hpp @@ -42,6 +42,52 @@ __device__ void inline atomicAdd(double* address, const double val) { atomicAddImpl(address, val); } +#elif defined(DCA_HAVE_HIP) +// HIP seems to have some horrible problem with concurrent atomic operations. +__device__ double inline atomicAddImpl(double* address, const double val) { + unsigned long long int* address_as_ull = (unsigned long long int*)address; + unsigned long long int old = *address_as_ull, assumed; + do { + assumed = old; + old = atomicCAS(address_as_ull, assumed, + __double_as_longlong(val + __longlong_as_double(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) } + } while (assumed != old); + return __longlong_as_double(old); +} + +__device__ double inline atomicAddImpl(float* address, const float val) { + unsigned long int* address_as_int = (unsigned long int*)address; + unsigned long int old = *address_as_int, assumed; + do { + assumed = old; + old = atomicCAS(address_as_int, assumed, + __float_as_int(val + __int_as_float(assumed))); + // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) } + } while (assumed != old); + return __int_as_float(old); +} + +__device__ void inline atomicAdd(float* address, const float val) { + atomicAddImpl(address, val); +} + +__device__ void inline atomicAdd(double* address, const double val) { + atomicAddImpl(address, val); +} + +__device__ void inline atomicAdd(cuDoubleComplex* address, cuDoubleComplex val) { + double* a_d = reinterpret_cast(address); + atomicAddImpl(a_d, val.x); + atomicAddImpl(a_d + 1, val.y); +} + + __device__ void inline atomicAdd(magmaFloatComplex* const address, magmaFloatComplex val) { + double* a_d = reinterpret_cast(address); + atomicAddImpl(a_d, val.x); + atomicAddImpl(a_d + 1, val.y); +} + #else __device__ void inline atomicAdd(double* address, double val) { ::atomicAdd(address, val); @@ -62,8 +108,9 @@ __device__ void inline atomicAdd(cuDoubleComplex* address, cuDoubleComplex val) atomicAdd(a_d, val.x); atomicAdd(a_d + 1, val.y); } -#endif // __CUDA_ARCH__ +#endif // atomic operation help + } // linalg } // dca diff --git a/include/dca/math/nfft/dnfft_1d_gpu.hpp b/include/dca/math/nfft/dnfft_1d_gpu.hpp index 0cae41164..587416459 100644 --- a/include/dca/math/nfft/dnfft_1d_gpu.hpp +++ b/include/dca/math/nfft/dnfft_1d_gpu.hpp @@ -224,6 +224,8 @@ void Dnfft1DGpu::accumulate( config_left_dev_.setAsync(config_left_, stream_); times_dev_.setAsync(times_, stream_); + //hipStreamSynchronize(stream_.streamActually()); + details::accumulateOnDevice( M.ptr(), M.leadingDimension(), factor, accumulation_matrix_.ptr(), accumulation_matrix_sqr_.ptr(), accumulation_matrix_.leadingDimension(), config_left_dev_.ptr(), diff --git a/include/dca/phys/dca_step/cluster_solver/ctaux/ctaux_walker.hpp b/include/dca/phys/dca_step/cluster_solver/ctaux/ctaux_walker.hpp index 3e681495b..1f5ab6d8d 100644 --- a/include/dca/phys/dca_step/cluster_solver/ctaux/ctaux_walker.hpp +++ b/include/dca/phys/dca_step/cluster_solver/ctaux/ctaux_walker.hpp @@ -1012,11 +1012,15 @@ void CtauxWalker::read_Gamma_matrices(e_spin_states case e_DN: CT_AUX_WALKER_TOOLS::compute_Gamma( Gamma_dn, N_dn, G_dn, vertex_indixes, exp_V, exp_delta_V, thread_id, stream_id); + // assume we've no guarantee this will be allowed to finish before the async copy starts + linalg::util::syncStream(thread_id, stream_id); break; case e_UP: CT_AUX_WALKER_TOOLS::compute_Gamma( Gamma_up, N_up, G_up, vertex_indixes, exp_V, exp_delta_V, thread_id, stream_id); + // assume we've no guarantee this will be allowed to finish before the async copy starts + linalg::util::syncStream(thread_id, stream_id); break; default: