From 9f01b166f32d277e16be75609a3c5cc3650ba7ba Mon Sep 17 00:00:00 2001
From: Peter Doak <doakpw@ornl.gov>
Date: Wed, 10 Apr 2024 14:48:30 -0400
Subject: [PATCH] reduce page access violations and seemingly Gamma_LU min_max
 fails

---
 cmake/dca_hip.cmake                           |  3 ++
 .../dca/linalg/util/atomic_add_cuda.cu.hpp    | 49 ++++++++++++++++++-
 include/dca/math/nfft/dnfft_1d_gpu.hpp        |  2 +
 .../cluster_solver/ctaux/ctaux_walker.hpp     |  4 ++
 4 files changed, 57 insertions(+), 1 deletion(-)

diff --git a/cmake/dca_hip.cmake b/cmake/dca_hip.cmake
index 2d3e0ebb3..4ba855a03 100644
--- a/cmake/dca_hip.cmake
+++ b/cmake/dca_hip.cmake
@@ -74,6 +74,9 @@ if (CMAKE_HIP_COMPILER)
   set(DCA_HIP_PROPERTIES "CMAKE_HIP_ARCHITECTURES gfx908,gfx90a")
   set(CMAKE_HIP_STANDARD 17)
   list(APPEND HIP_HIPCC_FLAGS "-fPIC")
+  list(APPEND HIP_HIPCC_FLAGS "-mno-unsafe-fp-atomics")
+  list(APPEND HIP_HIPCC_FLAGS "-fgpu-default-stream=per-thread")
+
   # doesn't appear to work
   set(CMAKE_HIP_SOURCE_FILE_EXTENSIONS cu)
   message("Enabled HIP as a language")
diff --git a/include/dca/linalg/util/atomic_add_cuda.cu.hpp b/include/dca/linalg/util/atomic_add_cuda.cu.hpp
index da7b35dc3..f0bd9c134 100644
--- a/include/dca/linalg/util/atomic_add_cuda.cu.hpp
+++ b/include/dca/linalg/util/atomic_add_cuda.cu.hpp
@@ -42,6 +42,52 @@ __device__ void inline atomicAdd(double* address, const double val) {
   atomicAddImpl(address, val);
 }
 
+#elif defined(DCA_HAVE_HIP)
+// HIP seems to have some horrible problem with concurrent atomic operations.
+__device__ double inline atomicAddImpl(double* address, const double val) {
+  unsigned long long int* address_as_ull = (unsigned long long int*)address;
+  unsigned long long int old = *address_as_ull, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_ull, assumed,
+                    __double_as_longlong(val + __longlong_as_double(assumed)));
+    // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) }
+  } while (assumed != old);
+  return __longlong_as_double(old);
+}
+
+__device__ double inline atomicAddImpl(float* address, const float val) {
+  unsigned long int* address_as_int = (unsigned long int*)address;
+  unsigned long int old = *address_as_int, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_int, assumed,
+                    __float_as_int(val + __int_as_float(assumed)));
+    // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN) }
+  } while (assumed != old);
+  return __int_as_float(old);
+}
+  
+__device__ void inline atomicAdd(float* address, const float val) {
+  atomicAddImpl(address, val);
+}
+
+__device__ void inline atomicAdd(double* address, const double val) {
+  atomicAddImpl(address, val);
+}
+
+__device__ void inline atomicAdd(cuDoubleComplex* address, cuDoubleComplex val) {
+  double* a_d = reinterpret_cast<double*>(address);
+  atomicAddImpl(a_d, val.x);
+  atomicAddImpl(a_d + 1, val.y);
+}
+
+  __device__ void inline atomicAdd(magmaFloatComplex* const address, magmaFloatComplex val) {
+  double* a_d = reinterpret_cast<double*>(address);
+  atomicAddImpl(a_d, val.x);
+  atomicAddImpl(a_d + 1, val.y);
+}
+
 #else
 __device__ void inline atomicAdd(double* address, double val) {
   ::atomicAdd(address, val);
@@ -62,8 +108,9 @@ __device__ void inline atomicAdd(cuDoubleComplex* address, cuDoubleComplex val)
   atomicAdd(a_d, val.x);
   atomicAdd(a_d + 1, val.y);
 }
-#endif  // __CUDA_ARCH__
+#endif  // atomic operation help
 
+  
 }  // linalg
 }  // dca
 
diff --git a/include/dca/math/nfft/dnfft_1d_gpu.hpp b/include/dca/math/nfft/dnfft_1d_gpu.hpp
index 0cae41164..587416459 100644
--- a/include/dca/math/nfft/dnfft_1d_gpu.hpp
+++ b/include/dca/math/nfft/dnfft_1d_gpu.hpp
@@ -224,6 +224,8 @@ void Dnfft1DGpu<Scalar, WDmn, RDmn, oversampling, CUBIC>::accumulate(
   config_left_dev_.setAsync(config_left_, stream_);
   times_dev_.setAsync(times_, stream_);
 
+  //hipStreamSynchronize(stream_.streamActually());
+  
   details::accumulateOnDevice<oversampling, BaseClass::window_sampling_, Scalar, Real>(
       M.ptr(), M.leadingDimension(), factor, accumulation_matrix_.ptr(),
       accumulation_matrix_sqr_.ptr(), accumulation_matrix_.leadingDimension(), config_left_dev_.ptr(),
diff --git a/include/dca/phys/dca_step/cluster_solver/ctaux/ctaux_walker.hpp b/include/dca/phys/dca_step/cluster_solver/ctaux/ctaux_walker.hpp
index 3e681495b..1f5ab6d8d 100644
--- a/include/dca/phys/dca_step/cluster_solver/ctaux/ctaux_walker.hpp
+++ b/include/dca/phys/dca_step/cluster_solver/ctaux/ctaux_walker.hpp
@@ -1012,11 +1012,15 @@ void CtauxWalker<device_t, Parameters, Data>::read_Gamma_matrices(e_spin_states
     case e_DN:
       CT_AUX_WALKER_TOOLS<device_t, Scalar>::compute_Gamma(
           Gamma_dn, N_dn, G_dn, vertex_indixes, exp_V, exp_delta_V, thread_id, stream_id);
+      // assume we've no guarantee this will be allowed to finish before the async copy starts
+      linalg::util::syncStream(thread_id, stream_id);
       break;
 
     case e_UP:
       CT_AUX_WALKER_TOOLS<device_t, Scalar>::compute_Gamma(
           Gamma_up, N_up, G_up, vertex_indixes, exp_V, exp_delta_V, thread_id, stream_id);
+      // assume we've no guarantee this will be allowed to finish before the async copy starts
+      linalg::util::syncStream(thread_id, stream_id);
       break;
 
     default: