fast weight kernel, remove extra syncthreads

kazuki-irie · kazuki-irie · commit eb650bbac04e · 2021-08-03T16:04:59.000+02:00
diff --git a/algorithmic/fast_weight/fast_weight_cuda.cu b/algorithmic/fast_weight/fast_weight_cuda.cu
@@ -107,15 +107,14 @@ __global__ void fast_weight_forward_kernel(
                 // get old value
                 v_old = shared_kv[threadIdx.x + sub * blockDim.x] *
                   shared_keys[t*E_block + e];
-                __syncthreads();
 
                 atomicAdd(
                     &shared_values_old[m],
                     v_old
                 );
-                __syncthreads();
             }
         }
+        __syncthreads();
 
         // compute new value to be inserted
         if (threadIdx.x < M) {
@@ -132,7 +131,6 @@ __global__ void fast_weight_forward_kernel(
             if (e < E) {
                 shared_kv[threadIdx.x + sub * blockDim.x] +=
                   shared_keys[t*E_block + e] * shared_values_insert[m];
-                __syncthreads();
 
                 res = shared_queries[t*E_block + e]
                   * shared_kv[threadIdx.x + sub * blockDim.x];
@@ -497,15 +495,13 @@ __global__ void fast_weight_backward_value_beta_kernel(
             if (e < E) {
                 shared_kv[threadIdx.x + sub * blockDim.x] +=
                   shared_queries[t*E_block + e] * shared_gradout[t*M + m];
-                __syncthreads();
 
                 float res = shared_keys[t*E_block + e]
                             * shared_kv[threadIdx.x + sub * blockDim.x];
                 atomicAdd(
                     &shared_results[m],
                     res
                 );
-                __syncthreads();
             }
         }
         __syncthreads();
diff --git a/language_modeling/src/utils/fast_fast_weight/fast_weight_cuda.cu b/language_modeling/src/utils/fast_fast_weight/fast_weight_cuda.cu
@@ -112,15 +112,14 @@ __global__ void fast_weight_forward_kernel(
             if (e < E) {
                 // get old value
                 v_old = shared_kv[kv_idx] * shared_keys[e_abs];
-                __syncthreads();
 
                 atomicAdd(
                     &shared_v_old[m],
                     v_old
                 );
-                __syncthreads();
             }
         }
+        __syncthreads();
 
         // compute new value to be inserted
         if (threadIdx.x < M) {
@@ -138,7 +137,6 @@ __global__ void fast_weight_forward_kernel(
             kv_idx = threadIdx.x + sub * blockDim.x;
             if (e < E) {
                 shared_kv[kv_idx] += shared_keys[e_abs] * shared_v_insert[m];
-                __syncthreads();
                 res = shared_queries[e_abs] * shared_kv[kv_idx];
                 atomicAdd(
                     &shared_results[m],
@@ -512,14 +510,12 @@ __global__ void fast_weight_backward_value_beta_kernel(
             if (e < E) {
                 shared_kv[kv_idx] +=
                   shared_queries[e_abs] * shared_gradout[m_abs];
-                __syncthreads();
 
                 float res = shared_keys[e_abs] * shared_kv[kv_idx];
                 atomicAdd(
                     &shared_results[m],
                     res
                 );
-                __syncthreads();
             }
         }
         __syncthreads();
diff --git a/reinforcement_learning/torchbeast/fast_weight/fast_weight_cuda.cu b/reinforcement_learning/torchbeast/fast_weight/fast_weight_cuda.cu
@@ -107,15 +107,14 @@ __global__ void fast_weight_forward_kernel(
                 // get old value
                 v_old = shared_kv[threadIdx.x + sub * blockDim.x] *
                   shared_keys[t*E_block + e];
-                __syncthreads();
 
                 atomicAdd(
                     &shared_values_old[m],
                     v_old
                 );
-                __syncthreads();
             }
         }
+        __syncthreads();
 
         // compute new value to be inserted
         if (threadIdx.x < M) {
@@ -132,7 +131,6 @@ __global__ void fast_weight_forward_kernel(
             if (e < E) {
                 shared_kv[threadIdx.x + sub * blockDim.x] +=
                   shared_keys[t*E_block + e] * shared_values_insert[m];
-                __syncthreads();
 
                 res = shared_queries[t*E_block + e]
                   * shared_kv[threadIdx.x + sub * blockDim.x];
@@ -497,15 +495,13 @@ __global__ void fast_weight_backward_value_beta_kernel(
             if (e < E) {
                 shared_kv[threadIdx.x + sub * blockDim.x] +=
                   shared_queries[t*E_block + e] * shared_gradout[t*M + m];
-                __syncthreads();
 
                 float res = shared_keys[t*E_block + e]
                             * shared_kv[threadIdx.x + sub * blockDim.x];
                 atomicAdd(
                     &shared_results[m],
                     res
                 );
-                __syncthreads();
             }
         }
         __syncthreads();