From ae84cabb3e0f6416eb228c7e31f2d59d48f74e69 Mon Sep 17 00:00:00 2001 From: DavidDiazGuerra Date: Wed, 19 Jan 2022 14:09:33 +0100 Subject: [PATCH] Reduce the shared memory consumption of calcAmpTau_kernel for omnidirectional sources and receivers --- examples/time_vs_nbRIRs.py | 2 +- src/gpuRIR_cuda.cu | 67 ++++++++++++++++++++------------------ 2 files changed, 37 insertions(+), 32 deletions(-) diff --git a/examples/time_vs_nbRIRs.py b/examples/time_vs_nbRIRs.py index f9bae08..e02d8e5 100644 --- a/examples/time_vs_nbRIRs.py +++ b/examples/time_vs_nbRIRs.py @@ -12,7 +12,7 @@ gpuRIR.activateMixedPrecision(False) gpuRIR.activateLUT(False) -nb_src_vec = np.concatenate([2**np.arange(11), [2047]]) # Number of RIRs to measure +nb_src_vec = np.concatenate([2**np.arange(12), [4095]]) # Number of RIRs to measure nb_test_per_point = 10 # Number of simulations per T60 to average the runtime nb_rcv = 1 # Number of receivers diff --git a/src/gpuRIR_cuda.cu b/src/gpuRIR_cuda.cu index 8f9cf4f..1c804e2 100644 --- a/src/gpuRIR_cuda.cu +++ b/src/gpuRIR_cuda.cu @@ -299,23 +299,26 @@ __global__ void calcAmpTau_kernel(float* g_amp /*[M_src]M_rcv][nb_img_x][nb_img_ } } - // Copy g_orV_rcv to shared memory - float* sh_orV_rcv = &sh_pos_rcv[M_rcv*3]; - if (threadIdx.x==0 && threadIdx.y==0) { - for (int m=threadIdx.z; m -1 * orV_src. If rflx_id = 0 => 1 * orV_src - im_src_to_rcv[d] = -vec[d]; // change vector direction (mirror through origin) + // For speaker directivity + if (spkr_pattern != DIR_OMNI) orV_src[d] = (1-2*rflx_idx[d]) * sh_orV_src[m_src*3+d]; // If rflx_id=1 =>-1 *orV_src. If rflx_id= 0 => 1 * orV_src + im_src_to_rcv[d] = -vec[d]; //change vector direction (mirror through origin) } dist = sqrtf(dist); float amp = rflx_att / (4*PI*dist); - amp *= directivity(vec, &sh_orV_rcv[m_rcv], mic_pattern); // apply microphone directivity dampening - amp *= directivity(im_src_to_rcv, orV_src, spkr_pattern); // apply speaker directivity dampening + if (spkr_pattern != DIR_OMNI) amp *= directivity(im_src_to_rcv, orV_src, spkr_pattern); // Apply speaker directivity dampening + if (mic_pattern != DIR_OMNI) amp *= directivity(vec, &sh_orV_rcv[3*m_rcv], mic_pattern); // Apply microphone directivity dampening g_amp[m_src*M_rcv*prodN + m_rcv*prodN + n_idx] = amp; g_tau[m_src*M_rcv*prodN + m_rcv*prodN + n_idx] = dist / c * Fs; @@ -793,16 +796,18 @@ float* gpuRIR_cuda::cuda_simulateRIR(float room_sz[3], float beta[6], float* h_p gpuErrchk( cudaMalloc(&orV_rcv, M_rcv*3*sizeof(float)) ); gpuErrchk( cudaMemcpy(pos_src, h_pos_src, M_src*3*sizeof(float), cudaMemcpyHostToDevice ) ); gpuErrchk( cudaMemcpy(pos_rcv, h_pos_rcv, M_rcv*3*sizeof(float), cudaMemcpyHostToDevice ) ); - gpuErrchk( cudaMemcpy(orV_src, h_orV_src, M_src*3*sizeof(float), cudaMemcpyHostToDevice ) ); - gpuErrchk( cudaMemcpy(orV_rcv, h_orV_rcv, M_rcv*3*sizeof(float), cudaMemcpyHostToDevice ) ); - + if (spkr_pattern != DIR_OMNI) gpuErrchk( cudaMemcpy(orV_src, h_orV_src, M_src*3*sizeof(float), cudaMemcpyHostToDevice) ); + if (mic_pattern != DIR_OMNI) gpuErrchk( cudaMemcpy(orV_rcv, h_orV_rcv, M_rcv*3*sizeof(float), cudaMemcpyHostToDevice) ); // Use the ISM to calculate the amplitude and delay of each image dim3 threadsPerBlockISM(nThreadsISM_x, nThreadsISM_y, nThreadsISM_z); dim3 numBlocksISM(ceil((float)nb_img[0] / nThreadsISM_x), ceil((float)nb_img[1] / nThreadsISM_y), ceil((float)nb_img[2] / nThreadsISM_z)); - int shMemISM = (2*M_src + 2*M_rcv) * 3 * sizeof(float); + int shMemISM = 0; + shMemISM += (spkr_pattern == DIR_OMNI)? M_src : 2*M_src; + shMemISM += (mic_pattern == DIR_OMNI)? M_rcv : 2*M_rcv; + shMemISM *= 3 * sizeof(float); // printf("shMemISM: %d\n", shMemISM); float* amp; // Amplitude with which the signals from each image source of each source arrive to each receiver