diff --git a/examples/time_vs_nbRIRs.py b/examples/time_vs_nbRIRs.py index c43f96f..f9bae08 100644 --- a/examples/time_vs_nbRIRs.py +++ b/examples/time_vs_nbRIRs.py @@ -12,7 +12,7 @@ gpuRIR.activateMixedPrecision(False) gpuRIR.activateLUT(False) -nb_src_vec = np.concatenate([2**np.arange(12), [4094]]) # Number of RIRs to measure +nb_src_vec = np.concatenate([2**np.arange(11), [2047]]) # Number of RIRs to measure nb_test_per_point = 10 # Number of simulations per T60 to average the runtime nb_rcv = 1 # Number of receivers diff --git a/src/gpuRIR_cuda.cu b/src/gpuRIR_cuda.cu index 22328d3..8f9cf4f 100644 --- a/src/gpuRIR_cuda.cu +++ b/src/gpuRIR_cuda.cu @@ -1,1066 +1,1067 @@ - -#include -#include - -#include -#include -#include -#include -#include -#include - -#include -#include "gpuRIR_cuda.h" - -#if CUDART_VERSION < 9000 -#define __h2div h2div -#endif - -// Image Source Method -static const int nThreadsISM_x = 4; -static const int nThreadsISM_y = 4; -static const int nThreadsISM_z = 4; - -// RIR computation -static const int initialReductionMin = 512; -static const int lut_oversamp = 16; -static const int nThreadsGen_t = 32; -static const int nThreadsGen_m = 4; -static const int nThreadsGen_n = 1; // Don't change it -static const int nThreadsRed = 128; - -// Power envelope prediction -static const int nThreadsEnvPred_x = 4; -static const int nThreadsEnvPred_y = 4; -static const int nThreadsEnvPred_z = 1; // Don't change it - -// Generate diffuse reverberation -static const int nThreadsDiff_t = 16; -static const int nThreadsDiff_src = 4; -static const int nThreadsDiff_rcv = 2; - -// RIR filtering onvolution -static const int nThreadsConv_x = 256; -static const int nThreadsConv_y = 1; -static const int nThreadsConv_z = 1; - -#if __CUDA_ARCH__ >= 530 -#define h2zeros __float2half2_rn(0.0) -#define h2ones __float2half2_rn(1.0) -#define h2pi __float2half2_rn(PI) -#endif - -// To hide the cuRAND generator in the header and don't need to include the cuda headers there -struct cuRandGeneratorWrapper_t -{ - curandGenerator_t gen; -}; -cuRandGeneratorWrapper_t gpuRIR_cuda::cuRandGenWrap; - -// CUDA architecture in format xy0 -int cuda_arch; - - -/***************************/ -/* Auxiliar host functions */ -/***************************/ - -#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } -inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) { - if (code != cudaSuccess) - { - fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); - if (abort) exit(code); - } -} - -#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } -inline void gpuAssert(curandStatus_t code, const char *file, int line, bool abort=true) { - if (code != CURAND_STATUS_SUCCESS) - { - fprintf(stderr,"cuRAND: %d %s %d\n", code, file, line); - if (abort) exit(code); - } -} - -#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } -inline void gpuAssert(cufftResult_t code, const char *file, int line, bool abort=true) { - if (code != CUFFT_SUCCESS) - { - fprintf(stderr,"cuFFT error: %d %s %d\n", code, file, line); - if (abort) exit(code); - } -} - -inline unsigned int pow2roundup (unsigned int x) { - --x; - x |= x >> 1; - x |= x >> 2; - x |= x >> 2; - x |= x >> 4; - x |= x >> 8; - x |= x >> 16; - return x+1; -} - -/*****************************/ -/* Auxiliar device functions */ -/*****************************/ - -__device__ __forceinline__ float hanning_window(float t, float Tw) { - return 0.5f * (1.0f + __cosf(2.0f*PI*t/Tw)); -} - -__device__ __forceinline__ float sinc(float x) { - return (x==0)? 1 : sinf(x)/x; -} - -__device__ __forceinline__ float image_sample(float amp, float tau, float t, float Tw) { - float t_tau = t - tau; - return (abs(t_tau)0.0f)? 1.0f : 0.0f; - case DIR_CARD: return 0.5f + 0.5f*cosTheta; - case DIR_HYPCARD: return 0.25f + 0.75f*cosTheta; - case DIR_SUBCARD: return 0.75f + 0.25f*cosTheta; - case DIR_BIDIR: return cosTheta; - default: printf("Invalid polar pattern.\n"); return 0.0f; - } -} - -// cufftComplex scale -__device__ __forceinline__ cufftComplex ComplexScale(cufftComplex a, float s) { - cufftComplex c; - c.x = s * a.x; - c.y = s * a.y; - return c; -} - -// cufftComplex multiplication -__device__ __forceinline__ cufftComplex ComplexMul(cufftComplex a, cufftComplex b) { - cufftComplex c; - c.x = a.x * b.x - a.y * b.y; - c.y = a.x * b.y + a.y * b.x; - return c; -} - -/*********************************************/ -/* Mixed precision auxiliar device functions */ -/*********************************************/ - -#if __CUDA_ARCH__ >= 530 - -__device__ __forceinline__ half2 h2abs(half2 x) { - uint32_t i = *reinterpret_cast(&x) & 0x7FFF7FFF; - return *reinterpret_cast( &i ); -} - -__device__ __forceinline__ half2 my_h2sinpi(half2 x) { - // Argument reduction to [-0.5, 0.5] - half2 i = h2rint(x); - half2 r = __hsub2(x, i); - - // sin(pi*x) polinomial approximation for x in [-0.5,0.5] - half2 r2 = __hmul2(r, r); - half2 s = __float2half2_rn(+2.31786431325108f); - s = __hfma2(r2, s, __float2half2_rn(-5.14167814230801f)); - s = __hfma2(r2, s, __float2half2_rn(+3.14087446786993f)); - s = __hmul2(s, r); - - half2 i_2 = __hmul2(i, __float2half2_rn(0.5f)); - half2 sgn = __hfma2(__float2half2_rn(-2.0f), - __hne2(__hsub2(i_2, h2rint(i_2)), h2zeros), - h2ones); // 1 if i is even, else -1: -2 * ((i/2-round(i/2))!=0) + 1 - s = __hmul2(s, sgn); - - return s; -} - -__device__ __forceinline__ half2 my_h2cospi(half2 x) { - // It is always on [-0.5, 0.5], so we do not need argument reduction - - // cos(pi*x) polinomial approximation for x in [-0.5,0.5] - half2 x2 = __hmul2(x, x); - half2 c = __float2half2_rn(-1.229339658587166f); - c = __hfma2(x2, c, __float2half2_rn(+4.043619929856572f)); - c = __hfma2(x2, c, __float2half2_rn(-4.934120365987677f)); - c = __hfma2(x2, c, __float2half2_rn(+0.999995282317910f)); - - return c; -} - -__device__ __forceinline__ half2 hanning_window_mp(half2 t, half2 Tw_inv) { - half2 c = my_h2cospi(__hmul2(Tw_inv, t)); - return __hmul2(c, c); -} - -__device__ __forceinline__ half2 my_h2sinc(half2 x) { - x = __hfma2(__heq2(x, h2zeros), __float2half2_rn(1e-7f), x); - return __h2div(my_h2sinpi(x), __hmul2(h2pi, x)); - -} - -__device__ __forceinline__ half2 image_sample_mp(half2 amp, float tau, float t1, float t2, float Tw_2, half2 Tw_inv) { - float t1_tau = t1-tau; - float t2_tau = t2-tau; - half2 t_tau = __floats2half2_rn(t1_tau, t2_tau); - if (abs(t1_tau)(sinc_lut, __fmaf_rz(t_tau,lut_oversamp,lut_center)) : 0.0f; -} - -/***********/ -/* KERNELS */ -/***********/ - -__global__ void calcAmpTau_kernel(float* g_amp /*[M_src]M_rcv][nb_img_x][nb_img_y][nb_img_z]*/, - float* g_tau /*[M_src]M_rcv][nb_img_x][nb_img_y][nb_img_z]*/, - float* g_tau_dp /*[M_src]M_rcv]*/, - float* g_pos_src/*[M_src][3]*/, float* g_pos_rcv/*[M_rcv][3]*/, float* g_orV_src/*[M_src][3]*/, float* g_orV_rcv/*[M_rcv][3]*/, - polarPattern spkr_pattern, polarPattern mic_pattern, float room_sz_x, float room_sz_y, float room_sz_z, - float beta_x1, float beta_x2, float beta_y1, float beta_y2, float beta_z1, float beta_z2, - int nb_img_x, int nb_img_y, int nb_img_z, - int M_src, int M_rcv, float c, float Fs) { - - extern __shared__ float sdata[]; - - int n[3]; - n[0] = blockIdx.x * blockDim.x + threadIdx.x; - n[1] = blockIdx.y * blockDim.y + threadIdx.y; - n[2] = blockIdx.z * blockDim.z + threadIdx.z; - - int N[3]; - N[0] = nb_img_x; - N[1] = nb_img_y; - N[2] = nb_img_z; - - float room_sz[3]; - room_sz[0] = room_sz_x; - room_sz[1] = room_sz_y; - room_sz[2] = room_sz_z; - - float beta[6]; - beta[0] = - beta_x1; - beta[1] = - beta_x2; - beta[2] = - beta_y1; - beta[3] = - beta_y2; - beta[4] = - beta_z1; - beta[5] = - beta_z2; - - int prodN = N[0]*N[1]*N[2]; - int n_idx = n[0]*N[1]*N[2] + n[1]*N[2] + n[2]; - - // Copy g_pos_src to shared memory - float* sh_pos_src = (float*) sdata; - if (threadIdx.y==0 && threadIdx.z==0) { - for (int m=threadIdx.x; m -1 * orV_src. If rflx_id = 0 => 1 * orV_src - im_src_to_rcv[d] = -vec[d]; // change vector direction (mirror through origin) - } - dist = sqrtf(dist); - float amp = rflx_att / (4*PI*dist); - amp *= directivity(vec, &sh_orV_rcv[m_rcv], mic_pattern); // apply microphone directivity dampening - amp *= directivity(im_src_to_rcv, orV_src, spkr_pattern); // apply speaker directivity dampening - - g_amp[m_src*M_rcv*prodN + m_rcv*prodN + n_idx] = amp; - g_tau[m_src*M_rcv*prodN + m_rcv*prodN + n_idx] = dist / c * Fs; - - if (direct_path){ - g_tau_dp[m_src*M_rcv + m_rcv] = dist / c * Fs; - // printf("%f,",amp); // For creating polar pattern diagrams - } - } - } - } -} - -__global__ void generateRIR_kernel(float* initialRIR, float* amp, float* tau, int T, int M, int N, int iniRIR_N, int ini_red, float Tw) { - int t = blockIdx.x * blockDim.x + threadIdx.x; - int m = blockIdx.y * blockDim.y + threadIdx.y; - int n_ini = blockIdx.z * ini_red; - int n_max = fminf(n_ini + ini_red, N); - - if (m0; s>>=1) { - if (tid < s) sdata[tid] += sdata[tid+s]; - __syncthreads(); - } - - if (tid==0) { - intermediateRIR[m*T*intRIR_N + t*intRIR_N + blockIdx.x] = sdata[0]; - } -} - -__global__ void envPred_kernel(float* A /*[M_src]M_rcv]*/, float* alpha /*[M_src]M_rcv]*/, - float* RIRs_early /*[M_src][M_rcv][nSamples]*/, float* tau_dp, /*[M_src]M_rcv]*/ - int M_src, int M_rcv, int nSamples, float Fs, - float room_sz_x, float room_sz_y, float room_sz_z, - float beta_x1, float beta_x2, float beta_y1, float beta_y2, float beta_z1, float beta_z2) { - - float w_sz = 10e-3f * Fs; // Maximum window size (samples) to compute the final power of the early RIRs_early - - int m_src = blockIdx.x * blockDim.x + threadIdx.x; - int m_rcv = blockIdx.y * blockDim.y + threadIdx.y; - - if (m_src= 530 - int t = blockIdx.x * blockDim.x + threadIdx.x; - int m = blockIdx.y * blockDim.y + threadIdx.y; - int n_ini = blockIdx.z * ini_red; - int n_max = fminf(n_ini + ini_red, N); - - if (m= 530 - int tid = threadIdx.x; - int n = blockIdx.x*(blockDim.x*2) + threadIdx.x; - int t = blockIdx.y; - int m = blockIdx.z; - - if (n+blockDim.x < N) sdata_mp[tid] = __hadd2(initialRIR[m*T*N + t*N + n], initialRIR[m*T*N + t*N + n+blockDim.x]); - else if (n0; s>>=1) { - if (tid < s) sdata_mp[tid] = __hadd2(sdata_mp[tid], sdata_mp[tid+s]); - __syncthreads(); - } - - if (tid==0) { - intermediateRIR[m*T*intRIR_N + t*intRIR_N + blockIdx.x] = sdata_mp[0]; - } - #else - printf("Mixed precision requires Pascal GPU architecture or higher.\n"); - #endif -} - -__global__ void h2RIR_to_floatRIR_kernel(half2* h2RIR, float* floatRIR, int M, int T) { - #if __CUDA_ARCH__ >= 530 - int t = blockIdx.x * blockDim.x + threadIdx.x; - int m = blockIdx.y * blockDim.y + threadIdx.y; - - if (t 1e9) initialReduction *= 2; - - int iniRIR_N = ceil((float)N/initialReduction); - dim3 threadsPerBlockIni(nThreadsGen_t, nThreadsGen_m, nThreadsGen_n); - dim3 numBlocksIni(ceil((float)T/threadsPerBlockIni.x), ceil((float)M/threadsPerBlockIni.y), iniRIR_N); - - float* initialRIR; - gpuErrchk( cudaMalloc(&initialRIR, M*T*iniRIR_N*sizeof(float)) ); - - int Tw = (int) round(8e-3f * Fs); // Window duration [samples] - - if (lookup_table) { - int lut_len = Tw * lut_oversamp; - lut_len += ((lut_len%2)? 0 : 1); // Must be odd - cudaArray* cuArrayLut; - cudaTextureObject_t sinc_lut = create_sinc_texture_lut(&cuArrayLut, Tw, lut_len); - - generateRIR_kernel_lut<<>>( initialRIR, amp, tau, T, M, N, iniRIR_N, initialReduction, Tw/2, sinc_lut, lut_len/2+0.5 ); - gpuErrchk( cudaDeviceSynchronize() ); - gpuErrchk( cudaPeekAtLastError() ); - - cudaDestroyTextureObject(sinc_lut); - cudaFreeArray(cuArrayLut); - } else { - generateRIR_kernel<<>>( initialRIR, amp, tau, T, M, N, iniRIR_N, initialReduction, Tw ); - gpuErrchk( cudaDeviceSynchronize() ); - gpuErrchk( cudaPeekAtLastError() ); - } - - - dim3 threadsPerBlockRed(nThreadsRed, 1, 1); - float* intermediateRIR; - int intRIR_N; - while (iniRIR_N > 2*nThreadsRed) { - intRIR_N = ceil((float)iniRIR_N / (2*nThreadsRed)); - gpuErrchk( cudaMalloc(&intermediateRIR, intRIR_N * T * M * sizeof(float)) ); - - dim3 numBlocksRed(intRIR_N, T, M); - reduceRIR_kernel<<>>( - initialRIR, intermediateRIR, M, T, iniRIR_N, intRIR_N); - gpuErrchk( cudaDeviceSynchronize() ); - gpuErrchk( cudaPeekAtLastError() ); - - gpuErrchk( cudaFree(initialRIR) ); - initialRIR = intermediateRIR; - iniRIR_N = intRIR_N; - } - - dim3 numBlocksEnd(1, T, M); - reduceRIR_kernel<<>>( - initialRIR, rir, M, T, iniRIR_N, 1); - gpuErrchk( cudaDeviceSynchronize() ); - gpuErrchk( cudaPeekAtLastError() ); - gpuErrchk( cudaFree(initialRIR) ); -} - -void cuda_rirGenerator_mp(float* rir, float* amp, float* tau, int M, int N, int T, float Fs) { - if (cuda_arch >= 530) { - int initialReduction = initialReductionMin; - while (M * T/2 * ceil((float)N/initialReduction) > 1e9) initialReduction *= 2; - - int iniRIR_N = ceil((float)N/initialReduction); - dim3 threadsPerBlockIni(nThreadsGen_t, nThreadsGen_m, nThreadsGen_n); - dim3 numBlocksIni(ceil((float)T/2/threadsPerBlockIni.x), ceil((float)M/threadsPerBlockIni.y), iniRIR_N); - - half2* initialRIR; - gpuErrchk( cudaMalloc(&initialRIR, M*(T/2)*iniRIR_N*sizeof(half2)) ); - - float Tw_2 = 8e-3f * Fs / 2; - #if CUDART_VERSION < 9020 - // For CUDA versions older than 9.2 it is nos possible to call from host code __float2half2_rn, - // but doing it in the kernel is slower - float Tw_inv = 1.0f / (8e-3f * Fs); - #else - half2 Tw_inv = __float2half2_rn(1.0f / (8e-3f * Fs)); - #endif - generateRIR_mp_kernel<<>>( initialRIR, amp, tau, T/2, M, N, iniRIR_N, initialReduction, Fs, Tw_2, Tw_inv ); - gpuErrchk( cudaDeviceSynchronize() ); - gpuErrchk( cudaPeekAtLastError() ); - - dim3 threadsPerBlockRed(nThreadsRed, 1, 1); - half2* intermediateRIR; - int intRIR_N; - while (iniRIR_N > 2*nThreadsRed) { - intRIR_N = ceil((float)iniRIR_N / (2*nThreadsRed)); - gpuErrchk( cudaMalloc(&intermediateRIR, intRIR_N * T/2 * M * sizeof(half2)) ); - - dim3 numBlocksRed(intRIR_N, T/2, M); - reduceRIR_mp_kernel<<>>( - initialRIR, intermediateRIR, M, T/2, iniRIR_N, intRIR_N); - gpuErrchk( cudaDeviceSynchronize() ); - gpuErrchk( cudaPeekAtLastError() ); - - gpuErrchk( cudaFree(initialRIR) ); - initialRIR = intermediateRIR; - iniRIR_N = intRIR_N; - } - - gpuErrchk( cudaMalloc(&intermediateRIR, M * T/2 * sizeof(half2)) ); - dim3 numBlocksEnd(1, T/2, M); - reduceRIR_mp_kernel<<>>( - initialRIR, intermediateRIR, M, T/2, iniRIR_N, 1); - gpuErrchk( cudaDeviceSynchronize() ); - gpuErrchk( cudaPeekAtLastError() ); - gpuErrchk( cudaFree(initialRIR) ); - - dim3 numBlocks(ceil((float)(T/2)/128), M, 1); - dim3 threadsPerBlock(128, 1, 1); - h2RIR_to_floatRIR_kernel<<>>(intermediateRIR, rir, M, T/2); - - gpuErrchk( cudaFree(intermediateRIR) ); - } else { - printf("Mixed precision requires Pascal GPU architecture or higher.\n"); - } -} - -int gpuRIR_cuda::PadData(float *signal, float **padded_signal, int segment_len, - float *RIR, float **padded_RIR, int M_src, int M_rcv, int RIR_len) { - - int N_fft = pow2roundup(segment_len + RIR_len - 1); - - // Pad signal - float *new_data = (float *)malloc(sizeof(float) * M_src * (N_fft+2)); - for (int m=0; m>> ( - amp, tau, tau_dp, - pos_src, pos_rcv, orV_src, orV_rcv, spkr_pattern, mic_pattern, - room_sz[0], room_sz[1], room_sz[2], - beta[0], beta[1], beta[2], beta[3], beta[4], beta[5], - nb_img[0], nb_img[1], nb_img[2], - M_src, M_rcv, c, Fs - ); - gpuErrchk( cudaDeviceSynchronize() ); - gpuErrchk( cudaPeekAtLastError() ); - - int nSamplesISM = ceil(Tdiff*Fs); - nSamplesISM += nSamplesISM%2; // nSamplesISM must be even - int nSamples = ceil(Tmax*Fs); - nSamples += nSamples%2; // nSamples must be even - int nSamplesDiff = nSamples - nSamplesISM; - - // Compute the RIRs as a sum of sincs - int M = M_src * M_rcv; - int N = nb_img[0] * nb_img[1] * nb_img[2]; - float* rirISM; - gpuErrchk( cudaMalloc(&rirISM, M*nSamplesISM*sizeof(float)) ); - if (mixed_precision) { - if (cuda_arch >= 530) { - cuda_rirGenerator_mp(rirISM, amp, tau, M, N, nSamplesISM, Fs); - } else { - printf("The mixed precision requires Pascal GPU architecture or higher.\n"); - } - } else { - cuda_rirGenerator(rirISM, amp, tau, M, N, nSamplesISM, Fs); - } - - // Compute the exponential power envelope parammeters of each RIR - dim3 threadsPerBlockEnvPred(nThreadsEnvPred_x, nThreadsEnvPred_y, nThreadsEnvPred_z); - dim3 numBlocksEnvPred(ceil((float)M_src / nThreadsEnvPred_x), - ceil((float)M_rcv / nThreadsEnvPred_y), 1); - - float* A; // pow_env = A * exp(alpha * (t-tau_dp)) - gpuErrchk( cudaMalloc(&A, M_src*M_rcv*sizeof(float)) ); - float* alpha; - gpuErrchk( cudaMalloc(&alpha, M_src*M_rcv*sizeof(float)) ); - - envPred_kernel<<>>( - A, alpha, rirISM, tau_dp, M_src, M_rcv, nSamplesISM, Fs, - room_sz[0], room_sz[1], room_sz[2], beta[0], beta[1], beta[2], beta[3], beta[4], beta[5]); - gpuErrchk( cudaDeviceSynchronize() ); - gpuErrchk( cudaPeekAtLastError() ); - - // Generate diffuse reverberation - float* rirDiff; - gpuErrchk( cudaMalloc(&rirDiff, M_src*M_rcv*nSamplesDiff*sizeof(float)) ); - - if (nSamplesDiff != 0) { - // Fill rirDiff with random numbers with uniform distribution - gpuErrchk( curandGenerateUniform(cuRandGenWrap.gen, rirDiff, M_src*M_rcv*nSamplesDiff) ); - gpuErrchk( cudaDeviceSynchronize() ); - gpuErrchk( cudaPeekAtLastError() ); - - dim3 threadsPerBlockDiff(nThreadsDiff_t, nThreadsDiff_src, nThreadsDiff_rcv); - dim3 numBlocksDiff(ceil((float)nSamplesDiff / nThreadsDiff_t), - ceil((float)M_src / nThreadsDiff_src), - ceil((float)M_rcv / nThreadsDiff_rcv)); - diffRev_kernel<<>>( - rirDiff, A, alpha, tau_dp, M_src, M_rcv, nSamplesISM, nSamplesDiff); - gpuErrchk( cudaDeviceSynchronize() ); - gpuErrchk( cudaPeekAtLastError() ); - } - - // Copy GPU memory to host - int rirSizeISM = M_src * M_rcv * nSamplesISM * sizeof(float); - int rirSizeDiff = M_src * M_rcv * nSamplesDiff * sizeof(float); - float* h_rir = (float*) malloc(rirSizeISM+rirSizeDiff); - - cudaPitchedPtr h_rir_pitchedPtr = make_cudaPitchedPtr( (void*) h_rir, - (nSamplesISM+nSamplesDiff)*sizeof(float), nSamplesISM+nSamplesDiff, M_rcv ); - cudaPitchedPtr rirISM_pitchedPtr = make_cudaPitchedPtr( (void*) rirISM, - nSamplesISM*sizeof(float), nSamplesISM, M_rcv ); - cudaPitchedPtr rirDiff_pitchedPtr = make_cudaPitchedPtr( (void*) rirDiff, - nSamplesDiff*sizeof(float), nSamplesDiff, M_rcv ); - - cudaMemcpy3DParms parmsISM = {0}; - parmsISM.srcPtr = rirISM_pitchedPtr; - parmsISM.dstPtr = h_rir_pitchedPtr; - parmsISM.extent = make_cudaExtent(nSamplesISM*sizeof(float), M_rcv, M_src); - parmsISM.kind = cudaMemcpyDeviceToHost; - gpuErrchk( cudaMemcpy3D(&parmsISM) ); - - if (nSamplesDiff > 0) { - cudaMemcpy3DParms parmsDiff = {0}; - parmsDiff.srcPtr = rirDiff_pitchedPtr; - parmsDiff.dstPtr = h_rir_pitchedPtr; - parmsDiff.dstPos = make_cudaPos(nSamplesISM*sizeof(float), 0, 0); - parmsDiff.extent = make_cudaExtent(nSamplesDiff*sizeof(float), M_rcv, M_src); - parmsDiff.kind = cudaMemcpyDeviceToHost; - gpuErrchk( cudaMemcpy3D(&parmsDiff) ); - } - - // Free memory - gpuErrchk( cudaFree(pos_src) ); - gpuErrchk( cudaFree(pos_rcv) ); - gpuErrchk( cudaFree(orV_rcv) ); - gpuErrchk( cudaFree(amp) ); - gpuErrchk( cudaFree(tau) ); - gpuErrchk( cudaFree(tau_dp) ); - gpuErrchk( cudaFree(rirISM) ); - gpuErrchk( cudaFree(A) ); - gpuErrchk( cudaFree(alpha) ); - gpuErrchk( cudaFree(rirDiff) ); - - return h_rir; -} - -float* gpuRIR_cuda::cuda_convolutions(float* source_segments, int M_src, int segment_len, - float* RIR, int M_rcv, int RIR_len) { - // function float* cuda_filterRIR(float* source_segments, int M_src, int segments_len, - // float* RIR, int M_rcv, int RIR_len); - // Input parameters: - // float* source_segments : Source signal segment for each trajectory point - // int M_src : Number of trajectory points - // int segment_len : Length of the segments [samples] - // float* RIR : 3D array with the RIR from each point of the trajectory to each receiver - // int M_rcv : Number of receivers - // int RIR_len : Length of the RIRs [samples] - - // Size of the FFT needed to avoid circular convolution effects - int N_fft = pow2roundup(segment_len + RIR_len - 1); - - // Copy the signal segments with zero padding - int mem_size_signal = sizeof(float) * M_src * (N_fft+2); - cufftComplex *d_signal; - gpuErrchk( cudaMalloc((void **)&d_signal, mem_size_signal) ); - gpuErrchk( cudaMemcpy2D((void *)d_signal, (N_fft+2)*sizeof(float), - (void *)source_segments, segment_len*sizeof(float), - segment_len*sizeof(float), M_src, cudaMemcpyHostToDevice) ); - gpuErrchk( cudaMemset2D((void *)((float *)d_signal + segment_len), (N_fft+2)*sizeof(float), - 0, (N_fft+2-segment_len)*sizeof(float), M_src ) ); - - // Copy the RIRs with zero padding - cudaPitchedPtr h_RIR_pitchedPtr = make_cudaPitchedPtr( (void*) RIR, - RIR_len*sizeof(float), RIR_len, M_rcv ); - int mem_size_RIR = sizeof(float) * M_src * M_rcv * (N_fft+2); - cufftComplex *d_RIR; - gpuErrchk( cudaMalloc((void **)&d_RIR, mem_size_RIR) ); - cudaPitchedPtr d_RIR_pitchedPtr = make_cudaPitchedPtr( (void*) d_RIR, - (N_fft+2)*sizeof(float), (N_fft+2), M_rcv ); - cudaMemcpy3DParms parmsCopySignal = {0}; - parmsCopySignal.srcPtr = h_RIR_pitchedPtr; - parmsCopySignal.dstPtr = d_RIR_pitchedPtr; - parmsCopySignal.extent = make_cudaExtent(RIR_len*sizeof(float), M_rcv, M_src); - parmsCopySignal.kind = cudaMemcpyHostToDevice; - gpuErrchk( cudaMemcpy3D(&parmsCopySignal) ); - gpuErrchk( cudaMemset2D((void *)((float *)d_RIR + RIR_len), (N_fft+2)*sizeof(float), - 0, (N_fft+2-RIR_len)*sizeof(float), M_rcv*M_src ) ); - - // CUFFT plans - cufftHandle plan_signal, plan_RIR, plan_RIR_inv; - gpuErrchk( cufftPlan1d(&plan_signal, N_fft, CUFFT_R2C, M_src) ); - gpuErrchk( cufftPlan1d(&plan_RIR, N_fft, CUFFT_R2C, M_src * M_rcv) ); - gpuErrchk( cufftPlan1d(&plan_RIR_inv, N_fft, CUFFT_C2R, M_src * M_rcv) ); - - // Transform signal and RIR - gpuErrchk( cufftExecR2C(plan_signal, (cufftReal *)d_signal, (cufftComplex *)d_signal) ); - gpuErrchk( cufftExecR2C(plan_RIR, (cufftReal *)d_RIR, (cufftComplex *)d_RIR ) ); - - // Multiply the coefficients together and normalize the result - dim3 threadsPerBlock(nThreadsConv_x, nThreadsConv_y, nThreadsConv_z); - int numBlocks_x = (int) ceil((float)(N_fft/2+1)/nThreadsConv_x); - int numBlocks_y = (int) ceil((float)M_rcv/nThreadsConv_y); - int numBlocks_z = (int) ceil((float)M_src/nThreadsConv_z); - dim3 numBlocks(numBlocks_x, numBlocks_y, numBlocks_z); - complexPointwiseMulAndScale<<>> - (d_signal, d_RIR, (N_fft/2+1), M_rcv, M_src, 1.0f/N_fft); - gpuErrchk( cudaDeviceSynchronize() ); - gpuErrchk( cudaPeekAtLastError() ); - - // Transform signal back - gpuErrchk( cufftExecC2R(plan_RIR_inv, (cufftComplex *)d_RIR, (cufftReal *)d_RIR) ); - - // Copy device memory to host - int conv_len = segment_len + RIR_len - 1; - float *convolved_segments = (float *)malloc(sizeof(float)*M_src*M_rcv*conv_len); - cudaPitchedPtr d_convolved_segments_pitchedPtr = make_cudaPitchedPtr( (void*) d_RIR, - (N_fft+2)*sizeof(float), conv_len, M_rcv ); - cudaPitchedPtr h_convolved_segments_pitchedPtr = make_cudaPitchedPtr( (void*) convolved_segments, - conv_len*sizeof(float), conv_len, M_rcv ); - cudaMemcpy3DParms parmsCopy = {0}; - parmsCopy.srcPtr = d_convolved_segments_pitchedPtr; - parmsCopy.dstPtr = h_convolved_segments_pitchedPtr; - parmsCopy.extent = make_cudaExtent(conv_len*sizeof(float), M_rcv, M_src); - parmsCopy.kind = cudaMemcpyDeviceToHost; - gpuErrchk( cudaMemcpy3D(&parmsCopy) ); - - //Destroy CUFFT context - gpuErrchk( cufftDestroy(plan_signal) ); - gpuErrchk( cufftDestroy(plan_RIR) ); - gpuErrchk( cufftDestroy(plan_RIR_inv) ); - - // cleanup memory - gpuErrchk( cudaFree(d_signal) ); - gpuErrchk( cudaFree(d_RIR) ); - - return convolved_segments; -} - -gpuRIR_cuda::gpuRIR_cuda(bool mPrecision, bool lut) { - // Get CUDA architecture - cudaDeviceProp prop; - cudaGetDeviceProperties(&prop, 0); - cuda_arch = prop.major*100 + prop.minor*10; - - // Activate mixed precision and lut if selected - activate_mixed_precision(mPrecision); - activate_lut(lut); - - // Initiate CUDA runtime API - float* memPtr_warmup; - gpuErrchk( cudaMalloc(&memPtr_warmup, 1*sizeof(float)) ); - gpuErrchk( cudaFree(memPtr_warmup) ); - - // Initiate cuFFT library - cufftHandle plan_warmup; - gpuErrchk( cufftPlan1d(&plan_warmup, 1024, CUFFT_R2C, 1) ); - gpuErrchk( cufftDestroy(plan_warmup) ); - - // Initialize cuRAND generator - gpuErrchk( curandCreateGenerator(&cuRandGenWrap.gen, CURAND_RNG_PSEUDO_DEFAULT) ); - gpuErrchk( curandSetPseudoRandomGeneratorSeed(cuRandGenWrap.gen, 1234ULL) ); -} - -bool gpuRIR_cuda::activate_mixed_precision(bool activate) { - if (cuda_arch >= 530) { - if (activate && lookup_table) { - printf("The mixed precision implementation is not compatible with the lookup table.\n"); - printf("Disabling the lookup table.\n"); - lookup_table = false; - } - mixed_precision = activate; - } else { - if (activate) printf("This feature requires Pascal GPU architecture or higher.\n"); - mixed_precision = false; - } - return mixed_precision; -} - -bool gpuRIR_cuda::activate_lut(bool activate) { - if (activate && mixed_precision) { - printf("The lookup table is not compatible with the mixed precision implementation.\n"); - printf("Disabling the mixed precision implementation.\n"); - mixed_precision = false; - } - lookup_table = activate; - return lookup_table; -} + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include "gpuRIR_cuda.h" + +#if CUDART_VERSION < 9000 +#define __h2div h2div +#endif + +// Image Source Method +static const int nThreadsISM_x = 4; +static const int nThreadsISM_y = 4; +static const int nThreadsISM_z = 4; + +// RIR computation +static const int initialReductionMin = 512; +static const int lut_oversamp = 16; +static const int nThreadsGen_t = 32; +static const int nThreadsGen_m = 4; +static const int nThreadsGen_n = 1; // Don't change it +static const int nThreadsRed = 128; + +// Power envelope prediction +static const int nThreadsEnvPred_x = 4; +static const int nThreadsEnvPred_y = 4; +static const int nThreadsEnvPred_z = 1; // Don't change it + +// Generate diffuse reverberation +static const int nThreadsDiff_t = 16; +static const int nThreadsDiff_src = 4; +static const int nThreadsDiff_rcv = 2; + +// RIR filtering onvolution +static const int nThreadsConv_x = 256; +static const int nThreadsConv_y = 1; +static const int nThreadsConv_z = 1; + +#if __CUDA_ARCH__ >= 530 +#define h2zeros __float2half2_rn(0.0) +#define h2ones __float2half2_rn(1.0) +#define h2pi __float2half2_rn(PI) +#endif + +// To hide the cuRAND generator in the header and don't need to include the cuda headers there +struct cuRandGeneratorWrapper_t +{ + curandGenerator_t gen; +}; +cuRandGeneratorWrapper_t gpuRIR_cuda::cuRandGenWrap; + +// CUDA architecture in format xy0 +int cuda_arch; + + +/***************************/ +/* Auxiliar host functions */ +/***************************/ + +#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } +inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) { + if (code != cudaSuccess) + { + fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); + if (abort) exit(code); + } +} + +#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } +inline void gpuAssert(curandStatus_t code, const char *file, int line, bool abort=true) { + if (code != CURAND_STATUS_SUCCESS) + { + fprintf(stderr,"cuRAND: %d %s %d\n", code, file, line); + if (abort) exit(code); + } +} + +#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } +inline void gpuAssert(cufftResult_t code, const char *file, int line, bool abort=true) { + if (code != CUFFT_SUCCESS) + { + fprintf(stderr,"cuFFT error: %d %s %d\n", code, file, line); + if (abort) exit(code); + } +} + +inline unsigned int pow2roundup (unsigned int x) { + --x; + x |= x >> 1; + x |= x >> 2; + x |= x >> 2; + x |= x >> 4; + x |= x >> 8; + x |= x >> 16; + return x+1; +} + +/*****************************/ +/* Auxiliar device functions */ +/*****************************/ + +__device__ __forceinline__ float hanning_window(float t, float Tw) { + return 0.5f * (1.0f + __cosf(2.0f*PI*t/Tw)); +} + +__device__ __forceinline__ float sinc(float x) { + return (x==0)? 1 : sinf(x)/x; +} + +__device__ __forceinline__ float image_sample(float amp, float tau, float t, float Tw) { + float t_tau = t - tau; + return (abs(t_tau)0.0f)? 1.0f : 0.0f; + case DIR_CARD: return 0.5f + 0.5f*cosTheta; + case DIR_HYPCARD: return 0.25f + 0.75f*cosTheta; + case DIR_SUBCARD: return 0.75f + 0.25f*cosTheta; + case DIR_BIDIR: return cosTheta; + default: printf("Invalid polar pattern.\n"); return 0.0f; + } +} + +// cufftComplex scale +__device__ __forceinline__ cufftComplex ComplexScale(cufftComplex a, float s) { + cufftComplex c; + c.x = s * a.x; + c.y = s * a.y; + return c; +} + +// cufftComplex multiplication +__device__ __forceinline__ cufftComplex ComplexMul(cufftComplex a, cufftComplex b) { + cufftComplex c; + c.x = a.x * b.x - a.y * b.y; + c.y = a.x * b.y + a.y * b.x; + return c; +} + +/*********************************************/ +/* Mixed precision auxiliar device functions */ +/*********************************************/ + +#if __CUDA_ARCH__ >= 530 + +__device__ __forceinline__ half2 h2abs(half2 x) { + uint32_t i = *reinterpret_cast(&x) & 0x7FFF7FFF; + return *reinterpret_cast( &i ); +} + +__device__ __forceinline__ half2 my_h2sinpi(half2 x) { + // Argument reduction to [-0.5, 0.5] + half2 i = h2rint(x); + half2 r = __hsub2(x, i); + + // sin(pi*x) polinomial approximation for x in [-0.5,0.5] + half2 r2 = __hmul2(r, r); + half2 s = __float2half2_rn(+2.31786431325108f); + s = __hfma2(r2, s, __float2half2_rn(-5.14167814230801f)); + s = __hfma2(r2, s, __float2half2_rn(+3.14087446786993f)); + s = __hmul2(s, r); + + half2 i_2 = __hmul2(i, __float2half2_rn(0.5f)); + half2 sgn = __hfma2(__float2half2_rn(-2.0f), + __hne2(__hsub2(i_2, h2rint(i_2)), h2zeros), + h2ones); // 1 if i is even, else -1: -2 * ((i/2-round(i/2))!=0) + 1 + s = __hmul2(s, sgn); + + return s; +} + +__device__ __forceinline__ half2 my_h2cospi(half2 x) { + // It is always on [-0.5, 0.5], so we do not need argument reduction + + // cos(pi*x) polinomial approximation for x in [-0.5,0.5] + half2 x2 = __hmul2(x, x); + half2 c = __float2half2_rn(-1.229339658587166f); + c = __hfma2(x2, c, __float2half2_rn(+4.043619929856572f)); + c = __hfma2(x2, c, __float2half2_rn(-4.934120365987677f)); + c = __hfma2(x2, c, __float2half2_rn(+0.999995282317910f)); + + return c; +} + +__device__ __forceinline__ half2 hanning_window_mp(half2 t, half2 Tw_inv) { + half2 c = my_h2cospi(__hmul2(Tw_inv, t)); + return __hmul2(c, c); +} + +__device__ __forceinline__ half2 my_h2sinc(half2 x) { + x = __hfma2(__heq2(x, h2zeros), __float2half2_rn(1e-7f), x); + return __h2div(my_h2sinpi(x), __hmul2(h2pi, x)); + +} + +__device__ __forceinline__ half2 image_sample_mp(half2 amp, float tau, float t1, float t2, float Tw_2, half2 Tw_inv) { + float t1_tau = t1-tau; + float t2_tau = t2-tau; + half2 t_tau = __floats2half2_rn(t1_tau, t2_tau); + if (abs(t1_tau)(sinc_lut, __fmaf_rz(t_tau,lut_oversamp,lut_center)) : 0.0f; +} + +/***********/ +/* KERNELS */ +/***********/ + +__global__ void calcAmpTau_kernel(float* g_amp /*[M_src]M_rcv][nb_img_x][nb_img_y][nb_img_z]*/, + float* g_tau /*[M_src]M_rcv][nb_img_x][nb_img_y][nb_img_z]*/, + float* g_tau_dp /*[M_src]M_rcv]*/, + float* g_pos_src/*[M_src][3]*/, float* g_pos_rcv/*[M_rcv][3]*/, float* g_orV_src/*[M_src][3]*/, float* g_orV_rcv/*[M_rcv][3]*/, + polarPattern spkr_pattern, polarPattern mic_pattern, float room_sz_x, float room_sz_y, float room_sz_z, + float beta_x1, float beta_x2, float beta_y1, float beta_y2, float beta_z1, float beta_z2, + int nb_img_x, int nb_img_y, int nb_img_z, + int M_src, int M_rcv, float c, float Fs) { + + extern __shared__ float sdata[]; + + int n[3]; + n[0] = blockIdx.x * blockDim.x + threadIdx.x; + n[1] = blockIdx.y * blockDim.y + threadIdx.y; + n[2] = blockIdx.z * blockDim.z + threadIdx.z; + + int N[3]; + N[0] = nb_img_x; + N[1] = nb_img_y; + N[2] = nb_img_z; + + float room_sz[3]; + room_sz[0] = room_sz_x; + room_sz[1] = room_sz_y; + room_sz[2] = room_sz_z; + + float beta[6]; + beta[0] = - beta_x1; + beta[1] = - beta_x2; + beta[2] = - beta_y1; + beta[3] = - beta_y2; + beta[4] = - beta_z1; + beta[5] = - beta_z2; + + int prodN = N[0]*N[1]*N[2]; + int n_idx = n[0]*N[1]*N[2] + n[1]*N[2] + n[2]; + + // Copy g_pos_src to shared memory + float* sh_pos_src = (float*) sdata; + if (threadIdx.y==0 && threadIdx.z==0) { + for (int m=threadIdx.x; m -1 * orV_src. If rflx_id = 0 => 1 * orV_src + im_src_to_rcv[d] = -vec[d]; // change vector direction (mirror through origin) + } + dist = sqrtf(dist); + float amp = rflx_att / (4*PI*dist); + amp *= directivity(vec, &sh_orV_rcv[m_rcv], mic_pattern); // apply microphone directivity dampening + amp *= directivity(im_src_to_rcv, orV_src, spkr_pattern); // apply speaker directivity dampening + + g_amp[m_src*M_rcv*prodN + m_rcv*prodN + n_idx] = amp; + g_tau[m_src*M_rcv*prodN + m_rcv*prodN + n_idx] = dist / c * Fs; + + if (direct_path){ + g_tau_dp[m_src*M_rcv + m_rcv] = dist / c * Fs; + // printf("%f,",amp); // For creating polar pattern diagrams + } + } + } + } +} + +__global__ void generateRIR_kernel(float* initialRIR, float* amp, float* tau, int T, int M, int N, int iniRIR_N, int ini_red, float Tw) { + int t = blockIdx.x * blockDim.x + threadIdx.x; + int m = blockIdx.y * blockDim.y + threadIdx.y; + int n_ini = blockIdx.z * ini_red; + int n_max = fminf(n_ini + ini_red, N); + + if (m0; s>>=1) { + if (tid < s) sdata[tid] += sdata[tid+s]; + __syncthreads(); + } + + if (tid==0) { + intermediateRIR[m*T*intRIR_N + t*intRIR_N + blockIdx.x] = sdata[0]; + } +} + +__global__ void envPred_kernel(float* A /*[M_src]M_rcv]*/, float* alpha /*[M_src]M_rcv]*/, + float* RIRs_early /*[M_src][M_rcv][nSamples]*/, float* tau_dp, /*[M_src]M_rcv]*/ + int M_src, int M_rcv, int nSamples, float Fs, + float room_sz_x, float room_sz_y, float room_sz_z, + float beta_x1, float beta_x2, float beta_y1, float beta_y2, float beta_z1, float beta_z2) { + + float w_sz = 10e-3f * Fs; // Maximum window size (samples) to compute the final power of the early RIRs_early + + int m_src = blockIdx.x * blockDim.x + threadIdx.x; + int m_rcv = blockIdx.y * blockDim.y + threadIdx.y; + + if (m_src= 530 + int t = blockIdx.x * blockDim.x + threadIdx.x; + int m = blockIdx.y * blockDim.y + threadIdx.y; + int n_ini = blockIdx.z * ini_red; + int n_max = fminf(n_ini + ini_red, N); + + if (m= 530 + int tid = threadIdx.x; + int n = blockIdx.x*(blockDim.x*2) + threadIdx.x; + int t = blockIdx.y; + int m = blockIdx.z; + + if (n+blockDim.x < N) sdata_mp[tid] = __hadd2(initialRIR[m*T*N + t*N + n], initialRIR[m*T*N + t*N + n+blockDim.x]); + else if (n0; s>>=1) { + if (tid < s) sdata_mp[tid] = __hadd2(sdata_mp[tid], sdata_mp[tid+s]); + __syncthreads(); + } + + if (tid==0) { + intermediateRIR[m*T*intRIR_N + t*intRIR_N + blockIdx.x] = sdata_mp[0]; + } + #else + printf("Mixed precision requires Pascal GPU architecture or higher.\n"); + #endif +} + +__global__ void h2RIR_to_floatRIR_kernel(half2* h2RIR, float* floatRIR, int M, int T) { + #if __CUDA_ARCH__ >= 530 + int t = blockIdx.x * blockDim.x + threadIdx.x; + int m = blockIdx.y * blockDim.y + threadIdx.y; + + if (t 1e9) initialReduction *= 2; + + int iniRIR_N = ceil((float)N/initialReduction); + dim3 threadsPerBlockIni(nThreadsGen_t, nThreadsGen_m, nThreadsGen_n); + dim3 numBlocksIni(ceil((float)T/threadsPerBlockIni.x), ceil((float)M/threadsPerBlockIni.y), iniRIR_N); + + float* initialRIR; + gpuErrchk( cudaMalloc(&initialRIR, M*T*iniRIR_N*sizeof(float)) ); + + int Tw = (int) round(8e-3f * Fs); // Window duration [samples] + + if (lookup_table) { + int lut_len = Tw * lut_oversamp; + lut_len += ((lut_len%2)? 0 : 1); // Must be odd + cudaArray* cuArrayLut; + cudaTextureObject_t sinc_lut = create_sinc_texture_lut(&cuArrayLut, Tw, lut_len); + + generateRIR_kernel_lut<<>>( initialRIR, amp, tau, T, M, N, iniRIR_N, initialReduction, Tw/2, sinc_lut, lut_len/2+0.5 ); + gpuErrchk( cudaDeviceSynchronize() ); + gpuErrchk( cudaPeekAtLastError() ); + + cudaDestroyTextureObject(sinc_lut); + cudaFreeArray(cuArrayLut); + } else { + generateRIR_kernel<<>>( initialRIR, amp, tau, T, M, N, iniRIR_N, initialReduction, Tw ); + gpuErrchk( cudaDeviceSynchronize() ); + gpuErrchk( cudaPeekAtLastError() ); + } + + + dim3 threadsPerBlockRed(nThreadsRed, 1, 1); + float* intermediateRIR; + int intRIR_N; + while (iniRIR_N > 2*nThreadsRed) { + intRIR_N = ceil((float)iniRIR_N / (2*nThreadsRed)); + gpuErrchk( cudaMalloc(&intermediateRIR, intRIR_N * T * M * sizeof(float)) ); + + dim3 numBlocksRed(intRIR_N, T, M); + reduceRIR_kernel<<>>( + initialRIR, intermediateRIR, M, T, iniRIR_N, intRIR_N); + gpuErrchk( cudaDeviceSynchronize() ); + gpuErrchk( cudaPeekAtLastError() ); + + gpuErrchk( cudaFree(initialRIR) ); + initialRIR = intermediateRIR; + iniRIR_N = intRIR_N; + } + + dim3 numBlocksEnd(1, T, M); + reduceRIR_kernel<<>>( + initialRIR, rir, M, T, iniRIR_N, 1); + gpuErrchk( cudaDeviceSynchronize() ); + gpuErrchk( cudaPeekAtLastError() ); + gpuErrchk( cudaFree(initialRIR) ); +} + +void cuda_rirGenerator_mp(float* rir, float* amp, float* tau, int M, int N, int T, float Fs) { + if (cuda_arch >= 530) { + int initialReduction = initialReductionMin; + while (M * T/2 * ceil((float)N/initialReduction) > 1e9) initialReduction *= 2; + + int iniRIR_N = ceil((float)N/initialReduction); + dim3 threadsPerBlockIni(nThreadsGen_t, nThreadsGen_m, nThreadsGen_n); + dim3 numBlocksIni(ceil((float)T/2/threadsPerBlockIni.x), ceil((float)M/threadsPerBlockIni.y), iniRIR_N); + + half2* initialRIR; + gpuErrchk( cudaMalloc(&initialRIR, M*(T/2)*iniRIR_N*sizeof(half2)) ); + + float Tw_2 = 8e-3f * Fs / 2; + #if CUDART_VERSION < 9020 + // For CUDA versions older than 9.2 it is nos possible to call from host code __float2half2_rn, + // but doing it in the kernel is slower + float Tw_inv = 1.0f / (8e-3f * Fs); + #else + half2 Tw_inv = __float2half2_rn(1.0f / (8e-3f * Fs)); + #endif + generateRIR_mp_kernel<<>>( initialRIR, amp, tau, T/2, M, N, iniRIR_N, initialReduction, Fs, Tw_2, Tw_inv ); + gpuErrchk( cudaDeviceSynchronize() ); + gpuErrchk( cudaPeekAtLastError() ); + + dim3 threadsPerBlockRed(nThreadsRed, 1, 1); + half2* intermediateRIR; + int intRIR_N; + while (iniRIR_N > 2*nThreadsRed) { + intRIR_N = ceil((float)iniRIR_N / (2*nThreadsRed)); + gpuErrchk( cudaMalloc(&intermediateRIR, intRIR_N * T/2 * M * sizeof(half2)) ); + + dim3 numBlocksRed(intRIR_N, T/2, M); + reduceRIR_mp_kernel<<>>( + initialRIR, intermediateRIR, M, T/2, iniRIR_N, intRIR_N); + gpuErrchk( cudaDeviceSynchronize() ); + gpuErrchk( cudaPeekAtLastError() ); + + gpuErrchk( cudaFree(initialRIR) ); + initialRIR = intermediateRIR; + iniRIR_N = intRIR_N; + } + + gpuErrchk( cudaMalloc(&intermediateRIR, M * T/2 * sizeof(half2)) ); + dim3 numBlocksEnd(1, T/2, M); + reduceRIR_mp_kernel<<>>( + initialRIR, intermediateRIR, M, T/2, iniRIR_N, 1); + gpuErrchk( cudaDeviceSynchronize() ); + gpuErrchk( cudaPeekAtLastError() ); + gpuErrchk( cudaFree(initialRIR) ); + + dim3 numBlocks(ceil((float)(T/2)/128), M, 1); + dim3 threadsPerBlock(128, 1, 1); + h2RIR_to_floatRIR_kernel<<>>(intermediateRIR, rir, M, T/2); + + gpuErrchk( cudaFree(intermediateRIR) ); + } else { + printf("Mixed precision requires Pascal GPU architecture or higher.\n"); + } +} + +int gpuRIR_cuda::PadData(float *signal, float **padded_signal, int segment_len, + float *RIR, float **padded_RIR, int M_src, int M_rcv, int RIR_len) { + + int N_fft = pow2roundup(segment_len + RIR_len - 1); + + // Pad signal + float *new_data = (float *)malloc(sizeof(float) * M_src * (N_fft+2)); + for (int m=0; m>> ( + amp, tau, tau_dp, + pos_src, pos_rcv, orV_src, orV_rcv, spkr_pattern, mic_pattern, + room_sz[0], room_sz[1], room_sz[2], + beta[0], beta[1], beta[2], beta[3], beta[4], beta[5], + nb_img[0], nb_img[1], nb_img[2], + M_src, M_rcv, c, Fs + ); + gpuErrchk( cudaDeviceSynchronize() ); + gpuErrchk( cudaPeekAtLastError() ); + + int nSamplesISM = ceil(Tdiff*Fs); + nSamplesISM += nSamplesISM%2; // nSamplesISM must be even + int nSamples = ceil(Tmax*Fs); + nSamples += nSamples%2; // nSamples must be even + int nSamplesDiff = nSamples - nSamplesISM; + + // Compute the RIRs as a sum of sincs + int M = M_src * M_rcv; + int N = nb_img[0] * nb_img[1] * nb_img[2]; + float* rirISM; + gpuErrchk( cudaMalloc(&rirISM, M*nSamplesISM*sizeof(float)) ); + if (mixed_precision) { + if (cuda_arch >= 530) { + cuda_rirGenerator_mp(rirISM, amp, tau, M, N, nSamplesISM, Fs); + } else { + printf("The mixed precision requires Pascal GPU architecture or higher.\n"); + } + } else { + cuda_rirGenerator(rirISM, amp, tau, M, N, nSamplesISM, Fs); + } + + // Compute the exponential power envelope parammeters of each RIR + dim3 threadsPerBlockEnvPred(nThreadsEnvPred_x, nThreadsEnvPred_y, nThreadsEnvPred_z); + dim3 numBlocksEnvPred(ceil((float)M_src / nThreadsEnvPred_x), + ceil((float)M_rcv / nThreadsEnvPred_y), 1); + + float* A; // pow_env = A * exp(alpha * (t-tau_dp)) + gpuErrchk( cudaMalloc(&A, M_src*M_rcv*sizeof(float)) ); + float* alpha; + gpuErrchk( cudaMalloc(&alpha, M_src*M_rcv*sizeof(float)) ); + + envPred_kernel<<>>( + A, alpha, rirISM, tau_dp, M_src, M_rcv, nSamplesISM, Fs, + room_sz[0], room_sz[1], room_sz[2], beta[0], beta[1], beta[2], beta[3], beta[4], beta[5]); + gpuErrchk( cudaDeviceSynchronize() ); + gpuErrchk( cudaPeekAtLastError() ); + + // Generate diffuse reverberation + float* rirDiff; + gpuErrchk( cudaMalloc(&rirDiff, M_src*M_rcv*nSamplesDiff*sizeof(float)) ); + + if (nSamplesDiff != 0) { + // Fill rirDiff with random numbers with uniform distribution + gpuErrchk( curandGenerateUniform(cuRandGenWrap.gen, rirDiff, M_src*M_rcv*nSamplesDiff) ); + gpuErrchk( cudaDeviceSynchronize() ); + gpuErrchk( cudaPeekAtLastError() ); + + dim3 threadsPerBlockDiff(nThreadsDiff_t, nThreadsDiff_src, nThreadsDiff_rcv); + dim3 numBlocksDiff(ceil((float)nSamplesDiff / nThreadsDiff_t), + ceil((float)M_src / nThreadsDiff_src), + ceil((float)M_rcv / nThreadsDiff_rcv)); + diffRev_kernel<<>>( + rirDiff, A, alpha, tau_dp, M_src, M_rcv, nSamplesISM, nSamplesDiff); + gpuErrchk( cudaDeviceSynchronize() ); + gpuErrchk( cudaPeekAtLastError() ); + } + + // Copy GPU memory to host + int rirSizeISM = M_src * M_rcv * nSamplesISM * sizeof(float); + int rirSizeDiff = M_src * M_rcv * nSamplesDiff * sizeof(float); + float* h_rir = (float*) malloc(rirSizeISM+rirSizeDiff); + + cudaPitchedPtr h_rir_pitchedPtr = make_cudaPitchedPtr( (void*) h_rir, + (nSamplesISM+nSamplesDiff)*sizeof(float), nSamplesISM+nSamplesDiff, M_rcv ); + cudaPitchedPtr rirISM_pitchedPtr = make_cudaPitchedPtr( (void*) rirISM, + nSamplesISM*sizeof(float), nSamplesISM, M_rcv ); + cudaPitchedPtr rirDiff_pitchedPtr = make_cudaPitchedPtr( (void*) rirDiff, + nSamplesDiff*sizeof(float), nSamplesDiff, M_rcv ); + + cudaMemcpy3DParms parmsISM = {0}; + parmsISM.srcPtr = rirISM_pitchedPtr; + parmsISM.dstPtr = h_rir_pitchedPtr; + parmsISM.extent = make_cudaExtent(nSamplesISM*sizeof(float), M_rcv, M_src); + parmsISM.kind = cudaMemcpyDeviceToHost; + gpuErrchk( cudaMemcpy3D(&parmsISM) ); + + if (nSamplesDiff > 0) { + cudaMemcpy3DParms parmsDiff = {0}; + parmsDiff.srcPtr = rirDiff_pitchedPtr; + parmsDiff.dstPtr = h_rir_pitchedPtr; + parmsDiff.dstPos = make_cudaPos(nSamplesISM*sizeof(float), 0, 0); + parmsDiff.extent = make_cudaExtent(nSamplesDiff*sizeof(float), M_rcv, M_src); + parmsDiff.kind = cudaMemcpyDeviceToHost; + gpuErrchk( cudaMemcpy3D(&parmsDiff) ); + } + + // Free memory + gpuErrchk( cudaFree(pos_src) ); + gpuErrchk( cudaFree(pos_rcv) ); + gpuErrchk( cudaFree(orV_rcv) ); + gpuErrchk( cudaFree(amp) ); + gpuErrchk( cudaFree(tau) ); + gpuErrchk( cudaFree(tau_dp) ); + gpuErrchk( cudaFree(rirISM) ); + gpuErrchk( cudaFree(A) ); + gpuErrchk( cudaFree(alpha) ); + gpuErrchk( cudaFree(rirDiff) ); + + return h_rir; +} + +float* gpuRIR_cuda::cuda_convolutions(float* source_segments, int M_src, int segment_len, + float* RIR, int M_rcv, int RIR_len) { + // function float* cuda_filterRIR(float* source_segments, int M_src, int segments_len, + // float* RIR, int M_rcv, int RIR_len); + // Input parameters: + // float* source_segments : Source signal segment for each trajectory point + // int M_src : Number of trajectory points + // int segment_len : Length of the segments [samples] + // float* RIR : 3D array with the RIR from each point of the trajectory to each receiver + // int M_rcv : Number of receivers + // int RIR_len : Length of the RIRs [samples] + + // Size of the FFT needed to avoid circular convolution effects + int N_fft = pow2roundup(segment_len + RIR_len - 1); + + // Copy the signal segments with zero padding + int mem_size_signal = sizeof(float) * M_src * (N_fft+2); + cufftComplex *d_signal; + gpuErrchk( cudaMalloc((void **)&d_signal, mem_size_signal) ); + gpuErrchk( cudaMemcpy2D((void *)d_signal, (N_fft+2)*sizeof(float), + (void *)source_segments, segment_len*sizeof(float), + segment_len*sizeof(float), M_src, cudaMemcpyHostToDevice) ); + gpuErrchk( cudaMemset2D((void *)((float *)d_signal + segment_len), (N_fft+2)*sizeof(float), + 0, (N_fft+2-segment_len)*sizeof(float), M_src ) ); + + // Copy the RIRs with zero padding + cudaPitchedPtr h_RIR_pitchedPtr = make_cudaPitchedPtr( (void*) RIR, + RIR_len*sizeof(float), RIR_len, M_rcv ); + int mem_size_RIR = sizeof(float) * M_src * M_rcv * (N_fft+2); + cufftComplex *d_RIR; + gpuErrchk( cudaMalloc((void **)&d_RIR, mem_size_RIR) ); + cudaPitchedPtr d_RIR_pitchedPtr = make_cudaPitchedPtr( (void*) d_RIR, + (N_fft+2)*sizeof(float), (N_fft+2), M_rcv ); + cudaMemcpy3DParms parmsCopySignal = {0}; + parmsCopySignal.srcPtr = h_RIR_pitchedPtr; + parmsCopySignal.dstPtr = d_RIR_pitchedPtr; + parmsCopySignal.extent = make_cudaExtent(RIR_len*sizeof(float), M_rcv, M_src); + parmsCopySignal.kind = cudaMemcpyHostToDevice; + gpuErrchk( cudaMemcpy3D(&parmsCopySignal) ); + gpuErrchk( cudaMemset2D((void *)((float *)d_RIR + RIR_len), (N_fft+2)*sizeof(float), + 0, (N_fft+2-RIR_len)*sizeof(float), M_rcv*M_src ) ); + + // CUFFT plans + cufftHandle plan_signal, plan_RIR, plan_RIR_inv; + gpuErrchk( cufftPlan1d(&plan_signal, N_fft, CUFFT_R2C, M_src) ); + gpuErrchk( cufftPlan1d(&plan_RIR, N_fft, CUFFT_R2C, M_src * M_rcv) ); + gpuErrchk( cufftPlan1d(&plan_RIR_inv, N_fft, CUFFT_C2R, M_src * M_rcv) ); + + // Transform signal and RIR + gpuErrchk( cufftExecR2C(plan_signal, (cufftReal *)d_signal, (cufftComplex *)d_signal) ); + gpuErrchk( cufftExecR2C(plan_RIR, (cufftReal *)d_RIR, (cufftComplex *)d_RIR ) ); + + // Multiply the coefficients together and normalize the result + dim3 threadsPerBlock(nThreadsConv_x, nThreadsConv_y, nThreadsConv_z); + int numBlocks_x = (int) ceil((float)(N_fft/2+1)/nThreadsConv_x); + int numBlocks_y = (int) ceil((float)M_rcv/nThreadsConv_y); + int numBlocks_z = (int) ceil((float)M_src/nThreadsConv_z); + dim3 numBlocks(numBlocks_x, numBlocks_y, numBlocks_z); + complexPointwiseMulAndScale<<>> + (d_signal, d_RIR, (N_fft/2+1), M_rcv, M_src, 1.0f/N_fft); + gpuErrchk( cudaDeviceSynchronize() ); + gpuErrchk( cudaPeekAtLastError() ); + + // Transform signal back + gpuErrchk( cufftExecC2R(plan_RIR_inv, (cufftComplex *)d_RIR, (cufftReal *)d_RIR) ); + + // Copy device memory to host + int conv_len = segment_len + RIR_len - 1; + float *convolved_segments = (float *)malloc(sizeof(float)*M_src*M_rcv*conv_len); + cudaPitchedPtr d_convolved_segments_pitchedPtr = make_cudaPitchedPtr( (void*) d_RIR, + (N_fft+2)*sizeof(float), conv_len, M_rcv ); + cudaPitchedPtr h_convolved_segments_pitchedPtr = make_cudaPitchedPtr( (void*) convolved_segments, + conv_len*sizeof(float), conv_len, M_rcv ); + cudaMemcpy3DParms parmsCopy = {0}; + parmsCopy.srcPtr = d_convolved_segments_pitchedPtr; + parmsCopy.dstPtr = h_convolved_segments_pitchedPtr; + parmsCopy.extent = make_cudaExtent(conv_len*sizeof(float), M_rcv, M_src); + parmsCopy.kind = cudaMemcpyDeviceToHost; + gpuErrchk( cudaMemcpy3D(&parmsCopy) ); + + //Destroy CUFFT context + gpuErrchk( cufftDestroy(plan_signal) ); + gpuErrchk( cufftDestroy(plan_RIR) ); + gpuErrchk( cufftDestroy(plan_RIR_inv) ); + + // cleanup memory + gpuErrchk( cudaFree(d_signal) ); + gpuErrchk( cudaFree(d_RIR) ); + + return convolved_segments; +} + +gpuRIR_cuda::gpuRIR_cuda(bool mPrecision, bool lut) { + // Get CUDA architecture + cudaDeviceProp prop; + cudaGetDeviceProperties(&prop, 0); + cuda_arch = prop.major*100 + prop.minor*10; + + // Activate mixed precision and lut if selected + activate_mixed_precision(mPrecision); + activate_lut(lut); + + // Initiate CUDA runtime API + float* memPtr_warmup; + gpuErrchk( cudaMalloc(&memPtr_warmup, 1*sizeof(float)) ); + gpuErrchk( cudaFree(memPtr_warmup) ); + + // Initiate cuFFT library + cufftHandle plan_warmup; + gpuErrchk( cufftPlan1d(&plan_warmup, 1024, CUFFT_R2C, 1) ); + gpuErrchk( cufftDestroy(plan_warmup) ); + + // Initialize cuRAND generator + gpuErrchk( curandCreateGenerator(&cuRandGenWrap.gen, CURAND_RNG_PSEUDO_DEFAULT) ); + gpuErrchk( curandSetPseudoRandomGeneratorSeed(cuRandGenWrap.gen, 1234ULL) ); +} + +bool gpuRIR_cuda::activate_mixed_precision(bool activate) { + if (cuda_arch >= 530) { + if (activate && lookup_table) { + printf("The mixed precision implementation is not compatible with the lookup table.\n"); + printf("Disabling the lookup table.\n"); + lookup_table = false; + } + mixed_precision = activate; + } else { + if (activate) printf("This feature requires Pascal GPU architecture or higher.\n"); + mixed_precision = false; + } + return mixed_precision; +} + +bool gpuRIR_cuda::activate_lut(bool activate) { + if (activate && mixed_precision) { + printf("The lookup table is not compatible with the mixed precision implementation.\n"); + printf("Disabling the mixed precision implementation.\n"); + mixed_precision = false; + } + lookup_table = activate; + return lookup_table; +} diff --git a/src/python_bind.cpp b/src/python_bind.cpp index 09bd25d..cec72f3 100644 --- a/src/python_bind.cpp +++ b/src/python_bind.cpp @@ -58,6 +58,7 @@ py::array gpuRIR_bind::simulateRIR_bind(std::vector room_sz, // Size of t assert(info_pos_rcv.shape[1] == 3); assert(info_orV_src.shape[1] == 3); assert(info_orV_rcv.shape[1] == 3); + assert(info_pos_src.shape[0] == info_orV_src.shape[0]); assert(info_pos_rcv.shape[0] == info_orV_rcv.shape[0]); int M_src = info_pos_src.shape[0]; int M_rcv = info_pos_rcv.shape[0];