diff --git a/README.md b/README.md index a8e3170..1844e6a 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,7 @@ * [`simulateRIR`](#simulaterir) * [`simulateTrajectory`](#simulatetrajectory) * [`activateMixedPrecision`](#activateMixedPrecision) + * [`activateLUT`](#activateLUT) * [`beta_SabineEstimation`](#beta_sabineestimation) * [`att2t_SabineEstimator`](#att2t_sabineestimator) * [`t2n`](#t2n) @@ -110,7 +111,16 @@ Activate the mixed precision mode, only for Pascal GPU architecture or superior. * **activate** : *bool, optional.* True for activate and Flase for deactivate. True by default. - + +### `activateLUT` + +Activate the lookup table for the sinc computations. + +#### Parameters + +* **activate** : *bool, optional.* + True for activate and Flase for deactivate. True by default. + ### `beta_SabineEstimation` Estimation of the reflection coefficients needed to have the desired reverberation time. diff --git a/examples/example.py b/examples/example.py index 8b7a297..5b2aaa9 100755 --- a/examples/example.py +++ b/examples/example.py @@ -10,6 +10,7 @@ import gpuRIR gpuRIR.activateMixedPrecision(False) +gpuRIR.activateLUT(True) room_sz = [3,3,2.5] # Size of the room [m] nb_src = 2 # Number of sources diff --git a/examples/time_vs_T60.py b/examples/time_vs_T60.py index 5c1eaa7..2f06592 100644 --- a/examples/time_vs_T60.py +++ b/examples/time_vs_T60.py @@ -10,6 +10,7 @@ import gpuRIR gpuRIR.activateMixedPrecision(False) +gpuRIR.activateLUT(False) T60_vec = np.arange(0.1, 2.2, 0.2) # Reverberation times to measure nb_test_per_point = 10 # Number of simulations per T60 to average the runtime diff --git a/examples/time_vs_nbRIRs.py b/examples/time_vs_nbRIRs.py index 7b950ab..c43f96f 100644 --- a/examples/time_vs_nbRIRs.py +++ b/examples/time_vs_nbRIRs.py @@ -10,6 +10,7 @@ import gpuRIR gpuRIR.activateMixedPrecision(False) +gpuRIR.activateLUT(False) nb_src_vec = np.concatenate([2**np.arange(12), [4094]]) # Number of RIRs to measure nb_test_per_point = 10 # Number of simulations per T60 to average the runtime diff --git a/gpuRIR/__init__.py b/gpuRIR/__init__.py index cc5a2da..d2616bb 100644 --- a/gpuRIR/__init__.py +++ b/gpuRIR/__init__.py @@ -8,7 +8,7 @@ from gpuRIR_bind import gpuRIR_bind -__all__ = ["mic_patterns", "beta_SabineEstimation", "att2t_SabineEstimator", "t2n", "simulateRIR", "simulateTrajectory", "activate_mixed_precision"] +__all__ = ["mic_patterns", "beta_SabineEstimation", "att2t_SabineEstimator", "t2n", "simulateRIR", "simulateTrajectory", "activate_mixed_precision", "activate_lut"] mic_patterns = { "omni": 0, @@ -212,5 +212,16 @@ def activateMixedPrecision(activate=True): ''' gpuRIR_bind_simulator.activate_mixed_precision_bind(activate) +def activateLUT(activate=True): + ''' Activate the lookup table for the sinc computations. + + Parameters + ---------- + activate : bool, optional + True for activate and Flase for deactivate. True by default. + + ''' + gpuRIR_bind_simulator.activate_lut_bind(activate) + # Create the simulator object when the module is loaded gpuRIR_bind_simulator = gpuRIR_bind() diff --git a/src/gpuRIR_cuda.cu b/src/gpuRIR_cuda.cu index 332233f..779dc8c 100644 --- a/src/gpuRIR_cuda.cu +++ b/src/gpuRIR_cuda.cu @@ -23,6 +23,7 @@ static const int nThreadsISM_z = 4; // RIR computation static const int initialReductionMin = 512; +static const int lut_oversamp = 16; static const int nThreadsGen_t = 32; static const int nThreadsGen_m = 4; static const int nThreadsGen_n = 1; // Don't change it @@ -106,32 +107,32 @@ inline unsigned int pow2roundup (unsigned int x) { /* Auxiliar device functions */ /*****************************/ -__device__ __forceinline__ scalar_t hanning_window(scalar_t t, scalar_t Tw) { +__device__ __forceinline__ float hanning_window(float t, float Tw) { return 0.5f * (1.0f + __cosf(2.0f*PI*t/Tw)); } -__device__ __forceinline__ scalar_t sinc(scalar_t x) { +__device__ __forceinline__ float sinc(float x) { return (x==0)? 1 : sinf(x)/x; } -__device__ __forceinline__ scalar_t image_sample(scalar_t amp, scalar_t tau, scalar_t t, scalar_t Tw) { - scalar_t t_tau = t - tau; +__device__ __forceinline__ float image_sample(float amp, float tau, float t, float Tw) { + float t_tau = t - tau; return (abs(t_tau)= 530 __device__ __forceinline__ half2 h2abs(half2 x) { @@ -216,9 +218,9 @@ __device__ __forceinline__ half2 my_h2sinc(half2 x) { } -__device__ __forceinline__ half2 image_sample_mp(half2 amp, scalar_t tau, scalar_t t1, scalar_t t2, scalar_t Tw_2, half2 Tw_inv) { - scalar_t t1_tau = t1-tau; - scalar_t t2_tau = t2-tau; +__device__ __forceinline__ half2 image_sample_mp(half2 amp, float tau, float t1, float t2, float Tw_2, half2 Tw_inv) { + float t1_tau = t1-tau; + float t2_tau = t2-tau; half2 t_tau = __floats2half2_rn(t1_tau, t2_tau); if (abs(t1_tau)(sinc_lut, __fmaf_rz(t_tau,lut_oversamp,lut_center)) : 0.0f; +} + /***********/ /* KERNELS */ /***********/ -__global__ void calcAmpTau_kernel(scalar_t* g_amp /*[M_src]M_rcv][nb_img_x][nb_img_y][nb_img_z]*/, - scalar_t* g_tau /*[M_src]M_rcv][nb_img_x][nb_img_y][nb_img_z]*/, - scalar_t* g_tau_dp /*[M_src]M_rcv]*/, - scalar_t* g_pos_src/*[M_src][3]*/, scalar_t* g_pos_rcv/*[M_rcv][3]*/, scalar_t* g_orV_rcv/*[M_rcv][3]*/, - micPattern mic_pattern, scalar_t room_sz_x, scalar_t room_sz_y, scalar_t room_sz_z, - scalar_t beta_x1, scalar_t beta_x2, scalar_t beta_y1, scalar_t beta_y2, scalar_t beta_z1, scalar_t beta_z2, +__global__ void calcAmpTau_kernel(float* g_amp /*[M_src]M_rcv][nb_img_x][nb_img_y][nb_img_z]*/, + float* g_tau /*[M_src]M_rcv][nb_img_x][nb_img_y][nb_img_z]*/, + float* g_tau_dp /*[M_src]M_rcv]*/, + float* g_pos_src/*[M_src][3]*/, float* g_pos_rcv/*[M_rcv][3]*/, float* g_orV_rcv/*[M_rcv][3]*/, + micPattern mic_pattern, float room_sz_x, float room_sz_y, float room_sz_z, + float beta_x1, float beta_x2, float beta_y1, float beta_y2, float beta_z1, float beta_z2, int nb_img_x, int nb_img_y, int nb_img_z, - int M_src, int M_rcv, scalar_t c, scalar_t Fs) { + int M_src, int M_rcv, float c, float Fs) { - extern __shared__ scalar_t sdata[]; + extern __shared__ float sdata[]; int n[3]; n[0] = blockIdx.x * blockDim.x + threadIdx.x; @@ -252,12 +263,12 @@ __global__ void calcAmpTau_kernel(scalar_t* g_amp /*[M_src]M_rcv][nb_img_x][nb_i N[1] = nb_img_y; N[2] = nb_img_z; - scalar_t room_sz[3]; + float room_sz[3]; room_sz[0] = room_sz_x; room_sz[1] = room_sz_y; room_sz[2] = room_sz_z; - scalar_t beta[6]; + float beta[6]; beta[0] = - beta_x1; beta[1] = - beta_x2; beta[2] = - beta_y1; @@ -269,7 +280,7 @@ __global__ void calcAmpTau_kernel(scalar_t* g_amp /*[M_src]M_rcv][nb_img_x][nb_i int n_idx = n[0]*N[1]*N[2] + n[1]*N[2] + n[2]; // Copy g_pos_src to shared memory - scalar_t* sh_pos_src = (scalar_t*) sdata; + float* sh_pos_src = (float*) sdata; if (threadIdx.y==0 && threadIdx.z==0) { for (int m=threadIdx.x; m= 530 int t = blockIdx.x * blockDim.x + threadIdx.x; @@ -463,8 +474,8 @@ __global__ void generateRIR_mp_kernel(half2* initialRIR, scalar_t* amp, scalar_t if (m= 530 int t = blockIdx.x * blockDim.x + threadIdx.x; int m = blockIdx.y * blockDim.y + threadIdx.y; @@ -516,11 +527,70 @@ __global__ void h2RIR_to_floatRIR_kernel(half2* h2RIR, scalar_t* floatRIR, int M #endif } +/************************/ +/* Lookup table KERNELS */ +/************************/ + +__global__ void generateRIR_kernel_lut(float* initialRIR, float* amp, float* tau, int T, int M, int N, int iniRIR_N, int ini_red, int Tw_2, cudaTextureObject_t sinc_lut, float lut_center) { + int t = blockIdx.x * blockDim.x + threadIdx.x; + int m = blockIdx.y * blockDim.y + threadIdx.y; + int n_ini = blockIdx.z * ini_red; + int n_max = fminf(n_ini + ini_red, N); + + if (m 1e9) initialReduction *= 2; @@ -528,23 +598,39 @@ void gpuRIR_cuda::cuda_rirGenerator(scalar_t* rir, scalar_t* amp, scalar_t* tau, dim3 threadsPerBlockIni(nThreadsGen_t, nThreadsGen_m, nThreadsGen_n); dim3 numBlocksIni(ceil((float)T/threadsPerBlockIni.x), ceil((float)M/threadsPerBlockIni.y), iniRIR_N); - scalar_t* initialRIR; - gpuErrchk( cudaMalloc(&initialRIR, M*T*iniRIR_N*sizeof(scalar_t)) ); + float* initialRIR; + gpuErrchk( cudaMalloc(&initialRIR, M*T*iniRIR_N*sizeof(float)) ); - scalar_t Tw = 8e-3f * Fs; // Window duration [samples] - generateRIR_kernel<<>>( initialRIR, amp, tau, T, M, N, iniRIR_N, initialReduction, Tw ); - gpuErrchk( cudaDeviceSynchronize() ); - gpuErrchk( cudaPeekAtLastError() ); + int Tw = (int) round(8e-3f * Fs); // Window duration [samples] + + if (lookup_table) { + int lut_len = Tw * lut_oversamp; + lut_len += ((lut_len%2)? 0 : 1); // Must be odd + cudaArray* cuArrayLut; + cudaTextureObject_t sinc_lut = create_sinc_texture_lut(&cuArrayLut, Tw, lut_len); + + generateRIR_kernel_lut<<>>( initialRIR, amp, tau, T, M, N, iniRIR_N, initialReduction, Tw/2, sinc_lut, lut_len/2+0.5 ); + gpuErrchk( cudaDeviceSynchronize() ); + gpuErrchk( cudaPeekAtLastError() ); + + cudaDestroyTextureObject(sinc_lut); + cudaFreeArray(cuArrayLut); + } else { + generateRIR_kernel<<>>( initialRIR, amp, tau, T, M, N, iniRIR_N, initialReduction, Tw ); + gpuErrchk( cudaDeviceSynchronize() ); + gpuErrchk( cudaPeekAtLastError() ); + } + dim3 threadsPerBlockRed(nThreadsRed, 1, 1); - scalar_t* intermediateRIR; + float* intermediateRIR; int intRIR_N; while (iniRIR_N > 2*nThreadsRed) { intRIR_N = ceil((float)iniRIR_N / (2*nThreadsRed)); - gpuErrchk( cudaMalloc(&intermediateRIR, intRIR_N * T * M * sizeof(scalar_t)) ); + gpuErrchk( cudaMalloc(&intermediateRIR, intRIR_N * T * M * sizeof(float)) ); dim3 numBlocksRed(intRIR_N, T, M); - reduceRIR_kernel<<>>( + reduceRIR_kernel<<>>( initialRIR, intermediateRIR, M, T, iniRIR_N, intRIR_N); gpuErrchk( cudaDeviceSynchronize() ); gpuErrchk( cudaPeekAtLastError() ); @@ -555,14 +641,14 @@ void gpuRIR_cuda::cuda_rirGenerator(scalar_t* rir, scalar_t* amp, scalar_t* tau, } dim3 numBlocksEnd(1, T, M); - reduceRIR_kernel<<>>( + reduceRIR_kernel<<>>( initialRIR, rir, M, T, iniRIR_N, 1); gpuErrchk( cudaDeviceSynchronize() ); gpuErrchk( cudaPeekAtLastError() ); gpuErrchk( cudaFree(initialRIR) ); } -void cuda_rirGenerator_mp(scalar_t* rir, scalar_t* amp, scalar_t* tau, int M, int N, int T, scalar_t Fs) { +void cuda_rirGenerator_mp(float* rir, float* amp, float* tau, int M, int N, int T, float Fs) { if (cuda_arch >= 530) { int initialReduction = initialReductionMin; while (M * T/2 * ceil((float)N/initialReduction) > 1e9) initialReduction *= 2; @@ -574,11 +660,11 @@ void cuda_rirGenerator_mp(scalar_t* rir, scalar_t* amp, scalar_t* tau, int M, in half2* initialRIR; gpuErrchk( cudaMalloc(&initialRIR, M*(T/2)*iniRIR_N*sizeof(half2)) ); - scalar_t Tw_2 = 8e-3f * Fs / 2; + float Tw_2 = 8e-3f * Fs / 2; #if CUDART_VERSION < 9020 // For CUDA versions older than 9.2 it is nos possible to call from host code __float2half2_rn, // but doing it in the kernel is slower - scalar_t Tw_inv = 1.0f / (8e-3f * Fs); + float Tw_inv = 1.0f / (8e-3f * Fs); #else half2 Tw_inv = __float2half2_rn(1.0f / (8e-3f * Fs)); #endif @@ -622,8 +708,8 @@ void cuda_rirGenerator_mp(scalar_t* rir, scalar_t* amp, scalar_t* tau, int M, in } } -int gpuRIR_cuda::PadData(scalar_t *signal, scalar_t **padded_signal, int segment_len, - scalar_t *RIR, scalar_t **padded_RIR, int M_src, int M_rcv, int RIR_len) { +int gpuRIR_cuda::PadData(float *signal, float **padded_signal, int segment_len, + float *RIR, float **padded_RIR, int M_src, int M_rcv, int RIR_len) { int N_fft = pow2roundup(segment_len + RIR_len - 1); @@ -652,35 +738,35 @@ int gpuRIR_cuda::PadData(scalar_t *signal, scalar_t **padded_signal, int segment /* Principal functions */ /***********************/ -scalar_t* gpuRIR_cuda::cuda_simulateRIR(scalar_t room_sz[3], scalar_t beta[6], scalar_t* h_pos_src, int M_src, - scalar_t* h_pos_rcv, scalar_t* h_orV_rcv, micPattern mic_pattern, int M_rcv, int nb_img[3], - scalar_t Tdiff, scalar_t Tmax, scalar_t Fs, scalar_t c) { - // function scalar_t* cuda_simulateRIR(scalar_t room_sz[3], scalar_t beta[6], scalar_t* h_pos_src, int M_src, - // scalar_t* h_pos_rcv, scalar_t* h_orV_rcv, micPattern mic_pattern, int M_rcv, int nb_img[3], - // scalar_t Tdiff, scalar_t Tmax, scalar_t Fs, scalar_t c); +float* gpuRIR_cuda::cuda_simulateRIR(float room_sz[3], float beta[6], float* h_pos_src, int M_src, + float* h_pos_rcv, float* h_orV_rcv, micPattern mic_pattern, int M_rcv, int nb_img[3], + float Tdiff, float Tmax, float Fs, float c) { + // function float* cuda_simulateRIR(float room_sz[3], float beta[6], float* h_pos_src, int M_src, + // float* h_pos_rcv, float* h_orV_rcv, micPattern mic_pattern, int M_rcv, int nb_img[3], + // float Tdiff, float Tmax, float Fs, float c); // Input parameters: - // scalar_t room_sz[3] : Size of the room [m] - // scalar_t beta[6] : Reflection coefficients [beta_x1 beta_x2 beta_y1 beta_y2 beta_z1 beta_z2] - // scalar_t* h_pos_src : M_src x 3 matrix with the positions of the sources [m] + // float room_sz[3] : Size of the room [m] + // float beta[6] : Reflection coefficients [beta_x1 beta_x2 beta_y1 beta_y2 beta_z1 beta_z2] + // float* h_pos_src : M_src x 3 matrix with the positions of the sources [m] // int M_src : Number of sources - // scalar_t* h_pos_rcv : M_rcv x 3 matrix with the positions of the receivers [m] - // scalar_t* h_orV_rcv : M_rcv x 3 matrix with vectors pointing in the same direction than the receivers + // float* h_pos_rcv : M_rcv x 3 matrix with the positions of the receivers [m] + // float* h_orV_rcv : M_rcv x 3 matrix with vectors pointing in the same direction than the receivers // micPattern mic_pattern : Polar pattern of the receivers (see gpuRIR_cuda.h) // int M_rcv : Number of receivers // int nb_img[3] : Number of sources in each dimension - // scalar_t Tdiff : Time when the ISM is replaced by a diffusse reverberation model [s] - // scalar_t Tmax : RIRs length [s] - // scalar_t Fs : Sampling frequency [Hz] - // scalar_t c : Speed of sound [m/s] + // float Tdiff : Time when the ISM is replaced by a diffusse reverberation model [s] + // float Tmax : RIRs length [s] + // float Fs : Sampling frequency [Hz] + // float c : Speed of sound [m/s] // Copy host memory to GPU - scalar_t *pos_src, *pos_rcv, *orV_rcv; - gpuErrchk( cudaMalloc(&pos_src, M_src*3*sizeof(scalar_t)) ); - gpuErrchk( cudaMalloc(&pos_rcv, M_rcv*3*sizeof(scalar_t)) ); - gpuErrchk( cudaMalloc(&orV_rcv, M_rcv*3*sizeof(scalar_t)) ); - gpuErrchk( cudaMemcpy(pos_src, h_pos_src, M_src*3*sizeof(scalar_t), cudaMemcpyHostToDevice ) ); - gpuErrchk( cudaMemcpy(pos_rcv, h_pos_rcv, M_rcv*3*sizeof(scalar_t), cudaMemcpyHostToDevice ) ); - gpuErrchk( cudaMemcpy(orV_rcv, h_orV_rcv, M_rcv*3*sizeof(scalar_t), cudaMemcpyHostToDevice ) ); + float *pos_src, *pos_rcv, *orV_rcv; + gpuErrchk( cudaMalloc(&pos_src, M_src*3*sizeof(float)) ); + gpuErrchk( cudaMalloc(&pos_rcv, M_rcv*3*sizeof(float)) ); + gpuErrchk( cudaMalloc(&orV_rcv, M_rcv*3*sizeof(float)) ); + gpuErrchk( cudaMemcpy(pos_src, h_pos_src, M_src*3*sizeof(float), cudaMemcpyHostToDevice ) ); + gpuErrchk( cudaMemcpy(pos_rcv, h_pos_rcv, M_rcv*3*sizeof(float), cudaMemcpyHostToDevice ) ); + gpuErrchk( cudaMemcpy(orV_rcv, h_orV_rcv, M_rcv*3*sizeof(float), cudaMemcpyHostToDevice ) ); // Use the ISM to calculate the amplitude and delay of each image @@ -688,14 +774,14 @@ scalar_t* gpuRIR_cuda::cuda_simulateRIR(scalar_t room_sz[3], scalar_t beta[6], s dim3 numBlocksISM(ceil((float)nb_img[0] / nThreadsISM_x), ceil((float)nb_img[1] / nThreadsISM_y), ceil((float)nb_img[2] / nThreadsISM_z)); - int shMemISM = (M_src + 2*M_rcv) * 3 * sizeof(scalar_t); + int shMemISM = (M_src + 2*M_rcv) * 3 * sizeof(float); - scalar_t* amp; // Amplitude with which the signals from each image source of each source arrive to each receiver - gpuErrchk( cudaMalloc(&, M_src*M_rcv*nb_img[0]*nb_img[1]*nb_img[2]*sizeof(scalar_t)) ); - scalar_t* tau; // Delay with which the signals from each image source of each source arrive to each receiver - gpuErrchk( cudaMalloc(&tau, M_src*M_rcv*nb_img[0]*nb_img[1]*nb_img[2]*sizeof(scalar_t)) ); - scalar_t* tau_dp; // Direct path delay - gpuErrchk( cudaMalloc(&tau_dp, M_src*M_rcv*sizeof(scalar_t)) ); + float* amp; // Amplitude with which the signals from each image source of each source arrive to each receiver + gpuErrchk( cudaMalloc(&, M_src*M_rcv*nb_img[0]*nb_img[1]*nb_img[2]*sizeof(float)) ); + float* tau; // Delay with which the signals from each image source of each source arrive to each receiver + gpuErrchk( cudaMalloc(&tau, M_src*M_rcv*nb_img[0]*nb_img[1]*nb_img[2]*sizeof(float)) ); + float* tau_dp; // Direct path delay + gpuErrchk( cudaMalloc(&tau_dp, M_src*M_rcv*sizeof(float)) ); calcAmpTau_kernel<<>> ( amp, tau, tau_dp, @@ -717,8 +803,8 @@ scalar_t* gpuRIR_cuda::cuda_simulateRIR(scalar_t room_sz[3], scalar_t beta[6], s // Compute the RIRs as a sum of sincs int M = M_src * M_rcv; int N = nb_img[0] * nb_img[1] * nb_img[2]; - scalar_t* rirISM; - gpuErrchk( cudaMalloc(&rirISM, M*nSamplesISM*sizeof(scalar_t)) ); + float* rirISM; + gpuErrchk( cudaMalloc(&rirISM, M*nSamplesISM*sizeof(float)) ); if (mixed_precision) { if (cuda_arch >= 530) { cuda_rirGenerator_mp(rirISM, amp, tau, M, N, nSamplesISM, Fs); @@ -734,10 +820,10 @@ scalar_t* gpuRIR_cuda::cuda_simulateRIR(scalar_t room_sz[3], scalar_t beta[6], s dim3 numBlocksEnvPred(ceil((float)M_src / nThreadsEnvPred_x), ceil((float)M_rcv / nThreadsEnvPred_y), 1); - scalar_t* A; // pow_env = A * exp(alpha * (t-tau_dp)) - gpuErrchk( cudaMalloc(&A, M_src*M_rcv*sizeof(scalar_t)) ); - scalar_t* alpha; - gpuErrchk( cudaMalloc(&alpha, M_src*M_rcv*sizeof(scalar_t)) ); + float* A; // pow_env = A * exp(alpha * (t-tau_dp)) + gpuErrchk( cudaMalloc(&A, M_src*M_rcv*sizeof(float)) ); + float* alpha; + gpuErrchk( cudaMalloc(&alpha, M_src*M_rcv*sizeof(float)) ); envPred_kernel<<>>( A, alpha, rirISM, tau_dp, M_src, M_rcv, nSamplesISM, Fs, @@ -746,8 +832,8 @@ scalar_t* gpuRIR_cuda::cuda_simulateRIR(scalar_t room_sz[3], scalar_t beta[6], s gpuErrchk( cudaPeekAtLastError() ); // Generate diffuse reverberation - scalar_t* rirDiff; - gpuErrchk( cudaMalloc(&rirDiff, M_src*M_rcv*nSamplesDiff*sizeof(scalar_t)) ); + float* rirDiff; + gpuErrchk( cudaMalloc(&rirDiff, M_src*M_rcv*nSamplesDiff*sizeof(float)) ); if (nSamplesDiff != 0) { // Fill rirDiff with random numbers with uniform distribution @@ -766,21 +852,21 @@ scalar_t* gpuRIR_cuda::cuda_simulateRIR(scalar_t room_sz[3], scalar_t beta[6], s } // Copy GPU memory to host - int rirSizeISM = M_src * M_rcv * nSamplesISM * sizeof(scalar_t); - int rirSizeDiff = M_src * M_rcv * nSamplesDiff * sizeof(scalar_t); - scalar_t* h_rir = (scalar_t*) malloc(rirSizeISM+rirSizeDiff); + int rirSizeISM = M_src * M_rcv * nSamplesISM * sizeof(float); + int rirSizeDiff = M_src * M_rcv * nSamplesDiff * sizeof(float); + float* h_rir = (float*) malloc(rirSizeISM+rirSizeDiff); cudaPitchedPtr h_rir_pitchedPtr = make_cudaPitchedPtr( (void*) h_rir, - (nSamplesISM+nSamplesDiff)*sizeof(scalar_t), nSamplesISM+nSamplesDiff, M_rcv ); + (nSamplesISM+nSamplesDiff)*sizeof(float), nSamplesISM+nSamplesDiff, M_rcv ); cudaPitchedPtr rirISM_pitchedPtr = make_cudaPitchedPtr( (void*) rirISM, - nSamplesISM*sizeof(scalar_t), nSamplesISM, M_rcv ); + nSamplesISM*sizeof(float), nSamplesISM, M_rcv ); cudaPitchedPtr rirDiff_pitchedPtr = make_cudaPitchedPtr( (void*) rirDiff, - nSamplesDiff*sizeof(scalar_t), nSamplesDiff, M_rcv ); + nSamplesDiff*sizeof(float), nSamplesDiff, M_rcv ); cudaMemcpy3DParms parmsISM = {0}; parmsISM.srcPtr = rirISM_pitchedPtr; parmsISM.dstPtr = h_rir_pitchedPtr; - parmsISM.extent = make_cudaExtent(nSamplesISM*sizeof(scalar_t), M_rcv, M_src); + parmsISM.extent = make_cudaExtent(nSamplesISM*sizeof(float), M_rcv, M_src); parmsISM.kind = cudaMemcpyDeviceToHost; gpuErrchk( cudaMemcpy3D(&parmsISM) ); @@ -788,8 +874,8 @@ scalar_t* gpuRIR_cuda::cuda_simulateRIR(scalar_t room_sz[3], scalar_t beta[6], s cudaMemcpy3DParms parmsDiff = {0}; parmsDiff.srcPtr = rirDiff_pitchedPtr; parmsDiff.dstPtr = h_rir_pitchedPtr; - parmsDiff.dstPos = make_cudaPos(nSamplesISM*sizeof(scalar_t), 0, 0); - parmsDiff.extent = make_cudaExtent(nSamplesDiff*sizeof(scalar_t), M_rcv, M_src); + parmsDiff.dstPos = make_cudaPos(nSamplesISM*sizeof(float), 0, 0); + parmsDiff.extent = make_cudaExtent(nSamplesDiff*sizeof(float), M_rcv, M_src); parmsDiff.kind = cudaMemcpyDeviceToHost; gpuErrchk( cudaMemcpy3D(&parmsDiff) ); } @@ -809,15 +895,15 @@ scalar_t* gpuRIR_cuda::cuda_simulateRIR(scalar_t room_sz[3], scalar_t beta[6], s return h_rir; } -scalar_t* gpuRIR_cuda::cuda_convolutions(scalar_t* source_segments, int M_src, int segment_len, - scalar_t* RIR, int M_rcv, int RIR_len) { - // function scalar_t* cuda_filterRIR(scalar_t* source_segments, int M_src, int segments_len, - // scalar_t* RIR, int M_rcv, int RIR_len); +float* gpuRIR_cuda::cuda_convolutions(float* source_segments, int M_src, int segment_len, + float* RIR, int M_rcv, int RIR_len) { + // function float* cuda_filterRIR(float* source_segments, int M_src, int segments_len, + // float* RIR, int M_rcv, int RIR_len); // Input parameters: - // scalar_t* source_segments : Source signal segment for each trajectory point + // float* source_segments : Source signal segment for each trajectory point // int M_src : Number of trajectory points // int segment_len : Length of the segments [samples] - // scalar_t* RIR : 3D array with the RIR from each point of the trajectory to each receiver + // float* RIR : 3D array with the RIR from each point of the trajectory to each receiver // int M_rcv : Number of receivers // int RIR_len : Length of the RIRs [samples] @@ -825,31 +911,31 @@ scalar_t* gpuRIR_cuda::cuda_convolutions(scalar_t* source_segments, int M_src, i int N_fft = pow2roundup(segment_len + RIR_len - 1); // Copy the signal segments with zero padding - int mem_size_signal = sizeof(scalar_t) * M_src * (N_fft+2); + int mem_size_signal = sizeof(float) * M_src * (N_fft+2); cufftComplex *d_signal; gpuErrchk( cudaMalloc((void **)&d_signal, mem_size_signal) ); - gpuErrchk( cudaMemcpy2D((void *)d_signal, (N_fft+2)*sizeof(scalar_t), - (void *)source_segments, segment_len*sizeof(scalar_t), - segment_len*sizeof(scalar_t), M_src, cudaMemcpyHostToDevice) ); - gpuErrchk( cudaMemset2D((void *)((scalar_t *)d_signal + segment_len), (N_fft+2)*sizeof(scalar_t), - 0, (N_fft+2-segment_len)*sizeof(scalar_t), M_src ) ); + gpuErrchk( cudaMemcpy2D((void *)d_signal, (N_fft+2)*sizeof(float), + (void *)source_segments, segment_len*sizeof(float), + segment_len*sizeof(float), M_src, cudaMemcpyHostToDevice) ); + gpuErrchk( cudaMemset2D((void *)((float *)d_signal + segment_len), (N_fft+2)*sizeof(float), + 0, (N_fft+2-segment_len)*sizeof(float), M_src ) ); // Copy the RIRs with zero padding cudaPitchedPtr h_RIR_pitchedPtr = make_cudaPitchedPtr( (void*) RIR, - RIR_len*sizeof(scalar_t), RIR_len, M_rcv ); - int mem_size_RIR = sizeof(scalar_t) * M_src * M_rcv * (N_fft+2); + RIR_len*sizeof(float), RIR_len, M_rcv ); + int mem_size_RIR = sizeof(float) * M_src * M_rcv * (N_fft+2); cufftComplex *d_RIR; gpuErrchk( cudaMalloc((void **)&d_RIR, mem_size_RIR) ); cudaPitchedPtr d_RIR_pitchedPtr = make_cudaPitchedPtr( (void*) d_RIR, - (N_fft+2)*sizeof(scalar_t), (N_fft+2), M_rcv ); + (N_fft+2)*sizeof(float), (N_fft+2), M_rcv ); cudaMemcpy3DParms parmsCopySignal = {0}; parmsCopySignal.srcPtr = h_RIR_pitchedPtr; parmsCopySignal.dstPtr = d_RIR_pitchedPtr; - parmsCopySignal.extent = make_cudaExtent(RIR_len*sizeof(scalar_t), M_rcv, M_src); + parmsCopySignal.extent = make_cudaExtent(RIR_len*sizeof(float), M_rcv, M_src); parmsCopySignal.kind = cudaMemcpyHostToDevice; gpuErrchk( cudaMemcpy3D(&parmsCopySignal) ); - gpuErrchk( cudaMemset2D((void *)((scalar_t *)d_RIR + RIR_len), (N_fft+2)*sizeof(scalar_t), - 0, (N_fft+2-RIR_len)*sizeof(scalar_t), M_rcv*M_src ) ); + gpuErrchk( cudaMemset2D((void *)((float *)d_RIR + RIR_len), (N_fft+2)*sizeof(float), + 0, (N_fft+2-RIR_len)*sizeof(float), M_rcv*M_src ) ); // CUFFT plans cufftHandle plan_signal, plan_RIR, plan_RIR_inv; @@ -877,15 +963,15 @@ scalar_t* gpuRIR_cuda::cuda_convolutions(scalar_t* source_segments, int M_src, i // Copy device memory to host int conv_len = segment_len + RIR_len - 1; - scalar_t *convolved_segments = (scalar_t *)malloc(sizeof(scalar_t)*M_src*M_rcv*conv_len); + float *convolved_segments = (float *)malloc(sizeof(float)*M_src*M_rcv*conv_len); cudaPitchedPtr d_convolved_segments_pitchedPtr = make_cudaPitchedPtr( (void*) d_RIR, - (N_fft+2)*sizeof(scalar_t), conv_len, M_rcv ); + (N_fft+2)*sizeof(float), conv_len, M_rcv ); cudaPitchedPtr h_convolved_segments_pitchedPtr = make_cudaPitchedPtr( (void*) convolved_segments, - conv_len*sizeof(scalar_t), conv_len, M_rcv ); + conv_len*sizeof(float), conv_len, M_rcv ); cudaMemcpy3DParms parmsCopy = {0}; parmsCopy.srcPtr = d_convolved_segments_pitchedPtr; parmsCopy.dstPtr = h_convolved_segments_pitchedPtr; - parmsCopy.extent = make_cudaExtent(conv_len*sizeof(scalar_t), M_rcv, M_src); + parmsCopy.extent = make_cudaExtent(conv_len*sizeof(float), M_rcv, M_src); parmsCopy.kind = cudaMemcpyDeviceToHost; gpuErrchk( cudaMemcpy3D(&parmsCopy) ); @@ -901,18 +987,19 @@ scalar_t* gpuRIR_cuda::cuda_convolutions(scalar_t* source_segments, int M_src, i return convolved_segments; } -gpuRIR_cuda::gpuRIR_cuda(bool mPrecision) { +gpuRIR_cuda::gpuRIR_cuda(bool mPrecision, bool lut) { // Get CUDA architecture cudaDeviceProp prop; cudaGetDeviceProperties(&prop, 0); cuda_arch = prop.major*100 + prop.minor*10; - // Activate mixed precision if selected + // Activate mixed precision and lut if selected activate_mixed_precision(mPrecision); + activate_lut(lut); // Initiate CUDA runtime API - scalar_t* memPtr_warmup; - gpuErrchk( cudaMalloc(&memPtr_warmup, 1*sizeof(scalar_t)) ); + float* memPtr_warmup; + gpuErrchk( cudaMalloc(&memPtr_warmup, 1*sizeof(float)) ); gpuErrchk( cudaFree(memPtr_warmup) ); // Initiate cuFFT library @@ -927,6 +1014,11 @@ gpuRIR_cuda::gpuRIR_cuda(bool mPrecision) { bool gpuRIR_cuda::activate_mixed_precision(bool activate) { if (cuda_arch >= 530) { + if (activate && lookup_table) { + printf("The mixed precision implementation is not compatible with the lookup table.\n"); + printf("Disabling the lookup table.\n"); + lookup_table = false; + } mixed_precision = activate; } else { if (activate) printf("This feature requires Pascal GPU architecture or higher.\n"); @@ -934,3 +1026,13 @@ bool gpuRIR_cuda::activate_mixed_precision(bool activate) { } return mixed_precision; } + +bool gpuRIR_cuda::activate_lut(bool activate) { + if (activate && mixed_precision) { + printf("The lookup table is not compatible with the mixed precision implementation.\n"); + printf("Disabling the mixed precision implementation.\n"); + mixed_precision = false; + } + lookup_table = activate; + return lookup_table; +} diff --git a/src/gpuRIR_cuda.h b/src/gpuRIR_cuda.h index 82cb232..21b7f6c 100644 --- a/src/gpuRIR_cuda.h +++ b/src/gpuRIR_cuda.h @@ -1,8 +1,4 @@ - -typedef float scalar_t; -//typedef float2 Complex; - // Accepted polar patterns for the receivers: typedef int micPattern; #define DIR_OMNI 0 @@ -19,11 +15,12 @@ struct cuRandGeneratorWrapper_t; class gpuRIR_cuda { public: - gpuRIR_cuda(bool); + gpuRIR_cuda(bool, bool); - scalar_t* cuda_simulateRIR(scalar_t[3], scalar_t[6], scalar_t*, int, scalar_t*, scalar_t*, micPattern, int, int[3], scalar_t, scalar_t, scalar_t, scalar_t); - scalar_t* cuda_convolutions(scalar_t*, int, int,scalar_t*, int, int); + float* cuda_simulateRIR(float[3], float[6], float*, int, float*, float*, micPattern, int, int[3], float, float, float, float); + float* cuda_convolutions(float*, int, int, float*, int, int); bool activate_mixed_precision(bool); + bool activate_lut(bool); private: // cuRAND generator @@ -31,8 +28,11 @@ class gpuRIR_cuda { // Mixed precision flag bool mixed_precision; + + // Lookup table flag + bool lookup_table; // Auxiliar host functions - void cuda_rirGenerator(scalar_t*, scalar_t*, scalar_t*, int, int, int, scalar_t); - int PadData(scalar_t*, scalar_t**, int, scalar_t*, scalar_t**, int, int, int); + void cuda_rirGenerator(float*, float*, float*, int, int, int, float); + int PadData(float*, float**, int, float*, float**, int, int, int); }; \ No newline at end of file diff --git a/src/python_bind.cpp b/src/python_bind.cpp index 159b274..0135885 100644 --- a/src/python_bind.cpp +++ b/src/python_bind.cpp @@ -12,29 +12,31 @@ namespace py = pybind11; class gpuRIR_bind { public: - gpuRIR_bind(bool mPrecision=false) : mixed_precision(mPrecision), gpuRIR_cuda_simulator(mPrecision) {}; + gpuRIR_bind(bool mPrecision=false, bool lut=true) : mixed_precision(mPrecision), lookup_table(lut), gpuRIR_cuda_simulator(mPrecision, lut) {}; - py::array simulateRIR_bind(std::vector, std::vector, py::array_t, py::array_t, py::array_t, micPattern, std::vector ,scalar_t, scalar_t, scalar_t, scalar_t); - py::array gpu_conv(py::array_t, py::array_t); + py::array simulateRIR_bind(std::vector, std::vector, py::array_t, py::array_t, py::array_t, micPattern, std::vector ,float, float, float, float); + py::array gpu_conv(py::array_t, py::array_t); bool activate_mixed_precision_bind(bool); + bool activate_lut_bind(bool); bool mixed_precision; + bool lookup_table; private: gpuRIR_cuda gpuRIR_cuda_simulator; }; -py::array gpuRIR_bind::simulateRIR_bind(std::vector room_sz, // Size of the room [m] - std::vector beta, // Reflection coefficients - py::array_t pos_src, // positions of the sources [m] - py::array_t pos_rcv, // positions of the receivers [m] - py::array_t orV_rcv, // orientation of the receivers +py::array gpuRIR_bind::simulateRIR_bind(std::vector room_sz, // Size of the room [m] + std::vector beta, // Reflection coefficients + py::array_t pos_src, // positions of the sources [m] + py::array_t pos_rcv, // positions of the receivers [m] + py::array_t orV_rcv, // orientation of the receivers micPattern mic_pattern, // Polar pattern of the receivers (see gpuRIR_cuda.h) std::vector nb_img, // Number of sources in each dimension - scalar_t Tdiff, // Time when the ISM is replaced by a diffusse reverberation model [s] - scalar_t Tmax, // RIRs length [s] - scalar_t Fs, // Sampling frequency [Hz] - scalar_t c=343.0 // Speed of sound [m/s] + float Tdiff, // Time when the ISM is replaced by a diffusse reverberation model [s] + float Tmax, // RIRs length [s] + float Fs, // Sampling frequency [Hz] + float c=343.0 // Speed of sound [m/s] ) { py::buffer_info info_pos_src = pos_src.request(); @@ -55,26 +57,26 @@ py::array gpuRIR_bind::simulateRIR_bind(std::vector room_sz, // Size o int M_src = info_pos_src.shape[0]; int M_rcv = info_pos_rcv.shape[0]; - scalar_t* rir = gpuRIR_cuda_simulator.cuda_simulateRIR(&room_sz[0], &beta[0], - (scalar_t*) info_pos_src.ptr, M_src, - (scalar_t*) info_pos_rcv.ptr, (scalar_t*) info_orV_rcv.ptr, mic_pattern, M_rcv, + float* rir = gpuRIR_cuda_simulator.cuda_simulateRIR(&room_sz[0], &beta[0], + (float*) info_pos_src.ptr, M_src, + (float*) info_pos_rcv.ptr, (float*) info_orV_rcv.ptr, mic_pattern, M_rcv, &nb_img[0], Tdiff, Tmax, Fs, c); py::capsule free_when_done(rir, [](void *f) { - scalar_t *foo = reinterpret_cast(f); + float *foo = reinterpret_cast(f); delete[] foo; }); int nSamples = ceil(Tmax*Fs); nSamples += nSamples%2; // nSamples must be even std::vector shape = {M_src, M_rcv, nSamples}; - std::vector strides = {M_rcv*nSamples*sizeof(scalar_t), nSamples*sizeof(scalar_t), sizeof(scalar_t)}; - return py::array_t(shape, strides, rir, free_when_done); + std::vector strides = {M_rcv*nSamples*sizeof(float), nSamples*sizeof(float), sizeof(float)}; + return py::array_t(shape, strides, rir, free_when_done); } -py::array gpuRIR_bind::gpu_conv(py::array_t source_segments, // Source signal segment for each trajectory point - py::array_t RIR // 3D array with the RIR from each point of the trajectory to each receiver +py::array gpuRIR_bind::gpu_conv(py::array_t source_segments, // Source signal segment for each trajectory point + py::array_t RIR // 3D array with the RIR from each point of the trajectory to each receiver ) { py::buffer_info info_source_segments = source_segments.request(); @@ -89,18 +91,18 @@ py::array gpuRIR_bind::gpu_conv(py::array_t source int M_rcv = info_RIR.shape[1]; int RIR_len = info_RIR.shape[2]; - scalar_t* convolution = gpuRIR_cuda_simulator.cuda_convolutions((scalar_t*)info_source_segments.ptr, M_src, segment_len, - (scalar_t*)info_RIR.ptr, M_rcv, RIR_len); + float* convolution = gpuRIR_cuda_simulator.cuda_convolutions((float*)info_source_segments.ptr, M_src, segment_len, + (float*)info_RIR.ptr, M_rcv, RIR_len); py::capsule free_when_done(convolution, [](void *f) { - scalar_t *foo = reinterpret_cast(f); + float *foo = reinterpret_cast(f); delete[] foo; }); int nSamples = segment_len+RIR_len-1; std::vector shape = {M_src, M_rcv, nSamples}; - std::vector strides = {M_rcv*nSamples*sizeof(scalar_t), nSamples*sizeof(scalar_t), sizeof(scalar_t)}; - return py::array_t(shape, strides, convolution, free_when_done); + std::vector strides = {M_rcv*nSamples*sizeof(float), nSamples*sizeof(float), sizeof(float)}; + return py::array_t(shape, strides, convolution, free_when_done); } @@ -108,6 +110,10 @@ bool gpuRIR_bind::activate_mixed_precision_bind(bool activate) { gpuRIR_cuda_simulator.activate_mixed_precision(activate); } +bool gpuRIR_bind::activate_lut_bind(bool activate) { + gpuRIR_cuda_simulator.activate_lut(activate); +} + PYBIND11_MODULE(gpuRIR_bind,m) { @@ -120,6 +126,6 @@ PYBIND11_MODULE(gpuRIR_bind,m) py::arg("Fs"), py::arg("c")=343.0f ) .def("gpu_conv", &gpuRIR_bind::gpu_conv, "Batched convolution using FFTs in GPU", py::arg("source_segments"), py::arg("RIR")) .def("activate_mixed_precision_bind", &gpuRIR_bind::activate_mixed_precision_bind, "Activate the mixed precision mode, only for Pascal GPU architecture or superior", - py::arg("activate")); - + py::arg("activate")) + .def("activate_lut_bind", &gpuRIR_bind::activate_lut_bind, "Activate the lookup table", py::arg("activate")); }