From 492b2f5a40ebbc6895d707ae3e51da81e954ec41 Mon Sep 17 00:00:00 2001 From: DavidDiazGuerra Date: Mon, 7 Oct 2019 09:15:53 +0200 Subject: [PATCH] Replace scalar_t by hardcoded float since most parts of the code wouldn't work with doubles --- src/gpuRIR_cuda.cu | 296 ++++++++++++++++++++++---------------------- src/gpuRIR_cuda.h | 12 +- src/python_bind.cpp | 48 +++---- 3 files changed, 176 insertions(+), 180 deletions(-) diff --git a/src/gpuRIR_cuda.cu b/src/gpuRIR_cuda.cu index 0cafdf5..a15328b 100644 --- a/src/gpuRIR_cuda.cu +++ b/src/gpuRIR_cuda.cu @@ -107,32 +107,32 @@ inline unsigned int pow2roundup (unsigned int x) { /* Auxiliar device functions */ /*****************************/ -__device__ __forceinline__ scalar_t hanning_window(scalar_t t, scalar_t Tw) { +__device__ __forceinline__ float hanning_window(float t, float Tw) { return 0.5f * (1.0f + __cosf(2.0f*PI*t/Tw)); } -__device__ __forceinline__ scalar_t sinc(scalar_t x) { +__device__ __forceinline__ float sinc(float x) { return (x==0)? 1 : sinf(x)/x; } -__device__ __forceinline__ scalar_t image_sample(scalar_t amp, scalar_t tau, scalar_t t, scalar_t Tw) { - scalar_t t_tau = t - tau; +__device__ __forceinline__ float image_sample(float amp, float tau, float t, float Tw) { + float t_tau = t - tau; return (abs(t_tau)(sinc_lut, __fmaf_rz(t_tau,lut_oversamp,lut_center)) : 0.0f; +__device__ __forceinline__ float image_sample_lut(float amp, float tau, float t, int Tw_2, cudaTextureObject_t sinc_lut, float lut_center) { + float t_tau = t - tau; + return (abs(t_tau)(sinc_lut, __fmaf_rz(t_tau,lut_oversamp,lut_center)) : 0.0f; } /***********/ /* KERNELS */ /***********/ -__global__ void calcAmpTau_kernel(scalar_t* g_amp /*[M_src]M_rcv][nb_img_x][nb_img_y][nb_img_z]*/, - scalar_t* g_tau /*[M_src]M_rcv][nb_img_x][nb_img_y][nb_img_z]*/, - scalar_t* g_tau_dp /*[M_src]M_rcv]*/, - scalar_t* g_pos_src/*[M_src][3]*/, scalar_t* g_pos_rcv/*[M_rcv][3]*/, scalar_t* g_orV_rcv/*[M_rcv][3]*/, - micPattern mic_pattern, scalar_t room_sz_x, scalar_t room_sz_y, scalar_t room_sz_z, - scalar_t beta_x1, scalar_t beta_x2, scalar_t beta_y1, scalar_t beta_y2, scalar_t beta_z1, scalar_t beta_z2, +__global__ void calcAmpTau_kernel(float* g_amp /*[M_src]M_rcv][nb_img_x][nb_img_y][nb_img_z]*/, + float* g_tau /*[M_src]M_rcv][nb_img_x][nb_img_y][nb_img_z]*/, + float* g_tau_dp /*[M_src]M_rcv]*/, + float* g_pos_src/*[M_src][3]*/, float* g_pos_rcv/*[M_rcv][3]*/, float* g_orV_rcv/*[M_rcv][3]*/, + micPattern mic_pattern, float room_sz_x, float room_sz_y, float room_sz_z, + float beta_x1, float beta_x2, float beta_y1, float beta_y2, float beta_z1, float beta_z2, int nb_img_x, int nb_img_y, int nb_img_z, - int M_src, int M_rcv, scalar_t c, scalar_t Fs) { + int M_src, int M_rcv, float c, float Fs) { - extern __shared__ scalar_t sdata[]; + extern __shared__ float sdata[]; int n[3]; n[0] = blockIdx.x * blockDim.x + threadIdx.x; @@ -262,12 +262,12 @@ __global__ void calcAmpTau_kernel(scalar_t* g_amp /*[M_src]M_rcv][nb_img_x][nb_i N[1] = nb_img_y; N[2] = nb_img_z; - scalar_t room_sz[3]; + float room_sz[3]; room_sz[0] = room_sz_x; room_sz[1] = room_sz_y; room_sz[2] = room_sz_z; - scalar_t beta[6]; + float beta[6]; beta[0] = - beta_x1; beta[1] = - beta_x2; beta[2] = - beta_y1; @@ -279,7 +279,7 @@ __global__ void calcAmpTau_kernel(scalar_t* g_amp /*[M_src]M_rcv][nb_img_x][nb_i int n_idx = n[0]*N[1]*N[2] + n[1]*N[2] + n[2]; // Copy g_pos_src to shared memory - scalar_t* sh_pos_src = (scalar_t*) sdata; + float* sh_pos_src = (float*) sdata; if (threadIdx.y==0 && threadIdx.z==0) { for (int m=threadIdx.x; m= 530 int t = blockIdx.x * blockDim.x + threadIdx.x; @@ -473,8 +473,8 @@ __global__ void generateRIR_mp_kernel(half2* initialRIR, scalar_t* amp, scalar_t if (m= 530 int t = blockIdx.x * blockDim.x + threadIdx.x; int m = blockIdx.y * blockDim.y + threadIdx.y; @@ -530,14 +530,14 @@ __global__ void h2RIR_to_floatRIR_kernel(half2* h2RIR, scalar_t* floatRIR, int M /* Lookup table KERNELS */ /************************/ -__global__ void generateRIR_kernel_lut(scalar_t* initialRIR, scalar_t* amp, scalar_t* tau, int T, int M, int N, int iniRIR_N, int ini_red, int Tw_2, cudaTextureObject_t sinc_lut, float lut_center) { +__global__ void generateRIR_kernel_lut(float* initialRIR, float* amp, float* tau, int T, int M, int N, int iniRIR_N, int ini_red, int Tw_2, cudaTextureObject_t sinc_lut, float lut_center) { int t = blockIdx.x * blockDim.x + threadIdx.x; int m = blockIdx.y * blockDim.y + threadIdx.y; int n_ini = blockIdx.z * ini_red; int n_max = fminf(n_ini + ini_red, N); if (m 1e9) initialReduction *= 2; @@ -597,8 +597,8 @@ void gpuRIR_cuda::cuda_rirGenerator(scalar_t* rir, scalar_t* amp, scalar_t* tau, dim3 threadsPerBlockIni(nThreadsGen_t, nThreadsGen_m, nThreadsGen_n); dim3 numBlocksIni(ceil((float)T/threadsPerBlockIni.x), ceil((float)M/threadsPerBlockIni.y), iniRIR_N); - scalar_t* initialRIR; - gpuErrchk( cudaMalloc(&initialRIR, M*T*iniRIR_N*sizeof(scalar_t)) ); + float* initialRIR; + gpuErrchk( cudaMalloc(&initialRIR, M*T*iniRIR_N*sizeof(float)) ); int Tw = (int) round(8e-3f * Fs); // Window duration [samples] @@ -622,14 +622,14 @@ void gpuRIR_cuda::cuda_rirGenerator(scalar_t* rir, scalar_t* amp, scalar_t* tau, dim3 threadsPerBlockRed(nThreadsRed, 1, 1); - scalar_t* intermediateRIR; + float* intermediateRIR; int intRIR_N; while (iniRIR_N > 2*nThreadsRed) { intRIR_N = ceil((float)iniRIR_N / (2*nThreadsRed)); - gpuErrchk( cudaMalloc(&intermediateRIR, intRIR_N * T * M * sizeof(scalar_t)) ); + gpuErrchk( cudaMalloc(&intermediateRIR, intRIR_N * T * M * sizeof(float)) ); dim3 numBlocksRed(intRIR_N, T, M); - reduceRIR_kernel<<>>( + reduceRIR_kernel<<>>( initialRIR, intermediateRIR, M, T, iniRIR_N, intRIR_N); gpuErrchk( cudaDeviceSynchronize() ); gpuErrchk( cudaPeekAtLastError() ); @@ -640,14 +640,14 @@ void gpuRIR_cuda::cuda_rirGenerator(scalar_t* rir, scalar_t* amp, scalar_t* tau, } dim3 numBlocksEnd(1, T, M); - reduceRIR_kernel<<>>( + reduceRIR_kernel<<>>( initialRIR, rir, M, T, iniRIR_N, 1); gpuErrchk( cudaDeviceSynchronize() ); gpuErrchk( cudaPeekAtLastError() ); gpuErrchk( cudaFree(initialRIR) ); } -void cuda_rirGenerator_mp(scalar_t* rir, scalar_t* amp, scalar_t* tau, int M, int N, int T, scalar_t Fs) { +void cuda_rirGenerator_mp(float* rir, float* amp, float* tau, int M, int N, int T, float Fs) { if (cuda_arch >= 530) { int initialReduction = initialReductionMin; while (M * T/2 * ceil((float)N/initialReduction) > 1e9) initialReduction *= 2; @@ -659,11 +659,11 @@ void cuda_rirGenerator_mp(scalar_t* rir, scalar_t* amp, scalar_t* tau, int M, in half2* initialRIR; gpuErrchk( cudaMalloc(&initialRIR, M*(T/2)*iniRIR_N*sizeof(half2)) ); - scalar_t Tw_2 = 8e-3f * Fs / 2; + float Tw_2 = 8e-3f * Fs / 2; #if CUDART_VERSION < 9020 // For CUDA versions older than 9.2 it is nos possible to call from host code __float2half2_rn, // but doing it in the kernel is slower - scalar_t Tw_inv = 1.0f / (8e-3f * Fs); + float Tw_inv = 1.0f / (8e-3f * Fs); #else half2 Tw_inv = __float2half2_rn(1.0f / (8e-3f * Fs)); #endif @@ -707,8 +707,8 @@ void cuda_rirGenerator_mp(scalar_t* rir, scalar_t* amp, scalar_t* tau, int M, in } } -int gpuRIR_cuda::PadData(scalar_t *signal, scalar_t **padded_signal, int segment_len, - scalar_t *RIR, scalar_t **padded_RIR, int M_src, int M_rcv, int RIR_len) { +int gpuRIR_cuda::PadData(float *signal, float **padded_signal, int segment_len, + float *RIR, float **padded_RIR, int M_src, int M_rcv, int RIR_len) { int N_fft = pow2roundup(segment_len + RIR_len - 1); @@ -737,35 +737,35 @@ int gpuRIR_cuda::PadData(scalar_t *signal, scalar_t **padded_signal, int segment /* Principal functions */ /***********************/ -scalar_t* gpuRIR_cuda::cuda_simulateRIR(scalar_t room_sz[3], scalar_t beta[6], scalar_t* h_pos_src, int M_src, - scalar_t* h_pos_rcv, scalar_t* h_orV_rcv, micPattern mic_pattern, int M_rcv, int nb_img[3], - scalar_t Tdiff, scalar_t Tmax, scalar_t Fs, scalar_t c) { - // function scalar_t* cuda_simulateRIR(scalar_t room_sz[3], scalar_t beta[6], scalar_t* h_pos_src, int M_src, - // scalar_t* h_pos_rcv, scalar_t* h_orV_rcv, micPattern mic_pattern, int M_rcv, int nb_img[3], - // scalar_t Tdiff, scalar_t Tmax, scalar_t Fs, scalar_t c); +float* gpuRIR_cuda::cuda_simulateRIR(float room_sz[3], float beta[6], float* h_pos_src, int M_src, + float* h_pos_rcv, float* h_orV_rcv, micPattern mic_pattern, int M_rcv, int nb_img[3], + float Tdiff, float Tmax, float Fs, float c) { + // function float* cuda_simulateRIR(float room_sz[3], float beta[6], float* h_pos_src, int M_src, + // float* h_pos_rcv, float* h_orV_rcv, micPattern mic_pattern, int M_rcv, int nb_img[3], + // float Tdiff, float Tmax, float Fs, float c); // Input parameters: - // scalar_t room_sz[3] : Size of the room [m] - // scalar_t beta[6] : Reflection coefficients [beta_x1 beta_x2 beta_y1 beta_y2 beta_z1 beta_z2] - // scalar_t* h_pos_src : M_src x 3 matrix with the positions of the sources [m] + // float room_sz[3] : Size of the room [m] + // float beta[6] : Reflection coefficients [beta_x1 beta_x2 beta_y1 beta_y2 beta_z1 beta_z2] + // float* h_pos_src : M_src x 3 matrix with the positions of the sources [m] // int M_src : Number of sources - // scalar_t* h_pos_rcv : M_rcv x 3 matrix with the positions of the receivers [m] - // scalar_t* h_orV_rcv : M_rcv x 3 matrix with vectors pointing in the same direction than the receivers + // float* h_pos_rcv : M_rcv x 3 matrix with the positions of the receivers [m] + // float* h_orV_rcv : M_rcv x 3 matrix with vectors pointing in the same direction than the receivers // micPattern mic_pattern : Polar pattern of the receivers (see gpuRIR_cuda.h) // int M_rcv : Number of receivers // int nb_img[3] : Number of sources in each dimension - // scalar_t Tdiff : Time when the ISM is replaced by a diffusse reverberation model [s] - // scalar_t Tmax : RIRs length [s] - // scalar_t Fs : Sampling frequency [Hz] - // scalar_t c : Speed of sound [m/s] + // float Tdiff : Time when the ISM is replaced by a diffusse reverberation model [s] + // float Tmax : RIRs length [s] + // float Fs : Sampling frequency [Hz] + // float c : Speed of sound [m/s] // Copy host memory to GPU - scalar_t *pos_src, *pos_rcv, *orV_rcv; - gpuErrchk( cudaMalloc(&pos_src, M_src*3*sizeof(scalar_t)) ); - gpuErrchk( cudaMalloc(&pos_rcv, M_rcv*3*sizeof(scalar_t)) ); - gpuErrchk( cudaMalloc(&orV_rcv, M_rcv*3*sizeof(scalar_t)) ); - gpuErrchk( cudaMemcpy(pos_src, h_pos_src, M_src*3*sizeof(scalar_t), cudaMemcpyHostToDevice ) ); - gpuErrchk( cudaMemcpy(pos_rcv, h_pos_rcv, M_rcv*3*sizeof(scalar_t), cudaMemcpyHostToDevice ) ); - gpuErrchk( cudaMemcpy(orV_rcv, h_orV_rcv, M_rcv*3*sizeof(scalar_t), cudaMemcpyHostToDevice ) ); + float *pos_src, *pos_rcv, *orV_rcv; + gpuErrchk( cudaMalloc(&pos_src, M_src*3*sizeof(float)) ); + gpuErrchk( cudaMalloc(&pos_rcv, M_rcv*3*sizeof(float)) ); + gpuErrchk( cudaMalloc(&orV_rcv, M_rcv*3*sizeof(float)) ); + gpuErrchk( cudaMemcpy(pos_src, h_pos_src, M_src*3*sizeof(float), cudaMemcpyHostToDevice ) ); + gpuErrchk( cudaMemcpy(pos_rcv, h_pos_rcv, M_rcv*3*sizeof(float), cudaMemcpyHostToDevice ) ); + gpuErrchk( cudaMemcpy(orV_rcv, h_orV_rcv, M_rcv*3*sizeof(float), cudaMemcpyHostToDevice ) ); // Use the ISM to calculate the amplitude and delay of each image @@ -773,14 +773,14 @@ scalar_t* gpuRIR_cuda::cuda_simulateRIR(scalar_t room_sz[3], scalar_t beta[6], s dim3 numBlocksISM(ceil((float)nb_img[0] / nThreadsISM_x), ceil((float)nb_img[1] / nThreadsISM_y), ceil((float)nb_img[2] / nThreadsISM_z)); - int shMemISM = (M_src + 2*M_rcv) * 3 * sizeof(scalar_t); + int shMemISM = (M_src + 2*M_rcv) * 3 * sizeof(float); - scalar_t* amp; // Amplitude with which the signals from each image source of each source arrive to each receiver - gpuErrchk( cudaMalloc(&, M_src*M_rcv*nb_img[0]*nb_img[1]*nb_img[2]*sizeof(scalar_t)) ); - scalar_t* tau; // Delay with which the signals from each image source of each source arrive to each receiver - gpuErrchk( cudaMalloc(&tau, M_src*M_rcv*nb_img[0]*nb_img[1]*nb_img[2]*sizeof(scalar_t)) ); - scalar_t* tau_dp; // Direct path delay - gpuErrchk( cudaMalloc(&tau_dp, M_src*M_rcv*sizeof(scalar_t)) ); + float* amp; // Amplitude with which the signals from each image source of each source arrive to each receiver + gpuErrchk( cudaMalloc(&, M_src*M_rcv*nb_img[0]*nb_img[1]*nb_img[2]*sizeof(float)) ); + float* tau; // Delay with which the signals from each image source of each source arrive to each receiver + gpuErrchk( cudaMalloc(&tau, M_src*M_rcv*nb_img[0]*nb_img[1]*nb_img[2]*sizeof(float)) ); + float* tau_dp; // Direct path delay + gpuErrchk( cudaMalloc(&tau_dp, M_src*M_rcv*sizeof(float)) ); calcAmpTau_kernel<<>> ( amp, tau, tau_dp, @@ -802,8 +802,8 @@ scalar_t* gpuRIR_cuda::cuda_simulateRIR(scalar_t room_sz[3], scalar_t beta[6], s // Compute the RIRs as a sum of sincs int M = M_src * M_rcv; int N = nb_img[0] * nb_img[1] * nb_img[2]; - scalar_t* rirISM; - gpuErrchk( cudaMalloc(&rirISM, M*nSamplesISM*sizeof(scalar_t)) ); + float* rirISM; + gpuErrchk( cudaMalloc(&rirISM, M*nSamplesISM*sizeof(float)) ); if (mixed_precision) { if (cuda_arch >= 530) { cuda_rirGenerator_mp(rirISM, amp, tau, M, N, nSamplesISM, Fs); @@ -819,10 +819,10 @@ scalar_t* gpuRIR_cuda::cuda_simulateRIR(scalar_t room_sz[3], scalar_t beta[6], s dim3 numBlocksEnvPred(ceil((float)M_src / nThreadsEnvPred_x), ceil((float)M_rcv / nThreadsEnvPred_y), 1); - scalar_t* A; // pow_env = A * exp(alpha * (t-tau_dp)) - gpuErrchk( cudaMalloc(&A, M_src*M_rcv*sizeof(scalar_t)) ); - scalar_t* alpha; - gpuErrchk( cudaMalloc(&alpha, M_src*M_rcv*sizeof(scalar_t)) ); + float* A; // pow_env = A * exp(alpha * (t-tau_dp)) + gpuErrchk( cudaMalloc(&A, M_src*M_rcv*sizeof(float)) ); + float* alpha; + gpuErrchk( cudaMalloc(&alpha, M_src*M_rcv*sizeof(float)) ); envPred_kernel<<>>( A, alpha, rirISM, tau_dp, M_src, M_rcv, nSamplesISM, Fs, @@ -831,8 +831,8 @@ scalar_t* gpuRIR_cuda::cuda_simulateRIR(scalar_t room_sz[3], scalar_t beta[6], s gpuErrchk( cudaPeekAtLastError() ); // Generate diffuse reverberation - scalar_t* rirDiff; - gpuErrchk( cudaMalloc(&rirDiff, M_src*M_rcv*nSamplesDiff*sizeof(scalar_t)) ); + float* rirDiff; + gpuErrchk( cudaMalloc(&rirDiff, M_src*M_rcv*nSamplesDiff*sizeof(float)) ); if (nSamplesDiff != 0) { // Fill rirDiff with random numbers with uniform distribution @@ -851,21 +851,21 @@ scalar_t* gpuRIR_cuda::cuda_simulateRIR(scalar_t room_sz[3], scalar_t beta[6], s } // Copy GPU memory to host - int rirSizeISM = M_src * M_rcv * nSamplesISM * sizeof(scalar_t); - int rirSizeDiff = M_src * M_rcv * nSamplesDiff * sizeof(scalar_t); - scalar_t* h_rir = (scalar_t*) malloc(rirSizeISM+rirSizeDiff); + int rirSizeISM = M_src * M_rcv * nSamplesISM * sizeof(float); + int rirSizeDiff = M_src * M_rcv * nSamplesDiff * sizeof(float); + float* h_rir = (float*) malloc(rirSizeISM+rirSizeDiff); cudaPitchedPtr h_rir_pitchedPtr = make_cudaPitchedPtr( (void*) h_rir, - (nSamplesISM+nSamplesDiff)*sizeof(scalar_t), nSamplesISM+nSamplesDiff, M_rcv ); + (nSamplesISM+nSamplesDiff)*sizeof(float), nSamplesISM+nSamplesDiff, M_rcv ); cudaPitchedPtr rirISM_pitchedPtr = make_cudaPitchedPtr( (void*) rirISM, - nSamplesISM*sizeof(scalar_t), nSamplesISM, M_rcv ); + nSamplesISM*sizeof(float), nSamplesISM, M_rcv ); cudaPitchedPtr rirDiff_pitchedPtr = make_cudaPitchedPtr( (void*) rirDiff, - nSamplesDiff*sizeof(scalar_t), nSamplesDiff, M_rcv ); + nSamplesDiff*sizeof(float), nSamplesDiff, M_rcv ); cudaMemcpy3DParms parmsISM = {0}; parmsISM.srcPtr = rirISM_pitchedPtr; parmsISM.dstPtr = h_rir_pitchedPtr; - parmsISM.extent = make_cudaExtent(nSamplesISM*sizeof(scalar_t), M_rcv, M_src); + parmsISM.extent = make_cudaExtent(nSamplesISM*sizeof(float), M_rcv, M_src); parmsISM.kind = cudaMemcpyDeviceToHost; gpuErrchk( cudaMemcpy3D(&parmsISM) ); @@ -873,8 +873,8 @@ scalar_t* gpuRIR_cuda::cuda_simulateRIR(scalar_t room_sz[3], scalar_t beta[6], s cudaMemcpy3DParms parmsDiff = {0}; parmsDiff.srcPtr = rirDiff_pitchedPtr; parmsDiff.dstPtr = h_rir_pitchedPtr; - parmsDiff.dstPos = make_cudaPos(nSamplesISM*sizeof(scalar_t), 0, 0); - parmsDiff.extent = make_cudaExtent(nSamplesDiff*sizeof(scalar_t), M_rcv, M_src); + parmsDiff.dstPos = make_cudaPos(nSamplesISM*sizeof(float), 0, 0); + parmsDiff.extent = make_cudaExtent(nSamplesDiff*sizeof(float), M_rcv, M_src); parmsDiff.kind = cudaMemcpyDeviceToHost; gpuErrchk( cudaMemcpy3D(&parmsDiff) ); } @@ -894,15 +894,15 @@ scalar_t* gpuRIR_cuda::cuda_simulateRIR(scalar_t room_sz[3], scalar_t beta[6], s return h_rir; } -scalar_t* gpuRIR_cuda::cuda_convolutions(scalar_t* source_segments, int M_src, int segment_len, - scalar_t* RIR, int M_rcv, int RIR_len) { - // function scalar_t* cuda_filterRIR(scalar_t* source_segments, int M_src, int segments_len, - // scalar_t* RIR, int M_rcv, int RIR_len); +float* gpuRIR_cuda::cuda_convolutions(float* source_segments, int M_src, int segment_len, + float* RIR, int M_rcv, int RIR_len) { + // function float* cuda_filterRIR(float* source_segments, int M_src, int segments_len, + // float* RIR, int M_rcv, int RIR_len); // Input parameters: - // scalar_t* source_segments : Source signal segment for each trajectory point + // float* source_segments : Source signal segment for each trajectory point // int M_src : Number of trajectory points // int segment_len : Length of the segments [samples] - // scalar_t* RIR : 3D array with the RIR from each point of the trajectory to each receiver + // float* RIR : 3D array with the RIR from each point of the trajectory to each receiver // int M_rcv : Number of receivers // int RIR_len : Length of the RIRs [samples] @@ -910,31 +910,31 @@ scalar_t* gpuRIR_cuda::cuda_convolutions(scalar_t* source_segments, int M_src, i int N_fft = pow2roundup(segment_len + RIR_len - 1); // Copy the signal segments with zero padding - int mem_size_signal = sizeof(scalar_t) * M_src * (N_fft+2); + int mem_size_signal = sizeof(float) * M_src * (N_fft+2); cufftComplex *d_signal; gpuErrchk( cudaMalloc((void **)&d_signal, mem_size_signal) ); - gpuErrchk( cudaMemcpy2D((void *)d_signal, (N_fft+2)*sizeof(scalar_t), - (void *)source_segments, segment_len*sizeof(scalar_t), - segment_len*sizeof(scalar_t), M_src, cudaMemcpyHostToDevice) ); - gpuErrchk( cudaMemset2D((void *)((scalar_t *)d_signal + segment_len), (N_fft+2)*sizeof(scalar_t), - 0, (N_fft+2-segment_len)*sizeof(scalar_t), M_src ) ); + gpuErrchk( cudaMemcpy2D((void *)d_signal, (N_fft+2)*sizeof(float), + (void *)source_segments, segment_len*sizeof(float), + segment_len*sizeof(float), M_src, cudaMemcpyHostToDevice) ); + gpuErrchk( cudaMemset2D((void *)((float *)d_signal + segment_len), (N_fft+2)*sizeof(float), + 0, (N_fft+2-segment_len)*sizeof(float), M_src ) ); // Copy the RIRs with zero padding cudaPitchedPtr h_RIR_pitchedPtr = make_cudaPitchedPtr( (void*) RIR, - RIR_len*sizeof(scalar_t), RIR_len, M_rcv ); - int mem_size_RIR = sizeof(scalar_t) * M_src * M_rcv * (N_fft+2); + RIR_len*sizeof(float), RIR_len, M_rcv ); + int mem_size_RIR = sizeof(float) * M_src * M_rcv * (N_fft+2); cufftComplex *d_RIR; gpuErrchk( cudaMalloc((void **)&d_RIR, mem_size_RIR) ); cudaPitchedPtr d_RIR_pitchedPtr = make_cudaPitchedPtr( (void*) d_RIR, - (N_fft+2)*sizeof(scalar_t), (N_fft+2), M_rcv ); + (N_fft+2)*sizeof(float), (N_fft+2), M_rcv ); cudaMemcpy3DParms parmsCopySignal = {0}; parmsCopySignal.srcPtr = h_RIR_pitchedPtr; parmsCopySignal.dstPtr = d_RIR_pitchedPtr; - parmsCopySignal.extent = make_cudaExtent(RIR_len*sizeof(scalar_t), M_rcv, M_src); + parmsCopySignal.extent = make_cudaExtent(RIR_len*sizeof(float), M_rcv, M_src); parmsCopySignal.kind = cudaMemcpyHostToDevice; gpuErrchk( cudaMemcpy3D(&parmsCopySignal) ); - gpuErrchk( cudaMemset2D((void *)((scalar_t *)d_RIR + RIR_len), (N_fft+2)*sizeof(scalar_t), - 0, (N_fft+2-RIR_len)*sizeof(scalar_t), M_rcv*M_src ) ); + gpuErrchk( cudaMemset2D((void *)((float *)d_RIR + RIR_len), (N_fft+2)*sizeof(float), + 0, (N_fft+2-RIR_len)*sizeof(float), M_rcv*M_src ) ); // CUFFT plans cufftHandle plan_signal, plan_RIR, plan_RIR_inv; @@ -962,15 +962,15 @@ scalar_t* gpuRIR_cuda::cuda_convolutions(scalar_t* source_segments, int M_src, i // Copy device memory to host int conv_len = segment_len + RIR_len - 1; - scalar_t *convolved_segments = (scalar_t *)malloc(sizeof(scalar_t)*M_src*M_rcv*conv_len); + float *convolved_segments = (float *)malloc(sizeof(float)*M_src*M_rcv*conv_len); cudaPitchedPtr d_convolved_segments_pitchedPtr = make_cudaPitchedPtr( (void*) d_RIR, - (N_fft+2)*sizeof(scalar_t), conv_len, M_rcv ); + (N_fft+2)*sizeof(float), conv_len, M_rcv ); cudaPitchedPtr h_convolved_segments_pitchedPtr = make_cudaPitchedPtr( (void*) convolved_segments, - conv_len*sizeof(scalar_t), conv_len, M_rcv ); + conv_len*sizeof(float), conv_len, M_rcv ); cudaMemcpy3DParms parmsCopy = {0}; parmsCopy.srcPtr = d_convolved_segments_pitchedPtr; parmsCopy.dstPtr = h_convolved_segments_pitchedPtr; - parmsCopy.extent = make_cudaExtent(conv_len*sizeof(scalar_t), M_rcv, M_src); + parmsCopy.extent = make_cudaExtent(conv_len*sizeof(float), M_rcv, M_src); parmsCopy.kind = cudaMemcpyDeviceToHost; gpuErrchk( cudaMemcpy3D(&parmsCopy) ); @@ -997,8 +997,8 @@ gpuRIR_cuda::gpuRIR_cuda(bool mPrecision, bool lut) { activate_lut(lut); // Initiate CUDA runtime API - scalar_t* memPtr_warmup; - gpuErrchk( cudaMalloc(&memPtr_warmup, 1*sizeof(scalar_t)) ); + float* memPtr_warmup; + gpuErrchk( cudaMalloc(&memPtr_warmup, 1*sizeof(float)) ); gpuErrchk( cudaFree(memPtr_warmup) ); // Initiate cuFFT library diff --git a/src/gpuRIR_cuda.h b/src/gpuRIR_cuda.h index baa73b7..21b7f6c 100644 --- a/src/gpuRIR_cuda.h +++ b/src/gpuRIR_cuda.h @@ -1,8 +1,4 @@ - -typedef float scalar_t; -//typedef float2 Complex; - // Accepted polar patterns for the receivers: typedef int micPattern; #define DIR_OMNI 0 @@ -21,8 +17,8 @@ class gpuRIR_cuda { public: gpuRIR_cuda(bool, bool); - scalar_t* cuda_simulateRIR(scalar_t[3], scalar_t[6], scalar_t*, int, scalar_t*, scalar_t*, micPattern, int, int[3], scalar_t, scalar_t, scalar_t, scalar_t); - scalar_t* cuda_convolutions(scalar_t*, int, int,scalar_t*, int, int); + float* cuda_simulateRIR(float[3], float[6], float*, int, float*, float*, micPattern, int, int[3], float, float, float, float); + float* cuda_convolutions(float*, int, int, float*, int, int); bool activate_mixed_precision(bool); bool activate_lut(bool); @@ -37,6 +33,6 @@ class gpuRIR_cuda { bool lookup_table; // Auxiliar host functions - void cuda_rirGenerator(scalar_t*, scalar_t*, scalar_t*, int, int, int, scalar_t); - int PadData(scalar_t*, scalar_t**, int, scalar_t*, scalar_t**, int, int, int); + void cuda_rirGenerator(float*, float*, float*, int, int, int, float); + int PadData(float*, float**, int, float*, float**, int, int, int); }; \ No newline at end of file diff --git a/src/python_bind.cpp b/src/python_bind.cpp index 71b0098..0135885 100644 --- a/src/python_bind.cpp +++ b/src/python_bind.cpp @@ -14,8 +14,8 @@ class gpuRIR_bind { public: gpuRIR_bind(bool mPrecision=false, bool lut=true) : mixed_precision(mPrecision), lookup_table(lut), gpuRIR_cuda_simulator(mPrecision, lut) {}; - py::array simulateRIR_bind(std::vector, std::vector, py::array_t, py::array_t, py::array_t, micPattern, std::vector ,scalar_t, scalar_t, scalar_t, scalar_t); - py::array gpu_conv(py::array_t, py::array_t); + py::array simulateRIR_bind(std::vector, std::vector, py::array_t, py::array_t, py::array_t, micPattern, std::vector ,float, float, float, float); + py::array gpu_conv(py::array_t, py::array_t); bool activate_mixed_precision_bind(bool); bool activate_lut_bind(bool); @@ -26,17 +26,17 @@ class gpuRIR_bind { gpuRIR_cuda gpuRIR_cuda_simulator; }; -py::array gpuRIR_bind::simulateRIR_bind(std::vector room_sz, // Size of the room [m] - std::vector beta, // Reflection coefficients - py::array_t pos_src, // positions of the sources [m] - py::array_t pos_rcv, // positions of the receivers [m] - py::array_t orV_rcv, // orientation of the receivers +py::array gpuRIR_bind::simulateRIR_bind(std::vector room_sz, // Size of the room [m] + std::vector beta, // Reflection coefficients + py::array_t pos_src, // positions of the sources [m] + py::array_t pos_rcv, // positions of the receivers [m] + py::array_t orV_rcv, // orientation of the receivers micPattern mic_pattern, // Polar pattern of the receivers (see gpuRIR_cuda.h) std::vector nb_img, // Number of sources in each dimension - scalar_t Tdiff, // Time when the ISM is replaced by a diffusse reverberation model [s] - scalar_t Tmax, // RIRs length [s] - scalar_t Fs, // Sampling frequency [Hz] - scalar_t c=343.0 // Speed of sound [m/s] + float Tdiff, // Time when the ISM is replaced by a diffusse reverberation model [s] + float Tmax, // RIRs length [s] + float Fs, // Sampling frequency [Hz] + float c=343.0 // Speed of sound [m/s] ) { py::buffer_info info_pos_src = pos_src.request(); @@ -57,26 +57,26 @@ py::array gpuRIR_bind::simulateRIR_bind(std::vector room_sz, // Size o int M_src = info_pos_src.shape[0]; int M_rcv = info_pos_rcv.shape[0]; - scalar_t* rir = gpuRIR_cuda_simulator.cuda_simulateRIR(&room_sz[0], &beta[0], - (scalar_t*) info_pos_src.ptr, M_src, - (scalar_t*) info_pos_rcv.ptr, (scalar_t*) info_orV_rcv.ptr, mic_pattern, M_rcv, + float* rir = gpuRIR_cuda_simulator.cuda_simulateRIR(&room_sz[0], &beta[0], + (float*) info_pos_src.ptr, M_src, + (float*) info_pos_rcv.ptr, (float*) info_orV_rcv.ptr, mic_pattern, M_rcv, &nb_img[0], Tdiff, Tmax, Fs, c); py::capsule free_when_done(rir, [](void *f) { - scalar_t *foo = reinterpret_cast(f); + float *foo = reinterpret_cast(f); delete[] foo; }); int nSamples = ceil(Tmax*Fs); nSamples += nSamples%2; // nSamples must be even std::vector shape = {M_src, M_rcv, nSamples}; - std::vector strides = {M_rcv*nSamples*sizeof(scalar_t), nSamples*sizeof(scalar_t), sizeof(scalar_t)}; - return py::array_t(shape, strides, rir, free_when_done); + std::vector strides = {M_rcv*nSamples*sizeof(float), nSamples*sizeof(float), sizeof(float)}; + return py::array_t(shape, strides, rir, free_when_done); } -py::array gpuRIR_bind::gpu_conv(py::array_t source_segments, // Source signal segment for each trajectory point - py::array_t RIR // 3D array with the RIR from each point of the trajectory to each receiver +py::array gpuRIR_bind::gpu_conv(py::array_t source_segments, // Source signal segment for each trajectory point + py::array_t RIR // 3D array with the RIR from each point of the trajectory to each receiver ) { py::buffer_info info_source_segments = source_segments.request(); @@ -91,18 +91,18 @@ py::array gpuRIR_bind::gpu_conv(py::array_t source int M_rcv = info_RIR.shape[1]; int RIR_len = info_RIR.shape[2]; - scalar_t* convolution = gpuRIR_cuda_simulator.cuda_convolutions((scalar_t*)info_source_segments.ptr, M_src, segment_len, - (scalar_t*)info_RIR.ptr, M_rcv, RIR_len); + float* convolution = gpuRIR_cuda_simulator.cuda_convolutions((float*)info_source_segments.ptr, M_src, segment_len, + (float*)info_RIR.ptr, M_rcv, RIR_len); py::capsule free_when_done(convolution, [](void *f) { - scalar_t *foo = reinterpret_cast(f); + float *foo = reinterpret_cast(f); delete[] foo; }); int nSamples = segment_len+RIR_len-1; std::vector shape = {M_src, M_rcv, nSamples}; - std::vector strides = {M_rcv*nSamples*sizeof(scalar_t), nSamples*sizeof(scalar_t), sizeof(scalar_t)}; - return py::array_t(shape, strides, convolution, free_when_done); + std::vector strides = {M_rcv*nSamples*sizeof(float), nSamples*sizeof(float), sizeof(float)}; + return py::array_t(shape, strides, convolution, free_when_done); }