diff --git a/README.md b/README.md index a668644..a8e3170 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,7 @@ - [Documentation](#documentation) * [`simulateRIR`](#simulaterir) * [`simulateTrajectory`](#simulatetrajectory) + * [`activateMixedPrecision`](#activateMixedPrecision) * [`beta_SabineEstimation`](#beta_sabineestimation) * [`att2t_SabineEstimator`](#att2t_sabineestimator) * [`t2n`](#t2n) @@ -101,6 +102,15 @@ Filter an audio signal by the RIRs of a motion trajectory recorded with a microp *2D ndarray* Matrix with the signals captured by each microphone in each column. +### `activateMixedPrecision` + +Activate the mixed precision mode, only for Pascal GPU architecture or superior. + +#### Parameters + +* **activate** : *bool, optional.* + True for activate and Flase for deactivate. True by default. + ### `beta_SabineEstimation` Estimation of the reflection coefficients needed to have the desired reverberation time. diff --git a/examples/example.py b/examples/example.py index a66232b..8b7a297 100755 --- a/examples/example.py +++ b/examples/example.py @@ -9,7 +9,7 @@ from math import ceil import gpuRIR -gpuRIR.activate_mixed_precision(False) +gpuRIR.activateMixedPrecision(False) room_sz = [3,3,2.5] # Size of the room [m] nb_src = 2 # Number of sources @@ -32,3 +32,4 @@ t = np.arange(int(ceil(Tmax * fs))) / fs plt.plot(t, RIRs.reshape(nb_src*nb_rcv, -1).transpose()) +plt.show() \ No newline at end of file diff --git a/examples/simulate_trajectory.py b/examples/simulate_trajectory.py index fee8e72..7977d4e 100644 --- a/examples/simulate_trajectory.py +++ b/examples/simulate_trajectory.py @@ -9,6 +9,7 @@ from scipy.io import wavfile import gpuRIR +gpuRIR.activateMixedPrecision(False) fs, source_signal = wavfile.read('source_signal.wav') @@ -33,3 +34,4 @@ filtered_signal = gpuRIR.simulateTrajectory(source_signal, RIRs) wavfile.write('filtered_signal.wav', fs, filtered_signal) plt.plot(filtered_signal) +plt.show() diff --git a/examples/time_vs_T60.py b/examples/time_vs_T60.py index f5cb779..5c1eaa7 100644 --- a/examples/time_vs_T60.py +++ b/examples/time_vs_T60.py @@ -9,7 +9,7 @@ import time import gpuRIR -gpuRIR.activate_mixed_precision(False) +gpuRIR.activateMixedPrecision(False) T60_vec = np.arange(0.1, 2.2, 0.2) # Reverberation times to measure nb_test_per_point = 10 # Number of simulations per T60 to average the runtime @@ -46,4 +46,5 @@ plt.semilogy(T60_vec, times) plt.ylabel("time [s]") -plt.xlabel("T60 [s]") \ No newline at end of file +plt.xlabel("T60 [s]") +plt.show() \ No newline at end of file diff --git a/examples/time_vs_nbRIRs.py b/examples/time_vs_nbRIRs.py index 36ef471..7b950ab 100644 --- a/examples/time_vs_nbRIRs.py +++ b/examples/time_vs_nbRIRs.py @@ -9,7 +9,7 @@ import time import gpuRIR -gpuRIR.activate_mixed_precision(False) +gpuRIR.activateMixedPrecision(False) nb_src_vec = np.concatenate([2**np.arange(12), [4094]]) # Number of RIRs to measure nb_test_per_point = 10 # Number of simulations per T60 to average the runtime @@ -47,3 +47,4 @@ plt.loglog(nb_src_vec, times) plt.ylabel("time [s]") plt.xlabel("number of RIRs") +plt.show() diff --git a/gpuRIR/__init__.py b/gpuRIR/__init__.py index faf538c..cc5a2da 100644 --- a/gpuRIR/__init__.py +++ b/gpuRIR/__init__.py @@ -201,7 +201,7 @@ def simulateTrajectory(source_signal, RIRs, timestamps=None, fs=None): return filtered_signal -def activate_mixed_precision(activate=True): +def activateMixedPrecision(activate=True): ''' Activate the mixed precision mode, only for Pascal GPU architecture or superior. Parameters diff --git a/src/gpuRIR_cuda.cu b/src/gpuRIR_cuda.cu index 7b4fe29..64d30c9 100644 --- a/src/gpuRIR_cuda.cu +++ b/src/gpuRIR_cuda.cu @@ -21,9 +21,6 @@ static const int nThreadsISM_x = 4; static const int nThreadsISM_y = 4; static const int nThreadsISM_z = 4; -// Time vector generation -static const int nThreadsTime = 128; - // RIR computation static const int initialReductionMin = 512; static const int nThreadsGen_t = 32; @@ -46,6 +43,11 @@ static const int nThreadsConv_x = 256; static const int nThreadsConv_y = 1; static const int nThreadsConv_z = 1; +#if __CUDA_ARCH__ >= 530 +#define h2zeros __float2half2_rn(0.0) +#define h2ones __float2half2_rn(1.0) +#define h2pi __float2half2_rn(PI) +#endif // To hide the cuRAND generator in the header and don't need to include the cuda headers there struct cuRandGeneratorWrapper_t @@ -58,13 +60,6 @@ cuRandGeneratorWrapper_t gpuRIR_cuda::cuRandGenWrap; int cuda_arch; -#if __CUDA_ARCH__ >= 530 -#define h2zeros __float2half2_rn(0.0) -#define h2ones __float2half2_rn(1.0) -#define h2pi __float2half2_rn(PI) -#endif - - /***************************/ /* Auxiliar host functions */ /***************************/ @@ -89,7 +84,7 @@ inline void gpuAssert(curandStatus_t code, const char *file, int line, bool abor #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } inline void gpuAssert(cufftResult_t code, const char *file, int line, bool abort=true) { - if (code != CURAND_STATUS_SUCCESS) + if (code != CUFFT_SUCCESS) { fprintf(stderr,"GPUassert: %s %s %d\n", code, file, line); if (abort) exit(code); @@ -146,6 +141,7 @@ __device__ __forceinline__ scalar_t mic_directivity(scalar_t doaVec[3], scalar_t case DIR_HYPCARD: return 0.25f + 0.75f*cosTheta; case DIR_SUBCARD: return 0.75f + 0.25f*cosTheta; case DIR_BIDIR: return cosTheta; + default: printf("Invalid microphone pattern"); return 0.0f; } } @@ -175,26 +171,6 @@ __device__ __forceinline__ half2 h2abs(half2 x) { return *reinterpret_cast( &i ); } -__device__ __forceinline__ half2 my_h2sin(half2 x) { - return __floats2half2_rn(sinf(__low2float(x)), sinf(__high2float(x))); -} - -__device__ __forceinline__ half2 my_h2cos(half2 x) { - return __floats2half2_rn(cosf(__low2float(x)), cosf(__high2float(x))); -} - -__device__ __forceinline__ half2 my_h2cospi(half2 x) { - // It is always on [-0.5, 0.5], so we do not need argument reduction - - // cos(pi*x) polinomial approximation for x in [-0.5,0.5] - half2 x2 = __hmul2(x, x); - half2 c = __float2half2_rn(-1.229339658587166); - c = __hfma2(x2, c, __float2half2_rn(+4.043619929856572f)); - c = __hfma2(x2, c, __float2half2_rn(-4.934120365987677f)); - c = __hfma2(x2, c, __float2half2_rn(+0.999995282317910)); - - return c; -} __device__ __forceinline__ half2 my_h2sinpi(half2 x) { // Argument reduction to [-0.5, 0.5] half2 i = h2rint(x); @@ -216,32 +192,35 @@ __device__ __forceinline__ half2 my_h2sinpi(half2 x) { return s; } +__device__ __forceinline__ half2 my_h2cospi(half2 x) { + // It is always on [-0.5, 0.5], so we do not need argument reduction + + // cos(pi*x) polinomial approximation for x in [-0.5,0.5] + half2 x2 = __hmul2(x, x); + half2 c = __float2half2_rn(-1.229339658587166f); + c = __hfma2(x2, c, __float2half2_rn(+4.043619929856572f)); + c = __hfma2(x2, c, __float2half2_rn(-4.934120365987677f)); + c = __hfma2(x2, c, __float2half2_rn(+0.999995282317910f)); + + return c; +} + __device__ __forceinline__ half2 hanning_window_mp(half2 t, half2 Tw_inv) { half2 c = my_h2cospi(__hmul2(Tw_inv, t)); return __hmul2(c, c); - // return __hmul2(__hadd2(h2ones, h2cos(__hmul2(PI_Tw_2, t))), __float2half2_rn(0.5f)); } __device__ __forceinline__ half2 my_h2sinc(half2 x) { - // x = __hmul2(h2pi, x); x = __hfma2(__heq2(x, h2zeros), __float2half2_rn(1e-7f), x); return __h2div(my_h2sinpi(x), __hmul2(h2pi, x)); } __device__ __forceinline__ half2 image_sample_mp(half2 amp, scalar_t tau, scalar_t t1, scalar_t t2, half2 Tw_2, half2 Tw_inv) { - half2 t_tau = __floats2half2_rn(t1-tau, t2-tau); // __hsub2(t, tau); + half2 t_tau = __floats2half2_rn(t1-tau, t2-tau); if (__hble2(h2abs(t_tau), Tw_2)) { return __hmul2(hanning_window_mp(t_tau, Tw_inv), __hmul2(amp, my_h2sinc( t_tau ))); } else return h2zeros; - - // half2 t_tau = __floats2half2_rn(t1-tau, t2-tau); // __hsub2(t, tau); - // if (__hble2(h2abs(t_tau), __float2half2_rn(8e-3f/2))) { - // half2 FsPI = __hmul2( __float2half2_rn(Fs), __float2half2_rn(PI)); - // return my_h2sinc( __hmul2(t_tau, FsPI) ); - // } else return h2zeros; - - // return my_h2sin(amp); } #endif @@ -357,11 +336,6 @@ __global__ void calcAmpTau_kernel(scalar_t* g_amp /*[M_src]M_rcv][nb_img_x][nb_i } } -__global__ void generateTime_kernel(scalar_t* t, scalar_t Fs, int nSamples) { - int sample = blockIdx.x * blockDim.x + threadIdx.x; - if (sample