From 21e8252ea53f4acd92c7dd24b3f1c6ab2c18299b Mon Sep 17 00:00:00 2001
From: DavidDiazGuerra <ddga@unizar.es>
Date: Fri, 4 Oct 2019 14:24:07 +0200
Subject: [PATCH] Now the lookup table is optional

---
 README.md                  | 12 +++++-
 examples/example.py        |  1 +
 examples/time_vs_T60.py    |  1 +
 examples/time_vs_nbRIRs.py |  1 +
 gpuRIR/__init__.py         | 13 +++++-
 src/gpuRIR_cuda.cu         | 83 +++++++++++++++++++++++++++++++-------
 src/gpuRIR_cuda.h          |  6 ++-
 src/python_bind.cpp        | 12 ++++--
 8 files changed, 108 insertions(+), 21 deletions(-)

diff --git a/README.md b/README.md
index a8e3170..1844e6a 100644
--- a/README.md
+++ b/README.md
@@ -10,6 +10,7 @@
   * [`simulateRIR`](#simulaterir)
   * [`simulateTrajectory`](#simulatetrajectory)
   * [`activateMixedPrecision`](#activateMixedPrecision)
+  * [`activateLUT`](#activateLUT)
   * [`beta_SabineEstimation`](#beta_sabineestimation)
   * [`att2t_SabineEstimator`](#att2t_sabineestimator)
   * [`t2n`](#t2n)
@@ -110,7 +111,16 @@ Activate the mixed precision mode, only for Pascal GPU architecture or superior.
 
 * **activate** : *bool, optional.*
         True for activate and Flase for deactivate. True by default.
-        
+
+### `activateLUT`
+
+Activate the lookup table for the sinc computations.
+
+#### Parameters
+
+* **activate** : *bool, optional.*
+        True for activate and Flase for deactivate. True by default.
+
 ### `beta_SabineEstimation`
 
 Estimation of the reflection coefficients needed to have the desired reverberation time.
diff --git a/examples/example.py b/examples/example.py
index 8b7a297..5b2aaa9 100755
--- a/examples/example.py
+++ b/examples/example.py
@@ -10,6 +10,7 @@
 
 import gpuRIR
 gpuRIR.activateMixedPrecision(False)
+gpuRIR.activateLUT(True)
 
 room_sz = [3,3,2.5]  # Size of the room [m]
 nb_src = 2  # Number of sources
diff --git a/examples/time_vs_T60.py b/examples/time_vs_T60.py
index 5c1eaa7..2f06592 100644
--- a/examples/time_vs_T60.py
+++ b/examples/time_vs_T60.py
@@ -10,6 +10,7 @@
 
 import gpuRIR
 gpuRIR.activateMixedPrecision(False)
+gpuRIR.activateLUT(False)
 
 T60_vec = np.arange(0.1, 2.2, 0.2) # Reverberation times to measure
 nb_test_per_point = 10 # Number of simulations per T60 to average the runtime
diff --git a/examples/time_vs_nbRIRs.py b/examples/time_vs_nbRIRs.py
index 7b950ab..c43f96f 100644
--- a/examples/time_vs_nbRIRs.py
+++ b/examples/time_vs_nbRIRs.py
@@ -10,6 +10,7 @@
 
 import gpuRIR
 gpuRIR.activateMixedPrecision(False)
+gpuRIR.activateLUT(False)
 
 nb_src_vec = np.concatenate([2**np.arange(12), [4094]]) # Number of RIRs to measure
 nb_test_per_point = 10 # Number of simulations per T60 to average the runtime
diff --git a/gpuRIR/__init__.py b/gpuRIR/__init__.py
index cc5a2da..d2616bb 100644
--- a/gpuRIR/__init__.py
+++ b/gpuRIR/__init__.py
@@ -8,7 +8,7 @@
 
 from gpuRIR_bind import gpuRIR_bind
 
-__all__ = ["mic_patterns", "beta_SabineEstimation", "att2t_SabineEstimator", "t2n", "simulateRIR", "simulateTrajectory", "activate_mixed_precision"]
+__all__ = ["mic_patterns", "beta_SabineEstimation", "att2t_SabineEstimator", "t2n", "simulateRIR", "simulateTrajectory", "activate_mixed_precision", "activate_lut"]
 
 mic_patterns =	{
   "omni": 0,
@@ -212,5 +212,16 @@ def activateMixedPrecision(activate=True):
 	'''
 	gpuRIR_bind_simulator.activate_mixed_precision_bind(activate)
 
+def activateLUT(activate=True):
+	''' Activate the lookup table for the sinc computations.
+
+	Parameters
+	----------
+	activate : bool, optional
+		True for activate and Flase for deactivate. True by default.
+
+	'''
+	gpuRIR_bind_simulator.activate_lut_bind(activate)
+
 # Create the simulator object when the module is loaded
 gpuRIR_bind_simulator = gpuRIR_bind()
diff --git a/src/gpuRIR_cuda.cu b/src/gpuRIR_cuda.cu
index f911df0..5feb223 100644
--- a/src/gpuRIR_cuda.cu
+++ b/src/gpuRIR_cuda.cu
@@ -115,9 +115,9 @@ __device__ __forceinline__ scalar_t sinc(scalar_t x) {
 	return (x==0)? 1 : sinf(x)/x;
 }
 
-__device__ __forceinline__ scalar_t image_sample(scalar_t amp, scalar_t tau, scalar_t t, int Tw_2, cudaTextureObject_t sinc_lut, float lut_center) {
+__device__ __forceinline__ scalar_t image_sample(scalar_t amp, scalar_t tau, scalar_t t, scalar_t Tw) {
 	scalar_t t_tau = t - tau;
-	return (abs(t_tau)<Tw_2)? amp * tex1D<scalar_t>(sinc_lut, __fmaf_rz(t_tau,lut_oversamp,lut_center)) : 0.0f;
+	return (abs(t_tau)<Tw/2)? hanning_window(t_tau, Tw) * amp * sinc( (t_tau) * PI ) : 0.0f;
 }
 
 __device__ __forceinline__ scalar_t SabineT60( scalar_t room_sz_x, scalar_t room_sz_y, scalar_t room_sz_z,
@@ -228,6 +228,15 @@ __device__ __forceinline__ half2 image_sample_mp(half2 amp, scalar_t tau, scalar
 
 #endif
 
+/******************************************/
+/* Lookup table auxiliar device functions */
+/******************************************/
+
+__device__ __forceinline__ scalar_t image_sample_lut(scalar_t amp, scalar_t tau, scalar_t t, int Tw_2, cudaTextureObject_t sinc_lut, float lut_center) {
+	scalar_t t_tau = t - tau;
+	return (abs(t_tau)<Tw_2)? amp * tex1D<scalar_t>(sinc_lut, __fmaf_rz(t_tau,lut_oversamp,lut_center)) : 0.0f;
+}
+
 /***********/
 /* KERNELS */
 /***********/
@@ -339,7 +348,7 @@ __global__ void calcAmpTau_kernel(scalar_t* g_amp /*[M_src]M_rcv][nb_img_x][nb_i
 	}
 }
 
-__global__ void generateRIR_kernel(scalar_t* initialRIR, scalar_t* amp, scalar_t* tau, int T, int M, int N, int iniRIR_N, int ini_red, int Tw_2, cudaTextureObject_t sinc_lut, float lut_center) {
+__global__ void generateRIR_kernel(scalar_t* initialRIR, scalar_t* amp, scalar_t* tau, int T, int M, int N, int iniRIR_N, int ini_red, scalar_t Tw) {
 	int t = blockIdx.x * blockDim.x + threadIdx.x;
 	int m = blockIdx.y * blockDim.y + threadIdx.y;
 	int n_ini = blockIdx.z * ini_red;
@@ -348,7 +357,7 @@ __global__ void generateRIR_kernel(scalar_t* initialRIR, scalar_t* amp, scalar_t
 	if (m<M && t<T) {
 		scalar_t loc_sum = 0;
 		for (int n=n_ini; n<n_max; n++) {
-			loc_sum += image_sample(amp[m*N+n], tau[m*N+n], t, Tw_2, sinc_lut, lut_center);
+			loc_sum += image_sample(amp[m*N+n], tau[m*N+n], t, Tw);
 		}
 		initialRIR[m*T*iniRIR_N + t*iniRIR_N + blockIdx.z] = loc_sum;
 	}
@@ -517,6 +526,25 @@ __global__ void h2RIR_to_floatRIR_kernel(half2* h2RIR, scalar_t* floatRIR, int M
 	#endif
 }
 
+/************************/
+/* Lookup table KERNELS */
+/************************/
+
+__global__ void generateRIR_kernel_lut(scalar_t* initialRIR, scalar_t* amp, scalar_t* tau, int T, int M, int N, int iniRIR_N, int ini_red, int Tw_2, cudaTextureObject_t sinc_lut, float lut_center) {
+	int t = blockIdx.x * blockDim.x + threadIdx.x;
+	int m = blockIdx.y * blockDim.y + threadIdx.y;
+	int n_ini = blockIdx.z * ini_red;
+	int n_max = fminf(n_ini + ini_red, N);
+	
+	if (m<M && t<T) {
+		scalar_t loc_sum = 0;
+		for (int n=n_ini; n<n_max; n++) {
+			loc_sum += image_sample_lut(amp[m*N+n], tau[m*N+n], t, Tw_2, sinc_lut, lut_center);
+		}
+		initialRIR[m*T*iniRIR_N + t*iniRIR_N + blockIdx.z] = loc_sum;
+	}
+}
+
 /***************************/
 /* Auxiliar host functions */
 /***************************/
@@ -573,16 +601,25 @@ void gpuRIR_cuda::cuda_rirGenerator(scalar_t* rir, scalar_t* amp, scalar_t* tau,
 	gpuErrchk( cudaMalloc(&initialRIR, M*T*iniRIR_N*sizeof(scalar_t)) );
 	
 	int Tw = (int) round(8e-3f * Fs); // Window duration [samples]
-	int lut_len = Tw * lut_oversamp;
-	lut_len += ((lut_len%2)? 0 : 1); // Must be odd
-	cudaArray* cuArrayLut;
-	cudaTextureObject_t sinc_lut = create_sinc_texture_lut(&cuArrayLut, Tw, lut_len);
 	
-	generateRIR_kernel<<<numBlocksIni, threadsPerBlockIni>>>( initialRIR, amp, tau, T, M, N, iniRIR_N, initialReduction, Tw/2, sinc_lut, lut_len/2+0.5 );
-	gpuErrchk( cudaDeviceSynchronize() );
-	gpuErrchk( cudaPeekAtLastError() );
-	cudaDestroyTextureObject(sinc_lut);
-	cudaFreeArray(cuArrayLut);
+	if (lookup_table) {
+		int lut_len = Tw * lut_oversamp;
+		lut_len += ((lut_len%2)? 0 : 1); // Must be odd
+		cudaArray* cuArrayLut;
+		cudaTextureObject_t sinc_lut = create_sinc_texture_lut(&cuArrayLut, Tw, lut_len);
+		
+		generateRIR_kernel_lut<<<numBlocksIni, threadsPerBlockIni>>>( initialRIR, amp, tau, T, M, N, iniRIR_N, initialReduction, Tw/2, sinc_lut, lut_len/2+0.5 );
+		gpuErrchk( cudaDeviceSynchronize() );
+		gpuErrchk( cudaPeekAtLastError() );
+		
+		cudaDestroyTextureObject(sinc_lut);
+		cudaFreeArray(cuArrayLut);
+	} else {
+		generateRIR_kernel<<<numBlocksIni, threadsPerBlockIni>>>( initialRIR, amp, tau, T, M, N, iniRIR_N, initialReduction, Tw );
+		gpuErrchk( cudaDeviceSynchronize() );
+		gpuErrchk( cudaPeekAtLastError() );
+	}
+		
 	
 	dim3 threadsPerBlockRed(nThreadsRed, 1, 1);
 	scalar_t* intermediateRIR;
@@ -949,14 +986,15 @@ scalar_t* gpuRIR_cuda::cuda_convolutions(scalar_t* source_segments, int M_src, i
 	return convolved_segments;
 }
 
-gpuRIR_cuda::gpuRIR_cuda(bool mPrecision) {
+gpuRIR_cuda::gpuRIR_cuda(bool mPrecision, bool lut) {
 	// Get CUDA architecture
 	cudaDeviceProp prop;
     cudaGetDeviceProperties(&prop, 0);
 	cuda_arch = prop.major*100 + prop.minor*10;
 
-	// Activate mixed precision if selected
+	// Activate mixed precision and lut if selected
 	activate_mixed_precision(mPrecision);
+	activate_lut(lut);
 	
 	// Initiate CUDA runtime API
 	scalar_t* memPtr_warmup;
@@ -975,6 +1013,11 @@ gpuRIR_cuda::gpuRIR_cuda(bool mPrecision) {
 
 bool gpuRIR_cuda::activate_mixed_precision(bool activate) {
 	if (cuda_arch >= 530) {
+		if (activate && lookup_table) {
+			printf("The mixed precision implementation is not compatible with the lookup table.");
+			printf("Dissabling the lookup table.");
+			lookup_table = false;
+		}
 		mixed_precision = activate;
 	} else {
 		if (activate) printf("This feature requires Pascal GPU architecture or higher.\n");
@@ -982,3 +1025,13 @@ bool gpuRIR_cuda::activate_mixed_precision(bool activate) {
 	}
 	return mixed_precision;
 }
+
+bool gpuRIR_cuda::activate_lut(bool activate) {
+	if (activate && mixed_precision) {
+		printf("The lookup table is not compatible with the mixed precision implementation.");
+		printf("Disabling the mixed precision implementation.");
+		mixed_precision = false;
+	}
+	lookup_table = activate;
+	return lookup_table;
+}
diff --git a/src/gpuRIR_cuda.h b/src/gpuRIR_cuda.h
index 82cb232..baa73b7 100644
--- a/src/gpuRIR_cuda.h
+++ b/src/gpuRIR_cuda.h
@@ -19,11 +19,12 @@ struct cuRandGeneratorWrapper_t;
 class gpuRIR_cuda {
 	
 	public:
-		gpuRIR_cuda(bool);
+		gpuRIR_cuda(bool, bool);
 
 		scalar_t* cuda_simulateRIR(scalar_t[3], scalar_t[6], scalar_t*, int, scalar_t*, scalar_t*, micPattern, int, int[3], scalar_t, scalar_t, scalar_t, scalar_t);
 		scalar_t* cuda_convolutions(scalar_t*, int, int,scalar_t*, int, int);
 		bool activate_mixed_precision(bool);
+		bool activate_lut(bool);
 		
 	private:
 		// cuRAND generator
@@ -31,6 +32,9 @@ class gpuRIR_cuda {
 		
 		// Mixed precision flag
 		bool mixed_precision;
+		
+		// Lookup table flag
+		bool lookup_table;
 
 		// Auxiliar host functions
 		void cuda_rirGenerator(scalar_t*, scalar_t*, scalar_t*, int, int, int, scalar_t);
diff --git a/src/python_bind.cpp b/src/python_bind.cpp
index 159b274..71b0098 100644
--- a/src/python_bind.cpp
+++ b/src/python_bind.cpp
@@ -12,13 +12,15 @@ namespace py = pybind11;
 
 class gpuRIR_bind {
 	public:
-		gpuRIR_bind(bool mPrecision=false) : mixed_precision(mPrecision), gpuRIR_cuda_simulator(mPrecision) {};
+		gpuRIR_bind(bool mPrecision=false, bool lut=true) : mixed_precision(mPrecision), lookup_table(lut), gpuRIR_cuda_simulator(mPrecision, lut) {};
 		
 		py::array simulateRIR_bind(std::vector<scalar_t>, std::vector<scalar_t>, py::array_t<scalar_t, py::array::c_style>, py::array_t<scalar_t, py::array::c_style>, py::array_t<scalar_t, py::array::c_style>, micPattern, std::vector<int> ,scalar_t, scalar_t, scalar_t, scalar_t);
 		py::array gpu_conv(py::array_t<scalar_t, py::array::c_style>, py::array_t<scalar_t, py::array::c_style>);
 		bool activate_mixed_precision_bind(bool);
+		bool activate_lut_bind(bool);
 		
 		bool mixed_precision;
+		bool lookup_table;
 	
 	private:
 		gpuRIR_cuda gpuRIR_cuda_simulator;
@@ -108,6 +110,10 @@ bool gpuRIR_bind::activate_mixed_precision_bind(bool activate) {
   gpuRIR_cuda_simulator.activate_mixed_precision(activate);
 }
 
+bool gpuRIR_bind::activate_lut_bind(bool activate) {
+  gpuRIR_cuda_simulator.activate_lut(activate);
+}
+
 
 PYBIND11_MODULE(gpuRIR_bind,m)
 {
@@ -120,6 +126,6 @@ PYBIND11_MODULE(gpuRIR_bind,m)
 			 py::arg("Fs"), py::arg("c")=343.0f )
 		.def("gpu_conv", &gpuRIR_bind::gpu_conv, "Batched convolution using FFTs in GPU", py::arg("source_segments"), py::arg("RIR"))
 		.def("activate_mixed_precision_bind", &gpuRIR_bind::activate_mixed_precision_bind, "Activate the mixed precision mode, only for Pascal GPU architecture or superior",
-			 py::arg("activate"));
-
+			 py::arg("activate"))
+		.def("activate_lut_bind", &gpuRIR_bind::activate_lut_bind, "Activate the lookup table", py::arg("activate"));
 }