From e1bca4da1fc01a4677781157a140358274e0311b Mon Sep 17 00:00:00 2001 From: DavidDiazGuerra Date: Tue, 16 Jul 2019 10:05:23 +0000 Subject: [PATCH] cospi custom function --- src/gpuRIR_cuda.cu | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/src/gpuRIR_cuda.cu b/src/gpuRIR_cuda.cu index 5ce78a3..11b8acb 100644 --- a/src/gpuRIR_cuda.cu +++ b/src/gpuRIR_cuda.cu @@ -180,10 +180,18 @@ __device__ __forceinline__ half2 my_h2cos(half2 x) { return __floats2half2_rn(cosf(__low2float(x)), cosf(__high2float(x))); } -__device__ __forceinline__ half2 hanning_window_mp(half2 t, half2 PI_Tw_2) { - return __hmul2(__hadd2(h2ones, h2cos(__hmul2(PI_Tw_2, t))), __float2half2_rn(0.5f)); +__device__ __forceinline__ half2 my_h2cospi(half2 x) { + // It is always on [-0.5, 0.5], so we do not need argument reduction + + // cos(pi*x) polinomial approximation for x in [-0.5,0.5] + half2 x2 = __hmul2(x, x); + half2 c = __float2half2_rn(-1.229339658587166); + c = __hfma2(x2, c, __float2half2_rn(+4.043619929856572f)); + c = __hfma2(x2, c, __float2half2_rn(-4.934120365987677f)); + c = __hfma2(x2, c, __float2half2_rn(+0.999995282317910)); + + return c; } - __device__ __forceinline__ half2 my_h2sinpi(half2 x) { // Argument reduction to [-0.5, 0.5] half2 i = h2rint(x); @@ -205,16 +213,23 @@ __device__ __forceinline__ half2 my_h2sinpi(half2 x) { return s; } +__device__ __forceinline__ half2 hanning_window_mp(half2 t, half2 Tw_inv) { + half2 c = my_h2cospi(__hmul2(Tw_inv, t)); + return __hmul2(c, c); + // return __hmul2(__hadd2(h2ones, h2cos(__hmul2(PI_Tw_2, t))), __float2half2_rn(0.5f)); +} + __device__ __forceinline__ half2 my_h2sinc(half2 x) { // x = __hmul2(h2pi, x); x = __hfma2(__heq2(x, h2zeros), __float2half2_rn(1e-7f), x); return __h2div(my_h2sinpi(x), __hmul2(h2pi, x)); + } -__device__ __forceinline__ half2 image_sample_mp(half2 amp, scalar_t tau, scalar_t t1, scalar_t t2, half2 Tw_2, half2 PI_Tw_2) { +__device__ __forceinline__ half2 image_sample_mp(half2 amp, scalar_t tau, scalar_t t1, scalar_t t2, half2 Tw_2, half2 Tw_inv) { half2 t_tau = __floats2half2_rn(t1-tau, t2-tau); // __hsub2(t, tau); if (__hble2(h2abs(t_tau), Tw_2)) { - return __hmul2(hanning_window_mp(t_tau, PI_Tw_2), __hmul2(amp, my_h2sinc( t_tau ))); + return __hmul2(hanning_window_mp(t_tau, Tw_inv), __hmul2(amp, my_h2sinc( t_tau ))); } else return h2zeros; // half2 t_tau = __floats2half2_rn(t1-tau, t2-tau); // __hsub2(t, tau); @@ -455,7 +470,7 @@ __global__ void complexPointwiseMulAndScale(cufftComplex *signal_segments, cufft /* Mixed precision KERNELS */ /***************************/ -__global__ void generateRIR_mp_kernel(half2* initialRIR, scalar_t* amp, scalar_t* tau, int T, int M, int N, int iniRIR_N, int ini_red, half2 Tw_2, half2 PI_Tw_2) { +__global__ void generateRIR_mp_kernel(half2* initialRIR, scalar_t* amp, scalar_t* tau, int T, int M, int N, int iniRIR_N, int ini_red, half2 Tw_2, half2 Tw_inv) { #if __CUDA_ARCH__ >= 530 int t = blockIdx.x * blockDim.x + threadIdx.x; int m = blockIdx.y * blockDim.y + threadIdx.y; @@ -468,7 +483,7 @@ __global__ void generateRIR_mp_kernel(half2* initialRIR, scalar_t* amp, scalar_t scalar_t loc_tim_2 = 2*t+1; for (int n=n_ini; n>>( initialRIR, amp, tau, T/2, M, N, iniRIR_N, initialReduction, Tw_2, PI_Tw_2 ); + half2 Tw_inv = __float2half2_rn(1.0f / (8e-3f * Fs)); + generateRIR_mp_kernel<<>>( initialRIR, amp, tau, T/2, M, N, iniRIR_N, initialReduction, Tw_2, Tw_inv ); gpuErrchk( cudaDeviceSynchronize() ); gpuErrchk( cudaPeekAtLastError() );