diff --git a/src/generate_gauss_c.py b/src/generate_gauss_c.py index d919836..9f72546 100644 --- a/src/generate_gauss_c.py +++ b/src/generate_gauss_c.py @@ -12,42 +12,7 @@ from math import pi, sqrt, exp from scipy import ndimage as ndi -# From stack overflow -def gauss(n=11,sigma=1): - r = range(-int(n/2),int(n/2)+1) - g = [1 / (sigma * sqrt(2*pi)) * exp(-float(x)**2/(2*sigma**2)) for x in r] - return np.array(g, dtype=np.float32) - -# A copy of the scipy implementation -def _gaussian_kernel1d(sigma, order, radius): - """ - Computes a 1-D Gaussian convolution kernel. - """ - if order < 0: - raise ValueError('order must be non-negative') - exponent_range = np.arange(order + 1) - sigma2 = sigma * sigma - x = np.arange(-radius, radius+1) - phi_x = np.exp(-0.5 / sigma2 * x ** 2) - phi_x = phi_x / phi_x.sum() - - if order == 0: - return phi_x - else: - # f(x) = q(x) * phi(x) = q(x) * exp(p(x)) - # f'(x) = (q'(x) + q(x) * p'(x)) * phi(x) - # p'(x) = -1 / sigma ** 2 - # Implement q'(x) + q(x) * p'(x) as a matrix operator and apply to the - # coefficients of q(x) - q = np.zeros(order + 1) - q[0] = 1 - D = np.diag(exponent_range[1:], 1) # D @ q(x) = q'(x) - P = np.diag(np.ones(order)/-sigma2, -1) # P @ q(x) = q(x) * p'(x) - Q_deriv = D + P - for _ in range(order): - q = Q_deriv.dot(q) - q = (x[:, None] ** exponent_range).dot(q) - return q * phi_x +impl_type = np.float32 def tobyt(arr): mi, ma = arr.min(), arr.max() @@ -56,47 +21,48 @@ def tobyt(arr): if __name__ == '__main__': sample, scale = commandline_args({"sample":"","scale":1}) outpath = 'dummy' - display = 10 - sigma = 13 - reps =20 + sigma = 5 + reps = 5 vf = h5py.File(f'{hdf5_root_fast}/processed/implant-edt/{scale}x/{sample}.h5', 'r') voxels = vf['voxels'][512:576,:,:] vf.close() - Image.fromarray(tobyt(voxels[display,:,:])).save(f"{outpath}/original.png") + nz,ny,nx = voxels.shape + + Image.fromarray(tobyt(voxels[nz//2,:,:])).save(f"{outpath}/original.png") - vmax = voxels.max(); + vmax = voxels.max() implant_mask = voxels >= vmax - implant_mask = implant_mask.astype(np.float32) - Image.fromarray(tobyt(implant_mask[display,:,:])).save(f"{outpath}/masked.png") + Image.fromarray(tobyt(implant_mask[nz//2,:,:].astype(impl_type))).save(f"{outpath}/masked.png") radius = int(4.0 * float(sigma) + .5) # stolen from the default scipy parameters - kernel = _gaussian_kernel1d(sigma, 0, radius)[::-1] - #kernel = gauss(radius*2+1, sigma) + kernel = ndi.filters._gaussian_kernel1d(sigma, 0, radius).astype(impl_type) - result = np.zeros_like(implant_mask) + result = np.empty(implant_mask.shape,dtype=impl_type) start = timeit.default_timer() histograms.gauss_filter_par_cpu(implant_mask, implant_mask.shape, kernel, reps, result) print (f'Parallel C edition: {timeit.default_timer() - start}') np.save(f'{outpath}/mine', result) - nz,ny,nx = result.shape Image.fromarray(tobyt(result[nz//2,:,:])).save(f'{outpath}/gauss-xy.png') Image.fromarray(tobyt(result[:,ny//2,:])).save(f'{outpath}/gauss-xz.png') Image.fromarray(tobyt(result[:,:,nx//2])).save(f'{outpath}/gauss-yz.png') Image.fromarray(tobyt((np.max(np.abs(result),axis=0)!=0).astype(float))).save(f'{outpath}/gauss-xy-nonzero.png') + Image.fromarray(tobyt((np.max(np.abs(result),axis=1)!=0).astype(float))).save(f'{outpath}/gauss-xz-nonzero.png') + Image.fromarray(tobyt((np.max(np.abs(result),axis=2)!=0).astype(float))).save(f'{outpath}/gauss-yz-nonzero.png') - control = implant_mask.copy() - start = timeit.default_timer() - #for _ in range(reps): - # control = ndi.gaussian_filter(control, sigma, mode='constant') - print (f'ndimage edition: {timeit.default_timer() - start}') - #np.save(f'{outpath}/control', control) - Image.fromarray(tobyt(control[display,:,:])).save(f'{outpath}/control1.png') - - diff = np.abs(result - control) - print (f'Total difference: {diff.sum()}') - print (f'Max difference: {diff.max()}') - print (f'Mean difference: {diff.mean()}') \ No newline at end of file + if True: # generate ndimage comparison + control = implant_mask.astype(impl_type) + start = timeit.default_timer() + for _ in range(reps): + control[:] = ndi.gaussian_filter(control, sigma, mode='constant') + #control[implant_mask] = 1 + print (f'ndimage edition: {timeit.default_timer() - start}') + np.save(f'{outpath}/control', control) + Image.fromarray(tobyt(control[nz//2,:,:])).save(f'{outpath}/control1.png') + diff = np.abs(result - control) + print (f'Total difference: {diff.sum()}') + print (f'Max difference: {diff.max()}') + print (f'Mean difference: {diff.mean()}') \ No newline at end of file diff --git a/src/pybind_kernels/histograms.cc b/src/pybind_kernels/histograms.cc index cef124f..3209a3e 100644 --- a/src/pybind_kernels/histograms.cc +++ b/src/pybind_kernels/histograms.cc @@ -14,6 +14,7 @@ typedef uint16_t voxel_type; //typedef float field_type; typedef uint16_t field_type; typedef uint8_t mask_type; +typedef float gauss_type; #define INLINE __attribute__((always_inline)) inline @@ -23,25 +24,25 @@ typedef uint8_t mask_type; void gauss_filter_par_cpu(const py::array_t np_voxels, const tuple shape, - const py::array_t np_kernel, + const py::array_t np_kernel, const uint64_t reps, - py::array_t &np_result) { + py::array_t &np_result) { auto voxels_info = np_voxels.request(), kernel_info = np_kernel.request(), result_info = np_result.request(); - const mask_type *voxels = static_cast(voxels_info.ptr); - const float *kernel = static_cast(kernel_info.ptr); + const mask_type *voxels = static_cast(voxels_info.ptr); + const gauss_type *kernel = static_cast(kernel_info.ptr); - float *result = static_cast(result_info.ptr); + gauss_type *result = static_cast(result_info.ptr); auto [Nz, Ny, Nx] = shape; // global shape TODO for blocked edition int64_t kernel_size = kernel_info.size, padding = kernel_size / 2, // kx should always be odd - // Partial shape + // Partial shape Pz = voxels_info.shape[0], Py = voxels_info.shape[1], Px = voxels_info.shape[2], @@ -49,19 +50,24 @@ void gauss_filter_par_cpu(const py::array_t np_voxels, Rz = result_info.shape[0], Ry = result_info.shape[1], Rx = result_info.shape[2]; - - float - *tmp0 = (float *) calloc(Rz*Ry*Rx, sizeof(float)), - *tmp1 = (float *) calloc(Rz*Ry*Rx, sizeof(float)); - + + assert(kernel_size % 2 == 1); + + gauss_type + *tmp0 = (gauss_type *) calloc(Rz*Ry*Rx, sizeof(gauss_type)), + *tmp1 = (gauss_type *) calloc(Rz*Ry*Rx, sizeof(gauss_type)); + #pragma omp parallel for for (int64_t i = 0; i < voxels_info.size; i++) { - tmp0[i] = voxels[i]; + tmp0[i] = voxels[i] ? 1 : 0; } + uint64_t iters = 3 * reps; // 1 pass for each dimension + const int64_t strides[3] = {Py*Px,Px,1}; + const int64_t N[3] = {Pz,Py,Px}; for (uint64_t rep = 0; rep < iters; rep++) { - float *tin, *tout; + gauss_type *tin, *tout; if (rep % 2 == 1) { tin = tmp1; tout = tmp0; @@ -69,30 +75,32 @@ void gauss_filter_par_cpu(const py::array_t np_voxels, tin = tmp0; tout = tmp1; } + int64_t dim = rep % 3; - #pragma omp parallel for collapse(3) + #pragma omp parallel for// collapse(3) for (int64_t z = 0; z < Pz; z++) { for (int64_t y = 0; y < Py; y++) { for (int64_t x = 0; x < Px; x++) { - float sum = 0; - int64_t dim = rep % 3; + int64_t output_index = z*strides[0] + y*strides[1] + x*strides[2]; + int64_t X[3] = {z,y,x}; - const int64_t output_index = z*Ry*Rx + y*Rx + x; - const int64_t strides[3] = {Py*Px,Px,1}; - const int64_t X[3] = {z,y,x}; - const int64_t N[3] = {Pz,Py,Px}; - - const int64_t stride = strides[dim], i_start = -min(padding,X[dim]-1), i_end = min(padding,N[dim]-X[dim]-1); + int64_t + stride = strides[dim], + i_start = -min(padding,X[dim]), + i_end = min(padding,N[dim]-X[dim]-1); auto mask_value = voxels[output_index]; - if (mask_value) { + if (0 && dim % 3 == 2 && mask_value) { tout[output_index] = 1; } else { + gauss_type sum = 0; + #pragma omp simd reduction(+:sum) for (int64_t i = i_start; i <= i_end; i++) { - uint64_t voxel_index = output_index + stride*i; + int64_t voxel_index = output_index + stride*i; sum += tin[voxel_index] * kernel[i+padding]; } + tout[output_index] = sum; } } @@ -100,7 +108,7 @@ void gauss_filter_par_cpu(const py::array_t np_voxels, } } - memcpy(result, iters % 2 == 1 ? tmp1 : tmp0, Rz*Ry*Rx * sizeof(float)); + memcpy(result, iters % 2 == 1 ? tmp1 : tmp0, Rz*Ry*Rx * sizeof(gauss_type)); free(tmp0); free(tmp1); }