diff --git a/src/generate_gauss_c.py b/src/generate_gauss_c.py
index d919836..9f72546 100644
--- a/src/generate_gauss_c.py
+++ b/src/generate_gauss_c.py
@@ -12,42 +12,7 @@
 from math import pi, sqrt, exp
 from scipy import ndimage as ndi
 
-# From stack overflow
-def gauss(n=11,sigma=1):
-    r = range(-int(n/2),int(n/2)+1)
-    g = [1 / (sigma * sqrt(2*pi)) * exp(-float(x)**2/(2*sigma**2)) for x in r]
-    return np.array(g, dtype=np.float32)
-
-# A copy of the scipy implementation
-def _gaussian_kernel1d(sigma, order, radius):
-    """
-    Computes a 1-D Gaussian convolution kernel.
-    """
-    if order < 0:
-        raise ValueError('order must be non-negative')
-    exponent_range = np.arange(order + 1)
-    sigma2 = sigma * sigma
-    x = np.arange(-radius, radius+1)
-    phi_x = np.exp(-0.5 / sigma2 * x ** 2)
-    phi_x = phi_x / phi_x.sum()
-
-    if order == 0:
-        return phi_x
-    else:
-        # f(x) = q(x) * phi(x) = q(x) * exp(p(x))
-        # f'(x) = (q'(x) + q(x) * p'(x)) * phi(x)
-        # p'(x) = -1 / sigma ** 2
-        # Implement q'(x) + q(x) * p'(x) as a matrix operator and apply to the
-        # coefficients of q(x)
-        q = np.zeros(order + 1)
-        q[0] = 1
-        D = np.diag(exponent_range[1:], 1)  # D @ q(x) = q'(x)
-        P = np.diag(np.ones(order)/-sigma2, -1)  # P @ q(x) = q(x) * p'(x)
-        Q_deriv = D + P
-        for _ in range(order):
-            q = Q_deriv.dot(q)
-        q = (x[:, None] ** exponent_range).dot(q)
-        return q * phi_x
+impl_type = np.float32
 
 def tobyt(arr):
     mi, ma = arr.min(), arr.max()
@@ -56,47 +21,48 @@ def tobyt(arr):
 if __name__ == '__main__':
     sample, scale = commandline_args({"sample":"<required>","scale":1})
     outpath = 'dummy'
-    display = 10
-    sigma = 13
-    reps =20
+    sigma = 5
+    reps = 5
 
     vf = h5py.File(f'{hdf5_root_fast}/processed/implant-edt/{scale}x/{sample}.h5', 'r')
     voxels = vf['voxels'][512:576,:,:]
     vf.close()
 
-    Image.fromarray(tobyt(voxels[display,:,:])).save(f"{outpath}/original.png")
+    nz,ny,nx = voxels.shape
+
+    Image.fromarray(tobyt(voxels[nz//2,:,:])).save(f"{outpath}/original.png")
 
-    vmax = voxels.max(); 
+    vmax = voxels.max()
     implant_mask = voxels >= vmax
-    implant_mask = implant_mask.astype(np.float32)
 
-    Image.fromarray(tobyt(implant_mask[display,:,:])).save(f"{outpath}/masked.png")
+    Image.fromarray(tobyt(implant_mask[nz//2,:,:].astype(impl_type))).save(f"{outpath}/masked.png")
 
     radius = int(4.0 * float(sigma) + .5) # stolen from the default scipy parameters
-    kernel = _gaussian_kernel1d(sigma, 0, radius)[::-1]
-    #kernel = gauss(radius*2+1, sigma)
+    kernel = ndi.filters._gaussian_kernel1d(sigma, 0, radius).astype(impl_type)
 
-    result = np.zeros_like(implant_mask)
+    result = np.empty(implant_mask.shape,dtype=impl_type)
     start = timeit.default_timer()
     histograms.gauss_filter_par_cpu(implant_mask, implant_mask.shape, kernel, reps, result)
     print (f'Parallel C edition: {timeit.default_timer() - start}')
     np.save(f'{outpath}/mine', result)
 
-    nz,ny,nx = result.shape    
     Image.fromarray(tobyt(result[nz//2,:,:])).save(f'{outpath}/gauss-xy.png')
     Image.fromarray(tobyt(result[:,ny//2,:])).save(f'{outpath}/gauss-xz.png')
     Image.fromarray(tobyt(result[:,:,nx//2])).save(f'{outpath}/gauss-yz.png')
     Image.fromarray(tobyt((np.max(np.abs(result),axis=0)!=0).astype(float))).save(f'{outpath}/gauss-xy-nonzero.png')
+    Image.fromarray(tobyt((np.max(np.abs(result),axis=1)!=0).astype(float))).save(f'{outpath}/gauss-xz-nonzero.png')
+    Image.fromarray(tobyt((np.max(np.abs(result),axis=2)!=0).astype(float))).save(f'{outpath}/gauss-yz-nonzero.png')
 
-    control = implant_mask.copy()
-    start = timeit.default_timer()
-    #for _ in range(reps):
-    #    control = ndi.gaussian_filter(control, sigma, mode='constant')
-    print (f'ndimage edition: {timeit.default_timer() - start}')
-    #np.save(f'{outpath}/control', control)
-    Image.fromarray(tobyt(control[display,:,:])).save(f'{outpath}/control1.png')
-
-    diff = np.abs(result - control)
-    print (f'Total difference: {diff.sum()}')
-    print (f'Max difference: {diff.max()}')
-    print (f'Mean difference: {diff.mean()}')
\ No newline at end of file
+    if True: # generate ndimage comparison
+        control = implant_mask.astype(impl_type)
+        start = timeit.default_timer()
+        for _ in range(reps):
+            control[:] = ndi.gaussian_filter(control, sigma, mode='constant')
+            #control[implant_mask] = 1
+        print (f'ndimage edition: {timeit.default_timer() - start}')
+        np.save(f'{outpath}/control', control)
+        Image.fromarray(tobyt(control[nz//2,:,:])).save(f'{outpath}/control1.png')
+        diff = np.abs(result - control)
+        print (f'Total difference: {diff.sum()}')
+        print (f'Max difference: {diff.max()}')
+        print (f'Mean difference: {diff.mean()}')
\ No newline at end of file
diff --git a/src/pybind_kernels/histograms.cc b/src/pybind_kernels/histograms.cc
index cef124f..3209a3e 100644
--- a/src/pybind_kernels/histograms.cc
+++ b/src/pybind_kernels/histograms.cc
@@ -14,6 +14,7 @@ typedef uint16_t voxel_type;
 //typedef float    field_type;
 typedef uint16_t field_type;
 typedef uint8_t mask_type;
+typedef float gauss_type;
 
 #define INLINE __attribute__((always_inline)) inline
 
@@ -23,25 +24,25 @@ typedef uint8_t mask_type;
 
 void gauss_filter_par_cpu(const py::array_t<mask_type> np_voxels,
                           const tuple<uint64_t, uint64_t, uint64_t> shape,
-                          const py::array_t<float> np_kernel,
+                          const py::array_t<gauss_type> np_kernel,
                           const uint64_t reps,
-                          py::array_t<float> &np_result) {
+                          py::array_t<gauss_type> &np_result) {
     auto
         voxels_info = np_voxels.request(),
         kernel_info = np_kernel.request(),
         result_info = np_result.request();
 
-    const mask_type *voxels = static_cast<const mask_type*>(voxels_info.ptr);
-    const float     *kernel = static_cast<const float*>(kernel_info.ptr);
+    const mask_type      *voxels = static_cast<const mask_type*>(voxels_info.ptr);
+    const gauss_type *kernel = static_cast<const gauss_type*>(kernel_info.ptr);
 
-    float *result = static_cast<float*>(result_info.ptr);
+    gauss_type *result = static_cast<gauss_type*>(result_info.ptr);
 
     auto [Nz, Ny, Nx] = shape; // global shape TODO for blocked edition
 
     int64_t
         kernel_size = kernel_info.size,
         padding = kernel_size / 2, // kx should always be odd
-        // Partial shape 
+        // Partial shape
         Pz = voxels_info.shape[0],
         Py = voxels_info.shape[1],
         Px = voxels_info.shape[2],
@@ -49,19 +50,24 @@ void gauss_filter_par_cpu(const py::array_t<mask_type> np_voxels,
         Rz = result_info.shape[0],
         Ry = result_info.shape[1],
         Rx = result_info.shape[2];
-    
-    float 
-        *tmp0 = (float *) calloc(Rz*Ry*Rx, sizeof(float)),
-        *tmp1 = (float *) calloc(Rz*Ry*Rx, sizeof(float));
-    
+
+    assert(kernel_size % 2 == 1);
+
+    gauss_type
+        *tmp0 = (gauss_type *) calloc(Rz*Ry*Rx, sizeof(gauss_type)),
+        *tmp1 = (gauss_type *) calloc(Rz*Ry*Rx, sizeof(gauss_type));
+
     #pragma omp parallel for
     for (int64_t i = 0; i < voxels_info.size; i++) {
-        tmp0[i] = voxels[i];
+        tmp0[i] = voxels[i] ? 1 : 0;
     }
+
     uint64_t iters = 3 * reps; // 1 pass for each dimension
+    const int64_t strides[3] = {Py*Px,Px,1};
+    const int64_t N[3] = {Pz,Py,Px};
 
     for (uint64_t rep = 0; rep < iters; rep++) {
-        float *tin, *tout;
+        gauss_type *tin, *tout;
         if (rep % 2 == 1) {
             tin = tmp1;
             tout = tmp0;
@@ -69,30 +75,32 @@ void gauss_filter_par_cpu(const py::array_t<mask_type> np_voxels,
             tin = tmp0;
             tout = tmp1;
         }
+        int64_t dim = rep % 3;
 
-        #pragma omp parallel for collapse(3)
+        #pragma omp parallel for// collapse(3)
         for (int64_t z = 0; z < Pz; z++) {
             for (int64_t y = 0; y < Py; y++) {
                 for (int64_t x = 0; x < Px; x++) {
-                    float sum = 0;
-                    int64_t dim = rep % 3;
+                    int64_t output_index = z*strides[0] + y*strides[1] + x*strides[2];
+                    int64_t X[3] = {z,y,x};
 
-                    const int64_t output_index = z*Ry*Rx + y*Rx + x;
-                    const int64_t strides[3] = {Py*Px,Px,1};
-                    const int64_t X[3] = {z,y,x};
-                    const int64_t N[3] = {Pz,Py,Px};
-
-                    const int64_t stride = strides[dim], i_start = -min(padding,X[dim]-1), i_end = min(padding,N[dim]-X[dim]-1);
+                    int64_t
+                        stride = strides[dim],
+                        i_start = -min(padding,X[dim]),
+                        i_end = min(padding,N[dim]-X[dim]-1);
 
                     auto mask_value = voxels[output_index];
-                    if (mask_value) {
+                    if (0 && dim % 3 == 2 && mask_value) {
                         tout[output_index] = 1;
                     } else {
+                        gauss_type sum = 0;
+
                         #pragma omp simd reduction(+:sum)
                         for (int64_t i = i_start; i <= i_end; i++) {
-                            uint64_t voxel_index = output_index + stride*i;
+                            int64_t voxel_index = output_index + stride*i;
                             sum += tin[voxel_index] * kernel[i+padding];
                         }
+
                         tout[output_index] = sum;
                     }
                 }
@@ -100,7 +108,7 @@ void gauss_filter_par_cpu(const py::array_t<mask_type> np_voxels,
         }
     }
 
-    memcpy(result, iters % 2 == 1 ? tmp1 : tmp0, Rz*Ry*Rx * sizeof(float));
+    memcpy(result, iters % 2 == 1 ? tmp1 : tmp0, Rz*Ry*Rx * sizeof(gauss_type));
     free(tmp0);
     free(tmp1);
 }