Merge branch 'release/v0.1'

JuliaGeometry · Feb 3, 2015 · 2cc2442 · 2cc2442
2 parents 6e7d4df + 4ddc70e
commit 2cc2442
Show file tree

Hide file tree

Showing 5 changed files with 141 additions and 22 deletions.
diff --git a/README.md b/README.md
@@ -4,14 +4,28 @@ Kd tree for Julia.
 
 [![Build Status](https://travis-ci.org/KristofferC/KDtree.jl.svg)](https://travis-ci.org/KristofferC/KDtree.jl) [![Coverage Status](https://coveralls.io/repos/KristofferC/KDtree.jl/badge.svg)](https://coveralls.io/r/KristofferC/KDtree.jl)
 
-Currently supports KNN-search and finding all points inside an hyper sphere.
+Currently supports KNN-search and finding all points inside an hyper sphere centered at a given point. Currently only
+uses Euclidean distance.
+
+Some care has been taken with regards to performance. For example the tree is not implemented as nodes pointing to other nodes but instead as an ensamble of densely packed arrays. This should give better cache locality. The negative aspect of this storage method is that the tree is immutable and new data can not be entered into the tree after it has been created.
+
+There are some benchmarks for the creation of the tree and the different searches in the benchmark folder.
+
+Since this is a new project there are still some obvious improvements which are listed in the TODO list.
 
 ## Author
 Kristoffer Carlsson (@KristofferC)
 
 ## Examples
 
+In the examples, notice that the module is called `KDtree` and the actual tree type is called `KDTree`. This is because modules and types can currently not have the same name in Julia.
+
+The tree is created from a matrix of floats of dimension `(n_dim, n_points)`.
+
 ### Points inside hyper sphere
+
+Finds all points inside an hyper sphere centered at a given point. Returns the indices of these points. 
+
 ```julia
 using KDtree
 tree = KDTree(randn(3, 1000))
@@ -30,7 +44,13 @@ gives the indices:
  161
 ```
 
-### KNN
+### K-Nearest-Neighbours
+
+Finds the *k* nearest neighbours to a given point. Returns a tuple of two lists with the indices and the distances
+from the given points respectively. These are sorted in the order of smallest to largest distance.
+
+The current implementation is a bit slower than it has to be for very large *k*.
+
 ```julia
 using KDtree
 tree = KDTree(randn(3, 1000))
@@ -45,11 +65,10 @@ gives both the indices and distances:
 * Implement a leaf size argument where the sub tree stop splitting after
    only a certain number of nodes are left in the sub tree.
 * Add proper benchmarks, compare with others implementations.
-* Priority Queue for storing the K best points in KNN instead of a linear array (should only matter for large K).
+* Add other measures than Euclidean distance.
+* Use a bounded priority queue for storing the K best points in KNN instead of a linear array (should only matter for large K).
 * Proper investigation of memory allocations and where time is spent.
 * Throw errors at dimension mismatch in the functions etc.
 
 
 
-
-
diff --git a/benchmark/bench_build_tree.jl b/benchmark/bench_build_tree.jl
@@ -22,8 +22,16 @@ end
 println(times)
 
 #=
-times 2015-02-01:
+2015-02-02:
+[1.6794e-5 6.0335e-5 0.000620148 0.007069198 0.103463751
+ 1.4306e-5 7.8995e-5 0.000901299 0.010201042 0.15665534
+ 1.555e-5 8.5528e-5 0.000956037 0.011532464 0.162782185
+ 1.5239e-5 9.4857e-5 0.001112474 0.01683234 0.182989888
+ 1.6794e-5 0.000107609 0.001225991 0.014654978 0.21553432
+ 1.7727e-5 0.000124714 0.00143872 0.017477369 0.240441345]
 
+
+2015-02-01:
 [4.6477e-5 0.000318537 0.004562246 0.054535408 0.796257552
  2.8026e-5 0.000350114 0.00443035 0.076212208 0.952349006
  2.895e-5 0.000329126 0.004856197 0.059021987 0.916525079

diff --git a/benchmark/bench_knn.jl b/benchmark/bench_knn.jl
@@ -0,0 +1,31 @@
+using KDtree
+
+dims = 3
+n_points = [10^i for i in 3:6]
+ks = [1, 3, 10, 50, 100, 500]
+
+times = Array(Float64, length(ks), length(n_points))
+
+# Compile it
+tree = KDTree(randn(2,2))
+k_nearest_neighbour(tree, zeros(2), 1)
+
+for (i, k) in enumerate(ks)
+    for (j , n_point) in enumerate(n_points)
+        data = randn(dims, n_point)
+        tree = KDTree(data)
+        times[i,j]  = @elapsed k_nearest_neighbour(tree, zeros(dims), k)
+    end
+end
+
+println(times)
+
+#=
+2015-02-02:
+[2.3015e-5 2.1771e-5 7.0288e-5 6.0958e-5
+ 2.7368e-5 3.5143e-5 8.957e-5 7.7752e-5
+ 6.0647e-5 5.0072e-5 0.000104499 0.000111029
+ 0.000194691 0.000216772 0.000293591 0.000318472
+ 0.000385027 0.000359835 0.000650316 0.000582517
+ 0.003119404 0.005313561 0.006453713 0.006805152]
+ =#
diff --git a/benchmark/bench_query_ball.jl b/benchmark/bench_query_ball.jl
@@ -0,0 +1,30 @@
+using KDtree
+
+dims = 3
+n_points = [10^i for i in 3:5]
+rs = [0.1 0.2 0.3 0.4]
+
+times = Array(Float64, length(rs), length(n_points))
+
+# Compile it
+tree = KDTree(randn(2,2))
+query_ball_point(tree, zeros(2), 0.1)
+
+
+for (i, r) in enumerate(rs)
+    for (j , n_point) in enumerate(n_points)
+        data = randn(dims, n_point)
+        tree = KDTree(data)
+        times[i,j]  = @elapsed query_ball_point(tree, zeros(dims), r)
+    end
+end
+
+println(times)
+
+#=
+2015-02-03:
+[2.1149e-5 3.2966e-5 8.3661e-5
+ 2.146e-5 5.1628e-5 0.000229523
+ 2.7369e-5 0.000116005 0.000453138
+ 4.012e-5 0.000204643 0.000789959]
+ =#
diff --git a/src/kd_tree.jl b/src/kd_tree.jl
@@ -1,4 +1,3 @@
-
 # Todo: update to mikowski, p = 1, inf
 function euclidean_distance{T <: FloatingPoint}(point_1::Array{T, 1},
                                                 point_2::Array{T, 1})
@@ -111,6 +110,7 @@ function KDTree{T <: FloatingPoint}(data::Matrix{T})
            hyper_recs, n_internal_nodes,  indices)
   end
 
+
 # Recursive function to build the tree.
 # Calculates what dimension has the maximum spread,
 # and how many points to send to each side.
@@ -156,19 +156,20 @@ function build_KDTree{T <: FloatingPoint}(index::Int,
 
 
     # Decide where to split
-    k = ifloor(log2(n_points))  # prevpow2 doesnt get the exponent
+    # k = floor(Integer, log2(n_points)) for v 0.4
+    k = ifloor(log2(n_points)) # <- deprecated in v 0.4
     rest = n_points - 2^k
 
     if rest > 2^(k-1)
-      mid_idx = 2^k
+        mid_idx = 2^k
     else
-       mid_idx = 2^(k-1) + rest
-     end
+        mid_idx = 2^(k-1) + rest
+    end
 
-   # select! works like n_th element in c++
-   # sorts the points in the maximum spread dimension s.t
-   # data[split_dim, a]) < data[split_dim, b]) for all a > mid_idx, b > mid_idx
-    select!(perm, mid_idx, by=i -> data[split_dim, i]) #select! allocates memory due to return value...
+    # select! works like n_th element in c++
+    # sorts the points in the maximum spread dimension s.t
+    # data[split_dim, a]) < data[split_dim, b]) for all a > mid_idx, b > mid_idx
+    select_spec!(perm, mid_idx, 1, length(perm), data, split_dim)
 
     split = data[split_dim, perm[mid_idx]]
     split_vals[index] = split
@@ -234,15 +235,13 @@ function _k_nearest_neighbour{T <: FloatingPoint}(tree::KDTree,
     end
 
     if point[tree.split_dims[index]] < tree.split_vals[index]
-      _k_nearest_neighbour(tree, point, k, best_idxs, best_dists, get_left_node(index))
-       _k_nearest_neighbour(tree, point, k, best_idxs, best_dists, get_right_node(index))
+        _k_nearest_neighbour(tree, point, k, best_idxs, best_dists, get_left_node(index))
+        _k_nearest_neighbour(tree, point, k, best_idxs, best_dists, get_right_node(index))
     else
-       _k_nearest_neighbour(tree, point, k, best_idxs, best_dists, get_right_node(index))
-       _k_nearest_neighbour(tree, point, k,best_idxs, best_dists,  get_left_node(index))
+        _k_nearest_neighbour(tree, point, k, best_idxs, best_dists, get_right_node(index))
+        _k_nearest_neighbour(tree, point, k,best_idxs, best_dists,  get_left_node(index))
     end
 end
-
-
 # Returns the indices for all points in the tree inside a
 # hypersphere of a given point with a given radius
 function query_ball_point{T <: FloatingPoint}(tree::KDTree,
@@ -266,7 +265,7 @@ function traverse_check{T <: FloatingPoint}(tree::KDTree,
     min_d, max_d = get_min_max_distance(tree.hyper_recs[index], point)
     if min_d > r # Hyper shpere is outside hyper rectangle, skip the whole sub tree
         return
-    elseif (max_d < r) && (max_d > zero(T))
+    elseif (max_d < r)
         traverse_no_check(tree, index, idx_in_ball)
     elseif is_leaf_node(tree, index)
         if euclidean_distance(get_point(tree, index), point) < r
@@ -293,6 +292,38 @@ function traverse_no_check(tree::KDTree, index::Int, idx_in_ball::Vector{Int})
 end
 
 
+# Taken from https://github.com/JuliaLang/julia/blob/v0.3.5/base/sort.jl
+# and modified because I couldn't figure out how to get rid of
+# the memory consumption when I passed in a new anonymous function
+# to the "by" argument in each node. I also removed the return value.
+function select_spec!{T <: FloatingPoint}(v::AbstractVector, k::Int, lo::Int, hi::Int, data::Matrix{T}, dim::Int)
+    lo <= k <= hi || error("select index $k is out of range $lo:$hi")
+     @inbounds while lo < hi
+        if hi-lo == 1
+            if data[dim, v[hi]] < data[dim, v[lo]]
+                v[lo], v[hi] = v[hi], v[lo]
+            end
+            return
+        end
+        pivot = v[(lo+hi)>>>1]
+        i, j = lo, hi
+        while true
+            while data[dim, v[i]] < data[dim, pivot]; i += 1; end
+            while  data[dim, pivot] <  data[dim, v[j]] ; j -= 1; end
+            i <= j || break
+            v[i], v[j] = v[j], v[i]
+            i += 1; j -= 1
+        end
+        if k <= j
+            hi = j
+        elseif i <= k
+            lo = i
+        else
+            return
+        end
+    end
+    return
+end
 
 
 #=