STOR-i · maximerischard · Dec 6, 2019 · Dec 8, 2019 · Dec 8, 2019 · Dec 8, 2019
diff --git a/.travis.yml b/.travis.yml
@@ -5,27 +5,30 @@ os:
   - osx
 julia:
   - 1.0
-  - 1.1
-  - 1.2
-  - 1.3
-  - 1.4
+  - 1.5
   - nightly
 notifications:
   email: false
 git:
   depth: 99999999
+codecov: true
+coveralls: true
+env:
+    - JULIA_NUM_THREADS=1
+    - JULIA_NUM_THREADS=3
 
 # uncomment the following lines to allow failures on nightly julia
 # (tests will run but not make your overall status red)
 matrix:
   allow_failures:
   - julia: nightly
 
+
 # uncomment following lines to deploy documentation
 jobs:
   include:
     - stage: "Documentation"
-      julia: 1.3
+      julia: 1.5
       os: linux
       script:
         - julia --project=docs/ -e 'using Pkg; Pkg.develop(PackageSpec(path=pwd())); Pkg.instantiate()'

diff --git a/Project.toml b/Project.toml
@@ -22,7 +22,6 @@ SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 StatsFuns = "4c63d2b9-4356-54db-8cca-17b64c39e42c"
-Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [compat]
 Distances = "0.7, 0.8, 0.9, 0.10"
@@ -39,13 +38,13 @@ ScikitLearnBase = "0.4, 0.5"
 SpecialFunctions = "0.7, 0.8, 0.9, 0.10, 1.0"
 StaticArrays = "0.6, 0.9, 0.12, 1.0"
 StatsFuns = "0.7, 0.8, 0.9"
-Zygote = "0.3, 0.4, 0.5"
 julia = "1"
 
 [extras]
 Calculus = "49dc2e85-a5d0-5ad3-a950-438e2897f1b9"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
+Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["Calculus", "Distributions", "Test"]
+test = ["Calculus", "Distributions", "Test", "Plots"]
diff --git a/perf/benchmarks/benchmark_julia.jl b/perf/benchmarks/benchmark_julia.jl
@@ -2,7 +2,6 @@ using GaussianProcesses
 using GaussianProcesses: update_mll_and_dmll!, update_cK!, FullCovariancePrecompute
 using BenchmarkTools
 using DataFrames
-using JLD
 using Random
 using CSV
 using LinearAlgebra
@@ -25,9 +24,9 @@ kerns = Dict(
 
 function benchmark_kernel(group, kern)
     XY_df = CSV.read("simdata.csv")
-    Xt = XY_df[:,[:x1, :x2, :x3, :x4, :x5, :x6, :x7, :x8, :x9, :x10]]
+    Xt = XY_df[!,[:x1, :x2, :x3, :x4, :x5, :x6, :x7, :x8, :x9, :x10]]
     X = Matrix(transpose(Matrix(Xt)))
-    Y = XY_df[:Y]
+    Y = XY_df[!,:Y]
     @assert length(Y) == nt
     buffer = FullCovariancePrecompute(nt)
     gp = GP(X, Y, MeanConst(0.0), kern, 0.3)
@@ -41,7 +40,7 @@ Random.seed!(1)
 X = randn(d, nt) # Training data
 Y = randn(nt)
 XY_df = DataFrame(X')
-XY_df[:Y] = Y
+XY_df[!,:Y] = Y
 XY_df
 CSV.write("simdata.csv", XY_df; writeheader=true)
 

diff --git a/perf/benchmarks/simdata.csv b/perf/benchmarks/simdata.csv
@@ -881,7 +881,7 @@ x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,Y
 -1.0416308788936433,1.0764502623098364,-0.12319708874179446,0.14880361426456007,-1.7038608214015492,0.35357219227840286,-1.3843702199740027,1.0142042301255974,0.9245312280717214,1.099152379597873,-0.587463481680643
 -1.763399479522698,-0.7030723312348475,-0.5574483026629544,-0.09608879800584239,0.3176261792338285,-0.20272266997013266,0.18622096875564312,0.6294657759626493,-0.10244936970357577,0.23663769298864948,0.7406551861623611
 0.43019486974216187,-0.18507331077938563,0.07765177448723373,-1.2886594714017927,0.3000179724372475,-0.6756828409744344,0.5205653112961023,0.5992113497605802,-0.2148486065467117,-0.8796170963811727,-1.6709706978118604
--0.06968393771658529,-0.29446488849498836,-1.178897573008452,-2.2104310768977533,-0.04444574149584955,-0.06204780061279588,0.8715184026877638,-8562630034363779e-20,1.1647180675461244,-1.4303938546548505,0.4052169391057248
+-0.06968393771658529,-0.29446488849498836,-1.178897573008452,-2.2104310768977533,-0.04444574149584955,-0.06204780061279588,0.8715184026877638,-8.562630034363779e-5,1.1647180675461244,-1.4303938546548505,0.4052169391057248
 0.339194308087732,-1.3015505596207104,0.11056959664042851,-0.1414289861997376,0.6415420326872232,-0.06353310081708839,2.905584320243272,-0.961340377013751,1.6715479030821394,1.4600672602543598,0.021298971040593834
 0.9480877902358498,-0.9879469250412419,0.11241920457577553,1.5101268864816368,1.6020452589452194,1.9071994569399975,-0.5999507172192088,0.864986716084598,-0.6996497761137089,-0.059408142918407084,-1.1374402301506277
 -0.6528715312205515,-0.22290347754944556,-2.0941228499988314,-1.316483492719756,-1.4635399844129666,-1.4512243407894792,0.19695408048977622,0.8025987608791353,2.1407804010306823,-0.17627995141407848,-0.578973002813413

diff --git a/src/GPA.jl b/src/GPA.jl
@@ -129,8 +129,8 @@ end
 
 function get_L_bar!(L_bar::AbstractMatrix, dl_df::AbstractVector, v::AbstractVector, cK::PDMat)
     fill!(L_bar, 0.0)
-    BLAS.ger!(1.0, dl_df, v, L_bar)
-    tril!(L_bar)
+    BLAS.ger!(1.0, dl_df, v, L_bar) # L_bar = dl_df*v'
+    tril!(L_bar) # lower triangular
     # ToDo:
     # the following two steps allocates memory
     # and are fickle, reaching into the internal
@@ -166,8 +166,12 @@ function dll_kern!(dll::AbstractVector, gp::GPBase, precomp::FullCovMCMCPrecompu
     get_L_bar!(L_bar, precomp.dl_df, gp.v, gp.cK)
     nobs = gp.nobs
     @inbounds for i in 1:nobs
+        # dmll_kern! assumes that ααinvcKI is symmetrical and operates
+        # on the upper triangular component of it,
+        # but L_bar is lower triangular, so we need to make this adjustment.
         L_bar[i,i] *= 2
     end
+    LinearAlgebra.copytri!(L_bar, 'L')
     # in GPA, L_bar plays the role of ααinvcKI
     return dmll_kern!(dll, gp.kernel, gp.x, gp.data, L_bar, covstrat)
 end

diff --git a/src/GPE.jl b/src/GPE.jl
@@ -211,34 +211,49 @@ function update_mll!(gp::GPE; noise::Bool=true, domean::Bool=true, kern::Bool=tr
     gp
 end
 
-"""
-    dmll_kern!((dmll::AbstractVector, k::Kernel, X::AbstractMatrix, data::KernelData, ααinvcKI::AbstractMatrix))
-
-Derivative of the marginal log likelihood log p(Y|θ) with respect to the kernel hyperparameters.
-"""
-function dmll_kern!(dmll::AbstractVector, k::Kernel, X::AbstractMatrix, data::KernelData, 
+function _dmll_kern_row!(dmll, buf, k, ααinvcKI, X, data, j, dim, nparams)
+    # diagonal
+    dKij_dθ!(buf, k, X, X, data, j, j, dim, nparams)
+    @inbounds for iparam in 1:nparams
+        dmll[iparam] += buf[iparam] * ααinvcKI[j, j] / 2.0
+    end
+    # off-diagonal
+    @inbounds for i in 1:j-1
+        dKij_dθ!(buf, k, X, X, data, i, j, dim, nparams)
+        @simd for iparam in 1:nparams
+            dmll[iparam] += buf[iparam] * ααinvcKI[i, j]
+        end
+    end
+end
+function dmll_kern_loop!(dmll::AbstractVector, k::Kernel, X::AbstractMatrix, data::KernelData, 
                     ααinvcKI::Matrix{Float64}, covstrat::CovarianceStrategy)
     dim, nobs = size(X)
     nparams = num_params(k)
     @assert nparams == length(dmll)
     dK_buffer = Vector{Float64}(undef, nparams)
     dmll[:] .= 0.0
-    @inbounds for j in 1:nobs
-        # diagonal
-        dKij_dθ!(dK_buffer, k, X, X, data, j, j, dim, nparams)
-        for iparam in 1:nparams
-            dmll[iparam] += dK_buffer[iparam] * ααinvcKI[j, j] / 2.0
-        end
-        # off-diagonal
-        for i in j+1:nobs
-            dKij_dθ!(dK_buffer, k, X, X, data, i, j, dim, nparams)
-            @simd for iparam in 1:nparams
-                dmll[iparam] += dK_buffer[iparam] * ααinvcKI[i, j]
-            end
-        end
+    # make a copy per thread for objects that are potentially not thread-safe:
+    kcopies = [deepcopy(k) for _ in 1:Threads.nthreads()]
+    buffercopies = [similar(dK_buffer) for _ in 1:Threads.nthreads()]
+    dmllcopies = [deepcopy(dmll) for _ in 1:Threads.nthreads()]
+
+    @inbounds Threads.@threads for j in 1:nobs
+        kthread = kcopies[Threads.threadid()]
+        bufthread = buffercopies[Threads.threadid()]
+        dmllthread = dmllcopies[Threads.threadid()]
+        _dmll_kern_row!(dmllthread, bufthread, kthread, 
+                        ααinvcKI, X, data, j, dim, nparams)
     end
+
+    dmll[:] = sum(dmllcopies) # sum up the results from all threads
     return dmll
 end
+"""
+    dmll_kern!((dmll::AbstractVector, k::Kernel, X::AbstractMatrix, data::KernelData, ααinvcKI::AbstractMatrix))
+
+Derivative of the marginal log likelihood log p(Y|θ) with respect to the kernel hyperparameters.
+"""
+dmll_kern!(dmll, k, X, data, ααinvcKI, covstrat) = dmll_kern_loop!(dmll, k, X, data, ααinvcKI, covstrat)
 
 """ AbstractGradientPrecompute types hold results of
     pre-computations of kernel gradients.

diff --git a/src/GaussianProcesses.jl b/src/GaussianProcesses.jl
@@ -1,6 +1,6 @@
 module GaussianProcesses
 
-using Optim, PDMats, ElasticPDMats, ElasticArrays, Distances, FastGaussQuadrature, RecipesBase, Distributions, Zygote, ProgressMeter
+using Optim, PDMats, ElasticPDMats, ElasticArrays, Distances, FastGaussQuadrature, RecipesBase, Distributions
 using StaticArrays
 using StatsFuns, SpecialFunctions
 
@@ -26,7 +26,7 @@ const invΦ = norminvcdf
 
 # all package code should be included here
 include("means/means.jl")
-include("kernels/kernels.jl")
+include("covariance/kernels.jl")
 include("likelihoods/likelihoods.jl")
 include("common.jl")
 include("utils.jl")

diff --git a/src/kernels/autodiff.jl → src/covariance/autodiff.jl b/src/kernels/autodiff.jl → src/covariance/autodiff.jl
diff --git a/src/kernels/composite_kernel.jl → src/covariance/composite_kernel.jl b/src/kernels/composite_kernel.jl → src/covariance/composite_kernel.jl
diff --git a/src/covariance/covariance.jl b/src/covariance/covariance.jl
@@ -0,0 +1,150 @@
+"""
+    cov(k::Kernel, X1::AbstractMatrix, X2::AbstractMatrix)
+
+Create covariance matrix from kernel `k` and matrices of observations `X1` and `X2`, where
+each column is an observation.
+"""
+function cov(k::Kernel, X1::AbstractMatrix, X2::AbstractMatrix, data::KernelData=EmptyData())
+    dim1, nobs1 = size(X1)
+    dim2, nobs2 = size(X2)
+    dim1==dim2 || throw(ArgumentError("X1 and X2 must have same dimension"))
+    cK = Array{promote_type(eltype(X1), eltype(X2))}(undef, nobs1, nobs2)
+    cov!(cK, k, X1, X2, data)
+end
+
+function _cov_row!(cK, k, X::AbstractMatrix, data, j, dim)
+    cK[j,j] = cov_ij(k, X, X, data, j, j, dim)
+    @inbounds for i in 1:j-1
+        cK[i,j] = cov_ij(k, X, X, data, i, j, dim)
+        cK[j,i] = cK[i,j]
+    end
+end
+function cov_loop!(cK::AbstractMatrix, k::Kernel, X::AbstractMatrix, data::KernelData)
+    dim, nobs = size(X)
+    (nobs,nobs) == size(cK) || throw(ArgumentError("cK has size $(size(cK)) and X has size $(size(X))"))
+    kcopies = [deepcopy(k) for _ in 1:Threads.nthreads()] # in case k is not threadsafe (e.g. ADkernel)
+    @inbounds Threads.@threads for j in 1:nobs
+        kthread = kcopies[Threads.threadid()]
+        _cov_row!(cK, k, X, data, j, dim)
+    end
+    return cK
+end
+function _cov_row!(cK, k, X1::AbstractMatrix, X2::AbstractMatrix, data, i, dim, nobs2)
+    @inbounds for j in 1:nobs2
+        cK[i,j] = cov_ij(k, X1, X2, data, i, j, dim)
+    end
+end
+function cov_loop!(cK::AbstractMatrix, k::Kernel, X1::AbstractMatrix, X2::AbstractMatrix, data::KernelData)
+    if X1 === X2
+        return cov_loop!(cK, k, X1, data)
+    end
+    dim1, nobs1 = size(X1)
+    dim2, nobs2 = size(X2)
+    dim1==dim2 || throw(ArgumentError("X1 and X2 must have same dimension"))
+    dim = size(X1, 1)
+    (nobs1,nobs2) == size(cK) || throw(ArgumentError("cK has size $(size(cK)) X1 $(size(X1)) and X2 $(size(X2))"))
+    kcopies = [deepcopy(k) for _ in 1:Threads.nthreads()]
+    @inbounds Threads.@threads for i in 1:nobs1
+        kthread = kcopies[Threads.threadid()]
+        _cov_row!(cK, kthread, X1, X2, data, i, dim, nobs2)
+    end
+    return cK
+end
+"""
+    cov(k::Kernel, X::AbstractMatrix[, data::KernelData = EmptyData()])
+
+Create covariance matrix from kernel `k`, matrix of observations `X`, where each column is
+an observation, and kernel data `data` constructed from input observations.
+"""
+cov(k::Kernel, X::AbstractMatrix, data::KernelData=EmptyData()) = cov(k, X, X, data)
+
+function cov_loop_generic!(cK::AbstractMatrix, k::Kernel, X1::AbstractMatrix, X2::AbstractMatrix, data::KernelData)
+    dim, nobs1 = size(X1)
+    dim, nobs2 = size(X2)
+    # Generic implementation using broadcasting. A more efficient multithreaded
+    # implementation is provided above.
+    cK .= cov_ij.(Ref(k), Ref(X1), Ref(X2), Ref(data), 1:nobs1, (1:nobs2)', dim) .* 2.0
+end
+"""
+    cov!(cK::AbstractMatrix, k::Kernel, X1::AbstractMatrix, X2::AbstractMatrix, data::KernelData=EmptyData())
+
+Like [`cov(k, X1, X2)`](@ref), but stores the result in `cK` rather than a new matrix.
+"""
+cov!(cK::AbstractMatrix, k::Kernel, X1::AbstractMatrix, X2::AbstractMatrix=X1, data::KernelData=EmptyData()) = cov_loop!(cK, k, X1, X2, data)
+cov!(cK::AbstractMatrix, k::Kernel, X::AbstractMatrix, data::KernelData=EmptyData()) = cov!(cK, k, X, X, data)
+
+############################
+##### Kernel Gradients #####
+############################
+
+function _grad_slice_row!(dK, k, X::AbstractMatrix, data, j, p, dim)
+    dK[j,j] = dKij_dθp(k,X,X,data,j,j,p,dim)
+    @inbounds @simd for i in 1:(j-1)
+        dK[i,j] = dKij_dθp(k,X,X,data,i,j,p,dim)
+        dK[j,i] = dK[i,j]
+    end
+end
+function grad_slice_loop!(dK::AbstractMatrix, k::Kernel, X::AbstractMatrix, p::Int, data::KernelData)
+    dim, nobs = size(X)
+    (nobs,nobs) == size(dK) || throw(ArgumentError("dK has size $(size(dK)) and X has size $(size(X))"))
+    kcopies = [deepcopy(k) for _ in 1:Threads.nthreads()]
+    @inbounds Threads.@threads for j in 1:nobs
+        kthread = kcopies[Threads.threadid()]
+        _grad_slice_row!(dK, kthread, X, data, j, p, dim)
+    end
+    return dK
+end
+function _grad_slice_row!(dK, k, X1::AbstractMatrix, X2::AbstractMatrix, data::KernelData, i, p, dim, nobs2)
+    @inbounds @simd for j in 1:nobs2
+        dK[i,j] = dKij_dθp(k,X1,X2,data,i,j,p,dim)
+    end
+end
+function grad_slice_loop!(dK::AbstractMatrix, k::Kernel, X1::AbstractMatrix, X2::AbstractMatrix, p::Int, data::KernelData)
+    if X1 === X2
+        return grad_slice_loop!(dK, k, X1, p, data)
+    end
+    dim1, nobs1 = size(X1)
+    dim2, nobs2 = size(X2)
+    dim1==dim2 || throw(ArgumentError("X1 and X2 must have same dimension"))
+    (nobs1,nobs2) == size(dK) || throw(ArgumentError("dK has size $(size(dK)) X1 $(size(X1)) and X2 $(size(X2))"))
+    dim=dim1
+    kcopies = [deepcopy(k) for _ in 1:Threads.nthreads()]
+    @inbounds Threads.@threads for i in 1:nobs1
+        kthread = kcopies[Threads.threadid()]
+        _grad_slice_row!(dK, kthread, X1, X2, data, i, p, dim, nobs2)
+    end
+    return dK
+end
+function grad_slice_loop_generic!(dK::AbstractMatrix, k::Kernel, X1::AbstractMatrix, X2::AbstractMatrix, p::Int, data::KernelData)
+    dim, nobs1 = size(X1)
+    dim, nobs2 = size(X2)
+    # Generic implementation using broadcasting. A more efficient multithreaded
+    # implementation is provided above.
+    dK .= dKij_dθp.(Ref(k), Ref(X1), Ref(X2), Ref(data), 1:nobs1, (1:nobs2)', p, dim)
+end
+grad_slice!(dK::AbstractMatrix, k::Kernel, X1::AbstractMatrix, X2::AbstractMatrix, p::Int, data::KernelData=EmptyData()) = grad_slice_loop!(dK, k, X1, X2, p, data)
+
+# Calculates the stack [dk / dθᵢ] of kernel matrix gradients
+function grad_stack!(stack::AbstractArray, k::Kernel, X1::AbstractMatrix, X2::AbstractMatrix, data::KernelData=EmptyData())
+    @inbounds for p in 1:num_params(k)
+        grad_slice!(view(stack, :, :, p), k, X1, X2, p, data)
+    end
+    stack
+end
+
+# grad_stack!(stack::AbstractArray, k::Kernel, X1::AbstractMatrix, X2::AbstractMatrix) =
+    # grad_stack!(stack, k, X1, X2, KernelData(k, X1, X2))
+
+# grad_stack(k::Kernel, X1::AbstractMatrix, X2::AbstractMatrix) = grad_stack(k, X1, X2, KernelData(k, X1, X2))
+
+function grad_stack(k::Kernel, X1::AbstractMatrix, X2::AbstractMatrix, data::KernelData=EmptyData())
+    nobs1 = size(X1, 2)
+    nobs2 = size(X2, 2)
+    stack = Array{eltype(X)}(undef, nobs1, nobs2, num_params(k))
+    grad_stack!(stack, k, X1, X2, data)
+end
+
+
+@inline function dKij_dθp(k::Kernel, X1::AbstractMatrix, X2::AbstractMatrix, data::KernelData, i::Int, j::Int, p::Int, dim::Int) 
+    return dKij_dθp(k, X1, X2, i, j, p, dim)
+end
diff --git a/src/kernels/distance.jl → src/covariance/distance.jl b/src/kernels/distance.jl → src/covariance/distance.jl
diff --git a/src/kernels/fixed_kernel.jl → src/covariance/fixed_kernel.jl b/src/kernels/fixed_kernel.jl → src/covariance/fixed_kernel.jl
@@ -58,8 +58,8 @@ function free(k::FixedKernel, par::Symbol)
     return FixedKernel(k.kernel, free)
 end
 
-function grad_slice!(dK::AbstractMatrix, k::FixedKernel, X1::AbstractMatrix, X2::AbstractMatrix, data::KernelData, p::Int)
-    return grad_slice!(dK, k.kernel, X1, X2, data, k.free[p])
+function grad_slice!(dK::AbstractMatrix, k::FixedKernel, X1::AbstractMatrix, X2::AbstractMatrix, p::Int, data::KernelData)
+    return grad_slice!(dK, k.kernel, X1, X2, k.free[p], data)
 end
 @inline function dKij_dθp(fk::FixedKernel,X1::AbstractMatrix, X2::AbstractMatrix,data::KernelData,i::Int,j::Int,p::Int,dim::Int)
     return dKij_dθp(fk.kernel, X1, X2, data, i, j, fk.free[p], dim)