Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

implement multi-threading for covariance and gradient matrices (WIP) #129

Open
wants to merge 15 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 8 additions & 5 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,27 +5,30 @@ os:
- osx
julia:
- 1.0
- 1.1
- 1.2
- 1.3
- 1.4
- 1.5
- nightly
notifications:
email: false
git:
depth: 99999999
codecov: true
coveralls: true
env:
- JULIA_NUM_THREADS=1
- JULIA_NUM_THREADS=3

# uncomment the following lines to allow failures on nightly julia
# (tests will run but not make your overall status red)
matrix:
allow_failures:
- julia: nightly


# uncomment following lines to deploy documentation
jobs:
include:
- stage: "Documentation"
julia: 1.3
julia: 1.5
os: linux
script:
- julia --project=docs/ -e 'using Pkg; Pkg.develop(PackageSpec(path=pwd())); Pkg.instantiate()'
Expand Down
5 changes: 2 additions & 3 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
StatsFuns = "4c63d2b9-4356-54db-8cca-17b64c39e42c"
Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"

[compat]
Distances = "0.7, 0.8, 0.9, 0.10"
Expand All @@ -39,13 +38,13 @@ ScikitLearnBase = "0.4, 0.5"
SpecialFunctions = "0.7, 0.8, 0.9, 0.10, 1.0"
StaticArrays = "0.6, 0.9, 0.12, 1.0"
StatsFuns = "0.7, 0.8, 0.9"
Zygote = "0.3, 0.4, 0.5"
julia = "1"

[extras]
Calculus = "49dc2e85-a5d0-5ad3-a950-438e2897f1b9"
Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[targets]
test = ["Calculus", "Distributions", "Test"]
test = ["Calculus", "Distributions", "Test", "Plots"]
7 changes: 3 additions & 4 deletions perf/benchmarks/benchmark_julia.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ using GaussianProcesses
using GaussianProcesses: update_mll_and_dmll!, update_cK!, FullCovariancePrecompute
using BenchmarkTools
using DataFrames
using JLD
using Random
using CSV
using LinearAlgebra
Expand All @@ -25,9 +24,9 @@ kerns = Dict(

function benchmark_kernel(group, kern)
XY_df = CSV.read("simdata.csv")
Xt = XY_df[:,[:x1, :x2, :x3, :x4, :x5, :x6, :x7, :x8, :x9, :x10]]
Xt = XY_df[!,[:x1, :x2, :x3, :x4, :x5, :x6, :x7, :x8, :x9, :x10]]
X = Matrix(transpose(Matrix(Xt)))
Y = XY_df[:Y]
Y = XY_df[!,:Y]
@assert length(Y) == nt
buffer = FullCovariancePrecompute(nt)
gp = GP(X, Y, MeanConst(0.0), kern, 0.3)
Expand All @@ -41,7 +40,7 @@ Random.seed!(1)
X = randn(d, nt) # Training data
Y = randn(nt)
XY_df = DataFrame(X')
XY_df[:Y] = Y
XY_df[!,:Y] = Y
XY_df
CSV.write("simdata.csv", XY_df; writeheader=true)

Expand Down
2 changes: 1 addition & 1 deletion perf/benchmarks/simdata.csv
Original file line number Diff line number Diff line change
Expand Up @@ -881,7 +881,7 @@ x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,Y
-1.0416308788936433,1.0764502623098364,-0.12319708874179446,0.14880361426456007,-1.7038608214015492,0.35357219227840286,-1.3843702199740027,1.0142042301255974,0.9245312280717214,1.099152379597873,-0.587463481680643
-1.763399479522698,-0.7030723312348475,-0.5574483026629544,-0.09608879800584239,0.3176261792338285,-0.20272266997013266,0.18622096875564312,0.6294657759626493,-0.10244936970357577,0.23663769298864948,0.7406551861623611
0.43019486974216187,-0.18507331077938563,0.07765177448723373,-1.2886594714017927,0.3000179724372475,-0.6756828409744344,0.5205653112961023,0.5992113497605802,-0.2148486065467117,-0.8796170963811727,-1.6709706978118604
-0.06968393771658529,-0.29446488849498836,-1.178897573008452,-2.2104310768977533,-0.04444574149584955,-0.06204780061279588,0.8715184026877638,-8562630034363779e-20,1.1647180675461244,-1.4303938546548505,0.4052169391057248
-0.06968393771658529,-0.29446488849498836,-1.178897573008452,-2.2104310768977533,-0.04444574149584955,-0.06204780061279588,0.8715184026877638,-8.562630034363779e-5,1.1647180675461244,-1.4303938546548505,0.4052169391057248
0.339194308087732,-1.3015505596207104,0.11056959664042851,-0.1414289861997376,0.6415420326872232,-0.06353310081708839,2.905584320243272,-0.961340377013751,1.6715479030821394,1.4600672602543598,0.021298971040593834
0.9480877902358498,-0.9879469250412419,0.11241920457577553,1.5101268864816368,1.6020452589452194,1.9071994569399975,-0.5999507172192088,0.864986716084598,-0.6996497761137089,-0.059408142918407084,-1.1374402301506277
-0.6528715312205515,-0.22290347754944556,-2.0941228499988314,-1.316483492719756,-1.4635399844129666,-1.4512243407894792,0.19695408048977622,0.8025987608791353,2.1407804010306823,-0.17627995141407848,-0.578973002813413
Expand Down
8 changes: 6 additions & 2 deletions src/GPA.jl
Original file line number Diff line number Diff line change
Expand Up @@ -129,8 +129,8 @@ end

function get_L_bar!(L_bar::AbstractMatrix, dl_df::AbstractVector, v::AbstractVector, cK::PDMat)
fill!(L_bar, 0.0)
BLAS.ger!(1.0, dl_df, v, L_bar)
tril!(L_bar)
BLAS.ger!(1.0, dl_df, v, L_bar) # L_bar = dl_df*v'
tril!(L_bar) # lower triangular
# ToDo:
# the following two steps allocates memory
# and are fickle, reaching into the internal
Expand Down Expand Up @@ -166,8 +166,12 @@ function dll_kern!(dll::AbstractVector, gp::GPBase, precomp::FullCovMCMCPrecompu
get_L_bar!(L_bar, precomp.dl_df, gp.v, gp.cK)
nobs = gp.nobs
@inbounds for i in 1:nobs
# dmll_kern! assumes that ααinvcKI is symmetrical and operates
# on the upper triangular component of it,
# but L_bar is lower triangular, so we need to make this adjustment.
L_bar[i,i] *= 2
end
LinearAlgebra.copytri!(L_bar, 'L')
# in GPA, L_bar plays the role of ααinvcKI
return dmll_kern!(dll, gp.kernel, gp.x, gp.data, L_bar, covstrat)
end
Expand Down
53 changes: 34 additions & 19 deletions src/GPE.jl
Original file line number Diff line number Diff line change
Expand Up @@ -211,34 +211,49 @@ function update_mll!(gp::GPE; noise::Bool=true, domean::Bool=true, kern::Bool=tr
gp
end

"""
dmll_kern!((dmll::AbstractVector, k::Kernel, X::AbstractMatrix, data::KernelData, ααinvcKI::AbstractMatrix))

Derivative of the marginal log likelihood log p(Y|θ) with respect to the kernel hyperparameters.
"""
function dmll_kern!(dmll::AbstractVector, k::Kernel, X::AbstractMatrix, data::KernelData,
function _dmll_kern_row!(dmll, buf, k, ααinvcKI, X, data, j, dim, nparams)
# diagonal
dKij_dθ!(buf, k, X, X, data, j, j, dim, nparams)
@inbounds for iparam in 1:nparams
dmll[iparam] += buf[iparam] * ααinvcKI[j, j] / 2.0
end
# off-diagonal
@inbounds for i in 1:j-1
dKij_dθ!(buf, k, X, X, data, i, j, dim, nparams)
@simd for iparam in 1:nparams
dmll[iparam] += buf[iparam] * ααinvcKI[i, j]
end
end
end
function dmll_kern_loop!(dmll::AbstractVector, k::Kernel, X::AbstractMatrix, data::KernelData,
ααinvcKI::Matrix{Float64}, covstrat::CovarianceStrategy)
dim, nobs = size(X)
nparams = num_params(k)
@assert nparams == length(dmll)
dK_buffer = Vector{Float64}(undef, nparams)
dmll[:] .= 0.0
@inbounds for j in 1:nobs
# diagonal
dKij_dθ!(dK_buffer, k, X, X, data, j, j, dim, nparams)
for iparam in 1:nparams
dmll[iparam] += dK_buffer[iparam] * ααinvcKI[j, j] / 2.0
end
# off-diagonal
for i in j+1:nobs
dKij_dθ!(dK_buffer, k, X, X, data, i, j, dim, nparams)
@simd for iparam in 1:nparams
dmll[iparam] += dK_buffer[iparam] * ααinvcKI[i, j]
end
end
# make a copy per thread for objects that are potentially not thread-safe:
kcopies = [deepcopy(k) for _ in 1:Threads.nthreads()]
buffercopies = [similar(dK_buffer) for _ in 1:Threads.nthreads()]
dmllcopies = [deepcopy(dmll) for _ in 1:Threads.nthreads()]

@inbounds Threads.@threads for j in 1:nobs
kthread = kcopies[Threads.threadid()]
bufthread = buffercopies[Threads.threadid()]
dmllthread = dmllcopies[Threads.threadid()]
_dmll_kern_row!(dmllthread, bufthread, kthread,
ααinvcKI, X, data, j, dim, nparams)
end

dmll[:] = sum(dmllcopies) # sum up the results from all threads
return dmll
end
"""
dmll_kern!((dmll::AbstractVector, k::Kernel, X::AbstractMatrix, data::KernelData, ααinvcKI::AbstractMatrix))

Derivative of the marginal log likelihood log p(Y|θ) with respect to the kernel hyperparameters.
"""
dmll_kern!(dmll, k, X, data, ααinvcKI, covstrat) = dmll_kern_loop!(dmll, k, X, data, ααinvcKI, covstrat)

""" AbstractGradientPrecompute types hold results of
pre-computations of kernel gradients.
Expand Down
4 changes: 2 additions & 2 deletions src/GaussianProcesses.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
module GaussianProcesses

using Optim, PDMats, ElasticPDMats, ElasticArrays, Distances, FastGaussQuadrature, RecipesBase, Distributions, Zygote, ProgressMeter
using Optim, PDMats, ElasticPDMats, ElasticArrays, Distances, FastGaussQuadrature, RecipesBase, Distributions
using StaticArrays
using StatsFuns, SpecialFunctions

Expand All @@ -26,7 +26,7 @@ const invΦ = norminvcdf

# all package code should be included here
include("means/means.jl")
include("kernels/kernels.jl")
include("covariance/kernels.jl")
include("likelihoods/likelihoods.jl")
include("common.jl")
include("utils.jl")
Expand Down
File renamed without changes.
File renamed without changes.
150 changes: 150 additions & 0 deletions src/covariance/covariance.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
"""
cov(k::Kernel, X1::AbstractMatrix, X2::AbstractMatrix)

Create covariance matrix from kernel `k` and matrices of observations `X1` and `X2`, where
each column is an observation.
"""
function cov(k::Kernel, X1::AbstractMatrix, X2::AbstractMatrix, data::KernelData=EmptyData())
dim1, nobs1 = size(X1)
dim2, nobs2 = size(X2)
dim1==dim2 || throw(ArgumentError("X1 and X2 must have same dimension"))
cK = Array{promote_type(eltype(X1), eltype(X2))}(undef, nobs1, nobs2)
cov!(cK, k, X1, X2, data)
end

function _cov_row!(cK, k, X::AbstractMatrix, data, j, dim)
cK[j,j] = cov_ij(k, X, X, data, j, j, dim)
@inbounds for i in 1:j-1
cK[i,j] = cov_ij(k, X, X, data, i, j, dim)
cK[j,i] = cK[i,j]
end
end
function cov_loop!(cK::AbstractMatrix, k::Kernel, X::AbstractMatrix, data::KernelData)
dim, nobs = size(X)
(nobs,nobs) == size(cK) || throw(ArgumentError("cK has size $(size(cK)) and X has size $(size(X))"))
kcopies = [deepcopy(k) for _ in 1:Threads.nthreads()] # in case k is not threadsafe (e.g. ADkernel)
@inbounds Threads.@threads for j in 1:nobs
kthread = kcopies[Threads.threadid()]
_cov_row!(cK, k, X, data, j, dim)
end
return cK
end
function _cov_row!(cK, k, X1::AbstractMatrix, X2::AbstractMatrix, data, i, dim, nobs2)
@inbounds for j in 1:nobs2
cK[i,j] = cov_ij(k, X1, X2, data, i, j, dim)
end
end
function cov_loop!(cK::AbstractMatrix, k::Kernel, X1::AbstractMatrix, X2::AbstractMatrix, data::KernelData)
if X1 === X2
return cov_loop!(cK, k, X1, data)
end
dim1, nobs1 = size(X1)
dim2, nobs2 = size(X2)
dim1==dim2 || throw(ArgumentError("X1 and X2 must have same dimension"))
dim = size(X1, 1)
(nobs1,nobs2) == size(cK) || throw(ArgumentError("cK has size $(size(cK)) X1 $(size(X1)) and X2 $(size(X2))"))
kcopies = [deepcopy(k) for _ in 1:Threads.nthreads()]
@inbounds Threads.@threads for i in 1:nobs1
kthread = kcopies[Threads.threadid()]
_cov_row!(cK, kthread, X1, X2, data, i, dim, nobs2)
end
return cK
end
"""
cov(k::Kernel, X::AbstractMatrix[, data::KernelData = EmptyData()])

Create covariance matrix from kernel `k`, matrix of observations `X`, where each column is
an observation, and kernel data `data` constructed from input observations.
"""
cov(k::Kernel, X::AbstractMatrix, data::KernelData=EmptyData()) = cov(k, X, X, data)

function cov_loop_generic!(cK::AbstractMatrix, k::Kernel, X1::AbstractMatrix, X2::AbstractMatrix, data::KernelData)
dim, nobs1 = size(X1)
dim, nobs2 = size(X2)
# Generic implementation using broadcasting. A more efficient multithreaded
# implementation is provided above.
cK .= cov_ij.(Ref(k), Ref(X1), Ref(X2), Ref(data), 1:nobs1, (1:nobs2)', dim) .* 2.0
end
"""
cov!(cK::AbstractMatrix, k::Kernel, X1::AbstractMatrix, X2::AbstractMatrix, data::KernelData=EmptyData())

Like [`cov(k, X1, X2)`](@ref), but stores the result in `cK` rather than a new matrix.
"""
cov!(cK::AbstractMatrix, k::Kernel, X1::AbstractMatrix, X2::AbstractMatrix=X1, data::KernelData=EmptyData()) = cov_loop!(cK, k, X1, X2, data)
cov!(cK::AbstractMatrix, k::Kernel, X::AbstractMatrix, data::KernelData=EmptyData()) = cov!(cK, k, X, X, data)

############################
##### Kernel Gradients #####
############################

function _grad_slice_row!(dK, k, X::AbstractMatrix, data, j, p, dim)
dK[j,j] = dKij_dθp(k,X,X,data,j,j,p,dim)
@inbounds @simd for i in 1:(j-1)
dK[i,j] = dKij_dθp(k,X,X,data,i,j,p,dim)
dK[j,i] = dK[i,j]
end
end
function grad_slice_loop!(dK::AbstractMatrix, k::Kernel, X::AbstractMatrix, p::Int, data::KernelData)
dim, nobs = size(X)
(nobs,nobs) == size(dK) || throw(ArgumentError("dK has size $(size(dK)) and X has size $(size(X))"))
kcopies = [deepcopy(k) for _ in 1:Threads.nthreads()]
@inbounds Threads.@threads for j in 1:nobs
kthread = kcopies[Threads.threadid()]
_grad_slice_row!(dK, kthread, X, data, j, p, dim)
end
return dK
end
function _grad_slice_row!(dK, k, X1::AbstractMatrix, X2::AbstractMatrix, data::KernelData, i, p, dim, nobs2)
@inbounds @simd for j in 1:nobs2
dK[i,j] = dKij_dθp(k,X1,X2,data,i,j,p,dim)
end
end
function grad_slice_loop!(dK::AbstractMatrix, k::Kernel, X1::AbstractMatrix, X2::AbstractMatrix, p::Int, data::KernelData)
if X1 === X2
return grad_slice_loop!(dK, k, X1, p, data)
end
dim1, nobs1 = size(X1)
dim2, nobs2 = size(X2)
dim1==dim2 || throw(ArgumentError("X1 and X2 must have same dimension"))
(nobs1,nobs2) == size(dK) || throw(ArgumentError("dK has size $(size(dK)) X1 $(size(X1)) and X2 $(size(X2))"))
dim=dim1
kcopies = [deepcopy(k) for _ in 1:Threads.nthreads()]
@inbounds Threads.@threads for i in 1:nobs1
kthread = kcopies[Threads.threadid()]
_grad_slice_row!(dK, kthread, X1, X2, data, i, p, dim, nobs2)
end
return dK
end
function grad_slice_loop_generic!(dK::AbstractMatrix, k::Kernel, X1::AbstractMatrix, X2::AbstractMatrix, p::Int, data::KernelData)
dim, nobs1 = size(X1)
dim, nobs2 = size(X2)
# Generic implementation using broadcasting. A more efficient multithreaded
# implementation is provided above.
dK .= dKij_dθp.(Ref(k), Ref(X1), Ref(X2), Ref(data), 1:nobs1, (1:nobs2)', p, dim)
end
grad_slice!(dK::AbstractMatrix, k::Kernel, X1::AbstractMatrix, X2::AbstractMatrix, p::Int, data::KernelData=EmptyData()) = grad_slice_loop!(dK, k, X1, X2, p, data)

# Calculates the stack [dk / dθᵢ] of kernel matrix gradients
function grad_stack!(stack::AbstractArray, k::Kernel, X1::AbstractMatrix, X2::AbstractMatrix, data::KernelData=EmptyData())
@inbounds for p in 1:num_params(k)
grad_slice!(view(stack, :, :, p), k, X1, X2, p, data)
end
stack
end

# grad_stack!(stack::AbstractArray, k::Kernel, X1::AbstractMatrix, X2::AbstractMatrix) =
# grad_stack!(stack, k, X1, X2, KernelData(k, X1, X2))

# grad_stack(k::Kernel, X1::AbstractMatrix, X2::AbstractMatrix) = grad_stack(k, X1, X2, KernelData(k, X1, X2))

function grad_stack(k::Kernel, X1::AbstractMatrix, X2::AbstractMatrix, data::KernelData=EmptyData())
nobs1 = size(X1, 2)
nobs2 = size(X2, 2)
stack = Array{eltype(X)}(undef, nobs1, nobs2, num_params(k))
grad_stack!(stack, k, X1, X2, data)
end


@inline function dKij_dθp(k::Kernel, X1::AbstractMatrix, X2::AbstractMatrix, data::KernelData, i::Int, j::Int, p::Int, dim::Int)
return dKij_dθp(k, X1, X2, i, j, p, dim)
end
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,8 @@ function free(k::FixedKernel, par::Symbol)
return FixedKernel(k.kernel, free)
end

function grad_slice!(dK::AbstractMatrix, k::FixedKernel, X1::AbstractMatrix, X2::AbstractMatrix, data::KernelData, p::Int)
return grad_slice!(dK, k.kernel, X1, X2, data, k.free[p])
function grad_slice!(dK::AbstractMatrix, k::FixedKernel, X1::AbstractMatrix, X2::AbstractMatrix, p::Int, data::KernelData)
return grad_slice!(dK, k.kernel, X1, X2, k.free[p], data)
end
@inline function dKij_dθp(fk::FixedKernel,X1::AbstractMatrix, X2::AbstractMatrix,data::KernelData,i::Int,j::Int,p::Int,dim::Int)
return dKij_dθp(fk.kernel, X1, X2, data, i, j, fk.free[p], dim)
Expand Down
Loading