diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index d836c333..6e07c96d 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -20,7 +20,7 @@ jobs: fail-fast: false matrix: version: - - '1.6' + - '1.8' - '1' - 'pre' os: @@ -68,7 +68,7 @@ jobs: include: - version: '1' downgrade: false - - version: '1.7' + - version: '1.8' downgrade: true steps: - uses: actions/checkout@v4 diff --git a/Project.toml b/Project.toml index f6ef830d..61c5a4d8 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "Pathfinder" uuid = "b1d3bc72-d0e7-4279-b92f-7fa5d6d2d454" authors = ["Seth Axen and contributors"] -version = "0.9.5" +version = "0.9.6" [deps] ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b" @@ -23,7 +23,6 @@ SciMLBase = "0bca4576-84f4-4d90-8ffe-ffa030f20462" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" Transducers = "28d57a85-8fef-5791-bfe6-a80928e7c999" -UnPack = "3a884ed6-31ef-47d7-9d2a-63182c4928ed" [weakdeps] Accessors = "7d9f7c33-5ae7-4f3b-8dc6-eff91059b697" @@ -63,8 +62,7 @@ Statistics = "1.6" StatsBase = "0.33.7, 0.34" Transducers = "0.4.81" Turing = "0.31.4, 0.32, 0.33, 0.34, 0.35" -UnPack = "1" -julia = "1.6" +julia = "1.8" [extras] Accessors = "7d9f7c33-5ae7-4f3b-8dc6-eff91059b697" diff --git a/src/Pathfinder.jl b/src/Pathfinder.jl index 5ba23927..d79bfc1c 100644 --- a/src/Pathfinder.jl +++ b/src/Pathfinder.jl @@ -19,7 +19,6 @@ using SciMLBase: SciMLBase using Statistics: Statistics using StatsBase: StatsBase using Transducers: Transducers -using UnPack: @unpack export PathfinderResult, MultiPathfinderResult export pathfinder, multipathfinder @@ -43,7 +42,7 @@ default_ad() = ADTypes.AutoForwardDiff() include("transducers.jl") include("woodbury.jl") include("optimize.jl") -include("inverse_hessian.jl") +include("lbfgs.jl") include("mvnormal.jl") include("elbo.jl") include("resample.jl") diff --git a/src/elbo.jl b/src/elbo.jl index e7190e57..6d5c0cd0 100644 --- a/src/elbo.jl +++ b/src/elbo.jl @@ -1,24 +1,30 @@ -function maximize_elbo(rng, logp, dists, ndraws, executor) +function maximize_elbo(rng, logp, dists, ndraws, executor; save_samples::Bool=true) + dim = isempty(dists) ? 0 : length(first(dists)) + Tdraws = Matrix{eltype(eltype(dists))} + draws = save_samples ? nothing : Tdraws(undef, (dim, ndraws)) EE = Core.Compiler.return_type( - elbo_and_samples, Tuple{typeof(rng),typeof(logp),eltype(dists),Int} + elbo_and_samples!, Tuple{Tdraws,typeof(rng),typeof(logp),eltype(dists)} ) estimates = similar(dists, EE) isempty(estimates) && return 0, estimates + Folds.map!(estimates, dists, executor) do dist - return elbo_and_samples(rng, logp, dist, ndraws) + _draws = draws === nothing ? Tdraws(undef, (dim, ndraws)) : draws + return elbo_and_samples!(_draws, rng, logp, dist; save_samples) end _, iteration_opt = _findmax(estimates |> Transducers.Map(est -> est.value)) return iteration_opt, estimates end -function elbo_and_samples(rng, logp, dist, ndraws) - ϕ, logqϕ = rand_and_logpdf(rng, dist, ndraws) +function elbo_and_samples!(ϕ, rng, logp, dist; save_samples::Bool=true) + ϕ, logqϕ = rand_and_logpdf!(rng, dist, ϕ) logpϕ = similar(logqϕ) logpϕ .= logp.(eachcol(ϕ)) logr = logpϕ - logqϕ elbo = Statistics.mean(logr) elbo_se = sqrt(Statistics.var(logr) / length(logr)) - return ELBOEstimate(elbo, elbo_se, ϕ, logpϕ, logqϕ, logr) + ϕ_save = save_samples ? copyto!(similar(ϕ), ϕ) : similar(ϕ, map(zero, size(ϕ))) + return ELBOEstimate(elbo, elbo_se, ϕ_save, logpϕ, logqϕ, logr) end struct ELBOEstimate{T,P,L<:AbstractVector{T}} diff --git a/src/inverse_hessian.jl b/src/inverse_hessian.jl deleted file mode 100644 index 273e5948..00000000 --- a/src/inverse_hessian.jl +++ /dev/null @@ -1,134 +0,0 @@ - -# eq 4.9 -# Gilbert, J.C., Lemaréchal, C. Some numerical experiments with variable-storage quasi-Newton algorithms. -# Mathematical Programming 45, 407–435 (1989). https://doi.org/10.1007/BF01589113 -function gilbert_init(α, s, y) - a = dot(y, Diagonal(α), y) - b = dot(y, s) - c = dot(s, inv(Diagonal(α)), s) - return @. b / (a / α + y^2 - (a / c) * (s / α)^2) -end - -""" - lbfgs_inverse_hessians( - θs, ∇logpθs; Hinit=gilbert_init, history_length=5, ϵ=1e-12 - ) -> Tuple{Vector{WoodburyPDMat},Int} - -From an L-BFGS trajectory and gradients, compute the inverse Hessian approximations at each point. - -Given positions `θs` with gradients `∇logpθs`, construct LBFGS inverse Hessian -approximations with the provided `history_length`. - -The 2nd returned value is the number of BFGS updates to the inverse Hessian matrices that -were rejected due to keeping the inverse Hessian positive definite. -""" -function lbfgs_inverse_hessians(θs, ∇logpθs; Hinit=gilbert_init, history_length=5, ϵ=1e-12) - L = length(θs) - 1 - θ = θs[1] - ∇logpθ = ∇logpθs[1] - n = length(θ) - - # allocate caches/containers - history_ind = 0 # index of last set history entry - history_length_effective = 0 # length of history so far - s = similar(θ) # cache for BFGS update, i.e. sₗ = θₗ₊₁ - θₗ = -λ Hₗ ∇logpθₗ - y = similar(∇logpθ) # cache for yₗ = ∇logpθₗ₊₁ - ∇logpθₗ = Hₗ₊₁ \ s₁ (secant equation) - S = similar(s, n, min(history_length, L)) # history of s - Y = similar(y, n, min(history_length, L)) # history of y - α = fill!(similar(θ), true) # diag(H₀) - H = lbfgs_inverse_hessian(Diagonal(α), S, Y, history_ind, history_length_effective) # H₀ = I - Hs = [H] # trace of H - - num_bfgs_updates_rejected = 0 - for l in 1:L - θlp1, ∇logpθlp1 = θs[l + 1], ∇logpθs[l + 1] - s .= θlp1 .- θ - y .= ∇logpθ .- ∇logpθlp1 - if dot(y, s) > ϵ * sum(abs2, y) # curvature is positive, safe to update inverse Hessian - # add s and y to history - history_ind = mod1(history_ind + 1, history_length) - history_length_effective = max(history_ind, history_length_effective) - S[1:n, history_ind] .= s - Y[1:n, history_ind] .= y - - # initial diagonal estimate of H - α = Hinit(α, s, y) - else - num_bfgs_updates_rejected += 1 - end - - θ, ∇logpθ = θlp1, ∇logpθlp1 - H = lbfgs_inverse_hessian(Diagonal(α), S, Y, history_ind, history_length_effective) - push!(Hs, H) - end - - return Hs, num_bfgs_updates_rejected -end - -""" - lbfgs_inverse_hessian(H₀, S₀, Y₀, history_ind, history_length) -> WoodburyPDMat - -Compute approximate inverse Hessian initialized from `H₀` from history stored in `S₀` and `Y₀`. - -`history_ind` indicates the column in `S₀` and `Y₀` that was most recently added to the -history, while `history_length` indicates the number of first columns in `S₀` and `Y₀` -currently being used for storing history. -`S = S₀[:, history_ind+1:history_length; 1:history_ind]` reorders the columns of `₀` so that the -oldest is first and newest is last. - -From Theorem 2.2 of [^Byrd1994], the expression for the inverse Hessian ``H`` is - -```math -\\begin{align} -B &= \\begin{pmatrix}H_0 Y & S\\end{pmatrix}\\\\ -R &= \\operatorname{triu}(S^\\mathrm{T} Y)\\\\ -E &= I \\circ R\\\\ -D &= \\begin{pmatrix} - 0 & -R^{-1}\\\\ - -R^{-\\mathrm{T}} & R^\\mathrm{-T} (E + Y^\\mathrm{T} H_0 Y ) R^\\mathrm{-1}\\\\ -\\end{pmatrix}\\ -H &= H_0 + B D B^\\mathrm{T} -\\end{align} -``` - -[^Byrd1994]: Byrd, R.H., Nocedal, J. & Schnabel, R.B. - Representations of quasi-Newton matrices and their use in limited memory methods. - Mathematical Programming 63, 129–156 (1994). - doi: [10.1007/BF01582063](https://doi.org/10.1007/BF01582063) -""" -function lbfgs_inverse_hessian(H₀::Diagonal, S0, Y0, history_ind, history_length) - J = history_length - α = H₀.diag - B = similar(α, size(α, 1), 2J) - D = fill!(similar(α, 2J, 2J), false) - iszero(J) && return WoodburyPDMat(H₀, B, D) - - hist_inds = [(history_ind + 1):history_length; 1:history_ind] - @views begin - S = S0[:, hist_inds] - Y = Y0[:, hist_inds] - B₁ = B[:, 1:J] - B₂ = B[:, (J + 1):(2J)] - D₁₁ = D[1:J, 1:J] - D₁₂ = D[1:J, (J + 1):(2J)] - D₂₁ = D[(J + 1):(2J), 1:J] - D₂₂ = D[(J + 1):(2J), (J + 1):(2J)] - end - - mul!(B₁, Diagonal(α), Y) - copyto!(B₂, S) - mul!(D₂₂, S', Y) - triu!(D₂₂) - R = UpperTriangular(D₂₂) - nRinv = UpperTriangular(D₁₂) - copyto!(nRinv, -I) - ldiv!(R, nRinv) - nRinv′ = LowerTriangular(copyto!(D₂₁, nRinv')) - tril!(D₂₂) # eliminate all but diagonal - mul!(D₂₂, Y', B₁, true, true) - LinearAlgebra.copytri!(D₂₂, 'U', false, false) - rmul!(D₂₂, nRinv) - lmul!(nRinv′, D₂₂) - - return WoodburyPDMat(H₀, B, D) -end diff --git a/src/lbfgs.jl b/src/lbfgs.jl new file mode 100644 index 00000000..0c2dd697 --- /dev/null +++ b/src/lbfgs.jl @@ -0,0 +1,208 @@ +# eq 4.9 +# Gilbert, J.C., Lemaréchal, C. Some numerical experiments with variable-storage quasi-Newton algorithms. +# Mathematical Programming 45, 407–435 (1989). https://doi.org/10.1007/BF01589113 +function gilbert_invH_init!(α, s, y) + a = dot(y, Diagonal(α), y) + b = dot(y, s) + c = dot(s, inv(Diagonal(α)), s) + @. α = b / (a / α + y^2 - (a / c) * (s / α)^2) + return α +end + +# history storage for L-BFGS and accessor methods +mutable struct LBFGSHistory{T<:Real} + const position_diffs::Matrix{T} + const gradient_diffs::Matrix{T} + const history_perm::Vector{Int} + history_length::Int +end + +function LBFGSHistory{T}(n::Int, history_length::Int) where {T<:Real} + position_diffs = Matrix{T}(undef, n, history_length + 1) + gradient_diffs = Matrix{T}(undef, n, history_length + 1) + history_perm = collect(1:(history_length + 1)) + return LBFGSHistory{T}(position_diffs, gradient_diffs, history_perm, 0) +end + +function _history_matrices(history::LBFGSHistory) + (; position_diffs, gradient_diffs, history_perm, history_length) = history + history_inds = @view history_perm[(end - history_length + 1):end] + return @views (position_diffs[:, history_inds], gradient_diffs[:, history_inds]) +end + +function _propose_history_update!( + history::LBFGSHistory, position, position_new, gradient, gradient_new +) + (; history_perm) = history + queue_ind = first(history_perm) + history.position_diffs[:, queue_ind] .= position_new .- position + history.gradient_diffs[:, queue_ind] .= gradient .- gradient_new + return history +end + +function _proposed_history_updates(history::LBFGSHistory) + (; position_diffs, gradient_diffs, history_perm) = history + queue_ind = first(history_perm) + return @views position_diffs[:, queue_ind], gradient_diffs[:, queue_ind] +end + +function _accept_history_update!(history::LBFGSHistory) + (; history_perm) = history + circshift!(history_perm, -1) + history.history_length = min(history.history_length + 1, length(history_perm) - 1) + return history +end + +function _has_positive_curvature(pos_diff, grad_diff, ϵ) + return dot(grad_diff, pos_diff) > ϵ * sum(abs2, grad_diff) +end + +# cache for L-BFGS inverse Hessian approximations +struct LBFGSInverseHessianCache{T<:Real} + diag_invH0::Vector{T} + B::Matrix{T} + D::Matrix{T} +end + +function LBFGSInverseHessianCache{T}(n::Int, history_length::Int) where {T<:Real} + diag_invH0 = ones(T, n) + B = Matrix{T}(undef, n, 2 * history_length) + D = zeros(T, 2 * history_length, 2 * history_length) + return LBFGSInverseHessianCache(diag_invH0, B, D) +end + +# state for each iteration of L-BFGS optimization +mutable struct LBFGSState{T<:Real,IH<:WoodburyPDMat{T}} + const x::Vector{T} + fx::T + const ∇fx::Vector{T} + const history::LBFGSHistory{T} + const cache::LBFGSInverseHessianCache{T} + invH::IH + num_bfgs_updates_rejected::Int +end +function LBFGSState(x, fx, ∇fx, history_length::Int) + T = Base.promote_eltype(x, fx, ∇fx) + n = length(x) + history = LBFGSHistory{T}(n, history_length) + cache = LBFGSInverseHessianCache{T}(n, history_length) + invH = lbfgs_inverse_hessian!(cache, history) # H₀ = I + return LBFGSState{T,typeof(invH)}(copy(x), fx, copy(∇fx), history, cache, invH, 0) +end + +function _update_state!(state::LBFGSState, x, fx, ∇fx, invH_init!, ϵ) + _propose_history_update!(state.history, state.x, x, state.∇fx, ∇fx) + pos_diff, grad_diff = _proposed_history_updates(state.history) + + # only update inverse Hessian if will not destroy positive definiteness + if _has_positive_curvature(pos_diff, grad_diff, ϵ) + _accept_history_update!(state.history) + invH_init!(state.cache.diag_invH0, pos_diff, grad_diff) + state.invH = lbfgs_inverse_hessian!(state.cache, state.history) + else + state.num_bfgs_updates_rejected += 1 + end + + copyto!(state.x, x) + state.fx = fx + copyto!(state.∇fx, ∇fx) + return state +end + +""" + lbfgs_inverse_hessians( + θs, ∇logpθs; Hinit=gilbert_init, history_length=5, ϵ=1e-12 + ) -> Tuple{Vector{WoodburyPDMat},Int} + +From an L-BFGS trajectory and gradients, compute the inverse Hessian approximations at each point. + +Given positions `θs` with gradients `∇logpθs`, construct LBFGS inverse Hessian +approximations with the provided `history_length`. + +The 2nd returned value is the number of BFGS updates to the inverse Hessian matrices that +were rejected due to keeping the inverse Hessian positive definite. +""" +function lbfgs_inverse_hessians( + θs, logpθs, ∇logpθs; (invH_init!)=gilbert_invH_init!, history_length=5, ϵ=1e-12 +) + L = length(θs) - 1 + history_length = min(history_length, L) + state = LBFGSState(first(θs), first(logpθs), first(∇logpθs), history_length) + invHs = [deepcopy(state.invH)] # trace of invH + + for (θ, logpθ, ∇logpθ) in Iterators.drop(zip(θs, logpθs, ∇logpθs), 1) + _update_state!(state, θ, logpθ, ∇logpθ, invH_init!, ϵ) + push!(invHs, deepcopy(state.invH)) + end + + return invHs, state.num_bfgs_updates_rejected +end + +""" + lbfgs_inverse_hessian!(cache::LBFGSInverseHessianCache, history::LBFGSHistory) -> WoodburyPDMat + +Compute approximate inverse Hessian initialized from history stored in `cache` and `history`. + +`cache` stores the diagonal of the initial inverse Hessian ``H₀^{-1}`` and the matrices +``B₀`` and ``D₀``, which are overwritten here and are used in the construction of the +returned approximate inverse Hessian ``H^{-1}``. + +From Theorem 2.2 of [^Byrd1994], the expression for the inverse Hessian ``H^{-1}`` is + +```math +\\begin{align} +B &= \\begin{pmatrix}H_0^{-1} Y & S\\end{pmatrix}\\\\ +R &= \\operatorname{triu}(S^\\mathrm{T} Y)\\\\ +E &= I \\circ R\\\\ +D &= \\begin{pmatrix} + 0 & -R^{-1}\\\\ + -R^{-\\mathrm{T}} & R^\\mathrm{-T} (E + Y^\\mathrm{T} H_0 Y ) R^\\mathrm{-1}\\\\ +\\end{pmatrix}\\ +H^{-1} &= H_0^{-1} + B D B^\\mathrm{T} +\\end{align} +``` + +[^Byrd1994]: Byrd, R.H., Nocedal, J. & Schnabel, R.B. + Representations of quasi-Newton matrices and their use in limited memory methods. + Mathematical Programming 63, 129–156 (1994). + doi: [10.1007/BF01582063](https://doi.org/10.1007/BF01582063) +""" +function lbfgs_inverse_hessian!(cache::LBFGSInverseHessianCache, history::LBFGSHistory) + (; B, D, diag_invH0) = cache + return lbfgs_inverse_hessian!(B, D, diag_invH0, history) +end +function lbfgs_inverse_hessian!(B_cache, D_cache, diag_invH0, history) + S, Y = _history_matrices(history) + J = history.history_length + B = @view B_cache[:, 1:(2J)] + D = @view D_cache[1:(2J), 1:(2J)] + invH0 = Diagonal(diag_invH0) + iszero(J) && return WoodburyPDMat(invH0, B, D) + + @views begin + B₁ = B[:, 1:J] + B₂ = B[:, (J + 1):(2J)] + D₁₁ = D[1:J, 1:J] + D₁₂ = D[1:J, (J + 1):(2J)] + D₂₁ = D[(J + 1):(2J), 1:J] + D₂₂ = D[(J + 1):(2J), (J + 1):(2J)] + end + + fill!(D₁₁, false) + mul!(B₁, invH0, Y) + copyto!(B₂, S) + mul!(D₂₂, S', Y) + triu!(D₂₂) + R = UpperTriangular(D₂₂) + nRinv = UpperTriangular(D₁₂) + copyto!(nRinv, -I) + ldiv!(R, nRinv) + nRinv′ = LowerTriangular(copyto!(D₂₁, nRinv')) + tril!(D₂₂) # eliminate all but diagonal + mul!(D₂₂, Y', B₁, true, true) + LinearAlgebra.copytri!(D₂₂, 'U', false, false) + rmul!(D₂₂, nRinv) + lmul!(nRinv′, D₂₂) + + return WoodburyPDMat(invH0, B, D) +end diff --git a/src/mvnormal.jl b/src/mvnormal.jl index c89f978c..e6ab6a22 100644 --- a/src/mvnormal.jl +++ b/src/mvnormal.jl @@ -11,8 +11,8 @@ point are returned. The 2nd returned value is the number of BFGS updates to the inverse Hessian matrices that were rejected due to keeping the inverse Hessian positive definite. """ -function fit_mvnormals(θs, ∇logpθs; kwargs...) - Σs, num_bfgs_updates_rejected = lbfgs_inverse_hessians(θs, ∇logpθs; kwargs...) +function fit_mvnormals(θs, logpθs, ∇logpθs; kwargs...) + Σs, num_bfgs_updates_rejected = lbfgs_inverse_hessians(θs, logpθs, ∇logpθs; kwargs...) trans = Transducers.MapSplat() do Σ, ∇logpθ, θ μ = muladd(Σ, ∇logpθ, θ) return Distributions.MvNormal(μ, Σ) @@ -22,16 +22,15 @@ function fit_mvnormals(θs, ∇logpθs; kwargs...) return dists, num_bfgs_updates_rejected end -# faster than computing `logpdf` and `rand` independently -function rand_and_logpdf(rng, dist::Distributions.MvNormal, ndraws) - μ = dist.μ - Σ = dist.Σ - N = length(μ) +# faster than computing `logpdf` and `rand!` independently +function rand_and_logpdf!(rng, dist::Distributions.MvNormal, x) + (; μ, Σ) = dist + N = length(dist) # draw points - u = Random.randn!(rng, similar(μ, N, ndraws)) - unormsq = vec(sum(abs2, u; dims=1)) - x = PDMats.unwhiten!(u, Σ, u) + Random.randn!(rng, x) + unormsq = vec(sum(abs2, x; dims=1)) + PDMats.unwhiten!(x, Σ, x) x .+= μ # compute log density at each point diff --git a/src/optimize.jl b/src/optimize.jl index 84e6677d..4ce508bf 100644 --- a/src/optimize.jl +++ b/src/optimize.jl @@ -101,7 +101,7 @@ end @static if isdefined(Optimization, :OptimizationState) # Optimization v3.21.0 and later function (cb::OptimizationCallback)(state::Optimization.OptimizationState, args...) - @unpack ( + (; xs, fxs, ∇fxs, progress_name, progress_id, maxiters, callback, fail_on_nonfinite ) = cb ret = callback !== nothing && callback(state, args...) @@ -127,7 +127,7 @@ end else # Optimization v3.20.X and earlier function (cb::OptimizationCallback)(x, nfx, args...) - @unpack ( + (; xs, fxs, ∇fxs, progress_name, progress_id, maxiters, callback, fail_on_nonfinite ) = cb ret = callback !== nothing && callback(x, nfx, args...) diff --git a/src/singlepath.jl b/src/singlepath.jl index c126af10..41ae4a0f 100644 --- a/src/singlepath.jl +++ b/src/singlepath.jl @@ -199,7 +199,7 @@ function pathfinder( kwargs..., ) end - @unpack ( + (; itry, success, optim_prob, @@ -302,7 +302,7 @@ function _pathfinder( # fit mv-normal distributions to trajectory fit_distributions, num_bfgs_updates_rejected = fit_mvnormals( - optim_trace.points, optim_trace.gradients; history_length + optim_trace.points, optim_trace.log_densities, optim_trace.gradients; history_length ) # find ELBO-maximizing distribution diff --git a/test/elbo.jl b/test/elbo.jl index 0d24e210..25b7a45f 100644 --- a/test/elbo.jl +++ b/test/elbo.jl @@ -5,7 +5,7 @@ using Test using Transducers @testset "ELBO estimation" begin - @testset "elbo_and_samples" begin + @testset "elbo_and_samples!" begin σ_target = 0.08 target_dist = Normal(0, σ_target) logp(x) = logpdf(target_dist, x[1]) @@ -13,18 +13,28 @@ using Transducers rng = Random.seed!(Random.default_rng(), 42) @testset for σ in σs dist = Normal(0, σ) - est = @inferred Pathfinder.elbo_and_samples(rng, logp, dist, 1_000_000) + draws = Matrix{Float64}(undef, length(dist), 1_000_000) + # rng = Random.seed!(Random.default_rng(), 42) + est = @inferred Pathfinder.elbo_and_samples!(draws, rng, logp, dist) @test est isa Pathfinder.ELBOEstimate # explicit elbo calculation r = σ / σ_target elbo_exp = (1 - r^2) / 2 + log(r) @test est.value ≈ elbo_exp atol = 3 * est.std_err + @test est.draws !== draws @test est.log_densities_target ≈ logp.(eachcol(est.draws)) @test est.log_densities_fit ≈ logpdf.(dist, first.(eachcol(est.draws))) @test est.log_density_ratios == est.log_densities_target - est.log_densities_fit @test est.value ≈ mean(est.log_density_ratios) @test est.std_err ≈ std(est.log_density_ratios) / sqrt(1_000_000) + + # fill!(draws, 0) + # est_nosave = @inferred Pathfinder.elbo_and_samples!(draws, rng, logp, dist; save_samples=false) + # @test est_nosave.value == est.value + # @test est_nosave.std_err == est.std_err + # @test est_nosave.draws !== draws + # @test isempty(est_nosave.draws) end end @@ -34,11 +44,7 @@ using Transducers logp(x) = logpdf(target_dist, x[1]) σs = [1e-3, 0.05, σ_target, 1.0, 1.1, 1.2, 5.0, 10.0] dists = Normal.(0, σs) - if VERSION ≥ v"1.7.0" - executors = [SequentialEx(), ThreadedEx()] - else - executors = [SequentialEx()] - end + executors = [SequentialEx()] @testset "$executor" for executor in executors rng = Random.seed!(Random.default_rng(), 42) lopt, estimates = @inferred Pathfinder.maximize_elbo( diff --git a/test/inverse_hessian.jl b/test/inverse_hessian.jl deleted file mode 100644 index d9185fb4..00000000 --- a/test/inverse_hessian.jl +++ /dev/null @@ -1,77 +0,0 @@ -using LinearAlgebra -using LogDensityProblems -using Optim -using Pathfinder -using SciMLBase -using Test - -function lbfgs_inverse_hessian_explicit(H₀, S, Y) - B = [H₀ * Y S] - R = triu(S'Y) - E = Diagonal(R) - D = [0*I -inv(R); -inv(R)' R' \ (E + Y' * H₀ * Y)/R] - return H₀ + B * D * B' -end - -@testset "L-BFGS inverse Hessian construction" begin - @testset "lbfgs_inverse_hessian" begin - #! format: off - S0 = [[1.719573, 3.294037, -2.008877, 3.901275, 0.214324, 0.400382, 0.113598, -1.804262, 0.465563, 2.465748], [-0.445476, -0.514915, 1.069617, -1.505506, -0.036535, -0.11386, -0.029737, 0.579662, -0.118694, -1.726258], [-0.013354, 0.1981, 0.081886, -0.194172, 0.010499, -0.007944, -0.001039, 0.061604, -0.002621, 0.11145], [0.00648, 0.255961, -0.011901, -0.059042, 0.013587, -0.002064, 0.000302, 0.023457, 0.00257, -0.002343], [-0.016408, 0.015005, 0.009654, 0.016735, -0.000245, -0.002825, -0.00106, 0.004272, -0.004578, 0.002275]] - Y0 = -[[-2.357935, -3.343312, 4.659008, -7.065433, -0.228045, -0.584164, -0.156837, 2.863289, -0.631753, -7.152021], [0.610851, 0.522617, -2.480668, 2.726559, 0.038874, 0.166123, 0.041055, -0.9199, 0.161064, 5.007094], [0.018312, -0.201064, -0.189911, 0.351657, -0.011171, 0.011591, 0.001434, -0.097763, 0.003557, -0.323265], [-0.008886, -0.25979, 0.027601, 0.106929, -0.014456, 0.003011, -0.000418, -0.037225, -0.003488, 0.006795], [0.022499, -0.015229, -0.02239, -0.030308, 0.00026, 0.004122, 0.001463, -0.00678, 0.006212, -0.006598]] - #! format: on - S = reduce(hcat, S0) - Y = reduce(hcat, Y0) - N, history_length = size(S) - α = rand(N) - H₀ = Diagonal(α) - - @test @inferred(Pathfinder.lbfgs_inverse_hessian(H₀, S, Y, 0, 0)) ≈ H₀ - - H = Pathfinder.lbfgs_inverse_hessian(H₀, S, Y, 3, 3) - Hexp = lbfgs_inverse_hessian_explicit(H₀, S[:, 1:3], Y[:, 1:3]) - @test H ≈ Hexp - - S2 = S[:, [4:history_length; 1:3]] - Y2 = Y[:, [4:history_length; 1:3]] - ilast = argmax(axes(S, 2)[[4:history_length; 1:3]]) - H = Pathfinder.lbfgs_inverse_hessian(H₀, S2, Y2, ilast, history_length) - Hexp = lbfgs_inverse_hessian_explicit(H₀, S, Y) - @test H ≈ Hexp - - H = Pathfinder.lbfgs_inverse_hessian(H₀, S, Y, history_length, history_length) - Hexp = lbfgs_inverse_hessian_explicit(H₀, S, Y) - @test H ≈ Hexp - end - - @testset "lbfgs_inverse_hessians" begin - n = 10 - history_length = 5 - logp(x) = logp_banana(x) - nocedal_wright_scaling(α, s, y) = fill!(similar(α), dot(y, s) / sum(abs2, y)) - θ₀ = 10 * randn(n) - - ℓ = build_logdensityproblem(logp, n, 2) - fun = Pathfinder.build_optim_function( - ℓ, SciMLBase.NoAD(), LogDensityProblems.capabilities(ℓ) - ) - prob = SciMLBase.OptimizationProblem(fun, θ₀) - optimizer = Optim.LBFGS(; - m=history_length, linesearch=Optim.LineSearches.MoreThuente() - ) - sol, optim_trace = Pathfinder.optimize_with_trace(prob, optimizer) - - # run lbfgs_inverse_hessians with the same initialization as Optim.LBFGS - Hs, num_bfgs_updates_rejected = Pathfinder.lbfgs_inverse_hessians( - optim_trace.points, - optim_trace.gradients; - history_length, - Hinit=nocedal_wright_scaling, - ) - ss = diff(optim_trace.points) - ps = (Hs .* optim_trace.gradients)[1:(end - 1)] - # check that next direction computed from Hessian is the same as the actual - # direction that was taken - @test all(≈(1), dot.(ps, ss) ./ norm.(ss) ./ norm.(ps)) - @test num_bfgs_updates_rejected == 0 - end -end diff --git a/test/lbfgs.jl b/test/lbfgs.jl new file mode 100644 index 00000000..ade1d658 --- /dev/null +++ b/test/lbfgs.jl @@ -0,0 +1,98 @@ +using LinearAlgebra +using LogDensityProblems +using Optim +using Pathfinder +using SciMLBase +using Test + +function lbfgs_inverse_hessian_explicit(H₀, S, Y) + B = [H₀ * Y S] + R = triu(S'Y) + E = Diagonal(R) + D = [0*I -inv(R); -inv(R)' R' \ (E + Y' * H₀ * Y)/R] + return H₀ + B * D * B' +end + +@testset "L-BFGS inverse Hessian construction" begin + @testset "lbfgs_inverse_hessian" begin + #! format: off + S_vecs = [[1.719573, 3.294037, -2.008877, 3.901275, 0.214324, 0.400382, 0.113598, -1.804262, 0.465563, 2.465748], [-0.445476, -0.514915, 1.069617, -1.505506, -0.036535, -0.11386, -0.029737, 0.579662, -0.118694, -1.726258], [-0.013354, 0.1981, 0.081886, -0.194172, 0.010499, -0.007944, -0.001039, 0.061604, -0.002621, 0.11145], [0.00648, 0.255961, -0.011901, -0.059042, 0.013587, -0.002064, 0.000302, 0.023457, 0.00257, -0.002343], [-0.016408, 0.015005, 0.009654, 0.016735, -0.000245, -0.002825, -0.00106, 0.004272, -0.004578, 0.002275]] + Y_vecs = -[[-2.357935, -3.343312, 4.659008, -7.065433, -0.228045, -0.584164, -0.156837, 2.863289, -0.631753, -7.152021], [0.610851, 0.522617, -2.480668, 2.726559, 0.038874, 0.166123, 0.041055, -0.9199, 0.161064, 5.007094], [0.018312, -0.201064, -0.189911, 0.351657, -0.011171, 0.011591, 0.001434, -0.097763, 0.003557, -0.323265], [-0.008886, -0.25979, 0.027601, 0.106929, -0.014456, 0.003011, -0.000418, -0.037225, -0.003488, 0.006795], [0.022499, -0.015229, -0.02239, -0.030308, 0.00026, 0.004122, 0.001463, -0.00678, 0.006212, -0.006598]] + #! format: on + S = reduce(hcat, S_vecs) + Y = reduce(hcat, Y_vecs) + history_length = length(S_vecs) + N = length(first(S_vecs)) + history = Pathfinder.LBFGSHistory{Float64}(N, history_length) + cache = Pathfinder.LBFGSInverseHessianCache{Float64}(N, history_length) + @. cache.diag_invH0 = rand() + invH0 = Diagonal(cache.diag_invH0) + + @test @inferred(Pathfinder.lbfgs_inverse_hessian!(cache, history)) ≈ invH0 + + for i in 1:3 + Pathfinder._propose_history_update!( + history, zero(S_vecs[i]), S_vecs[i], Y_vecs[i], zero(Y_vecs[i]) + ) + Pathfinder._accept_history_update!(history) + end + invH = Pathfinder.lbfgs_inverse_hessian!(cache, history) + invH_expected = lbfgs_inverse_hessian_explicit(invH0, S[:, 1:3], Y[:, 1:3]) + @test invH ≈ invH_expected + + S2 = S[:, [4:history_length; 1:3]] + Y2 = Y[:, [4:history_length; 1:3]] + for i in vcat(4:history_length, 1:3) + Pathfinder._propose_history_update!( + history, zero(S_vecs[i]), S_vecs[i], Y_vecs[i], zero(Y_vecs[i]) + ) + Pathfinder._accept_history_update!(history) + end + invH = Pathfinder.lbfgs_inverse_hessian!(cache, history) + invH_expected = lbfgs_inverse_hessian_explicit(invH0, S2, Y2) + @test invH ≈ invH_expected + + for i in 1:history_length + Pathfinder._propose_history_update!( + history, zero(S_vecs[i]), S_vecs[i], Y_vecs[i], zero(Y_vecs[i]) + ) + Pathfinder._accept_history_update!(history) + end + invH = Pathfinder.lbfgs_inverse_hessian!(cache, history) + invH_expected = lbfgs_inverse_hessian_explicit(invH0, S, Y) + @test invH ≈ invH_expected + end + + @testset "lbfgs_inverse_hessians" begin + n = 10 + history_length = 5 + logp(x) = logp_banana(x) + nocedal_wright_invH_init!(α, s, y) = fill!(α, dot(y, s) / sum(abs2, y)) + θ₀ = 10 * randn(n) + + ℓ = build_logdensityproblem(logp, n, 2) + fun = Pathfinder.build_optim_function( + ℓ, SciMLBase.NoAD(), LogDensityProblems.capabilities(ℓ) + ) + prob = SciMLBase.OptimizationProblem(fun, θ₀) + optimizer = Optim.LBFGS(; + m=history_length, linesearch=Optim.LineSearches.MoreThuente() + ) + sol, optim_trace = Pathfinder.optimize_with_trace(prob, optimizer) + + # run lbfgs_inverse_hessians with the same initialization as Optim.LBFGS + invHs, num_bfgs_updates_rejected = Pathfinder.lbfgs_inverse_hessians( + optim_trace.points, + optim_trace.log_densities, + optim_trace.gradients; + history_length, + (invH_init!)=nocedal_wright_invH_init!, + ) + ss = diff(optim_trace.points) + ps = (invHs .* optim_trace.gradients)[1:(end - 1)] + # check that next direction computed from Hessian is the same as the actual + # direction that was taken + @test all(≈(1), dot.(ps, ss) ./ norm.(ss) ./ norm.(ps)) + @test num_bfgs_updates_rejected == 0 + end +end diff --git a/test/multipath.jl b/test/multipath.jl index 42ab3c93..7088be77 100644 --- a/test/multipath.jl +++ b/test/multipath.jl @@ -22,11 +22,7 @@ using Transducers d = MvNormal(μ, Σ) logp(x) = logpdf(d, x) ℓ = build_logdensityproblem(logp, dim, 2) - rngs = if VERSION ≥ v"1.7" - [MersenneTwister(), Random.default_rng()] - else - [MersenneTwister()] - end + rngs = [MersenneTwister(), Random.default_rng()] seed = 76 @testset for rng in rngs executor = rng isa MersenneTwister ? SequentialEx() : ThreadedEx() diff --git a/test/mvnormal.jl b/test/mvnormal.jl index c2604d6a..0deb8afb 100644 --- a/test/mvnormal.jl +++ b/test/mvnormal.jl @@ -2,6 +2,7 @@ using Distributions using Optim using Pathfinder using Random +using SciMLBase using Test @testset "MvNormal functions" begin @@ -17,10 +18,16 @@ using Test history_length = optimizer.m _, optim_trace = Pathfinder.optimize_with_trace(prob, optimizer) Σs, num_bfgs_updates_rejected1 = Pathfinder.lbfgs_inverse_hessians( - optim_trace.points, optim_trace.gradients; history_length + optim_trace.points, + optim_trace.log_densities, + optim_trace.gradients; + history_length, ) dists, num_bfgs_updates_rejected2 = @inferred Pathfinder.fit_mvnormals( - optim_trace.points, optim_trace.gradients; history_length + optim_trace.points, + optim_trace.log_densities, + optim_trace.gradients; + history_length, ) @test dists isa Vector{<:MvNormal{Float64,<:Pathfinder.WoodburyPDMat}} @test num_bfgs_updates_rejected2 == num_bfgs_updates_rejected1 @@ -28,17 +35,19 @@ using Test @test optim_trace.points .+ Σs .* optim_trace.gradients ≈ getproperty.(dists, :μ) end - @testset "rand_and_logpdf" begin - ndraws = 20 + @testset "rand_and_logpdf!" begin @testset "MvNormal" begin n = 10 + ndraws = 20 + draws = Matrix{Float64}(undef, n, ndraws) Σ = rand_pd_mat(Float64, 10) μ = randn(n) dist = MvNormal(μ, Σ) seed = 42 rng = Random.seed!(Random.default_rng(), seed) - x, logpx = @inferred Pathfinder.rand_and_logpdf(rng, dist, ndraws) + x, logpx = @inferred Pathfinder.rand_and_logpdf!(rng, dist, draws) + @test x === draws Random.seed!(rng, seed) x2 = rand(rng, dist, ndraws) logpx2 = logpdf(dist, x2) @@ -51,7 +60,8 @@ using Test n = 10 ndraws = 20 nhist = 4 - A = rand_pd_diag_mat(Float64, 10) + draws = Matrix{Float64}(undef, n, ndraws) + A = rand_pd_diag_mat(Float64, n) D = rand_pd_mat(Float64, 2nhist) B = randn(n, 2nhist) Σ = Pathfinder.WoodburyPDMat(A, B, D) @@ -60,7 +70,8 @@ using Test seed = 42 rng = Random.seed!(Random.default_rng(), seed) - x, logpx = @inferred Pathfinder.rand_and_logpdf(rng, dist, ndraws) + x, logpx = @inferred Pathfinder.rand_and_logpdf!(rng, dist, draws) + @test x === draws Random.seed!(rng, seed) x2 = rand(rng, dist, ndraws) logpx2 = logpdf(dist, x2) @@ -72,7 +83,7 @@ using Test n = 10 ndraws = 300_000 nhist = 4 - A = rand_pd_diag_mat(Float64, 10) + A = rand_pd_diag_mat(Float64, n) D = rand_pd_mat(Float64, 2nhist) B = randn(n, 2nhist) @@ -108,13 +119,16 @@ using Test end @testset "Normal" begin + ndraws = 20 σ = rand() * 10 μ = randn() dist = Normal(μ, σ) seed = 42 rng = Random.seed!(Random.default_rng(), seed) - x, logpx = @inferred Pathfinder.rand_and_logpdf(rng, dist, ndraws) + draws = Matrix{Float64}(undef, 1, ndraws) + x, logpx = @inferred Pathfinder.rand_and_logpdf!(rng, dist, draws) + @test x === draws Random.seed!(rng, seed) x2 = rand(rng, dist, ndraws) logpx2 = logpdf.(dist, x2) diff --git a/test/runtests.jl b/test/runtests.jl index b03bccbf..5cd8ecda 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -9,7 +9,7 @@ Random.seed!(0) include("transducers.jl") include("woodbury.jl") include("optimize.jl") - include("inverse_hessian.jl") + include("lbfgs.jl") include("mvnormal.jl") include("elbo.jl") include("resample.jl") diff --git a/test/singlepath.jl b/test/singlepath.jl index 0e21ec69..aa03fcea 100644 --- a/test/singlepath.jl +++ b/test/singlepath.jl @@ -16,11 +16,7 @@ using Transducers # here pathfinder finds the exact solution after 1 iteration logp(x) = -sum(abs2, x) / 2 ndraws = 100 - rngs = if VERSION ≥ v"1.7" - [MersenneTwister(), Random.default_rng()] - else - [MersenneTwister()] - end + rngs = [MersenneTwister(), Random.default_rng()] seed = 42 @testset for dim in [1, 5, 10, 100], rng in rngs executor = rng isa MersenneTwister ? SequentialEx() : ThreadedEx() @@ -83,11 +79,7 @@ using Transducers dim = 5 ℓ = build_logdensityproblem(logp, dim, 2) ndraws_elbo = 100 - rngs = if VERSION ≥ v"1.7" - [MersenneTwister(), Random.default_rng()] - else - [MersenneTwister()] - end + rngs = [MersenneTwister(), Random.default_rng()] x = randn(dim) seed = 38 optimizer = Optim.LBFGS(; m=6) diff --git a/test/test_utils.jl b/test/test_utils.jl index f2120029..80cacdba 100644 --- a/test/test_utils.jl +++ b/test/test_utils.jl @@ -14,16 +14,10 @@ rand_pd_diag_mat(rng, T, n) = Diagonal(rand(rng, T, n)) rand_pd_diag_mat(T, n) = rand_pd_diag_mat(Random.GLOBAL_RNG, T, n) # defined for testing purposes -function Pathfinder.rand_and_logpdf(rng, dist, ndraws) - x = rand(rng, dist, ndraws) - if x isa AbstractVector - xmat = permutedims(x) - logpx = Distributions.logpdf.(dist, x) - else - xmat = x - logpx = Distributions.logpdf(dist, x) - end - return xmat, logpx +function Pathfinder.rand_and_logpdf!(rng, dist, x) + rand!(rng, dist, x) + logpx = Distributions.logpdf.(dist, vec(x)) + return x, logpx end # banana distribution