diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index d836c333..6e07c96d 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -20,7 +20,7 @@ jobs:
       fail-fast: false
       matrix:
         version:
-          - '1.6'
+          - '1.8'
           - '1'
           - 'pre'
         os:
@@ -68,7 +68,7 @@ jobs:
         include:
           - version: '1'
             downgrade: false
-          - version: '1.7'
+          - version: '1.8'
             downgrade: true
     steps:
       - uses: actions/checkout@v4
diff --git a/Project.toml b/Project.toml
index f6ef830d..61c5a4d8 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "Pathfinder"
 uuid = "b1d3bc72-d0e7-4279-b92f-7fa5d6d2d454"
 authors = ["Seth Axen <seth.axen@gmail.com> and contributors"]
-version = "0.9.5"
+version = "0.9.6"
 
 [deps]
 ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b"
@@ -23,7 +23,6 @@ SciMLBase = "0bca4576-84f4-4d90-8ffe-ffa030f20462"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 Transducers = "28d57a85-8fef-5791-bfe6-a80928e7c999"
-UnPack = "3a884ed6-31ef-47d7-9d2a-63182c4928ed"
 
 [weakdeps]
 Accessors = "7d9f7c33-5ae7-4f3b-8dc6-eff91059b697"
@@ -63,8 +62,7 @@ Statistics = "1.6"
 StatsBase = "0.33.7, 0.34"
 Transducers = "0.4.81"
 Turing = "0.31.4, 0.32, 0.33, 0.34, 0.35"
-UnPack = "1"
-julia = "1.6"
+julia = "1.8"
 
 [extras]
 Accessors = "7d9f7c33-5ae7-4f3b-8dc6-eff91059b697"
diff --git a/src/Pathfinder.jl b/src/Pathfinder.jl
index 5ba23927..d79bfc1c 100644
--- a/src/Pathfinder.jl
+++ b/src/Pathfinder.jl
@@ -19,7 +19,6 @@ using SciMLBase: SciMLBase
 using Statistics: Statistics
 using StatsBase: StatsBase
 using Transducers: Transducers
-using UnPack: @unpack
 
 export PathfinderResult, MultiPathfinderResult
 export pathfinder, multipathfinder
@@ -43,7 +42,7 @@ default_ad() = ADTypes.AutoForwardDiff()
 include("transducers.jl")
 include("woodbury.jl")
 include("optimize.jl")
-include("inverse_hessian.jl")
+include("lbfgs.jl")
 include("mvnormal.jl")
 include("elbo.jl")
 include("resample.jl")
diff --git a/src/elbo.jl b/src/elbo.jl
index e7190e57..6d5c0cd0 100644
--- a/src/elbo.jl
+++ b/src/elbo.jl
@@ -1,24 +1,30 @@
-function maximize_elbo(rng, logp, dists, ndraws, executor)
+function maximize_elbo(rng, logp, dists, ndraws, executor; save_samples::Bool=true)
+    dim = isempty(dists) ? 0 : length(first(dists))
+    Tdraws = Matrix{eltype(eltype(dists))}
+    draws = save_samples ? nothing : Tdraws(undef, (dim, ndraws))
     EE = Core.Compiler.return_type(
-        elbo_and_samples, Tuple{typeof(rng),typeof(logp),eltype(dists),Int}
+        elbo_and_samples!, Tuple{Tdraws,typeof(rng),typeof(logp),eltype(dists)}
     )
     estimates = similar(dists, EE)
     isempty(estimates) && return 0, estimates
+
     Folds.map!(estimates, dists, executor) do dist
-        return elbo_and_samples(rng, logp, dist, ndraws)
+        _draws = draws === nothing ? Tdraws(undef, (dim, ndraws)) : draws
+        return elbo_and_samples!(_draws, rng, logp, dist; save_samples)
     end
     _, iteration_opt = _findmax(estimates |> Transducers.Map(est -> est.value))
     return iteration_opt, estimates
 end
 
-function elbo_and_samples(rng, logp, dist, ndraws)
-    ϕ, logqϕ = rand_and_logpdf(rng, dist, ndraws)
+function elbo_and_samples!(ϕ, rng, logp, dist; save_samples::Bool=true)
+    ϕ, logqϕ = rand_and_logpdf!(rng, dist, ϕ)
     logpϕ = similar(logqϕ)
     logpϕ .= logp.(eachcol(ϕ))
     logr = logpϕ - logqϕ
     elbo = Statistics.mean(logr)
     elbo_se = sqrt(Statistics.var(logr) / length(logr))
-    return ELBOEstimate(elbo, elbo_se, ϕ, logpϕ, logqϕ, logr)
+    ϕ_save = save_samples ? copyto!(similar(ϕ), ϕ) : similar(ϕ, map(zero, size(ϕ)))
+    return ELBOEstimate(elbo, elbo_se, ϕ_save, logpϕ, logqϕ, logr)
 end
 
 struct ELBOEstimate{T,P,L<:AbstractVector{T}}
diff --git a/src/inverse_hessian.jl b/src/inverse_hessian.jl
deleted file mode 100644
index 273e5948..00000000
--- a/src/inverse_hessian.jl
+++ /dev/null
@@ -1,134 +0,0 @@
-
-# eq 4.9
-# Gilbert, J.C., Lemaréchal, C. Some numerical experiments with variable-storage quasi-Newton algorithms.
-# Mathematical Programming 45, 407–435 (1989). https://doi.org/10.1007/BF01589113
-function gilbert_init(α, s, y)
-    a = dot(y, Diagonal(α), y)
-    b = dot(y, s)
-    c = dot(s, inv(Diagonal(α)), s)
-    return @. b / (a / α + y^2 - (a / c) * (s / α)^2)
-end
-
-"""
-    lbfgs_inverse_hessians(
-        θs, ∇logpθs; Hinit=gilbert_init, history_length=5, ϵ=1e-12
-    ) -> Tuple{Vector{WoodburyPDMat},Int}
-
-From an L-BFGS trajectory and gradients, compute the inverse Hessian approximations at each point.
-
-Given positions `θs` with gradients `∇logpθs`, construct LBFGS inverse Hessian
-approximations with the provided `history_length`.
-
-The 2nd returned value is the number of BFGS updates to the inverse Hessian matrices that
-were rejected due to keeping the inverse Hessian positive definite.
-"""
-function lbfgs_inverse_hessians(θs, ∇logpθs; Hinit=gilbert_init, history_length=5, ϵ=1e-12)
-    L = length(θs) - 1
-    θ = θs[1]
-    ∇logpθ = ∇logpθs[1]
-    n = length(θ)
-
-    # allocate caches/containers
-    history_ind = 0 # index of last set history entry
-    history_length_effective = 0 # length of history so far
-    s = similar(θ) # cache for BFGS update, i.e. sₗ = θₗ₊₁ - θₗ = -λ Hₗ ∇logpθₗ
-    y = similar(∇logpθ) # cache for yₗ = ∇logpθₗ₊₁ - ∇logpθₗ = Hₗ₊₁ \ s₁ (secant equation)
-    S = similar(s, n, min(history_length, L)) # history of s
-    Y = similar(y, n, min(history_length, L)) # history of y
-    α = fill!(similar(θ), true) # diag(H₀)
-    H = lbfgs_inverse_hessian(Diagonal(α), S, Y, history_ind, history_length_effective) # H₀ = I
-    Hs = [H] # trace of H
-
-    num_bfgs_updates_rejected = 0
-    for l in 1:L
-        θlp1, ∇logpθlp1 = θs[l + 1], ∇logpθs[l + 1]
-        s .= θlp1 .- θ
-        y .= ∇logpθ .- ∇logpθlp1
-        if dot(y, s) > ϵ * sum(abs2, y)  # curvature is positive, safe to update inverse Hessian
-            # add s and y to history
-            history_ind = mod1(history_ind + 1, history_length)
-            history_length_effective = max(history_ind, history_length_effective)
-            S[1:n, history_ind] .= s
-            Y[1:n, history_ind] .= y
-
-            # initial diagonal estimate of H
-            α = Hinit(α, s, y)
-        else
-            num_bfgs_updates_rejected += 1
-        end
-
-        θ, ∇logpθ = θlp1, ∇logpθlp1
-        H = lbfgs_inverse_hessian(Diagonal(α), S, Y, history_ind, history_length_effective)
-        push!(Hs, H)
-    end
-
-    return Hs, num_bfgs_updates_rejected
-end
-
-"""
-    lbfgs_inverse_hessian(H₀, S₀, Y₀, history_ind, history_length) -> WoodburyPDMat
-
-Compute approximate inverse Hessian initialized from `H₀` from history stored in `S₀` and `Y₀`.
-
-`history_ind` indicates the column in `S₀` and `Y₀` that was most recently added to the
-history, while `history_length` indicates the number of first columns in `S₀` and `Y₀`
-currently being used for storing history.
-`S = S₀[:, history_ind+1:history_length; 1:history_ind]` reorders the columns of `₀` so that the
-oldest is first and newest is last.
-
-From Theorem 2.2 of [^Byrd1994], the expression for the inverse Hessian ``H`` is
-
-```math
-\\begin{align}
-B &= \\begin{pmatrix}H_0 Y & S\\end{pmatrix}\\\\
-R &= \\operatorname{triu}(S^\\mathrm{T} Y)\\\\
-E &= I \\circ R\\\\
-D &= \\begin{pmatrix}
-    0 & -R^{-1}\\\\
-    -R^{-\\mathrm{T}} & R^\\mathrm{-T} (E + Y^\\mathrm{T} H_0 Y ) R^\\mathrm{-1}\\\\
-\\end{pmatrix}\\
-H &= H_0 + B D B^\\mathrm{T}
-\\end{align}
-```
-
-[^Byrd1994]: Byrd, R.H., Nocedal, J. & Schnabel, R.B.
-             Representations of quasi-Newton matrices and their use in limited memory methods.
-             Mathematical Programming 63, 129–156 (1994).
-             doi: [10.1007/BF01582063](https://doi.org/10.1007/BF01582063)
-"""
-function lbfgs_inverse_hessian(H₀::Diagonal, S0, Y0, history_ind, history_length)
-    J = history_length
-    α = H₀.diag
-    B = similar(α, size(α, 1), 2J)
-    D = fill!(similar(α, 2J, 2J), false)
-    iszero(J) && return WoodburyPDMat(H₀, B, D)
-
-    hist_inds = [(history_ind + 1):history_length; 1:history_ind]
-    @views begin
-        S = S0[:, hist_inds]
-        Y = Y0[:, hist_inds]
-        B₁ = B[:, 1:J]
-        B₂ = B[:, (J + 1):(2J)]
-        D₁₁ = D[1:J, 1:J]
-        D₁₂ = D[1:J, (J + 1):(2J)]
-        D₂₁ = D[(J + 1):(2J), 1:J]
-        D₂₂ = D[(J + 1):(2J), (J + 1):(2J)]
-    end
-
-    mul!(B₁, Diagonal(α), Y)
-    copyto!(B₂, S)
-    mul!(D₂₂, S', Y)
-    triu!(D₂₂)
-    R = UpperTriangular(D₂₂)
-    nRinv = UpperTriangular(D₁₂)
-    copyto!(nRinv, -I)
-    ldiv!(R, nRinv)
-    nRinv′ = LowerTriangular(copyto!(D₂₁, nRinv'))
-    tril!(D₂₂) # eliminate all but diagonal
-    mul!(D₂₂, Y', B₁, true, true)
-    LinearAlgebra.copytri!(D₂₂, 'U', false, false)
-    rmul!(D₂₂, nRinv)
-    lmul!(nRinv′, D₂₂)
-
-    return WoodburyPDMat(H₀, B, D)
-end
diff --git a/src/lbfgs.jl b/src/lbfgs.jl
new file mode 100644
index 00000000..0c2dd697
--- /dev/null
+++ b/src/lbfgs.jl
@@ -0,0 +1,208 @@
+# eq 4.9
+# Gilbert, J.C., Lemaréchal, C. Some numerical experiments with variable-storage quasi-Newton algorithms.
+# Mathematical Programming 45, 407–435 (1989). https://doi.org/10.1007/BF01589113
+function gilbert_invH_init!(α, s, y)
+    a = dot(y, Diagonal(α), y)
+    b = dot(y, s)
+    c = dot(s, inv(Diagonal(α)), s)
+    @. α = b / (a / α + y^2 - (a / c) * (s / α)^2)
+    return α
+end
+
+# history storage for L-BFGS and accessor methods
+mutable struct LBFGSHistory{T<:Real}
+    const position_diffs::Matrix{T}
+    const gradient_diffs::Matrix{T}
+    const history_perm::Vector{Int}
+    history_length::Int
+end
+
+function LBFGSHistory{T}(n::Int, history_length::Int) where {T<:Real}
+    position_diffs = Matrix{T}(undef, n, history_length + 1)
+    gradient_diffs = Matrix{T}(undef, n, history_length + 1)
+    history_perm = collect(1:(history_length + 1))
+    return LBFGSHistory{T}(position_diffs, gradient_diffs, history_perm, 0)
+end
+
+function _history_matrices(history::LBFGSHistory)
+    (; position_diffs, gradient_diffs, history_perm, history_length) = history
+    history_inds = @view history_perm[(end - history_length + 1):end]
+    return @views (position_diffs[:, history_inds], gradient_diffs[:, history_inds])
+end
+
+function _propose_history_update!(
+    history::LBFGSHistory, position, position_new, gradient, gradient_new
+)
+    (; history_perm) = history
+    queue_ind = first(history_perm)
+    history.position_diffs[:, queue_ind] .= position_new .- position
+    history.gradient_diffs[:, queue_ind] .= gradient .- gradient_new
+    return history
+end
+
+function _proposed_history_updates(history::LBFGSHistory)
+    (; position_diffs, gradient_diffs, history_perm) = history
+    queue_ind = first(history_perm)
+    return @views position_diffs[:, queue_ind], gradient_diffs[:, queue_ind]
+end
+
+function _accept_history_update!(history::LBFGSHistory)
+    (; history_perm) = history
+    circshift!(history_perm, -1)
+    history.history_length = min(history.history_length + 1, length(history_perm) - 1)
+    return history
+end
+
+function _has_positive_curvature(pos_diff, grad_diff, ϵ)
+    return dot(grad_diff, pos_diff) > ϵ * sum(abs2, grad_diff)
+end
+
+# cache for L-BFGS inverse Hessian approximations
+struct LBFGSInverseHessianCache{T<:Real}
+    diag_invH0::Vector{T}
+    B::Matrix{T}
+    D::Matrix{T}
+end
+
+function LBFGSInverseHessianCache{T}(n::Int, history_length::Int) where {T<:Real}
+    diag_invH0 = ones(T, n)
+    B = Matrix{T}(undef, n, 2 * history_length)
+    D = zeros(T, 2 * history_length, 2 * history_length)
+    return LBFGSInverseHessianCache(diag_invH0, B, D)
+end
+
+# state for each iteration of L-BFGS optimization
+mutable struct LBFGSState{T<:Real,IH<:WoodburyPDMat{T}}
+    const x::Vector{T}
+    fx::T
+    const ∇fx::Vector{T}
+    const history::LBFGSHistory{T}
+    const cache::LBFGSInverseHessianCache{T}
+    invH::IH
+    num_bfgs_updates_rejected::Int
+end
+function LBFGSState(x, fx, ∇fx, history_length::Int)
+    T = Base.promote_eltype(x, fx, ∇fx)
+    n = length(x)
+    history = LBFGSHistory{T}(n, history_length)
+    cache = LBFGSInverseHessianCache{T}(n, history_length)
+    invH = lbfgs_inverse_hessian!(cache, history) # H₀ = I
+    return LBFGSState{T,typeof(invH)}(copy(x), fx, copy(∇fx), history, cache, invH, 0)
+end
+
+function _update_state!(state::LBFGSState, x, fx, ∇fx, invH_init!, ϵ)
+    _propose_history_update!(state.history, state.x, x, state.∇fx, ∇fx)
+    pos_diff, grad_diff = _proposed_history_updates(state.history)
+
+    # only update inverse Hessian if will not destroy positive definiteness
+    if _has_positive_curvature(pos_diff, grad_diff, ϵ)
+        _accept_history_update!(state.history)
+        invH_init!(state.cache.diag_invH0, pos_diff, grad_diff)
+        state.invH = lbfgs_inverse_hessian!(state.cache, state.history)
+    else
+        state.num_bfgs_updates_rejected += 1
+    end
+
+    copyto!(state.x, x)
+    state.fx = fx
+    copyto!(state.∇fx, ∇fx)
+    return state
+end
+
+"""
+    lbfgs_inverse_hessians(
+        θs, ∇logpθs; Hinit=gilbert_init, history_length=5, ϵ=1e-12
+    ) -> Tuple{Vector{WoodburyPDMat},Int}
+
+From an L-BFGS trajectory and gradients, compute the inverse Hessian approximations at each point.
+
+Given positions `θs` with gradients `∇logpθs`, construct LBFGS inverse Hessian
+approximations with the provided `history_length`.
+
+The 2nd returned value is the number of BFGS updates to the inverse Hessian matrices that
+were rejected due to keeping the inverse Hessian positive definite.
+"""
+function lbfgs_inverse_hessians(
+    θs, logpθs, ∇logpθs; (invH_init!)=gilbert_invH_init!, history_length=5, ϵ=1e-12
+)
+    L = length(θs) - 1
+    history_length = min(history_length, L)
+    state = LBFGSState(first(θs), first(logpθs), first(∇logpθs), history_length)
+    invHs = [deepcopy(state.invH)] # trace of invH
+
+    for (θ, logpθ, ∇logpθ) in Iterators.drop(zip(θs, logpθs, ∇logpθs), 1)
+        _update_state!(state, θ, logpθ, ∇logpθ, invH_init!, ϵ)
+        push!(invHs, deepcopy(state.invH))
+    end
+
+    return invHs, state.num_bfgs_updates_rejected
+end
+
+"""
+    lbfgs_inverse_hessian!(cache::LBFGSInverseHessianCache, history::LBFGSHistory) -> WoodburyPDMat
+
+Compute approximate inverse Hessian initialized from history stored in `cache` and `history`.
+
+`cache` stores the diagonal of the initial inverse Hessian ``H₀^{-1}`` and the matrices
+``B₀`` and ``D₀``, which are overwritten here and are used in the construction of the
+returned approximate inverse Hessian ``H^{-1}``.
+
+From Theorem 2.2 of [^Byrd1994], the expression for the inverse Hessian ``H^{-1}`` is
+
+```math
+\\begin{align}
+B &= \\begin{pmatrix}H_0^{-1} Y & S\\end{pmatrix}\\\\
+R &= \\operatorname{triu}(S^\\mathrm{T} Y)\\\\
+E &= I \\circ R\\\\
+D &= \\begin{pmatrix}
+    0 & -R^{-1}\\\\
+    -R^{-\\mathrm{T}} & R^\\mathrm{-T} (E + Y^\\mathrm{T} H_0 Y ) R^\\mathrm{-1}\\\\
+\\end{pmatrix}\\
+H^{-1} &= H_0^{-1} + B D B^\\mathrm{T}
+\\end{align}
+```
+
+[^Byrd1994]: Byrd, R.H., Nocedal, J. & Schnabel, R.B.
+             Representations of quasi-Newton matrices and their use in limited memory methods.
+             Mathematical Programming 63, 129–156 (1994).
+             doi: [10.1007/BF01582063](https://doi.org/10.1007/BF01582063)
+"""
+function lbfgs_inverse_hessian!(cache::LBFGSInverseHessianCache, history::LBFGSHistory)
+    (; B, D, diag_invH0) = cache
+    return lbfgs_inverse_hessian!(B, D, diag_invH0, history)
+end
+function lbfgs_inverse_hessian!(B_cache, D_cache, diag_invH0, history)
+    S, Y = _history_matrices(history)
+    J = history.history_length
+    B = @view B_cache[:, 1:(2J)]
+    D = @view D_cache[1:(2J), 1:(2J)]
+    invH0 = Diagonal(diag_invH0)
+    iszero(J) && return WoodburyPDMat(invH0, B, D)
+
+    @views begin
+        B₁ = B[:, 1:J]
+        B₂ = B[:, (J + 1):(2J)]
+        D₁₁ = D[1:J, 1:J]
+        D₁₂ = D[1:J, (J + 1):(2J)]
+        D₂₁ = D[(J + 1):(2J), 1:J]
+        D₂₂ = D[(J + 1):(2J), (J + 1):(2J)]
+    end
+
+    fill!(D₁₁, false)
+    mul!(B₁, invH0, Y)
+    copyto!(B₂, S)
+    mul!(D₂₂, S', Y)
+    triu!(D₂₂)
+    R = UpperTriangular(D₂₂)
+    nRinv = UpperTriangular(D₁₂)
+    copyto!(nRinv, -I)
+    ldiv!(R, nRinv)
+    nRinv′ = LowerTriangular(copyto!(D₂₁, nRinv'))
+    tril!(D₂₂) # eliminate all but diagonal
+    mul!(D₂₂, Y', B₁, true, true)
+    LinearAlgebra.copytri!(D₂₂, 'U', false, false)
+    rmul!(D₂₂, nRinv)
+    lmul!(nRinv′, D₂₂)
+
+    return WoodburyPDMat(invH0, B, D)
+end
diff --git a/src/mvnormal.jl b/src/mvnormal.jl
index c89f978c..e6ab6a22 100644
--- a/src/mvnormal.jl
+++ b/src/mvnormal.jl
@@ -11,8 +11,8 @@ point are returned.
 The 2nd returned value is the number of BFGS updates to the inverse Hessian matrices that
 were rejected due to keeping the inverse Hessian positive definite.
 """
-function fit_mvnormals(θs, ∇logpθs; kwargs...)
-    Σs, num_bfgs_updates_rejected = lbfgs_inverse_hessians(θs, ∇logpθs; kwargs...)
+function fit_mvnormals(θs, logpθs, ∇logpθs; kwargs...)
+    Σs, num_bfgs_updates_rejected = lbfgs_inverse_hessians(θs, logpθs, ∇logpθs; kwargs...)
     trans = Transducers.MapSplat() do Σ, ∇logpθ, θ
         μ = muladd(Σ, ∇logpθ, θ)
         return Distributions.MvNormal(μ, Σ)
@@ -22,16 +22,15 @@ function fit_mvnormals(θs, ∇logpθs; kwargs...)
     return dists, num_bfgs_updates_rejected
 end
 
-# faster than computing `logpdf` and `rand` independently
-function rand_and_logpdf(rng, dist::Distributions.MvNormal, ndraws)
-    μ = dist.μ
-    Σ = dist.Σ
-    N = length(μ)
+# faster than computing `logpdf` and `rand!` independently
+function rand_and_logpdf!(rng, dist::Distributions.MvNormal, x)
+    (; μ, Σ) = dist
+    N = length(dist)
 
     # draw points
-    u = Random.randn!(rng, similar(μ, N, ndraws))
-    unormsq = vec(sum(abs2, u; dims=1))
-    x = PDMats.unwhiten!(u, Σ, u)
+    Random.randn!(rng, x)
+    unormsq = vec(sum(abs2, x; dims=1))
+    PDMats.unwhiten!(x, Σ, x)
     x .+= μ
 
     # compute log density at each point
diff --git a/src/optimize.jl b/src/optimize.jl
index 84e6677d..4ce508bf 100644
--- a/src/optimize.jl
+++ b/src/optimize.jl
@@ -101,7 +101,7 @@ end
 @static if isdefined(Optimization, :OptimizationState)
     # Optimization v3.21.0 and later
     function (cb::OptimizationCallback)(state::Optimization.OptimizationState, args...)
-        @unpack (
+        (;
             xs, fxs, ∇fxs, progress_name, progress_id, maxiters, callback, fail_on_nonfinite
         ) = cb
         ret = callback !== nothing && callback(state, args...)
@@ -127,7 +127,7 @@ end
 else
     # Optimization v3.20.X and earlier
     function (cb::OptimizationCallback)(x, nfx, args...)
-        @unpack (
+        (;
             xs, fxs, ∇fxs, progress_name, progress_id, maxiters, callback, fail_on_nonfinite
         ) = cb
         ret = callback !== nothing && callback(x, nfx, args...)
diff --git a/src/singlepath.jl b/src/singlepath.jl
index c126af10..41ae4a0f 100644
--- a/src/singlepath.jl
+++ b/src/singlepath.jl
@@ -199,7 +199,7 @@ function pathfinder(
             kwargs...,
         )
     end
-    @unpack (
+    (;
         itry,
         success,
         optim_prob,
@@ -302,7 +302,7 @@ function _pathfinder(
 
     # fit mv-normal distributions to trajectory
     fit_distributions, num_bfgs_updates_rejected = fit_mvnormals(
-        optim_trace.points, optim_trace.gradients; history_length
+        optim_trace.points, optim_trace.log_densities, optim_trace.gradients; history_length
     )
 
     # find ELBO-maximizing distribution
diff --git a/test/elbo.jl b/test/elbo.jl
index 0d24e210..25b7a45f 100644
--- a/test/elbo.jl
+++ b/test/elbo.jl
@@ -5,7 +5,7 @@ using Test
 using Transducers
 
 @testset "ELBO estimation" begin
-    @testset "elbo_and_samples" begin
+    @testset "elbo_and_samples!" begin
         σ_target = 0.08
         target_dist = Normal(0, σ_target)
         logp(x) = logpdf(target_dist, x[1])
@@ -13,18 +13,28 @@ using Transducers
         rng = Random.seed!(Random.default_rng(), 42)
         @testset for σ in σs
             dist = Normal(0, σ)
-            est = @inferred Pathfinder.elbo_and_samples(rng, logp, dist, 1_000_000)
+            draws = Matrix{Float64}(undef, length(dist), 1_000_000)
+            # rng = Random.seed!(Random.default_rng(), 42)
+            est = @inferred Pathfinder.elbo_and_samples!(draws, rng, logp, dist)
             @test est isa Pathfinder.ELBOEstimate
 
             # explicit elbo calculation
             r = σ / σ_target
             elbo_exp = (1 - r^2) / 2 + log(r)
             @test est.value ≈ elbo_exp atol = 3 * est.std_err
+            @test est.draws !== draws
             @test est.log_densities_target ≈ logp.(eachcol(est.draws))
             @test est.log_densities_fit ≈ logpdf.(dist, first.(eachcol(est.draws)))
             @test est.log_density_ratios == est.log_densities_target - est.log_densities_fit
             @test est.value ≈ mean(est.log_density_ratios)
             @test est.std_err ≈ std(est.log_density_ratios) / sqrt(1_000_000)
+
+            # fill!(draws, 0)
+            # est_nosave = @inferred Pathfinder.elbo_and_samples!(draws, rng, logp, dist; save_samples=false)
+            # @test est_nosave.value == est.value
+            # @test est_nosave.std_err == est.std_err
+            # @test est_nosave.draws !== draws
+            # @test isempty(est_nosave.draws)
         end
     end
 
@@ -34,11 +44,7 @@ using Transducers
         logp(x) = logpdf(target_dist, x[1])
         σs = [1e-3, 0.05, σ_target, 1.0, 1.1, 1.2, 5.0, 10.0]
         dists = Normal.(0, σs)
-        if VERSION ≥ v"1.7.0"
-            executors = [SequentialEx(), ThreadedEx()]
-        else
-            executors = [SequentialEx()]
-        end
+        executors = [SequentialEx()]
         @testset "$executor" for executor in executors
             rng = Random.seed!(Random.default_rng(), 42)
             lopt, estimates = @inferred Pathfinder.maximize_elbo(
diff --git a/test/inverse_hessian.jl b/test/inverse_hessian.jl
deleted file mode 100644
index d9185fb4..00000000
--- a/test/inverse_hessian.jl
+++ /dev/null
@@ -1,77 +0,0 @@
-using LinearAlgebra
-using LogDensityProblems
-using Optim
-using Pathfinder
-using SciMLBase
-using Test
-
-function lbfgs_inverse_hessian_explicit(H₀, S, Y)
-    B = [H₀ * Y S]
-    R = triu(S'Y)
-    E = Diagonal(R)
-    D = [0*I -inv(R); -inv(R)' R' \ (E + Y' * H₀ * Y)/R]
-    return H₀ + B * D * B'
-end
-
-@testset "L-BFGS inverse Hessian construction" begin
-    @testset "lbfgs_inverse_hessian" begin
-        #! format: off
-        S0 = [[1.719573, 3.294037, -2.008877, 3.901275, 0.214324, 0.400382, 0.113598, -1.804262, 0.465563, 2.465748], [-0.445476, -0.514915, 1.069617, -1.505506, -0.036535, -0.11386, -0.029737, 0.579662, -0.118694, -1.726258], [-0.013354, 0.1981, 0.081886, -0.194172, 0.010499, -0.007944, -0.001039, 0.061604, -0.002621, 0.11145], [0.00648, 0.255961, -0.011901, -0.059042, 0.013587, -0.002064, 0.000302, 0.023457, 0.00257, -0.002343], [-0.016408, 0.015005, 0.009654, 0.016735, -0.000245, -0.002825, -0.00106, 0.004272, -0.004578, 0.002275]]
-        Y0 = -[[-2.357935, -3.343312, 4.659008, -7.065433, -0.228045, -0.584164, -0.156837, 2.863289, -0.631753, -7.152021], [0.610851, 0.522617, -2.480668, 2.726559, 0.038874, 0.166123, 0.041055, -0.9199, 0.161064, 5.007094], [0.018312, -0.201064, -0.189911, 0.351657, -0.011171, 0.011591, 0.001434, -0.097763, 0.003557, -0.323265], [-0.008886, -0.25979, 0.027601, 0.106929, -0.014456, 0.003011, -0.000418, -0.037225, -0.003488, 0.006795], [0.022499, -0.015229, -0.02239, -0.030308, 0.00026, 0.004122, 0.001463, -0.00678, 0.006212, -0.006598]]
-        #! format: on
-        S = reduce(hcat, S0)
-        Y = reduce(hcat, Y0)
-        N, history_length = size(S)
-        α = rand(N)
-        H₀ = Diagonal(α)
-
-        @test @inferred(Pathfinder.lbfgs_inverse_hessian(H₀, S, Y, 0, 0)) ≈ H₀
-
-        H = Pathfinder.lbfgs_inverse_hessian(H₀, S, Y, 3, 3)
-        Hexp = lbfgs_inverse_hessian_explicit(H₀, S[:, 1:3], Y[:, 1:3])
-        @test H ≈ Hexp
-
-        S2 = S[:, [4:history_length; 1:3]]
-        Y2 = Y[:, [4:history_length; 1:3]]
-        ilast = argmax(axes(S, 2)[[4:history_length; 1:3]])
-        H = Pathfinder.lbfgs_inverse_hessian(H₀, S2, Y2, ilast, history_length)
-        Hexp = lbfgs_inverse_hessian_explicit(H₀, S, Y)
-        @test H ≈ Hexp
-
-        H = Pathfinder.lbfgs_inverse_hessian(H₀, S, Y, history_length, history_length)
-        Hexp = lbfgs_inverse_hessian_explicit(H₀, S, Y)
-        @test H ≈ Hexp
-    end
-
-    @testset "lbfgs_inverse_hessians" begin
-        n = 10
-        history_length = 5
-        logp(x) = logp_banana(x)
-        nocedal_wright_scaling(α, s, y) = fill!(similar(α), dot(y, s) / sum(abs2, y))
-        θ₀ = 10 * randn(n)
-
-        ℓ = build_logdensityproblem(logp, n, 2)
-        fun = Pathfinder.build_optim_function(
-            ℓ, SciMLBase.NoAD(), LogDensityProblems.capabilities(ℓ)
-        )
-        prob = SciMLBase.OptimizationProblem(fun, θ₀)
-        optimizer = Optim.LBFGS(;
-            m=history_length, linesearch=Optim.LineSearches.MoreThuente()
-        )
-        sol, optim_trace = Pathfinder.optimize_with_trace(prob, optimizer)
-
-        # run lbfgs_inverse_hessians with the same initialization as Optim.LBFGS
-        Hs, num_bfgs_updates_rejected = Pathfinder.lbfgs_inverse_hessians(
-            optim_trace.points,
-            optim_trace.gradients;
-            history_length,
-            Hinit=nocedal_wright_scaling,
-        )
-        ss = diff(optim_trace.points)
-        ps = (Hs .* optim_trace.gradients)[1:(end - 1)]
-        # check that next direction computed from Hessian is the same as the actual
-        # direction that was taken
-        @test all(≈(1), dot.(ps, ss) ./ norm.(ss) ./ norm.(ps))
-        @test num_bfgs_updates_rejected == 0
-    end
-end
diff --git a/test/lbfgs.jl b/test/lbfgs.jl
new file mode 100644
index 00000000..ade1d658
--- /dev/null
+++ b/test/lbfgs.jl
@@ -0,0 +1,98 @@
+using LinearAlgebra
+using LogDensityProblems
+using Optim
+using Pathfinder
+using SciMLBase
+using Test
+
+function lbfgs_inverse_hessian_explicit(H₀, S, Y)
+    B = [H₀ * Y S]
+    R = triu(S'Y)
+    E = Diagonal(R)
+    D = [0*I -inv(R); -inv(R)' R' \ (E + Y' * H₀ * Y)/R]
+    return H₀ + B * D * B'
+end
+
+@testset "L-BFGS inverse Hessian construction" begin
+    @testset "lbfgs_inverse_hessian" begin
+        #! format: off
+        S_vecs = [[1.719573, 3.294037, -2.008877, 3.901275, 0.214324, 0.400382, 0.113598, -1.804262, 0.465563, 2.465748], [-0.445476, -0.514915, 1.069617, -1.505506, -0.036535, -0.11386, -0.029737, 0.579662, -0.118694, -1.726258], [-0.013354, 0.1981, 0.081886, -0.194172, 0.010499, -0.007944, -0.001039, 0.061604, -0.002621, 0.11145], [0.00648, 0.255961, -0.011901, -0.059042, 0.013587, -0.002064, 0.000302, 0.023457, 0.00257, -0.002343], [-0.016408, 0.015005, 0.009654, 0.016735, -0.000245, -0.002825, -0.00106, 0.004272, -0.004578, 0.002275]]
+        Y_vecs = -[[-2.357935, -3.343312, 4.659008, -7.065433, -0.228045, -0.584164, -0.156837, 2.863289, -0.631753, -7.152021], [0.610851, 0.522617, -2.480668, 2.726559, 0.038874, 0.166123, 0.041055, -0.9199, 0.161064, 5.007094], [0.018312, -0.201064, -0.189911, 0.351657, -0.011171, 0.011591, 0.001434, -0.097763, 0.003557, -0.323265], [-0.008886, -0.25979, 0.027601, 0.106929, -0.014456, 0.003011, -0.000418, -0.037225, -0.003488, 0.006795], [0.022499, -0.015229, -0.02239, -0.030308, 0.00026, 0.004122, 0.001463, -0.00678, 0.006212, -0.006598]]
+        #! format: on
+        S = reduce(hcat, S_vecs)
+        Y = reduce(hcat, Y_vecs)
+        history_length = length(S_vecs)
+        N = length(first(S_vecs))
+        history = Pathfinder.LBFGSHistory{Float64}(N, history_length)
+        cache = Pathfinder.LBFGSInverseHessianCache{Float64}(N, history_length)
+        @. cache.diag_invH0 = rand()
+        invH0 = Diagonal(cache.diag_invH0)
+
+        @test @inferred(Pathfinder.lbfgs_inverse_hessian!(cache, history)) ≈ invH0
+
+        for i in 1:3
+            Pathfinder._propose_history_update!(
+                history, zero(S_vecs[i]), S_vecs[i], Y_vecs[i], zero(Y_vecs[i])
+            )
+            Pathfinder._accept_history_update!(history)
+        end
+        invH = Pathfinder.lbfgs_inverse_hessian!(cache, history)
+        invH_expected = lbfgs_inverse_hessian_explicit(invH0, S[:, 1:3], Y[:, 1:3])
+        @test invH ≈ invH_expected
+
+        S2 = S[:, [4:history_length; 1:3]]
+        Y2 = Y[:, [4:history_length; 1:3]]
+        for i in vcat(4:history_length, 1:3)
+            Pathfinder._propose_history_update!(
+                history, zero(S_vecs[i]), S_vecs[i], Y_vecs[i], zero(Y_vecs[i])
+            )
+            Pathfinder._accept_history_update!(history)
+        end
+        invH = Pathfinder.lbfgs_inverse_hessian!(cache, history)
+        invH_expected = lbfgs_inverse_hessian_explicit(invH0, S2, Y2)
+        @test invH ≈ invH_expected
+
+        for i in 1:history_length
+            Pathfinder._propose_history_update!(
+                history, zero(S_vecs[i]), S_vecs[i], Y_vecs[i], zero(Y_vecs[i])
+            )
+            Pathfinder._accept_history_update!(history)
+        end
+        invH = Pathfinder.lbfgs_inverse_hessian!(cache, history)
+        invH_expected = lbfgs_inverse_hessian_explicit(invH0, S, Y)
+        @test invH ≈ invH_expected
+    end
+
+    @testset "lbfgs_inverse_hessians" begin
+        n = 10
+        history_length = 5
+        logp(x) = logp_banana(x)
+        nocedal_wright_invH_init!(α, s, y) = fill!(α, dot(y, s) / sum(abs2, y))
+        θ₀ = 10 * randn(n)
+
+        ℓ = build_logdensityproblem(logp, n, 2)
+        fun = Pathfinder.build_optim_function(
+            ℓ, SciMLBase.NoAD(), LogDensityProblems.capabilities(ℓ)
+        )
+        prob = SciMLBase.OptimizationProblem(fun, θ₀)
+        optimizer = Optim.LBFGS(;
+            m=history_length, linesearch=Optim.LineSearches.MoreThuente()
+        )
+        sol, optim_trace = Pathfinder.optimize_with_trace(prob, optimizer)
+
+        # run lbfgs_inverse_hessians with the same initialization as Optim.LBFGS
+        invHs, num_bfgs_updates_rejected = Pathfinder.lbfgs_inverse_hessians(
+            optim_trace.points,
+            optim_trace.log_densities,
+            optim_trace.gradients;
+            history_length,
+            (invH_init!)=nocedal_wright_invH_init!,
+        )
+        ss = diff(optim_trace.points)
+        ps = (invHs .* optim_trace.gradients)[1:(end - 1)]
+        # check that next direction computed from Hessian is the same as the actual
+        # direction that was taken
+        @test all(≈(1), dot.(ps, ss) ./ norm.(ss) ./ norm.(ps))
+        @test num_bfgs_updates_rejected == 0
+    end
+end
diff --git a/test/multipath.jl b/test/multipath.jl
index 42ab3c93..7088be77 100644
--- a/test/multipath.jl
+++ b/test/multipath.jl
@@ -22,11 +22,7 @@ using Transducers
         d = MvNormal(μ, Σ)
         logp(x) = logpdf(d, x)
         ℓ = build_logdensityproblem(logp, dim, 2)
-        rngs = if VERSION ≥ v"1.7"
-            [MersenneTwister(), Random.default_rng()]
-        else
-            [MersenneTwister()]
-        end
+        rngs = [MersenneTwister(), Random.default_rng()]
         seed = 76
         @testset for rng in rngs
             executor = rng isa MersenneTwister ? SequentialEx() : ThreadedEx()
diff --git a/test/mvnormal.jl b/test/mvnormal.jl
index c2604d6a..0deb8afb 100644
--- a/test/mvnormal.jl
+++ b/test/mvnormal.jl
@@ -2,6 +2,7 @@ using Distributions
 using Optim
 using Pathfinder
 using Random
+using SciMLBase
 using Test
 
 @testset "MvNormal functions" begin
@@ -17,10 +18,16 @@ using Test
         history_length = optimizer.m
         _, optim_trace = Pathfinder.optimize_with_trace(prob, optimizer)
         Σs, num_bfgs_updates_rejected1 = Pathfinder.lbfgs_inverse_hessians(
-            optim_trace.points, optim_trace.gradients; history_length
+            optim_trace.points,
+            optim_trace.log_densities,
+            optim_trace.gradients;
+            history_length,
         )
         dists, num_bfgs_updates_rejected2 = @inferred Pathfinder.fit_mvnormals(
-            optim_trace.points, optim_trace.gradients; history_length
+            optim_trace.points,
+            optim_trace.log_densities,
+            optim_trace.gradients;
+            history_length,
         )
         @test dists isa Vector{<:MvNormal{Float64,<:Pathfinder.WoodburyPDMat}}
         @test num_bfgs_updates_rejected2 == num_bfgs_updates_rejected1
@@ -28,17 +35,19 @@ using Test
         @test optim_trace.points .+ Σs .* optim_trace.gradients ≈ getproperty.(dists, :μ)
     end
 
-    @testset "rand_and_logpdf" begin
-        ndraws = 20
+    @testset "rand_and_logpdf!" begin
         @testset "MvNormal" begin
             n = 10
+            ndraws = 20
+            draws = Matrix{Float64}(undef, n, ndraws)
             Σ = rand_pd_mat(Float64, 10)
             μ = randn(n)
             dist = MvNormal(μ, Σ)
 
             seed = 42
             rng = Random.seed!(Random.default_rng(), seed)
-            x, logpx = @inferred Pathfinder.rand_and_logpdf(rng, dist, ndraws)
+            x, logpx = @inferred Pathfinder.rand_and_logpdf!(rng, dist, draws)
+            @test x === draws
             Random.seed!(rng, seed)
             x2 = rand(rng, dist, ndraws)
             logpx2 = logpdf(dist, x2)
@@ -51,7 +60,8 @@ using Test
                 n = 10
                 ndraws = 20
                 nhist = 4
-                A = rand_pd_diag_mat(Float64, 10)
+                draws = Matrix{Float64}(undef, n, ndraws)
+                A = rand_pd_diag_mat(Float64, n)
                 D = rand_pd_mat(Float64, 2nhist)
                 B = randn(n, 2nhist)
                 Σ = Pathfinder.WoodburyPDMat(A, B, D)
@@ -60,7 +70,8 @@ using Test
 
                 seed = 42
                 rng = Random.seed!(Random.default_rng(), seed)
-                x, logpx = @inferred Pathfinder.rand_and_logpdf(rng, dist, ndraws)
+                x, logpx = @inferred Pathfinder.rand_and_logpdf!(rng, dist, draws)
+                @test x === draws
                 Random.seed!(rng, seed)
                 x2 = rand(rng, dist, ndraws)
                 logpx2 = logpdf(dist, x2)
@@ -72,7 +83,7 @@ using Test
                 n = 10
                 ndraws = 300_000
                 nhist = 4
-                A = rand_pd_diag_mat(Float64, 10)
+                A = rand_pd_diag_mat(Float64, n)
                 D = rand_pd_mat(Float64, 2nhist)
                 B = randn(n, 2nhist)
 
@@ -108,13 +119,16 @@ using Test
         end
 
         @testset "Normal" begin
+            ndraws = 20
             σ = rand() * 10
             μ = randn()
             dist = Normal(μ, σ)
 
             seed = 42
             rng = Random.seed!(Random.default_rng(), seed)
-            x, logpx = @inferred Pathfinder.rand_and_logpdf(rng, dist, ndraws)
+            draws = Matrix{Float64}(undef, 1, ndraws)
+            x, logpx = @inferred Pathfinder.rand_and_logpdf!(rng, dist, draws)
+            @test x === draws
             Random.seed!(rng, seed)
             x2 = rand(rng, dist, ndraws)
             logpx2 = logpdf.(dist, x2)
diff --git a/test/runtests.jl b/test/runtests.jl
index b03bccbf..5cd8ecda 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -9,7 +9,7 @@ Random.seed!(0)
     include("transducers.jl")
     include("woodbury.jl")
     include("optimize.jl")
-    include("inverse_hessian.jl")
+    include("lbfgs.jl")
     include("mvnormal.jl")
     include("elbo.jl")
     include("resample.jl")
diff --git a/test/singlepath.jl b/test/singlepath.jl
index 0e21ec69..aa03fcea 100644
--- a/test/singlepath.jl
+++ b/test/singlepath.jl
@@ -16,11 +16,7 @@ using Transducers
         # here pathfinder finds the exact solution after 1 iteration
         logp(x) = -sum(abs2, x) / 2
         ndraws = 100
-        rngs = if VERSION ≥ v"1.7"
-            [MersenneTwister(), Random.default_rng()]
-        else
-            [MersenneTwister()]
-        end
+        rngs = [MersenneTwister(), Random.default_rng()]
         seed = 42
         @testset for dim in [1, 5, 10, 100], rng in rngs
             executor = rng isa MersenneTwister ? SequentialEx() : ThreadedEx()
@@ -83,11 +79,7 @@ using Transducers
         dim = 5
         ℓ = build_logdensityproblem(logp, dim, 2)
         ndraws_elbo = 100
-        rngs = if VERSION ≥ v"1.7"
-            [MersenneTwister(), Random.default_rng()]
-        else
-            [MersenneTwister()]
-        end
+        rngs = [MersenneTwister(), Random.default_rng()]
         x = randn(dim)
         seed = 38
         optimizer = Optim.LBFGS(; m=6)
diff --git a/test/test_utils.jl b/test/test_utils.jl
index f2120029..80cacdba 100644
--- a/test/test_utils.jl
+++ b/test/test_utils.jl
@@ -14,16 +14,10 @@ rand_pd_diag_mat(rng, T, n) = Diagonal(rand(rng, T, n))
 rand_pd_diag_mat(T, n) = rand_pd_diag_mat(Random.GLOBAL_RNG, T, n)
 
 # defined for testing purposes
-function Pathfinder.rand_and_logpdf(rng, dist, ndraws)
-    x = rand(rng, dist, ndraws)
-    if x isa AbstractVector
-        xmat = permutedims(x)
-        logpx = Distributions.logpdf.(dist, x)
-    else
-        xmat = x
-        logpx = Distributions.logpdf(dist, x)
-    end
-    return xmat, logpx
+function Pathfinder.rand_and_logpdf!(rng, dist, x)
+    rand!(rng, dist, x)
+    logpx = Distributions.logpdf.(dist, vec(x))
+    return x, logpx
 end
 
 # banana distribution