TuringLang · Red-Portal · Dec 30, 2024 · Nov 17, 2024 · Nov 17, 2024 · Nov 17, 2024
diff --git a/README.md b/README.md
@@ -109,7 +109,7 @@ q_avg, _, stats, _ = AdvancedVI.optimize(
     q_transformed,
     max_iter;
     adtype=ADTypes.AutoForwardDiff(),
-    optimizer=Optimisers.Adam(1e-3),
+    optimizer=ProjectScale(Optimisers.Adam(1e-3)),
 )
 
 # Evaluate final ELBO with 10^3 Monte Carlo samples

diff --git a/bench/benchmarks.jl b/bench/benchmarks.jl
@@ -40,7 +40,7 @@ begin
     ]
         max_iter = 10^4
         d = LogDensityProblems.dimension(prob)
-        optimizer = Optimisers.Adam(T(1e-3))
+        optimizer = ProjectScale(Optimisers.Adam(T(1e-3)))
 
         for (objname, obj) in [
                 ("RepGradELBO", RepGradELBO(10)),

diff --git a/docs/src/elbo/repgradelbo.md b/docs/src/elbo/repgradelbo.md
@@ -219,7 +219,7 @@ _, _, stats_cfe, _ = AdvancedVI.optimize(
     max_iter;
     show_progress = false,
     adtype        = AutoForwardDiff(),
-    optimizer     = Optimisers.Adam(3e-3),
+    optimizer     = ProjectScale(Optimisers.Adam(3e-3)),
     callback      = callback,
 ); 
 
@@ -230,7 +230,7 @@ _, _, stats_stl, _ = AdvancedVI.optimize(
     max_iter;
     show_progress = false,
     adtype        = AutoForwardDiff(),
-    optimizer     = Optimisers.Adam(3e-3),
+    optimizer     = ProjectScale(Optimisers.Adam(3e-3)),
     callback      = callback,
 ); 
 
@@ -317,7 +317,7 @@ _, _, stats_qmc, _ = AdvancedVI.optimize(
     max_iter;
     show_progress = false,
     adtype        = AutoForwardDiff(),
-    optimizer     = Optimisers.Adam(3e-3),
+    optimizer     = ProjectScale(Optimisers.Adam(3e-3)),
     callback      = callback,
 ); 
 

diff --git a/docs/src/examples.md b/docs/src/examples.md
@@ -118,11 +118,14 @@ q_avg_trans, q_trans, stats, _ = AdvancedVI.optimize(
     n_max_iter;
     show_progress=false,
     adtype=AutoForwardDiff(),
-    optimizer=Optimisers.Adam(1e-3),
+    optimizer=ProjectScale(Optimisers.Adam(1e-3)),
 );
 nothing
 ```
 
+`ProjectScale` is a wrapper around an optimization rule such that the variational approximation stays within a stable region of the variational family.
+For more information see [this section](@ref projectscale).
+
 `q_avg_trans` is the final output of the optimization procedure.
 If a parameter averaging strategy is used through the keyword argument `averager`, `q_avg_trans` is be the output of the averaging strategy, while `q_trans` is the last iterate.
 

diff --git a/docs/src/families.md b/docs/src/families.md
@@ -56,6 +56,16 @@ FullRankGaussian
 MeanFieldGaussian
 ```
 
+### [Scale Projection Operator](@id projectscale)
+
+For the location scale, it is often the case that optimization is stable only when the smallest eigenvalue of the scale matrix is strictly positive[^D2020].
+To ensure this, we provide the following wrapper around optimization rule:
+
+```@docs
+ProjectScale
+```
+
+[^D2020]: Domke, J. (2020). Provable smoothness guarantees for black-box variational inference. In *International Conference on Machine Learning*.
 ### Gaussian Variational Families
 
 ```julia

diff --git a/ext/AdvancedVIBijectorsExt.jl b/ext/AdvancedVIBijectorsExt.jl
@@ -16,6 +16,7 @@ else
 end
 
 function AdvancedVI.update_variational_params!(
+    proj::ProjectScale,
     ::Type{<:Bijectors.TransformedDistribution{<:AdvancedVI.MvLocationScale}},
     opt_st,
     params,
@@ -24,9 +25,8 @@ function AdvancedVI.update_variational_params!(
 )
     opt_st, params = Optimisers.update!(opt_st, params, grad)
     q = restructure(params)
-    ϵ = q.dist.scale_eps
+    ϵ = proj.scale_eps
 
-    # Project the scale matrix to the set of positive definite triangular matrices
     diag_idx = diagind(q.dist.scale)
     @. q.dist.scale[diag_idx] = max(q.dist.scale[diag_idx], ϵ)
 
@@ -35,6 +35,25 @@ function AdvancedVI.update_variational_params!(
     return opt_st, params
 end
 
+function AdvancedVI.update_variational_params!(
+    proj::ProjectScale,
+    ::Type{<:Bijectors.TransformedDistribution{<:AdvancedVI.MvLocationScaleLowRank}},
+    opt_st,
+    params,
+    restructure,
+    grad,
+)
+    opt_st, params = Optimisers.update!(opt_st, params, grad)
+    q = restructure(params)
+    ϵ = proj.scale_eps
+
+    @. q.dist.scale_diag = max(q.dist.scale_diag, ϵ)
+
+    params, _ = Optimisers.destructure(q)
+
+    return opt_st, params
+end
+
 function AdvancedVI.reparam_with_entropy(
     rng::Random.AbstractRNG,
     q::Bijectors.TransformedDistribution,

diff --git a/src/AdvancedVI.jl b/src/AdvancedVI.jl
@@ -63,16 +63,17 @@ restructure_ad_forward(::ADTypes.AbstractADType, restructure, params) = restruct
 
 # Update for gradient descent step
 """
-    update_variational_params!(family_type, opt_st, params, restructure, grad)
+    update_variational_params!(rule, family_type, opt_st, params, restructure, grad)
 
-Update variational distribution according to the update rule in the optimizer state `opt_st` and the variational family `family_type`.
+Update variational distribution according to the update rule in the optimizer state `opt_st`, the optimizer given by `rule`, and the variational family type `family_type`.
 
 This is a wrapper around `Optimisers.update!` to provide some indirection.
 For example, depending on the optimizer and the variational family, this may do additional things such as applying projection or proximal mappings.
 Same as the default behavior of `Optimisers.update!`, `params` and `opt_st` may be updated by the routine and are no longer valid after calling this functino.
 Instead, the return values should be used.
 
 # Arguments
+- `rule`: Optimization rule.
 - `family_type::Type`: Type of the variational family `typeof(restructure(params))`.
 - `opt_st`: Optimizer state returned by `Optimisers.setup`.
 - `params`: Current set of parameters to be updated.
@@ -83,9 +84,9 @@ Instead, the return values should be used.
 - `opt_st`: Updated optimizer state.
 - `params`: Updated parameters.
 """
-function update_variational_params! end
-
-function update_variational_params!(::Type, opt_st, params, restructure, grad)
+function update_variational_params!(
+    ::Optimisers.AbstractRule, family_type, opt_st, params, restructure, grad
+)
     return Optimisers.update!(opt_st, params, grad)
 end
 
@@ -186,7 +187,7 @@ include("objectives/elbo/repgradelbo.jl")
 include("objectives/elbo/scoregradelbo.jl")
 
 # Variational Families
-export MvLocationScale, MeanFieldGaussian, FullRankGaussian
+export MvLocationScale, MeanFieldGaussian, FullRankGaussian, ProjectScale
 
 include("families/location_scale.jl")
 

diff --git a/src/families/location_scale.jl b/src/families/location_scale.jl
@@ -1,14 +1,6 @@
 
-struct MvLocationScale{S,D<:ContinuousDistribution,L,E<:Real} <:
-       ContinuousMultivariateDistribution
-    location::L
-    scale::S
-    dist::D
-    scale_eps::E
-end
-
 """
-    MvLocationScale(location, scale, dist; scale_eps)
+    MvLocationScale(location, scale, dist)
 
 The location scale variational family broadly represents various variational
 families using `location` and `scale` variational parameters.
@@ -20,21 +12,11 @@ represented as follows:
   u = rand(dist, d)
   z = scale*u + location
 ```
-
-`scale_eps` sets a constraint on the smallest value of `scale` to be enforced during optimization.
-This is necessary to guarantee stable convergence.
-
-# Keyword Arguments
-- `scale_eps`: Lower bound constraint for the diagonal of the scale. (default: `1e-4`).
 """
-function MvLocationScale(
-    location::AbstractVector{T},
-    scale::AbstractMatrix{T},
-    dist::ContinuousUnivariateDistribution;
-    scale_eps::T=T(1e-4),
-) where {T<:Real}
-    @assert minimum(diag(scale)) ≥ scale_eps "Initial scale is too small (smallest diagonal value is $(minimum(diag(scale)))). This might result in unstable optimization behavior."
-    return MvLocationScale(location, scale, dist, scale_eps)
+struct MvLocationScale{S,D<:ContinuousDistribution,L} <: ContinuousMultivariateDistribution
+    location::L
+    scale::S
+    dist::D
 end
 
 Functors.@functor MvLocationScale (location, scale)
@@ -44,18 +26,18 @@ Functors.@functor MvLocationScale (location, scale)
 # `scale <: Diagonal`, which is not the default behavior. Otherwise, forward-mode AD
 # is very inefficient.
 # begin
-struct RestructureMeanField{S<:Diagonal,D,L,E}
-    model::MvLocationScale{S,D,L,E}
+struct RestructureMeanField{S<:Diagonal,D,L}
+    model::MvLocationScale{S,D,L}
 end
 
 function (re::RestructureMeanField)(flat::AbstractVector)
     n_dims = div(length(flat), 2)
     location = first(flat, n_dims)
     scale = Diagonal(last(flat, n_dims))
-    return MvLocationScale(location, scale, re.model.dist, re.model.scale_eps)
+    return MvLocationScale(location, scale, re.model.dist)
 end
 
-function Optimisers.destructure(q::MvLocationScale{<:Diagonal,D,L,E}) where {D,L,E}
+function Optimisers.destructure(q::MvLocationScale{<:Diagonal,D,L}) where {D,L}
     @unpack location, scale, dist = q
     flat = vcat(location, diag(scale))
     return flat, RestructureMeanField(q)
@@ -66,7 +48,7 @@ Base.length(q::MvLocationScale) = length(q.location)
 
 Base.size(q::MvLocationScale) = size(q.location)
 
-Base.eltype(::Type{<:MvLocationScale{S,D,L,E}}) where {S,D,L,E} = eltype(D)
+Base.eltype(::Type{<:MvLocationScale{S,D,L}}) where {S,D,L} = eltype(D)
 
 function StatsBase.entropy(q::MvLocationScale)
     @unpack location, scale, dist = q
@@ -131,49 +113,61 @@ function Distributions.cov(q::MvLocationScale)
 end
 
 """
-    FullRankGaussian(μ, L; scale_eps)
+    FullRankGaussian(μ, L)
 
 Construct a Gaussian variational approximation with a dense covariance matrix.
 
 # Arguments
 - `μ::AbstractVector{T}`: Mean of the Gaussian.
 - `L::LinearAlgebra.AbstractTriangular{T}`: Cholesky factor of the covariance of the Gaussian.
-
-# Keyword Arguments
-- `scale_eps`: Smallest value allowed for the diagonal of the scale. (default: `1e-4`).
 """
 function FullRankGaussian(
-    μ::AbstractVector{T}, L::LinearAlgebra.AbstractTriangular{T}; scale_eps::T=T(1e-4)
+    μ::AbstractVector{T}, L::LinearAlgebra.AbstractTriangular{T}
 ) where {T<:Real}
-    q_base = Normal{T}(zero(T), one(T))
-    return MvLocationScale(μ, L, q_base, scale_eps)
+    return MvLocationScale(μ, L, Normal{T}(zero(T), one(T)))
 end
 
 """
-    MeanFieldGaussian(μ, L; scale_eps)
+    MeanFieldGaussian(μ, L)
 
 Construct a Gaussian variational approximation with a diagonal covariance matrix.
 
 # Arguments
 - `μ::AbstractVector{T}`: Mean of the Gaussian.
 - `L::Diagonal{T}`: Diagonal Cholesky factor of the covariance of the Gaussian.
+"""
+function MeanFieldGaussian(μ::AbstractVector{T}, L::Diagonal{T}) where {T<:Real}
+    return MvLocationScale(μ, L, Normal{T}(zero(T), one(T)))
+end
 
-# Keyword Arguments
-- `scale_eps`: Smallest value allowed for the diagonal of the scale. (default: `1e-4`).
 """
-function MeanFieldGaussian(
-    μ::AbstractVector{T}, L::Diagonal{T}; scale_eps::T=T(1e-4)
-) where {T<:Real}
-    q_base = Normal{T}(zero(T), one(T))
-    return MvLocationScale(μ, L, q_base, scale_eps)
+    ProjectScale(rule, scale_eps)
+
+Compose an optimization `rule` with a projection, where the projection ensures that a `LocationScale` or `LocationScaleLowRank` has a scale with eigenvalues larger than `scale_eps`.
+
+# Arguments
+- `rule::Optimisers.AbstractRule`: Optimization rule to compose with the projection.
+- `scale_eps::Real`: Lower bound on the eigenvalues of the scale matrix of the projection.
+"""
+struct ProjectScale{Rule<:Optimisers.AbstractRule,F<:Real} <: Optimisers.AbstractRule
+    rule::Rule
+    scale_eps::F
+end
+
+function ProjectScale(rule, scale_eps::Real=1e-5)
+    return ProjectScale{typeof(rule),typeof(scale_eps)}(rule, scale_eps)
 end
 
+Optimisers.setup(proj::ProjectScale, x) = Optimisers.setup(proj.rule, x)
+
+Optimisers.init(proj::ProjectScale, x) = Optimisers.init(proj.rule, x)
+
 function update_variational_params!(
-    ::Type{<:MvLocationScale}, opt_st, params, restructure, grad
+    proj::ProjectScale, ::Type{<:MvLocationScale}, opt_st, params, restructure, grad
 )
     opt_st, params = Optimisers.update!(opt_st, params, grad)
     q = restructure(params)
-    ϵ = q.scale_eps
+    ϵ = convert(eltype(params), proj.scale_eps)
 
     # Project the scale matrix to the set of positive definite triangular matrices
     diag_idx = diagind(q.scale)