-
Notifications
You must be signed in to change notification settings - Fork 46
Description
I recently played around in my internal lab with the RDoG optimizer https://arxiv.org/pdf/2406.02296 (more methods are proposed in the paper I just do not have time to try them all), which is basically a Riemannian generalization of the DoG optimizer https://arxiv.org/abs/2302.12022. First, it's very easy to understand, and second, it's not computationally intensive compared to Armijo. My cost functions are usually very expensive to evaluate—surprisingly, sometimes gradients are comparatively cheap (at least the stochastic gradients)—so it would be nice to have something that is not prohibitively slow but at the same time learning-rate schedule free. If you are finding it valuable to have it in the repo I can file a PR.
My current implementation is the following one:
@doc raw"""
RDoGStepsize{R<:Real} <: Stepsize
"""
mutable struct RDoGStepsize{R<:Real,P} <: Stepsize
initial_distance::R
max_distance::R
gradient_sum::R
initial_point::P
use_curvature::Bool
sectional_curvature_bound::R
last_stepsize::R
end
function RDoGStepsize(
M::AbstractManifold;
p=rand(M),
initial_distance::R=1e-3,
use_curvature::Bool=false,
sectional_curvature_bound::R=0.0,
) where {R<:Real}
return RDoGStepsize{R,typeof(p)}(
initial_distance,
initial_distance, # max_distance starts at initial_distance
zero(R), # gradient_sum starts at 0
copy(M, p), # store initial point
use_curvature,
sectional_curvature_bound,
NaN, # last_stepsize
)
end
"""
geometric_curvature_function(κ::Real, d::Real)
"""
function geometric_curvature_function(κ::Real, d::Real)
if κ < 0 && d > 0
sqrt_abs_κ = sqrt(abs(κ))
arg = sqrt_abs_κ * d
# Avoid numerical issues for small arguments
if arg < 1e-8
return 1.0 + arg^2 / 3.0 # Taylor expansion
else
return arg / tanh(arg)
end
else
return 1.0
end
end
function (rdog::RDoGStepsize)(
mp::AbstractManoptProblem,
s::AbstractManoptSolverState,
i,
args...;
gradient=nothing,
kwargs...,
)
M = get_manifold(mp)
p = get_iterate(s)
grad = isnothing(gradient) ? get_gradient(mp, p) : gradient
# Compute gradient norm
grad_norm_sq = norm(M, p, grad)^2
if i == 0
# Initialize on first call
rdog.gradient_sum = grad_norm_sq
rdog.initial_point = copy(M, p)
rdog.max_distance = rdog.initial_distance
# Initial stepsize
if rdog.use_curvature
ζ = geometric_curvature_function(rdog.sectional_curvature_bound, rdog.max_distance)
stepsize = rdog.initial_distance / (sqrt(ζ) * sqrt(max(grad_norm_sq, eps())))
else
stepsize = rdog.initial_distance / sqrt(max(grad_norm_sq, eps()))
end
else
# Update gradient sum
rdog.gradient_sum += grad_norm_sq
# Update max distance
current_distance = distance(M, rdog.initial_point, p)
rdog.max_distance = max(rdog.max_distance, current_distance)
# Compute stepsize
if rdog.use_curvature
ζ = geometric_curvature_function(rdog.sectional_curvature_bound, rdog.max_distance)
stepsize = rdog.max_distance / (sqrt(ζ) * sqrt(rdog.gradient_sum))
else
stepsize = rdog.max_distance / sqrt(rdog.gradient_sum)
end
end
rdog.last_stepsize = stepsize
return stepsize
end
get_initial_stepsize(rdog::RDoGStepsize) = rdog.last_stepsize
get_last_stepsize(rdog::RDoGStepsize) = rdog.last_stepsize
function show(io::IO, rdog::RDoGStepsize)
s = """
RDoGStepsize(;
initial_distance=$(rdog.initial_distance),
use_curvature=$(rdog.use_curvature),
sectional_curvature_bound=$(rdog.sectional_curvature_bound)
)
Current state:
max_distance = $(rdog.max_distance)
gradient_sum = $(rdog.gradient_sum)
last_stepsize = $(rdog.last_stepsize)
"""
return print(io, s)
end
"""
RDoG(; kwargs...)
RDoG(M::AbstractManifold; kwargs...)
Creates a factory for the [`RDoGStepsize`](@ref).
## Keyword arguments
* `initial_distance=1e-3`: initial distance estimate
* `use_curvature=false`: whether to use sectional curvature
* `sectional_curvature_bound=0.0`: lower bound on sectional curvature
$(_note(:ManifoldDefaultFactory, "RDoGStepsize"))
"""
function RDoG(args...; kwargs...)
return ManifoldDefaultsFactory(Manopt.RDoGStepsize, args...; kwargs...)
end
I forked the repo and made an example for the Rayleigh quotient https://github.com/Nimrais/Manopt.jl/blob/master/examples/rdog_example.jl.
If you will call it you will obtain the following result.
True optimal value (negative largest eigenvalue): -103.56226794615476
RDoG Example: Rayleigh Quotient Minimization
==================================================
Manifold: Sphere(29)
True optimal value: -103.56226794615476
Initial cost: -26.050530058561083
Initial optimality gap: 77.51173788759368
1. Basic RDoG (no curvature)
------------------------------
Initial | f(x) = -26.050530058561 | | # 50 | f(x) = -108.764589219706 | ||grad f|| = 5.46144063e+00 | step = 7.14544209e-03
# 100 | f(x) = -111.794090201414 | ||grad f|| = 1.94207092e+01 | step = 4.83908048e-03
# 150 | f(x) = -105.322098782001 | ||grad f|| = 3.73581804e+00 | step = 4.75430550e-03
# 200 | f(x) = -103.914644314998 | ||grad f|| = 7.31942441e-01 | step = 4.75104968e-03
# 250 | f(x) = -103.631999324778 | ||grad f|| = 1.44217419e-01 | step = 4.75092292e-03
# 300 | f(x) = -103.576035552189 | ||grad f|| = 2.84495965e-02 | step = 4.75091798e-03
# 350 | f(x) = -103.564984971476 | ||grad f|| = 5.61355494e-03 | step = 4.75091779e-03
# 400 | f(x) = -103.562804101333 | ||grad f|| = 1.10769530e-03 | step = 4.75091778e-03
# 450 | f(x) = -103.562373744711 | ||grad f|| = 2.18578122e-04 | step = 4.75091778e-03
# 500 | f(x) = -103.562288823125 | ||grad f|| = 4.31314309e-05 | step = 4.75091778e-03
# 550 | f(x) = -103.562272065753 | ||grad f|| = 8.51101171e-06 | step = 4.75091778e-03
# 600 | f(x) = -103.562268759064 | ||grad f|| = 1.67945553e-06 | step = 4.75091778e-03
Final cost: -103.56226842977263
Final optimality gap: -4.836178675304836e-7
Gap reduction factor: 1.6027476049095297e8
2. RDoG with curvature
------------------------------
Initial | f(x) = -26.050530058561 | | # 50 | f(x) = -72.095369960602 | ||grad f|| = 3.70251339e+01 | step = 5.96437367e-03
# 100 | f(x) = -99.259701666648 | ||grad f|| = 8.38849476e+00 | step = 4.75172205e-03
# 150 | f(x) = -103.000821847999 | ||grad f|| = 1.15867074e+00 | step = 4.73360379e-03
# 200 | f(x) = -103.486796766971 | ||grad f|| = 1.56925069e-01 | step = 4.73327489e-03
# 250 | f(x) = -103.552080484853 | ||grad f|| = 2.12039156e-02 | step = 4.73326889e-03
# 300 | f(x) = -103.560892014525 | ||grad f|| = 2.86421892e-03 | step = 4.73326878e-03
# 350 | f(x) = -103.562082096827 | ||grad f|| = 3.86881848e-04 | step = 4.73326878e-03
# 400 | f(x) = -103.562242842924 | ||grad f|| = 5.22574324e-05 | step = 4.73326878e-03
# 450 | f(x) = -103.562264555380 | ||grad f|| = 7.05858209e-06 | step = 4.73326878e-03
Final cost: -103.5622684228676
Final optimality gap: -4.767128416460764e-7
Gap reduction factor: 1.6259628672881514e8
3. Fixed stepsize for comparison
------------------------------
Initial | f(x) = -26.050530058561 | | # 50 | f(x) = -12.164350964363 | ||grad f|| = 6.14247524e+01 | step = 1.00000000e-02
# 100 | f(x) = -154.102934108739 | ||grad f|| = 6.36359713e+01 | step = 1.00000000e-02
# 150 | f(x) = -27.826871769201 | ||grad f|| = 1.21849235e+02 | step = 1.00000000e-02
# 200 | f(x) = -142.833567182374 | ||grad f|| = 7.66632630e+01 | step = 1.00000000e-02
# 250 | f(x) = -11.489914079089 | ||grad f|| = 6.21234688e+01 | step = 1.00000000e-02
# 300 | f(x) = -73.826859643045 | ||grad f|| = 6.10347964e+01 | step = 1.00000000e-02
# 350 | f(x) = -47.379687572027 | ||grad f|| = 4.76704604e+01 | step = 1.00000000e-02
# 400 | f(x) = -129.266585427988 | ||grad f|| = 1.80502748e+01 | step = 1.00000000e-02
# 450 | f(x) = -145.390758234062 | ||grad f|| = 7.51024945e+01 | step = 1.00000000e-02
# 500 | f(x) = -32.820973565456 | ||grad f|| = 1.24955303e+02 | step = 1.00000000e-02
# 550 | f(x) = -90.420049394265 | ||grad f|| = 8.14205164e+00 | step = 1.00000000e-02
# 600 | f(x) = -143.388053266504 | ||grad f|| = 7.63607133e+01 | step = 1.00000000e-02
# 650 | f(x) = -77.428020925510 | ||grad f|| = 1.59325019e+01 | step = 1.00000000e-02
# 700 | f(x) = -13.069167334947 | ||grad f|| = 6.04488467e+01 | step = 1.00000000e-02
# 750 | f(x) = -15.525608524218 | ||grad f|| = 1.12794817e+02 | step = 1.00000000e-02
# 800 | f(x) = -138.759517588186 | ||grad f|| = 2.63777228e+01 | step = 1.00000000e-02
# 850 | f(x) = -71.881686168002 | ||grad f|| = 6.01177083e+01 | step = 1.00000000e-02
# 900 | f(x) = -133.778383487263 | ||grad f|| = 2.18157716e+01 | step = 1.00000000e-02
# 950 | f(x) = -125.908532239189 | ||grad f|| = 7.95445936e+01 | step = 1.00000000e-02
# 1000 | f(x) = -114.380405799096 | ||grad f|| = 7.72801332e+01 | step = 1.00000000e-02
Final cost: -114.38040579909566
Final optimality gap: -10.818137852940893
Gap reduction factor: 7.164979679614846
4. Sensitivity to initial distance
------------------------------
Running shorter tests (200 iterations) with different initial distances:
ε = 1.0e-5: Final gap = 0.2792590185977133, Reduction = 277.56216532169816x
ε = 0.001: Final gap = -0.037547838633443575, Reduction = 2064.3461969753585x
ε = 0.1: Final gap = 0.30871421932334897, Reduction = 251.07926048073432x