Skip to content

Adding RDoG #502

@Nimrais

Description

@Nimrais

I recently played around in my internal lab with the RDoG optimizer https://arxiv.org/pdf/2406.02296 (more methods are proposed in the paper I just do not have time to try them all), which is basically a Riemannian generalization of the DoG optimizer https://arxiv.org/abs/2302.12022. First, it's very easy to understand, and second, it's not computationally intensive compared to Armijo. My cost functions are usually very expensive to evaluate—surprisingly, sometimes gradients are comparatively cheap (at least the stochastic gradients)—so it would be nice to have something that is not prohibitively slow but at the same time learning-rate schedule free. If you are finding it valuable to have it in the repo I can file a PR.

My current implementation is the following one:

@doc raw"""
    RDoGStepsize{R<:Real} <: Stepsize
"""
mutable struct RDoGStepsize{R<:Real,P} <: Stepsize
    initial_distance::R
    max_distance::R
    gradient_sum::R
    initial_point::P
    use_curvature::Bool
    sectional_curvature_bound::R
    last_stepsize::R
end

function RDoGStepsize(
    M::AbstractManifold;
    p=rand(M),
    initial_distance::R=1e-3,
    use_curvature::Bool=false,
    sectional_curvature_bound::R=0.0,
) where {R<:Real}
    return RDoGStepsize{R,typeof(p)}(
        initial_distance,
        initial_distance,  # max_distance starts at initial_distance
        zero(R),          # gradient_sum starts at 0
        copy(M, p),       # store initial point
        use_curvature,
        sectional_curvature_bound,
        NaN,              # last_stepsize
    )
end

"""
    geometric_curvature_function(κ::Real, d::Real)
"""
function geometric_curvature_function::Real, d::Real)
    if κ < 0 && d > 0
        sqrt_abs_κ = sqrt(abs(κ))
        arg = sqrt_abs_κ * d
        # Avoid numerical issues for small arguments
        if arg < 1e-8
            return 1.0 + arg^2 / 3.0  # Taylor expansion
        else
            return arg / tanh(arg)
        end
    else
        return 1.0
    end
end

function (rdog::RDoGStepsize)(
    mp::AbstractManoptProblem,
    s::AbstractManoptSolverState,
    i,
    args...;
    gradient=nothing,
    kwargs...,
)
    M = get_manifold(mp)
    p = get_iterate(s)
    grad = isnothing(gradient) ? get_gradient(mp, p) : gradient
    
    # Compute gradient norm
    grad_norm_sq = norm(M, p, grad)^2
    
    if i == 0
        # Initialize on first call
        rdog.gradient_sum = grad_norm_sq
        rdog.initial_point = copy(M, p)
        rdog.max_distance = rdog.initial_distance
        
        # Initial stepsize
        if rdog.use_curvature
            ζ = geometric_curvature_function(rdog.sectional_curvature_bound, rdog.max_distance)
            stepsize = rdog.initial_distance / (sqrt(ζ) * sqrt(max(grad_norm_sq, eps())))
        else
            stepsize = rdog.initial_distance / sqrt(max(grad_norm_sq, eps()))
        end
    else
        # Update gradient sum
        rdog.gradient_sum += grad_norm_sq
        
        # Update max distance
        current_distance = distance(M, rdog.initial_point, p)
        rdog.max_distance = max(rdog.max_distance, current_distance)
        
        # Compute stepsize
        if rdog.use_curvature
            ζ = geometric_curvature_function(rdog.sectional_curvature_bound, rdog.max_distance)
            stepsize = rdog.max_distance / (sqrt(ζ) * sqrt(rdog.gradient_sum))
        else
            stepsize = rdog.max_distance / sqrt(rdog.gradient_sum)
        end
    end
    
    rdog.last_stepsize = stepsize
    return stepsize
end

get_initial_stepsize(rdog::RDoGStepsize) = rdog.last_stepsize
get_last_stepsize(rdog::RDoGStepsize) = rdog.last_stepsize

function show(io::IO, rdog::RDoGStepsize)
    s = """
    RDoGStepsize(;
      initial_distance=$(rdog.initial_distance),
      use_curvature=$(rdog.use_curvature),
      sectional_curvature_bound=$(rdog.sectional_curvature_bound)
    )
    
    Current state:
      max_distance = $(rdog.max_distance)
      gradient_sum = $(rdog.gradient_sum)
      last_stepsize = $(rdog.last_stepsize)
    """
    return print(io, s)
end

"""
    RDoG(; kwargs...)
    RDoG(M::AbstractManifold; kwargs...)

Creates a factory for the [`RDoGStepsize`](@ref).

## Keyword arguments

* `initial_distance=1e-3`: initial distance estimate
* `use_curvature=false`: whether to use sectional curvature
* `sectional_curvature_bound=0.0`: lower bound on sectional curvature

$(_note(:ManifoldDefaultFactory, "RDoGStepsize"))
"""
function RDoG(args...; kwargs...)
    return ManifoldDefaultsFactory(Manopt.RDoGStepsize, args...; kwargs...)
end

I forked the repo and made an example for the Rayleigh quotient https://github.com/Nimrais/Manopt.jl/blob/master/examples/rdog_example.jl.

If you will call it you will obtain the following result.

True optimal value (negative largest eigenvalue): -103.56226794615476
RDoG Example: Rayleigh Quotient Minimization
==================================================
Manifold: Sphere(29)
True optimal value: -103.56226794615476
Initial cost: -26.050530058561083
Initial optimality gap: 77.51173788759368

1. Basic RDoG (no curvature)
------------------------------
Initial  | f(x) = -26.050530058561 |  | # 50     | f(x) = -108.764589219706 | ||grad f|| = 5.46144063e+00 | step = 7.14544209e-03
# 100    | f(x) = -111.794090201414 | ||grad f|| = 1.94207092e+01 | step = 4.83908048e-03
# 150    | f(x) = -105.322098782001 | ||grad f|| = 3.73581804e+00 | step = 4.75430550e-03
# 200    | f(x) = -103.914644314998 | ||grad f|| = 7.31942441e-01 | step = 4.75104968e-03
# 250    | f(x) = -103.631999324778 | ||grad f|| = 1.44217419e-01 | step = 4.75092292e-03
# 300    | f(x) = -103.576035552189 | ||grad f|| = 2.84495965e-02 | step = 4.75091798e-03
# 350    | f(x) = -103.564984971476 | ||grad f|| = 5.61355494e-03 | step = 4.75091779e-03
# 400    | f(x) = -103.562804101333 | ||grad f|| = 1.10769530e-03 | step = 4.75091778e-03
# 450    | f(x) = -103.562373744711 | ||grad f|| = 2.18578122e-04 | step = 4.75091778e-03
# 500    | f(x) = -103.562288823125 | ||grad f|| = 4.31314309e-05 | step = 4.75091778e-03
# 550    | f(x) = -103.562272065753 | ||grad f|| = 8.51101171e-06 | step = 4.75091778e-03
# 600    | f(x) = -103.562268759064 | ||grad f|| = 1.67945553e-06 | step = 4.75091778e-03
Final cost: -103.56226842977263
Final optimality gap: -4.836178675304836e-7
Gap reduction factor: 1.6027476049095297e8

2. RDoG with curvature
------------------------------
Initial  | f(x) = -26.050530058561 |  | # 50     | f(x) = -72.095369960602 | ||grad f|| = 3.70251339e+01 | step = 5.96437367e-03
# 100    | f(x) = -99.259701666648 | ||grad f|| = 8.38849476e+00 | step = 4.75172205e-03
# 150    | f(x) = -103.000821847999 | ||grad f|| = 1.15867074e+00 | step = 4.73360379e-03
# 200    | f(x) = -103.486796766971 | ||grad f|| = 1.56925069e-01 | step = 4.73327489e-03
# 250    | f(x) = -103.552080484853 | ||grad f|| = 2.12039156e-02 | step = 4.73326889e-03
# 300    | f(x) = -103.560892014525 | ||grad f|| = 2.86421892e-03 | step = 4.73326878e-03
# 350    | f(x) = -103.562082096827 | ||grad f|| = 3.86881848e-04 | step = 4.73326878e-03
# 400    | f(x) = -103.562242842924 | ||grad f|| = 5.22574324e-05 | step = 4.73326878e-03
# 450    | f(x) = -103.562264555380 | ||grad f|| = 7.05858209e-06 | step = 4.73326878e-03
Final cost: -103.5622684228676
Final optimality gap: -4.767128416460764e-7
Gap reduction factor: 1.6259628672881514e8

3. Fixed stepsize for comparison
------------------------------
Initial  | f(x) = -26.050530058561 |  | # 50     | f(x) = -12.164350964363 | ||grad f|| = 6.14247524e+01 | step = 1.00000000e-02
# 100    | f(x) = -154.102934108739 | ||grad f|| = 6.36359713e+01 | step = 1.00000000e-02
# 150    | f(x) = -27.826871769201 | ||grad f|| = 1.21849235e+02 | step = 1.00000000e-02
# 200    | f(x) = -142.833567182374 | ||grad f|| = 7.66632630e+01 | step = 1.00000000e-02
# 250    | f(x) = -11.489914079089 | ||grad f|| = 6.21234688e+01 | step = 1.00000000e-02
# 300    | f(x) = -73.826859643045 | ||grad f|| = 6.10347964e+01 | step = 1.00000000e-02
# 350    | f(x) = -47.379687572027 | ||grad f|| = 4.76704604e+01 | step = 1.00000000e-02
# 400    | f(x) = -129.266585427988 | ||grad f|| = 1.80502748e+01 | step = 1.00000000e-02
# 450    | f(x) = -145.390758234062 | ||grad f|| = 7.51024945e+01 | step = 1.00000000e-02
# 500    | f(x) = -32.820973565456 | ||grad f|| = 1.24955303e+02 | step = 1.00000000e-02
# 550    | f(x) = -90.420049394265 | ||grad f|| = 8.14205164e+00 | step = 1.00000000e-02
# 600    | f(x) = -143.388053266504 | ||grad f|| = 7.63607133e+01 | step = 1.00000000e-02
# 650    | f(x) = -77.428020925510 | ||grad f|| = 1.59325019e+01 | step = 1.00000000e-02
# 700    | f(x) = -13.069167334947 | ||grad f|| = 6.04488467e+01 | step = 1.00000000e-02
# 750    | f(x) = -15.525608524218 | ||grad f|| = 1.12794817e+02 | step = 1.00000000e-02
# 800    | f(x) = -138.759517588186 | ||grad f|| = 2.63777228e+01 | step = 1.00000000e-02
# 850    | f(x) = -71.881686168002 | ||grad f|| = 6.01177083e+01 | step = 1.00000000e-02
# 900    | f(x) = -133.778383487263 | ||grad f|| = 2.18157716e+01 | step = 1.00000000e-02
# 950    | f(x) = -125.908532239189 | ||grad f|| = 7.95445936e+01 | step = 1.00000000e-02
# 1000   | f(x) = -114.380405799096 | ||grad f|| = 7.72801332e+01 | step = 1.00000000e-02
Final cost: -114.38040579909566
Final optimality gap: -10.818137852940893
Gap reduction factor: 7.164979679614846

4. Sensitivity to initial distance
------------------------------
Running shorter tests (200 iterations) with different initial distances:
ε = 1.0e-5: Final gap = 0.2792590185977133, Reduction = 277.56216532169816x
ε = 0.001: Final gap = -0.037547838633443575, Reduction = 2064.3461969753585x
ε = 0.1: Final gap = 0.30871421932334897, Reduction = 251.07926048073432x

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions