@@ -1687,3 +1687,218 @@ end
16871687function get_last_stepsize (step:: WolfePowellBinaryLinesearchStepsize , :: Any... )
16881688 return step. last_stepsize
16891689end
1690+
1691+ @doc raw """
1692+ DistanceOverGradientsStepsize{R<:Real} <: Stepsize
1693+
1694+ # Fields
1695+
1696+ * `initial_distance::R`: initial distance estimate ``ϵ>0``
1697+ * `max_distance::R`: tracked maximum distance ``\b ar r_t``
1698+ * `gradient_sum::R`: accumulated sum ``G_t``
1699+ * `initial_point`: stored start point ``p_0``
1700+ * `use_curvature::Bool`: toggle curvature correction ``ζ_κ``
1701+ * `sectional_curvature_bound::R`: lower bound ``κ`` used in ``ζ_κ`` when `use_curvature=true`
1702+ * `last_stepsize::R`: last computed stepsize
1703+
1704+ # Constructor
1705+
1706+ DistanceOverGradientsStepsize(M::AbstractManifold; kwargs...)
1707+
1708+ ## Keyword arguments
1709+
1710+ * `initial_distance=1e-3`: initial estimate ``ϵ``
1711+ * `use_curvature=false`: whether to use ``ζ_κ``
1712+ * `sectional_curvature_bound=0.0`: lower curvature bound ``κ`` (if known)
1713+ * `p`: initial point, used to track distance
1714+
1715+ # References
1716+
1717+ [DoddSharrockNemeth:2024](@cite): Learning-Rate-Free Stochastic Optimization over
1718+ Riemannian Manifolds (RDoG).
1719+ """
1720+ mutable struct DistanceOverGradientsStepsize{R <: Real , P} <: Stepsize
1721+ initial_distance:: R
1722+ max_distance:: R
1723+ gradient_sum:: R
1724+ initial_point:: P
1725+ use_curvature:: Bool
1726+ sectional_curvature_bound:: R
1727+ last_stepsize:: R
1728+ end
1729+
1730+ function DistanceOverGradientsStepsize (
1731+ M:: AbstractManifold ;
1732+ p = rand (M),
1733+ initial_distance:: R1 = 1.0e-3 ,
1734+ use_curvature:: Bool = false ,
1735+ sectional_curvature_bound:: R2 = 0.0 ,
1736+ ) where {R1 <: Real , R2 <: Real }
1737+ R = promote_type (R1, R2)
1738+ id = convert (R, initial_distance)
1739+ κ = convert (R, sectional_curvature_bound)
1740+ return DistanceOverGradientsStepsize {R, typeof(p)} (
1741+ id,
1742+ id, # max_distance starts at initial_distance
1743+ zero (R), # gradient_sum starts at 0
1744+ copy (M, p), # store initial point
1745+ use_curvature,
1746+ κ,
1747+ NaN , # last_stepsize
1748+ )
1749+ end
1750+
1751+ @doc raw """
1752+ geometric_curvature_function(κ::Real, d::Real)
1753+
1754+ Compute the geometric curvature function ``ζ_κ(d)`` used by the RDoG stepsize:
1755+
1756+ ```math
1757+ ζ_κ(d) =
1758+ \b egin{cases}
1759+ 1, & \t ext{if } κ \g e 0,\\ [4pt]
1760+ \d frac{\s qrt{|κ|}\, d}{\t anh(\s qrt{|κ|}\, d)}, & \t ext{if } κ < 0.
1761+ \e nd{cases}
1762+ ```
1763+
1764+ For small arguments, a Taylor approximation is used for numerical stability.
1765+ """
1766+ function geometric_curvature_function (κ:: Real , d:: Real )
1767+ if κ < 0 && d > 0
1768+ sqrt_abs_κ = sqrt (abs (κ))
1769+ arg = sqrt_abs_κ * d
1770+ return arg / tanh (arg)
1771+ else
1772+ return 1.0
1773+ end
1774+ end
1775+
1776+ function (rdog:: DistanceOverGradientsStepsize{R, P} )(
1777+ mp:: AbstractManoptProblem ,
1778+ s:: AbstractManoptSolverState ,
1779+ i,
1780+ args... ;
1781+ gradient = nothing ,
1782+ kwargs... ,
1783+ ) where {R, P}
1784+ M = get_manifold (mp)
1785+ p = get_iterate (s)
1786+ grad = isnothing (gradient) ? get_gradient (mp, p) : gradient
1787+
1788+ # Compute gradient norm
1789+ grad_norm_sq = clamp (norm (M, p, grad)^ 2 , eps (R), typemax (R))
1790+ if i == 0
1791+ # Initialize on first call
1792+ rdog. gradient_sum = grad_norm_sq
1793+ rdog. initial_point = copy (M, p)
1794+ rdog. max_distance = rdog. initial_distance
1795+
1796+ # Initial stepsize
1797+ if rdog. use_curvature
1798+ ζ = geometric_curvature_function (
1799+ rdog. sectional_curvature_bound, rdog. max_distance
1800+ )
1801+ stepsize = rdog. initial_distance / (sqrt (ζ) * sqrt (max (grad_norm_sq, eps (R))))
1802+ else
1803+ stepsize = rdog. initial_distance / sqrt (max (grad_norm_sq, eps (R)))
1804+ end
1805+ else
1806+ # Update gradient sum
1807+ rdog. gradient_sum += grad_norm_sq
1808+
1809+ # Update max distance
1810+ current_distance = distance (M, rdog. initial_point, p)
1811+ rdog. max_distance = max (rdog. max_distance, current_distance)
1812+
1813+ # Compute stepsize
1814+ if rdog. use_curvature
1815+ ζ = geometric_curvature_function (
1816+ rdog. sectional_curvature_bound, rdog. max_distance
1817+ )
1818+ stepsize = rdog. max_distance / (sqrt (ζ) * sqrt (rdog. gradient_sum))
1819+ else
1820+ stepsize = rdog. max_distance / sqrt (rdog. gradient_sum)
1821+ end
1822+ end
1823+
1824+ rdog. last_stepsize = stepsize
1825+ return stepsize
1826+ end
1827+
1828+ get_initial_stepsize (rdog:: DistanceOverGradientsStepsize ) = rdog. last_stepsize
1829+ get_last_stepsize (rdog:: DistanceOverGradientsStepsize ) = rdog. last_stepsize
1830+
1831+ function show (io:: IO , rdog:: DistanceOverGradientsStepsize )
1832+ s = """
1833+ DistanceOverGradients(;
1834+ initial_distance=$(rdog. initial_distance) ,
1835+ use_curvature=$(rdog. use_curvature) ,
1836+ sectional_curvature_bound=$(rdog. sectional_curvature_bound)
1837+ )
1838+
1839+ Current state:
1840+ max_distance = $(rdog. max_distance)
1841+ gradient_sum = $(rdog. gradient_sum)
1842+ last_stepsize = $(rdog. last_stepsize)
1843+ """
1844+ return print (io, s)
1845+ end
1846+
1847+ @doc raw """
1848+ DistanceOverGradients(; kwargs...)
1849+ DistanceOverGradients(M::AbstractManifold; kwargs...)
1850+
1851+ Create a factory for the [`DistanceOverGradientsStepsize`](@ref), the
1852+ Riemannian Distance over Gradients (RDoG) learning-rate-free stepsize from
1853+ [DoddSharrockNemeth:2024](@cite). It adapts via the maximum distance from the
1854+ start point and the accumulated gradient norms, optionally corrected by the
1855+ geometric curvature term ``ζ_κ``.
1856+
1857+ Riemannian Distance over Gradients (RDoG) learning-rate-free stepsize schedule
1858+ introduced by [DoddSharrockNemeth:2024](@cite). This schedule adapts without manual
1859+ tuning by combining a distance proxy from the start point with accumulated
1860+ gradient norms.
1861+
1862+ Definitions used by the implementation:
1863+
1864+ * ``\b ar r_t := \m ax(\, ϵ,\, \m ax_{0\l e s\l e t} d(p_0, p_s)\, )`` tracks the maximum geodesic
1865+ distance from the initial point ``p_0`` using the current iterate ``p_t``.
1866+ * ``G_t := \s um_{s=0}^t \l Vert g_s \r Vert^2``, where ``g_s = \o peratorname{grad} f(p_s)``.
1867+
1868+ At iteration ``t`` the stepsize used here is
1869+
1870+ ```math
1871+ η_t =
1872+ \b egin{cases}
1873+ \d frac{\b ar r_t}{\s qrt{G_t}}, & \t ext{if `use_curvature = false`,}\\ [6pt]
1874+ \d frac{\b ar r_t}{\s qrt{\, ζ_κ(\b ar r_t)\, }\,\s qrt{G_t}}, & \t ext{if `use_curvature = true`,}
1875+ \e nd{cases}
1876+ ```
1877+
1878+ with the geometric curvature function ``ζ_κ(d)`` defined in
1879+ [`geometric_curvature_function`](@ref). The initialization in this
1880+ implementation follows the paper: on the first call (``t=0``), we set
1881+ ``G_0=\l Vert g_0\r Vert^2``, ``\b ar r_0 = ϵ`` and take
1882+
1883+ ```math
1884+ η_0 =
1885+ \b egin{cases}
1886+ \d frac{ϵ}{\l Vert g_0\r Vert}, & \t ext{if `use_curvature = false`,}\\ [6pt]
1887+ \d frac{ϵ}{\s qrt{\, ζ_κ(ϵ)\, }\,\l Vert g_0\r Vert}, & \t ext{if `use_curvature = true`.}
1888+ \e nd{cases}
1889+ ```
1890+
1891+ On subsequent calls, the state is updated as implemented: ``G_t \l eftarrow G_{t-1}
1892+ + \l Vert g_t\r Vert^2`` and ``\b ar r_t \l eftarrow \m ax(\b ar r_{t-1}, d(p_0,p_t))``.
1893+
1894+ ## Keyword arguments
1895+
1896+ * `initial_distance=1e-3`: initial distance estimate ``ϵ``
1897+ * `use_curvature=false`: whether to include ``ζ_κ``
1898+ * `sectional_curvature_bound=0.0`: curvature lower bound ``κ`` (if known)
1899+
1900+ $(_note(:ManifoldDefaultFactory, "DistanceOverGradientsStepsize"))
1901+ """
1902+ function DistanceOverGradients (args... ; kwargs... )
1903+ return ManifoldDefaultsFactory (Manopt. DistanceOverGradientsStepsize, args... ; kwargs... )
1904+ end
0 commit comments