refactor!: update package (#27)

gdalle · web-flow · commit ff46b855a759 · 2025-06-11T15:53:27.000+02:00
* Transition to ImplicitDiff v0.8

* Add projection correctness test

* Clean up

* Fixes

* Allow specifying x0

* Different sizes

* Fix

* Show for debugging

* Show

* Make tests pass
diff --git a/.gitignore b/.gitignore
@@ -5,3 +5,4 @@
 /docs/build/
 /docs/Manifest.toml
 /docs/src/tutorial.md
+playground.jl
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "DifferentiableFrankWolfe"
 uuid = "b383313e-5450-4164-a800-befbd27b574d"
 authors = ["Guillaume Dalle"]
-version = "0.4.1"
+version = "0.5.0"
 
 [deps]
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
@@ -11,8 +11,8 @@ LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 
 [compat]
 ChainRulesCore = "1.15"
-FrankWolfe = "0.3, 0.4, 0.5"
-ImplicitDifferentiation = "0.7"
+FrankWolfe = "0.5"
+ImplicitDifferentiation = "0.8"
 LinearAlgebra = "1"
 julia = "1.10"
 
@@ -25,10 +25,31 @@ FrankWolfe = "f55ce6ea-fdc5-4628-88c5-0087fe54bd30"
 ImplicitDifferentiation = "57b37032-215b-411a-8a7c-41a003a55207"
 JET = "c3a54625-cd67-489e-a8e7-0a5a0ff4e31b"
 JuliaFormatter = "98e50ef6-434e-11e9-1051-2b60c6c9e899"
+ProximalOperators = "a725b495-10eb-56fe-b38b-717eba820537"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+TestItemRunner = "f8b46487-2199-4994-9208-9a1283c18c0a"
+TestItems = "1c621080-faea-4a02-84b6-bbd5e436b8fe"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [targets]
-test = ["Aqua", "ChainRulesCore", "Documenter", "ForwardDiff", "FrankWolfe", "ImplicitDifferentiation", "JET", "JuliaFormatter", "Random", "Statistics", "Test", "Zygote"]
+test = [
+    "Aqua",
+    "ChainRulesCore",
+    "Documenter",
+    "ForwardDiff",
+    "FrankWolfe",
+    "ImplicitDifferentiation",
+    "JET",
+    "JuliaFormatter",
+    "ProximalOperators",
+    "Random",
+    "StableRNGs",
+    "Statistics",
+    "Test",
+    "TestItems",
+    "TestItemRunner",
+    "Zygote",
+]
diff --git a/docs/Project.toml b/docs/Project.toml
@@ -4,6 +4,7 @@ Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 FrankWolfe = "f55ce6ea-fdc5-4628-88c5-0087fe54bd30"
 Literate = "98b081ad-f1c9-55d3-8b20-4c87d4299306"
+ProximalOperators = "a725b495-10eb-56fe-b38b-717eba820537"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [compat]
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -1,7 +1,3 @@
-```@meta
-CurrentModule = DifferentiableFrankWolfe
-```
-
 # DifferentiableFrankWolfe
 
 Documentation for [DifferentiableFrankWolfe.jl](https://github.com/gdalle/DifferentiableFrankWolfe.jl).
diff --git a/examples/tutorial.jl b/examples/tutorial.jl
@@ -4,40 +4,49 @@
 
 using DifferentiableFrankWolfe: DiffFW, simplex_projection
 using ForwardDiff: ForwardDiff
-using FrankWolfe: UnitSimplexOracle
+using FrankWolfe: ProbabilitySimplexOracle
+using ProximalOperators: ProximalOperators
 using Test: @test
 using Zygote: Zygote
 
 # Constructing the wrapper
 
 f(x, θ) = 0.5 * sum(abs2, x - θ)  # minimizing the squared distance...
 f_grad1(x, θ) = x - θ
-lmo = UnitSimplexOracle(1.0)  # ... to the probability simplex
-dfw = DiffFW(f, f_grad1, lmo);  # ... is equivalent to a simplex projection
+lmo = ProbabilitySimplexOracle(1.0)  # ... to the probability simplex
+dfw = DiffFW(f, f_grad1, lmo);  # ... is equivalent to a simplex projection if we're not already in it
 
 # Calling the wrapper
 
-θ = rand(10)
+x0 = ones(3) ./ 3
+θ = [1.0, 1.5, 0.2]
 
 #-
 
 frank_wolfe_kwargs = (; max_iteration=100, epsilon=1e-4)
-y, stats = dfw(θ, frank_wolfe_kwargs)
-y
+y = dfw(θ, x0; frank_wolfe_kwargs...)
+
+#- Comparing with the ground truth
+
+true_simplex_projection(x) = ProximalOperators.prox(ProximalOperators.IndSimplex(1.0), x)[1]
 
 #-
 
-y_true = simplex_projection(θ)
+y_true = true_simplex_projection(θ)
 @test Vector(y) ≈ Vector(y_true) atol = 1e-3
 
 # Differentiating the wrapper
 
-J1 = Zygote.jacobian(_θ -> dfw(_θ, frank_wolfe_kwargs)[1], θ)[1]
-J1_true = Zygote.jacobian(simplex_projection, θ)[1]
-@test J1 ≈ J1_true atol = 1e-3
+#-
+
+J_true = ForwardDiff.jacobian(true_simplex_projection, θ)
+
+#-
+
+J1 = Zygote.jacobian(_θ -> dfw(_θ, x0; frank_wolfe_kwargs...), θ)[1]
+@test J1 ≈ J_true atol = 1e-3
 
 #-
 
-J2 = ForwardDiff.jacobian(_θ -> dfw(_θ, frank_wolfe_kwargs)[1], θ)
-J2_true = ForwardDiff.jacobian(simplex_projection, θ)
-@test J2 ≈ J2_true atol = 1e-3
+J2 = ForwardDiff.jacobian(_θ -> dfw(_θ, x0; frank_wolfe_kwargs...), θ)
+@test J2 ≈ J_true atol = 1e-3
diff --git a/src/DifferentiableFrankWolfe.jl b/src/DifferentiableFrankWolfe.jl
@@ -7,7 +7,12 @@ module DifferentiableFrankWolfe
 
 using ChainRulesCore: ChainRulesCore, NoTangent, ProjectTo, unthunk
 using FrankWolfe: FrankWolfe, LinearMinimizationOracle
-using FrankWolfe: away_frank_wolfe, compute_extreme_point
+using FrankWolfe:
+    away_frank_wolfe,
+    blended_conditional_gradient,
+    blended_pairwise_conditional_gradient,
+    compute_extreme_point,
+    pairwise_frank_wolfe
 using ImplicitDifferentiation: ImplicitFunction
 using LinearAlgebra: dot
 
diff --git a/src/difffw.jl b/src/difffw.jl
@@ -1,13 +1,35 @@
 """
     ForwardFW
 
-Underlying solver for [`DiffFW`](@ref), which relies on a variant of Frank-Wolfe.
+Underlying solver for [`DiffFW`](@ref), which relies on a variant of Frank-Wolfe with active set memorization.
 """
 struct ForwardFW{F,G,M,A}
     f::F
     f_grad1::G
     lmo::M
     alg::A
+
+    function ForwardFW(f, f_grad1, lmo, alg)
+        @assert alg in (
+            away_frank_wolfe,
+            blended_conditional_gradient,
+            blended_pairwise_conditional_gradient,
+            pairwise_frank_wolfe,
+        )
+        return new{typeof(f),typeof(f_grad1),typeof(lmo),typeof(alg)}(f, f_grad1, lmo, alg)
+    end
+end
+
+function (forward::ForwardFW)(θ::AbstractArray, x0::AbstractArray, frank_wolfe_kwargs)
+    f, f_grad1, lmo, alg = forward.f, forward.f_grad1, forward.lmo, forward.alg
+    obj(x) = f(x, θ)
+    grad!(g, x) = copyto!(g, f_grad1(x, θ))
+    x_final, v_final, primal_value, dual_gap, traj_data, active_set = alg(
+        obj, grad!, lmo, x0; frank_wolfe_kwargs...
+    )
+    stats = (; x_final, v_final, primal_value, dual_gap, traj_data, active_set)
+    p = active_set.weights
+    return p, stats
 end
 
 """
@@ -19,20 +41,42 @@ struct ConditionsFW{G}
     f_grad1::G
 end
 
+function (conditions::ConditionsFW)(
+    θ::AbstractArray,
+    p::AbstractVector,
+    stats::NamedTuple,
+    _x0::AbstractArray,
+    _frank_wolfe_kwargs,
+)
+    V = stats.active_set.atoms
+    f_grad1 = conditions.f_grad1
+    V_mat = stack(V)
+    x = V_mat * p
+    ∇ₓf = f_grad1(x, θ)
+    ∇ₚg = transpose(V_mat) * ∇ₓf
+    T = simplex_projection(p .- ∇ₚg)
+    return T .- p
+end
+
 """
     DiffFW
 
-Callable parametrized wrapper for the Frank-Wolfe algorithm to solve `θ -> argmin_{x ∈ C} f(x, θ)`, which can be differentiated implicitly wrt `θ`.
+Callable parametrized wrapper for the Frank-Wolfe algorithm to solve `θ -> argmin_{x ∈ C} f(x, θ)` from a given starting point `x0`.
+The solution routine can be differentiated implicitly with respect `θ`, but not with respect to `x0`.
+
+# Constructor
+
+    DiffFW(f, f_grad1, lmo, alg=away_frank_wolfe; implicit_kwargs=(;))
 
-Reference: <https://arxiv.org/abs/2105.15183> (section 2 + end of appendix A).
+- `f`: function `f(x, θ)` to minimize with respect to `x`
+- `f_grad1`: gradient `∇ₓf(x, θ)` of `f` with respect to `x`
+- `lmo`: linear minimization oracle `θ -> argmin_{x ∈ C} θᵀx` from [FrankWolfe.jl](https://github.com/ZIB-IOL/FrankWolfe.jl), implicitly defines the convex set `C`
+- `alg`: optimization algorithm from [FrankWolfe.jl](https://github.com/ZIB-IOL/FrankWolfe.jl), must return an `active_set`
+- `implicit_kwargs`: keyword arguments passed to the `ImplicitFunction` object from [ImplicitDifferentiation.jl](https://github.com/gdalle/ImplicitDifferentiation.jl)
 
-# Fields
+# References
 
-- `f`: function `f(x, θ)` to minimize wrt `x`
-- `f_grad1`: gradient `∇ₓf(x, θ)` of `f` wrt `x`
-- `lmo`: linear minimization oracle `θ -> argmin_{x ∈ C} θᵀx` from [FrankWolfe.jl], implicitly defines the convex set `C`
-- `alg`: optimization algorithm from [FrankWolfe.jl](https://github.com/ZIB-IOL/FrankWolfe.jl)
-- `implicit`: implicit function from [ImplicitDifferentiation.jl](https://github.com/gdalle/ImplicitDifferentiation.jl)
+> [Efficient and Modular Implicit Differentiation](https://proceedings.neurips.cc/paper_files/paper/2022/hash/228b9279ecf9bbafe582406850c57115-Abstract-Conference.html), Blondel et al. (2022)
 """
 struct DiffFW{F,G,M<:LinearMinimizationOracle,A,I<:ImplicitFunction}
     f::F
@@ -42,11 +86,6 @@ struct DiffFW{F,G,M<:LinearMinimizationOracle,A,I<:ImplicitFunction}
     implicit::I
 end
 
-"""
-    DiffFW(f, f_grad1, lmo, alg=away_frank_wolfe; implicit_kwargs=(;))
-
-Constructor for [`DiffFW`](@ref) which chooses a default algorithm and creates the implicit function automatically.
-"""
 function DiffFW(
     f::F, f_grad1::G, lmo::L, alg::A=away_frank_wolfe; implicit_kwargs=NamedTuple()
 ) where {F,G,L,A}
@@ -57,40 +96,30 @@ function DiffFW(
 end
 
 """
-    (dfw::DiffFW)(θ::AbstractArray, frank_wolfe_kwargs::NamedTuple)
+    detailed_output(dfw::DiffFW, θ::AbstractArray, x0::AbstractArray; kwargs...)
 
-Apply the Frank-Wolfe algorithm to `θ` with settings defined by the named tuple `frank_wolfe_kwargs` (given as a positional argument).
+Apply the differentiable Frank-Wolfe algorithm defined by `dfw` to parameter `θ` with starting point `x0`.
+Keyword arguments are passed on to the Frank-Wolfe algorithm inside `dfw`.
 
 Return a couple (x, stats) where `x` is the solution and `stats` is a named tuple containing additional information (its contents are not covered by public API, and mostly useful for debugging).
 """
-function (dfw::DiffFW)(θ::AbstractArray, frank_wolfe_kwargs=NamedTuple())
-    p, stats = dfw.implicit(θ, frank_wolfe_kwargs)
+function detailed_output(dfw::DiffFW, θ::AbstractArray, x0::AbstractArray; kwargs...)
+    p, stats = dfw.implicit(θ, x0, kwargs)
     V = stats.active_set.atoms
-    x = mapreduce(*,+,p,V)
+    V_mat = stack(V)
+    x = V_mat * p
     return x, stats
 end
 
-function (forward::ForwardFW)(θ::AbstractArray, frank_wolfe_kwargs::NamedTuple)
-    f, f_grad1, lmo, alg = forward.f, forward.f_grad1, forward.lmo, forward.alg
-    obj(x) = f(x, θ)
-    grad!(g, x) = copyto!(g, f_grad1(x, θ))
-    x0 = compute_extreme_point(lmo, θ)
-    x_final, v_final, primal_value, dual_gap, traj_data, active_set = alg(
-        obj, grad!, lmo, x0; frank_wolfe_kwargs...
-    )
-    stats = (; x_final, v_final, primal_value, dual_gap, traj_data, active_set)
-    p = active_set.weights
-    return p, stats
-end
+"""
+    (dfw::DiffFW)(θ::AbstractArray, x0::AbstractArray; kwargs...)
 
-function (conditions::ConditionsFW)(
-    θ::AbstractArray, p::AbstractVector, stats::NamedTuple, frank_wolfe_kwargs::NamedTuple
-)
-    V = stats.active_set.atoms
-    x = mapreduce(*,+,p,V)
-    f_grad1 = conditions.f_grad1
-    ∇ₓf = f_grad1(x, θ)
-    ∇ₚg = dot.(V, Ref(∇ₓf))
-    T = simplex_projection(p .- ∇ₚg)
-    return T .- p
+Apply the differentiable Frank-Wolfe algorithm defined by `dfw` to parameter `θ` with starting point `x0`.
+Keyword arguments are passed on to the Frank-Wolfe algorithm inside `dfw`.
+
+Return the optimal solution `x`.
+"""
+function (dfw::DiffFW)(θ::AbstractArray, x0::AbstractArray; kwargs...)
+    x, _ = detailed_output(dfw, θ, x0; kwargs...)
+    return x
 end
diff --git a/src/simplex_projection.jl b/src/simplex_projection.jl
@@ -5,29 +5,26 @@ Compute the Euclidean projection of the vector `z` onto the probability simplex.
 
 This function is differentiable thanks to a custom chain rule.
 
-Reference: <https://arxiv.org/abs/1602.02068>.
+# References
+
+> [From Softmax to Sparsemax: A Sparse Model of Attention and Multi-Label Classification](https://proceedings.mlr.press/v48/martins16.html), Martins and Astudillo (2016)
 """
 function simplex_projection(z::AbstractVector{<:Real}; kwargs...)
     p, _ = simplex_projection_and_support(z)
     return p
 end
 
-"""
-    simplex_projection_and_support(z)
-
-Compute the Euclidean projection `p` of `z` on the probability simplex as well as the indicators `s` of its support, which are useful for differentiation.
+relu(x) = max(x, zero(typeof(x)))
 
-Reference: <https://arxiv.org/abs/1602.02068>.
-"""
-function simplex_projection_and_support(z::AbstractVector{<:Real})
+function simplex_projection_and_support(z::AbstractVector{T}) where {T<:Real}
     d = length(z)
     z_sorted = sort(z; rev=true)
     z_sorted_cumsum = cumsum(z_sorted)
-    k = maximum(j for j in 1:d if (1 + j * z_sorted[j]) > z_sorted_cumsum[j])
+    ind_filter = 1 .+ (1:d) .* z_sorted .> z_sorted_cumsum
+    k = findlast(ind_filter)
     τ = (z_sorted_cumsum[k] - 1) / k
-    p = z .- τ
-    p .= max.(p, zero(eltype(p)))
-    s = [Int(p[i] > eps()) for i in 1:d]
+    p = relu.(z .- τ)
+    s = p .> eps(T)
     return p, s
 end
 
diff --git a/test/correctness.jl b/test/correctness.jl
diff --git a/test/formalities.jl b/test/formalities.jl
diff --git a/test/runtests.jl b/test/runtests.jl