Skip to content

Commit

Permalink
Jpsl/update flux (#1086)
Browse files Browse the repository at this point in the history
* Update Flux and GPUArrays compatibility in Project.toml; refactor FluxApproximator and TargetNetwork implementations

* Refactor target network optimization and update test assertions for consistency

* Simplify FluxApproximator's optimise! method by using a single-line function definition

* Bump version to 0.15.4 in Project.toml

* Update NEWS.md for v0.15.4: Upgrade Flux.jl to v0.16 and resolve deprecation warnings

* Add Conda dependency and update test environment setup

* Update test environment setup to use pip for gym installation

* Fix RLEnv tests

* Fix optimizer reference in stock trading environment example

* Fix optimizer reference in stock trading environment example

* Refactor optimizer implementation in DDPGPolicy to use OptimiserChain
  • Loading branch information
jeremiahpslewis authored Dec 17, 2024
1 parent 35c2092 commit d8c159d
Show file tree
Hide file tree
Showing 10 changed files with 88 additions and 78 deletions.
4 changes: 2 additions & 2 deletions docs/homepage/blog/ospp_report_210370190/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -491,11 +491,11 @@ create_critic(critic_dim) = Chain(
create_policy(player) = DDPGPolicy(
behavior_actor = NeuralNetworkApproximator(
model = create_actor(player),
optimizer = Flux.Optimise.Optimiser(ClipNorm(0.5), Adam(1e-2)),
optimizer = OptimiserChain(ClipNorm(0.5), Adam(1e-2)),
),
behavior_critic = NeuralNetworkApproximator(
model = create_critic(critic_dim),
optimizer = Flux.Optimise.Optimiser(ClipNorm(0.5), Adam(1e-2)),
optimizer = OptimiserChain(ClipNorm(0.5), Adam(1e-2)),
),
target_actor = NeuralNetworkApproximator(
model = create_actor(player),
Expand Down
4 changes: 4 additions & 0 deletions src/ReinforcementLearningCore/NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# ReinforcementLearningCore.jl Release Notes

#### v0.15.4

- Update `Flux.jl` to `v0.16` and fix deprecation warnings and method errors

#### v0.15.3

- Make `FluxApproximator` work with `QBasedPolicy`
Expand Down
6 changes: 3 additions & 3 deletions src/ReinforcementLearningCore/Project.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name = "ReinforcementLearningCore"
uuid = "de1b191a-4ae0-4afa-a27b-92d07f46b2d6"
version = "0.15.3"
version = "0.15.4"

[deps]
AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
Expand Down Expand Up @@ -31,8 +31,8 @@ CircularArrayBuffers = "0.1.12"
Crayons = "4"
Distributions = "0.25"
FillArrays = "0.8, 0.9, 0.10, 0.11, 0.12, 0.13, 1"
Flux = "0.14"
GPUArrays = "8, 9, 10"
Flux = "0.14, 0.15, 0.16"
GPUArrays = "8, 9, 10, 11"
Metal = "1.0"
ProgressMeter = "1"
Reexport = "1"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,5 +43,5 @@ Flux.@layer FluxApproximator trainable=(model,)
forward(A::FluxApproximator, args...; kwargs...) = A.model(args...; kwargs...)
forward(A::FluxApproximator, env::E, player::AbstractPlayer=current_player(env)) where {E <: AbstractEnv} = env |> (x -> state(x, player)) |> (x -> forward(A, x))

RLBase.optimise!(A::FluxApproximator, grad::NamedTuple) =
Flux.Optimise.update!(A.optimiser_state, A.model, grad.model)
RLBase.optimise!(A::FluxApproximator, grad::NamedTuple) = Flux.Optimise.update!(A.optimiser_state, A.model, grad.model)

Original file line number Diff line number Diff line change
Expand Up @@ -74,9 +74,12 @@ function RLBase.optimise!(tn::TargetNetwork, grad::NamedTuple)
tn.n_optimise += 1

if tn.n_optimise % tn.sync_freq == 0
# polyak averaging
for (dest, src) in zip(Flux.params(target(tn)), Flux.params(tn.network))
dest .= tn.ρ .* dest .+ (1 - tn.ρ) .* src
# Polyak averaging
src_layers = RLCore.model(tn)
dest_layers = RLCore.target(tn)
for i in 1:length(src_layers)
dest_layers[i].weight .= tn.ρ .* dest_layers[i].weight .+ (1 - tn.ρ) .* src_layers[i].weight
dest_layers[i].bias .= tn.ρ .* dest_layers[i].bias .+ (1 - tn.ρ) .* src_layers[i].bias
end
tn.n_optimise = 0
end
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@ using ReinforcementLearningCore
@test_throws "AssertionError: `FluxApproximator` model is not on GPU." TargetNetwork(FluxApproximator(model, optimiser), use_gpu=true)
end
@test TargetNetwork(FluxApproximator(model=model, optimiser=optimiser, use_gpu=true), use_gpu=true) isa TargetNetwork
@test TargetNetwork(FluxApproximator(model, optimiser, use_gpu=true), use_gpu=true) isa TargetNetwork
@test TargetNetwork(FluxApproximator(model=model, optimiser=optimiser, use_gpu=true), use_gpu=true) isa TargetNetwork

approx = FluxApproximator(model, optimiser, use_gpu=false)
approx = FluxApproximator(model=model, optimiser=optimiser, use_gpu=false)
target_network = TargetNetwork(approx, use_gpu=false)


Expand All @@ -38,7 +38,7 @@ using ReinforcementLearningCore
@testset "Optimise" begin
optimiser = Adam()
model = Chain(Dense(10, 5, relu), Dense(5, 2))
approximator = FluxApproximator(model, optimiser)
approximator = FluxApproximator(model=model, optimiser=optimiser)
target_network = TargetNetwork(approximator)
input = rand(Float32, 10)
grad = Flux.Zygote.gradient(target_network) do model
Expand All @@ -54,7 +54,7 @@ using ReinforcementLearningCore

@testset "Sync" begin
optimiser = Adam()
model = FluxApproximator(Chain(Dense(10, 5, relu), Dense(5, 2)), optimiser)
model = FluxApproximator(model=Chain(Dense(10, 5, relu), Dense(5, 2)), optimiser=optimiser)
target_network = TargetNetwork(model, sync_freq=2, ρ=0.5)

input = rand(Float32, 10)
Expand All @@ -75,9 +75,9 @@ end
m = Chain(Dense(4,1))
app = FluxApproximator(model = m, optimiser = Flux.Adam(), use_gpu=true)
tn = TargetNetwork(app, sync_freq = 3, use_gpu=true)
@test typeof(model(tn)) == typeof(target(tn))
p1 = Flux.destructure(model(tn))[1]
pt1 = Flux.destructure(target(tn))[1]
@test typeof(RLCore.model(tn)) == typeof(RLCore.target(tn))
p1 = Flux.destructure(RLCore.model(tn))[1]
pt1 = Flux.destructure(RLCore.target(tn))[1]
@test p1 == pt1
input = gpu(ones(Float32, 4))
grad = Flux.Zygote.gradient(tn) do model
Expand All @@ -87,16 +87,16 @@ end
grad_model = grad[1]

RLCore.optimise!(tn, grad_model)
@test p1 != Flux.destructure(model(tn))[1]
@test p1 == Flux.destructure(target(tn))[1]
@test p1 != Flux.destructure(RLCore.model(tn))[1]
@test p1 == Flux.destructure(RLCore.target(tn))[1]
RLCore.optimise!(tn, grad_model)
@test p1 != Flux.destructure(model(tn))[1]
@test p1 != Flux.destructure(RLCore.model(tn))[1]
@test p1 == Flux.destructure(target(tn))[1]
RLCore.optimise!(tn, grad_model)
@test Flux.destructure(target(tn))[1] == Flux.destructure(model(tn))[1]
@test Flux.destructure(RLCore.target(tn))[1] == Flux.destructure(RLCore.model(tn))[1]
@test p1 != Flux.destructure(target(tn))[1]
p2 = Flux.destructure(model(tn))[1]
p2 = Flux.destructure(RLCore.model(tn))[1]
RLCore.optimise!(tn, grad_model)
@test p2 != Flux.destructure(model(tn))[1]
@test p2 == Flux.destructure(target(tn))[1]
@test p2 != Flux.destructure(RLCore.model(tn))[1]
@test p2 == Flux.destructure(RLCore.target(tn))[1]
end
98 changes: 48 additions & 50 deletions src/ReinforcementLearningCore/test/utils/networks.jl
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,13 @@ import ReinforcementLearningBase: RLBase
q_values = NN(rand(Float32, 2))
@test size(q_values) == (3,)
gs = gradient(params(NN)) do
gs = gradient(NN) do
sum(NN(rand(Float32, 2, 5)))
end
old_params = deepcopy(collect(params(NN).params))
old_params = deepcopy(collect(Flux.trainable(NN).params))
push!(NN, gs)
new_params = collect(params(NN).params)
new_params = collect(Flux.trainable(NN).params)
@test old_params != new_params
end
Expand Down Expand Up @@ -72,42 +72,40 @@ import ReinforcementLearningBase: RLBase
end
@testset "Correctness of gradients" begin
@testset "One action per state" begin
@test Flux.params(gn) == Flux.Params([gn.pre.weight, gn.pre.bias, gn.μ.weight, gn.μ.bias, gn.σ.weight, gn.σ.bias])
@test Flux.trainable(gn).pre == gn.pre
@test Flux.trainable(gn).μ == gn.μ
@test Flux.trainable(gn).σ == gn.σ
action_saver = Matrix[]
g = Flux.gradient(Flux.params(gn)) do
a, logp = gn(state, is_sampling = true, is_return_log_prob = true)
g = Flux.gradient(gn) do model
a, logp = model(state, is_sampling = true, is_return_log_prob = true)
ChainRulesCore.ignore_derivatives() do
push!(action_saver, a)
end
sum(logp)
end
g2 = Flux.gradient(Flux.params(gn)) do
logp = gn(state, only(action_saver))
g2 = Flux.gradient(gn) do model
logp = model(state, only(action_saver))
sum(logp)
end
#Check that gradients are identical
for (grad1, grad2) in zip(g,g2)
@test grad1 grad2
end
@test g == g2
end
@testset "Multiple actions per state" begin
#Same with multiple actions sampled
action_saver = []
state = unsqueeze(state, dims = 2)
g = Flux.gradient(Flux.params(gn)) do
a, logp = gn(state, 3)
g1 = Flux.gradient(gn) do model
a, logp = model(state, 3)
ChainRulesCore.ignore_derivatives() do
push!(action_saver, a)
end
sum(logp)
end
g2 = Flux.gradient(Flux.params(gn)) do
logp = gn(state, only(action_saver))
g2 = Flux.gradient(gn) do model
logp = model(state, only(action_saver))
sum(logp)
end
for (grad1, grad2) in zip(g,g2)
@test grad1 grad2
end
@test g1 == g2
end
end
end
Expand All @@ -117,7 +115,6 @@ import ReinforcementLearningBase: RLBase
gn = GaussianNetwork(Dense(20,15), Dense(15,10), Dense(15,10, softplus)) |> gpu
state = rand(Float32, 20,3) |> gpu #batch of 3 states
@testset "Forward pass compatibility" begin
@test Flux.params(gn) == Flux.Params([gn.pre.weight, gn.pre.bias, gn.μ.weight, gn.μ.bias, gn.σ.weight, gn.σ.bias])
m, L = gn(state)
@test size(m) == size(L) == (10,3)
a, logp = gn(CUDA.CURAND.RNG(), state, is_sampling = true, is_return_log_prob = true)
Expand All @@ -134,15 +131,15 @@ import ReinforcementLearningBase: RLBase
@testset "Backward pass compatibility" begin
@testset "One action sampling" begin
action_saver = CuMatrix[]
g = Flux.gradient(Flux.params(gn)) do
a, logp = gn(CUDA.CURAND.RNG(), state, is_sampling = true, is_return_log_prob = true)
g = Flux.gradient(gn) do model
a, logp = model(CUDA.CURAND.RNG(), state, is_sampling = true, is_return_log_prob = true)
ChainRulesCore.ignore_derivatives() do
push!(action_saver, a)
end
sum(logp)
end
g2 = Flux.gradient(Flux.params(gn)) do
logp = gn(state, only(action_saver))
g2 = Flux.gradient(gn) do model
logp = model(state, only(action_saver))
sum(logp)
end
#Check that gradients are identical
Expand All @@ -153,15 +150,15 @@ import ReinforcementLearningBase: RLBase
@testset "Multiple actions sampling" begin
action_saver = []
state = unsqueeze(state, dims = 2)
g = Flux.gradient(Flux.params(gn)) do
g = Flux.gradient(gn) do
a, logp = gn(CUDA.CURAND.RNG(), state, 3)
ChainRulesCore.ignore_derivatives() do
push!(action_saver, a)
end
sum(logp)
end
g2 = Flux.gradient(Flux.params(gn)) do
logp = gn(state, only(action_saver))
g2 = Flux.gradient(gn) do model
logp = model(state, only(action_saver))
sum(logp)
end
for (grad1, grad2) in zip(g,g2)
Expand Down Expand Up @@ -202,7 +199,10 @@ import ReinforcementLearningBase: RLBase
μ = Dense(15,10)
Σ = Dense(15,10*11÷2)
gn = CovGaussianNetwork(pre, μ, Σ)
@test Flux.params(gn) == Flux.Params([pre.weight, pre.bias, μ.weight, μ.bias, Σ.weight, Σ.bias])
@test Flux.trainable(gn).pre == pre
@test Flux.trainable(gn).μ == μ
@test Flux.trainable(gn).Σ == Σ

state = rand(Float32, 20,3) #batch of 3 states
#Check that it works in 2D
m, L = gn(state)
Expand Down Expand Up @@ -233,35 +233,34 @@ import ReinforcementLearningBase: RLBase
logp_truth = [logpdf(mvn, a) for (mvn, a) in zip(mvnormals, eachslice(as, dims = 3))]
@test stack(logp_truth; dims=2) dropdims(logps,dims = 1) #test against ground truth
action_saver = []
g = Flux.gradient(Flux.params(gn)) do
a, logp = gn(Flux.unsqueeze(state,dims = 2), is_sampling = true, is_return_log_prob = true)
g1 = Flux.gradient(gn) do model
a, logp = model(Flux.unsqueeze(state,dims = 2), is_sampling = true, is_return_log_prob = true)
ChainRulesCore.ignore_derivatives() do
push!(action_saver, a)
end
mean(logp)
end
g2 = Flux.gradient(Flux.params(gn)) do
logp = gn(Flux.unsqueeze(state,dims = 2), only(action_saver))
g2 = Flux.gradient(gn) do model
logp = model(Flux.unsqueeze(state,dims = 2), only(action_saver))
mean(logp)
end
for (grad1, grad2) in zip(g,g2)
@test grad1 grad2
end
@test g1 == g2

empty!(action_saver)
g3 = Flux.gradient(Flux.params(gn)) do
a, logp = gn(Flux.unsqueeze(state,dims = 2), 3)

g3 = Flux.gradient(gn) do model
a, logp = model(Flux.unsqueeze(state,dims = 2), is_sampling = true, is_return_log_prob = true)
ChainRulesCore.ignore_derivatives() do
push!(action_saver, a)
end
mean(logp)
end
g4 = Flux.gradient(Flux.params(gn)) do
logp = gn(Flux.unsqueeze(state,dims = 2), only(action_saver))
g4 = Flux.gradient(gn) do model
logp = model(Flux.unsqueeze(state, dims = 2), only(action_saver))
mean(logp)
end
for (grad1, grad2) in zip(g4,g3)
@test grad1 grad2
end

@test g4 == g3
end
@testset "CUDA" begin
if (@isdefined CUDA) && CUDA.functional()
Expand All @@ -271,7 +270,6 @@ import ReinforcementLearningBase: RLBase
μ = Dense(15,10) |> gpu
Σ = Dense(15,10*11÷2) |> gpu
gn = CovGaussianNetwork(pre, μ, Σ)
@test Flux.params(gn) == Flux.Params([pre.weight, pre.bias, μ.weight, μ.bias, Σ.weight, Σ.bias])
state = rand(Float32, 20,3)|> gpu #batch of 3 states
m, L = gn(Flux.unsqueeze(state,dims = 2))
@test size(m) == (10,1,3)
Expand All @@ -292,31 +290,31 @@ import ReinforcementLearningBase: RLBase
logp_truth = [logpdf(mvn, cpu(a)) for (mvn, a) in zip(mvnormals, eachslice(as, dims = 3))]
@test reduce(hcat, collect(logp_truth)) dropdims(cpu(logps); dims=1) #test against ground truth
action_saver = []
g = Flux.gradient(Flux.params(gn)) do
a, logp = gn(rng, Flux.unsqueeze(state,dims = 2), is_sampling = true, is_return_log_prob = true)
g = Flux.gradient(gn) do model
a, logp = model(rng, Flux.unsqueeze(state,dims = 2), is_sampling = true, is_return_log_prob = true)
ChainRulesCore.ignore_derivatives() do
push!(action_saver, a)
end
mean(logp)
end

g2 = Flux.gradient(Flux.params(gn)) do
logp = gn(Flux.unsqueeze(state,dims = 2), only(action_saver))
g2 = Flux.gradient(gn) do model
logp = model(Flux.unsqueeze(state,dims = 2), only(action_saver))
mean(logp)
end
for (grad1, grad2) in zip(g,g2)
@test grad1 grad2
end
empty!(action_saver)
g3 = Flux.gradient(Flux.params(gn)) do
a, logp = gn(rng, Flux.unsqueeze(state,dims = 2), 3)
g3 = Flux.gradient(gn) do model
a, logp = model(rng, Flux.unsqueeze(state,dims = 2), 3)
ChainRulesCore.ignore_derivatives() do
push!(action_saver, a)
end
mean(logp)
end
g4 = Flux.gradient(Flux.params(gn)) do
logp = gn(Flux.unsqueeze(state,dims = 2), only(action_saver))
g4 = Flux.gradient(gn) do model
logp = model(Flux.unsqueeze(state,dims = 2), only(action_saver))
mean(logp)
end
for (grad1, grad2) in zip(g4,g3)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ end
Dense(ns, 64, relu),
Dense(64, na, relu),
),
Flux.Optimise.Optimiser(ClipNorm(0.5), ADAM(1e-5)),
OptimiserChain(ClipNorm(0.5), Adam(1e-5)),
),
explorer = EpsilonGreedyExplorer(ϵ_stable=0.01),
),
Expand Down
9 changes: 7 additions & 2 deletions src/ReinforcementLearningEnvironments/test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,13 @@ using TimerOutputs
using Conda
using JLD2

Conda.add("gym")
Conda.add("numpy")
ENV["CONDA_JL_USE_MINIFORGE"] = "1"

Conda.add("python", Conda.ROOTENV)
Conda.add("numpy", Conda.ROOTENV)
Conda.pip_interop(true, Conda.ROOTENV)
Conda.pip("install", "gym", Conda.ROOTENV)


@testset "ReinforcementLearningEnvironments" begin
include("environments/environments.jl")
Expand Down
2 changes: 1 addition & 1 deletion src/ReinforcementLearningFarm/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ ReinforcementLearning = "158674fc-8238-5cab-b5ba-03dfc80d1318"

[compat]
FillArrays = "1"
Flux = "0.14"
Flux = "0.14, 0.15, 0.16"
CircularArrayBuffers = "0.1.12"
Distributions = "0.25"
ReinforcementLearning = "0.11"
Expand Down

0 comments on commit d8c159d

Please sign in to comment.