From d5ec5cc1d81cc5c764668548f6c05221003f30ae Mon Sep 17 00:00:00 2001 From: Jeremiah Lewis <4462211+jeremiahpslewis@users.noreply.github.com> Date: Tue, 17 Dec 2024 19:04:02 +0100 Subject: [PATCH 01/13] Update Flux and GPUArrays compatibility in Project.toml; refactor FluxApproximator and TargetNetwork implementations --- src/ReinforcementLearningCore/Project.toml | 4 +- .../policies/learners/flux_approximator.jl | 5 +- .../src/policies/learners/target_network.jl | 7 +- .../test/policies/learners/target_network.jl | 28 ++--- .../test/runtests.jl | 4 +- .../test/utils/networks.jl | 100 +++++++++--------- src/ReinforcementLearningFarm/Project.toml | 2 +- 7 files changed, 78 insertions(+), 72 deletions(-) diff --git a/src/ReinforcementLearningCore/Project.toml b/src/ReinforcementLearningCore/Project.toml index 05d099c86..5209c0562 100644 --- a/src/ReinforcementLearningCore/Project.toml +++ b/src/ReinforcementLearningCore/Project.toml @@ -31,8 +31,8 @@ CircularArrayBuffers = "0.1.12" Crayons = "4" Distributions = "0.25" FillArrays = "0.8, 0.9, 0.10, 0.11, 0.12, 0.13, 1" -Flux = "0.14" -GPUArrays = "8, 9, 10" +Flux = "0.14, 0.15, 0.16" +GPUArrays = "8, 9, 10, 11" Metal = "1.0" ProgressMeter = "1" Reexport = "1" diff --git a/src/ReinforcementLearningCore/src/policies/learners/flux_approximator.jl b/src/ReinforcementLearningCore/src/policies/learners/flux_approximator.jl index 2227e201f..6657e6bc2 100644 --- a/src/ReinforcementLearningCore/src/policies/learners/flux_approximator.jl +++ b/src/ReinforcementLearningCore/src/policies/learners/flux_approximator.jl @@ -43,5 +43,8 @@ Flux.@layer FluxApproximator trainable=(model,) forward(A::FluxApproximator, args...; kwargs...) = A.model(args...; kwargs...) forward(A::FluxApproximator, env::E, player::AbstractPlayer=current_player(env)) where {E <: AbstractEnv} = env |> (x -> state(x, player)) |> (x -> forward(A, x)) -RLBase.optimise!(A::FluxApproximator, grad::NamedTuple) = +function RLBase.optimise!(A::FluxApproximator, grad::NamedTuple) + Flux.Optimise.update!(A.optimiser_state, A.model, grad.model) +end + diff --git a/src/ReinforcementLearningCore/src/policies/learners/target_network.jl b/src/ReinforcementLearningCore/src/policies/learners/target_network.jl index 7a3b8490a..e673e5b51 100644 --- a/src/ReinforcementLearningCore/src/policies/learners/target_network.jl +++ b/src/ReinforcementLearningCore/src/policies/learners/target_network.jl @@ -75,8 +75,11 @@ function RLBase.optimise!(tn::TargetNetwork, grad::NamedTuple) if tn.n_optimise % tn.sync_freq == 0 # polyak averaging - for (dest, src) in zip(Flux.params(target(tn)), Flux.params(tn.network)) - dest .= tn.ρ .* dest .+ (1 - tn.ρ) .* src + zip(Flux.params(RLCore.model(tn)), Flux.params(RLCore.target(tn))) + src_layers = RLCore.model(tn) + dest_layers = RLCore.target(tn) + for i in 1:length(src_layers) + dest_layers[i].weight .= tn.ρ .* dest_layers[i].weight .+ (1 - tn.ρ) .* src_layers[i].weight end tn.n_optimise = 0 end diff --git a/src/ReinforcementLearningCore/test/policies/learners/target_network.jl b/src/ReinforcementLearningCore/test/policies/learners/target_network.jl index e9182ddaa..c95e025dd 100644 --- a/src/ReinforcementLearningCore/test/policies/learners/target_network.jl +++ b/src/ReinforcementLearningCore/test/policies/learners/target_network.jl @@ -10,9 +10,9 @@ using ReinforcementLearningCore @test_throws "AssertionError: `FluxApproximator` model is not on GPU." TargetNetwork(FluxApproximator(model, optimiser), use_gpu=true) end @test TargetNetwork(FluxApproximator(model=model, optimiser=optimiser, use_gpu=true), use_gpu=true) isa TargetNetwork - @test TargetNetwork(FluxApproximator(model, optimiser, use_gpu=true), use_gpu=true) isa TargetNetwork + @test TargetNetwork(FluxApproximator(model=model, optimiser=optimiser, use_gpu=true), use_gpu=true) isa TargetNetwork - approx = FluxApproximator(model, optimiser, use_gpu=false) + approx = FluxApproximator(model=model, optimiser=optimiser, use_gpu=false) target_network = TargetNetwork(approx, use_gpu=false) @@ -38,7 +38,7 @@ using ReinforcementLearningCore @testset "Optimise" begin optimiser = Adam() model = Chain(Dense(10, 5, relu), Dense(5, 2)) - approximator = FluxApproximator(model, optimiser) + approximator = FluxApproximator(model=model, optimiser=optimiser) target_network = TargetNetwork(approximator) input = rand(Float32, 10) grad = Flux.Zygote.gradient(target_network) do model @@ -54,7 +54,7 @@ using ReinforcementLearningCore @testset "Sync" begin optimiser = Adam() - model = FluxApproximator(Chain(Dense(10, 5, relu), Dense(5, 2)), optimiser) + model = FluxApproximator(model=Chain(Dense(10, 5, relu), Dense(5, 2)), optimiser=optimiser) target_network = TargetNetwork(model, sync_freq=2, ρ=0.5) input = rand(Float32, 10) @@ -75,9 +75,9 @@ end m = Chain(Dense(4,1)) app = FluxApproximator(model = m, optimiser = Flux.Adam(), use_gpu=true) tn = TargetNetwork(app, sync_freq = 3, use_gpu=true) - @test typeof(model(tn)) == typeof(target(tn)) - p1 = Flux.destructure(model(tn))[1] - pt1 = Flux.destructure(target(tn))[1] + @test typeof(RLCore.model(tn)) == typeof(RLCore.target(tn)) + p1 = Flux.destructure(RLCore.model(tn))[1] + pt1 = Flux.destructure(RLCore.target(tn))[1] @test p1 == pt1 input = gpu(ones(Float32, 4)) grad = Flux.Zygote.gradient(tn) do model @@ -87,16 +87,16 @@ end grad_model = grad[1] RLCore.optimise!(tn, grad_model) - @test p1 != Flux.destructure(model(tn))[1] - @test p1 == Flux.destructure(target(tn))[1] + @test p1 != Flux.destructure(RLCore.model(tn))[1] + @test p1 == Flux.destructure(RLCore.target(tn))[1] RLCore.optimise!(tn, grad_model) - @test p1 != Flux.destructure(model(tn))[1] + @test p1 != Flux.destructure(RLCore.model(tn))[1] @test p1 == Flux.destructure(target(tn))[1] RLCore.optimise!(tn, grad_model) - @test Flux.destructure(target(tn))[1] == Flux.destructure(model(tn))[1] + @test Flux.destructure(target(tn))[1] == Flux.destructure(RLCore.model(tn))[1] @test p1 != Flux.destructure(target(tn))[1] - p2 = Flux.destructure(model(tn))[1] + p2 = Flux.destructure(RLCore.model(tn))[1] RLCore.optimise!(tn, grad_model) - @test p2 != Flux.destructure(model(tn))[1] - @test p2 == Flux.destructure(target(tn))[1] + @test p2 != Flux.destructure(RLCore.model(tn))[1] + @test p2 == Flux.destructure(RLCore.target(tn))[1] end diff --git a/src/ReinforcementLearningCore/test/runtests.jl b/src/ReinforcementLearningCore/test/runtests.jl index f3420f230..05c669ee3 100644 --- a/src/ReinforcementLearningCore/test/runtests.jl +++ b/src/ReinforcementLearningCore/test/runtests.jl @@ -4,9 +4,9 @@ using Preferences if Sys.isapple() && Sys.ARCH === :aarch64 flux_uuid = UUID("587475ba-b771-5e3f-ad9e-33799f191a9c") - set_preferences!(flux_uuid, "gpu_backend" => "Metal") + # set_preferences!(flux_uuid, "gpu_backend" => "Metal") - using Metal + # using Metal else using CUDA, cuDNN end diff --git a/src/ReinforcementLearningCore/test/utils/networks.jl b/src/ReinforcementLearningCore/test/utils/networks.jl index f070dc75c..58b28eebf 100644 --- a/src/ReinforcementLearningCore/test/utils/networks.jl +++ b/src/ReinforcementLearningCore/test/utils/networks.jl @@ -22,13 +22,13 @@ import ReinforcementLearningBase: RLBase q_values = NN(rand(Float32, 2)) @test size(q_values) == (3,) - gs = gradient(params(NN)) do + gs = gradient(NN) do sum(NN(rand(Float32, 2, 5))) end - old_params = deepcopy(collect(params(NN).params)) + old_params = deepcopy(collect(Flux.trainable(NN).params)) push!(NN, gs) - new_params = collect(params(NN).params) + new_params = collect(Flux.trainable(NN).params) @test old_params != new_params end @@ -72,42 +72,40 @@ import ReinforcementLearningBase: RLBase end @testset "Correctness of gradients" begin @testset "One action per state" begin - @test Flux.params(gn) == Flux.Params([gn.pre.weight, gn.pre.bias, gn.μ.weight, gn.μ.bias, gn.σ.weight, gn.σ.bias]) + @test Flux.trainable(gn).pre == gn.pre + @test Flux.trainable(gn).μ == gn.μ + @test Flux.trainable(gn).σ == gn.σ action_saver = Matrix[] - g = Flux.gradient(Flux.params(gn)) do - a, logp = gn(state, is_sampling = true, is_return_log_prob = true) + g = Flux.gradient(gn) do model + a, logp = model(state, is_sampling = true, is_return_log_prob = true) ChainRulesCore.ignore_derivatives() do push!(action_saver, a) end sum(logp) end - g2 = Flux.gradient(Flux.params(gn)) do - logp = gn(state, only(action_saver)) + g2 = Flux.gradient(gn) do model + logp = model(state, only(action_saver)) sum(logp) end #Check that gradients are identical - for (grad1, grad2) in zip(g,g2) - @test grad1 ≈ grad2 - end + @test g == g2 end @testset "Multiple actions per state" begin #Same with multiple actions sampled action_saver = [] state = unsqueeze(state, dims = 2) - g = Flux.gradient(Flux.params(gn)) do - a, logp = gn(state, 3) + g1 = Flux.gradient(gn) do model + a, logp = model(state, 3) ChainRulesCore.ignore_derivatives() do push!(action_saver, a) end sum(logp) end - g2 = Flux.gradient(Flux.params(gn)) do - logp = gn(state, only(action_saver)) + g2 = Flux.gradient(gn) do model + logp = model(state, only(action_saver)) sum(logp) end - for (grad1, grad2) in zip(g,g2) - @test grad1 ≈ grad2 - end + @test g1 == g2 end end end @@ -117,7 +115,7 @@ import ReinforcementLearningBase: RLBase gn = GaussianNetwork(Dense(20,15), Dense(15,10), Dense(15,10, softplus)) |> gpu state = rand(Float32, 20,3) |> gpu #batch of 3 states @testset "Forward pass compatibility" begin - @test Flux.params(gn) == Flux.Params([gn.pre.weight, gn.pre.bias, gn.μ.weight, gn.μ.bias, gn.σ.weight, gn.σ.bias]) + @test Flux.trainable(gn) == Flux.Params([gn.pre.weight, gn.pre.bias, gn.μ.weight, gn.μ.bias, gn.σ.weight, gn.σ.bias]) m, L = gn(state) @test size(m) == size(L) == (10,3) a, logp = gn(CUDA.CURAND.RNG(), state, is_sampling = true, is_return_log_prob = true) @@ -134,15 +132,15 @@ import ReinforcementLearningBase: RLBase @testset "Backward pass compatibility" begin @testset "One action sampling" begin action_saver = CuMatrix[] - g = Flux.gradient(Flux.params(gn)) do - a, logp = gn(CUDA.CURAND.RNG(), state, is_sampling = true, is_return_log_prob = true) + g = Flux.gradient(gn) do model + a, logp = model(CUDA.CURAND.RNG(), state, is_sampling = true, is_return_log_prob = true) ChainRulesCore.ignore_derivatives() do push!(action_saver, a) end sum(logp) end - g2 = Flux.gradient(Flux.params(gn)) do - logp = gn(state, only(action_saver)) + g2 = Flux.gradient(gn) do model + logp = model(state, only(action_saver)) sum(logp) end #Check that gradients are identical @@ -153,15 +151,15 @@ import ReinforcementLearningBase: RLBase @testset "Multiple actions sampling" begin action_saver = [] state = unsqueeze(state, dims = 2) - g = Flux.gradient(Flux.params(gn)) do + g = Flux.gradient(gn) do a, logp = gn(CUDA.CURAND.RNG(), state, 3) ChainRulesCore.ignore_derivatives() do push!(action_saver, a) end sum(logp) end - g2 = Flux.gradient(Flux.params(gn)) do - logp = gn(state, only(action_saver)) + g2 = Flux.gradient(gn) do model + logp = model(state, only(action_saver)) sum(logp) end for (grad1, grad2) in zip(g,g2) @@ -202,7 +200,10 @@ import ReinforcementLearningBase: RLBase μ = Dense(15,10) Σ = Dense(15,10*11÷2) gn = CovGaussianNetwork(pre, μ, Σ) - @test Flux.params(gn) == Flux.Params([pre.weight, pre.bias, μ.weight, μ.bias, Σ.weight, Σ.bias]) + @test Flux.trainable(gn).pre == pre + @test Flux.trainable(gn).μ == μ + @test Flux.trainable(gn).Σ == Σ + state = rand(Float32, 20,3) #batch of 3 states #Check that it works in 2D m, L = gn(state) @@ -233,35 +234,34 @@ import ReinforcementLearningBase: RLBase logp_truth = [logpdf(mvn, a) for (mvn, a) in zip(mvnormals, eachslice(as, dims = 3))] @test stack(logp_truth; dims=2) ≈ dropdims(logps,dims = 1) #test against ground truth action_saver = [] - g = Flux.gradient(Flux.params(gn)) do - a, logp = gn(Flux.unsqueeze(state,dims = 2), is_sampling = true, is_return_log_prob = true) + g1 = Flux.gradient(gn) do model + a, logp = model(Flux.unsqueeze(state,dims = 2), is_sampling = true, is_return_log_prob = true) ChainRulesCore.ignore_derivatives() do push!(action_saver, a) end mean(logp) end - g2 = Flux.gradient(Flux.params(gn)) do - logp = gn(Flux.unsqueeze(state,dims = 2), only(action_saver)) + g2 = Flux.gradient(gn) do model + logp = model(Flux.unsqueeze(state,dims = 2), only(action_saver)) mean(logp) end - for (grad1, grad2) in zip(g,g2) - @test grad1 ≈ grad2 - end + @test g1 == g2 + empty!(action_saver) - g3 = Flux.gradient(Flux.params(gn)) do - a, logp = gn(Flux.unsqueeze(state,dims = 2), 3) + + g3 = Flux.gradient(gn) do model + a, logp = model(Flux.unsqueeze(state,dims = 2), is_sampling = true, is_return_log_prob = true) ChainRulesCore.ignore_derivatives() do push!(action_saver, a) end mean(logp) end - g4 = Flux.gradient(Flux.params(gn)) do - logp = gn(Flux.unsqueeze(state,dims = 2), only(action_saver)) + g4 = Flux.gradient(gn) do model + logp = model(Flux.unsqueeze(state, dims = 2), only(action_saver)) mean(logp) end - for (grad1, grad2) in zip(g4,g3) - @test grad1 ≈ grad2 - end + + @test g4 == g3 end @testset "CUDA" begin if (@isdefined CUDA) && CUDA.functional() @@ -271,7 +271,7 @@ import ReinforcementLearningBase: RLBase μ = Dense(15,10) |> gpu Σ = Dense(15,10*11÷2) |> gpu gn = CovGaussianNetwork(pre, μ, Σ) - @test Flux.params(gn) == Flux.Params([pre.weight, pre.bias, μ.weight, μ.bias, Σ.weight, Σ.bias]) + @test Flux.trainable(gn) == Flux.Params([pre.weight, pre.bias, μ.weight, μ.bias, Σ.weight, Σ.bias]) state = rand(Float32, 20,3)|> gpu #batch of 3 states m, L = gn(Flux.unsqueeze(state,dims = 2)) @test size(m) == (10,1,3) @@ -292,31 +292,31 @@ import ReinforcementLearningBase: RLBase logp_truth = [logpdf(mvn, cpu(a)) for (mvn, a) in zip(mvnormals, eachslice(as, dims = 3))] @test reduce(hcat, collect(logp_truth)) ≈ dropdims(cpu(logps); dims=1) #test against ground truth action_saver = [] - g = Flux.gradient(Flux.params(gn)) do - a, logp = gn(rng, Flux.unsqueeze(state,dims = 2), is_sampling = true, is_return_log_prob = true) + g = Flux.gradient(gn) do model + a, logp = model(rng, Flux.unsqueeze(state,dims = 2), is_sampling = true, is_return_log_prob = true) ChainRulesCore.ignore_derivatives() do push!(action_saver, a) end mean(logp) end - g2 = Flux.gradient(Flux.params(gn)) do - logp = gn(Flux.unsqueeze(state,dims = 2), only(action_saver)) + g2 = Flux.gradient(gn) do model + logp = model(Flux.unsqueeze(state,dims = 2), only(action_saver)) mean(logp) end for (grad1, grad2) in zip(g,g2) @test grad1 ≈ grad2 end empty!(action_saver) - g3 = Flux.gradient(Flux.params(gn)) do - a, logp = gn(rng, Flux.unsqueeze(state,dims = 2), 3) + g3 = Flux.gradient(gn) do model + a, logp = model(rng, Flux.unsqueeze(state,dims = 2), 3) ChainRulesCore.ignore_derivatives() do push!(action_saver, a) end mean(logp) end - g4 = Flux.gradient(Flux.params(gn)) do - logp = gn(Flux.unsqueeze(state,dims = 2), only(action_saver)) + g4 = Flux.gradient(gn) do model + logp = model(Flux.unsqueeze(state,dims = 2), only(action_saver)) mean(logp) end for (grad1, grad2) in zip(g4,g3) diff --git a/src/ReinforcementLearningFarm/Project.toml b/src/ReinforcementLearningFarm/Project.toml index fd2c22bce..50297f670 100644 --- a/src/ReinforcementLearningFarm/Project.toml +++ b/src/ReinforcementLearningFarm/Project.toml @@ -13,7 +13,7 @@ ReinforcementLearning = "158674fc-8238-5cab-b5ba-03dfc80d1318" [compat] FillArrays = "1" -Flux = "0.14" +Flux = "0.14, 0.15, 0.16" CircularArrayBuffers = "0.1.12" Distributions = "0.25" ReinforcementLearning = "0.11" From de3652d9794b089253e6acd535118ccfe4f4b7f7 Mon Sep 17 00:00:00 2001 From: Jeremiah Lewis <4462211+jeremiahpslewis@users.noreply.github.com> Date: Tue, 17 Dec 2024 19:04:06 +0100 Subject: [PATCH 02/13] Refactor target network optimization and update test assertions for consistency --- .../src/policies/learners/target_network.jl | 4 ++-- .../test/policies/learners/target_network.jl | 2 +- src/ReinforcementLearningCore/test/runtests.jl | 4 ++-- src/ReinforcementLearningCore/test/utils/networks.jl | 2 -- 4 files changed, 5 insertions(+), 7 deletions(-) diff --git a/src/ReinforcementLearningCore/src/policies/learners/target_network.jl b/src/ReinforcementLearningCore/src/policies/learners/target_network.jl index e673e5b51..1178ce527 100644 --- a/src/ReinforcementLearningCore/src/policies/learners/target_network.jl +++ b/src/ReinforcementLearningCore/src/policies/learners/target_network.jl @@ -74,12 +74,12 @@ function RLBase.optimise!(tn::TargetNetwork, grad::NamedTuple) tn.n_optimise += 1 if tn.n_optimise % tn.sync_freq == 0 - # polyak averaging - zip(Flux.params(RLCore.model(tn)), Flux.params(RLCore.target(tn))) + # Polyak averaging src_layers = RLCore.model(tn) dest_layers = RLCore.target(tn) for i in 1:length(src_layers) dest_layers[i].weight .= tn.ρ .* dest_layers[i].weight .+ (1 - tn.ρ) .* src_layers[i].weight + dest_layers[i].bias .= tn.ρ .* dest_layers[i].bias .+ (1 - tn.ρ) .* src_layers[i].bias end tn.n_optimise = 0 end diff --git a/src/ReinforcementLearningCore/test/policies/learners/target_network.jl b/src/ReinforcementLearningCore/test/policies/learners/target_network.jl index c95e025dd..e4f05396b 100644 --- a/src/ReinforcementLearningCore/test/policies/learners/target_network.jl +++ b/src/ReinforcementLearningCore/test/policies/learners/target_network.jl @@ -93,7 +93,7 @@ end @test p1 != Flux.destructure(RLCore.model(tn))[1] @test p1 == Flux.destructure(target(tn))[1] RLCore.optimise!(tn, grad_model) - @test Flux.destructure(target(tn))[1] == Flux.destructure(RLCore.model(tn))[1] + @test Flux.destructure(RLCore.target(tn))[1] == Flux.destructure(RLCore.model(tn))[1] @test p1 != Flux.destructure(target(tn))[1] p2 = Flux.destructure(RLCore.model(tn))[1] RLCore.optimise!(tn, grad_model) diff --git a/src/ReinforcementLearningCore/test/runtests.jl b/src/ReinforcementLearningCore/test/runtests.jl index 05c669ee3..f3420f230 100644 --- a/src/ReinforcementLearningCore/test/runtests.jl +++ b/src/ReinforcementLearningCore/test/runtests.jl @@ -4,9 +4,9 @@ using Preferences if Sys.isapple() && Sys.ARCH === :aarch64 flux_uuid = UUID("587475ba-b771-5e3f-ad9e-33799f191a9c") - # set_preferences!(flux_uuid, "gpu_backend" => "Metal") + set_preferences!(flux_uuid, "gpu_backend" => "Metal") - # using Metal + using Metal else using CUDA, cuDNN end diff --git a/src/ReinforcementLearningCore/test/utils/networks.jl b/src/ReinforcementLearningCore/test/utils/networks.jl index 58b28eebf..d078928b7 100644 --- a/src/ReinforcementLearningCore/test/utils/networks.jl +++ b/src/ReinforcementLearningCore/test/utils/networks.jl @@ -115,7 +115,6 @@ import ReinforcementLearningBase: RLBase gn = GaussianNetwork(Dense(20,15), Dense(15,10), Dense(15,10, softplus)) |> gpu state = rand(Float32, 20,3) |> gpu #batch of 3 states @testset "Forward pass compatibility" begin - @test Flux.trainable(gn) == Flux.Params([gn.pre.weight, gn.pre.bias, gn.μ.weight, gn.μ.bias, gn.σ.weight, gn.σ.bias]) m, L = gn(state) @test size(m) == size(L) == (10,3) a, logp = gn(CUDA.CURAND.RNG(), state, is_sampling = true, is_return_log_prob = true) @@ -271,7 +270,6 @@ import ReinforcementLearningBase: RLBase μ = Dense(15,10) |> gpu Σ = Dense(15,10*11÷2) |> gpu gn = CovGaussianNetwork(pre, μ, Σ) - @test Flux.trainable(gn) == Flux.Params([pre.weight, pre.bias, μ.weight, μ.bias, Σ.weight, Σ.bias]) state = rand(Float32, 20,3)|> gpu #batch of 3 states m, L = gn(Flux.unsqueeze(state,dims = 2)) @test size(m) == (10,1,3) From b0e7e85558edb93f5747190caec502bdd7ef9cc4 Mon Sep 17 00:00:00 2001 From: Jeremiah Lewis <4462211+jeremiahpslewis@users.noreply.github.com> Date: Tue, 17 Dec 2024 19:08:54 +0100 Subject: [PATCH 03/13] Simplify FluxApproximator's optimise! method by using a single-line function definition --- .../src/policies/learners/flux_approximator.jl | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/ReinforcementLearningCore/src/policies/learners/flux_approximator.jl b/src/ReinforcementLearningCore/src/policies/learners/flux_approximator.jl index 6657e6bc2..a6f0cb5b9 100644 --- a/src/ReinforcementLearningCore/src/policies/learners/flux_approximator.jl +++ b/src/ReinforcementLearningCore/src/policies/learners/flux_approximator.jl @@ -43,8 +43,5 @@ Flux.@layer FluxApproximator trainable=(model,) forward(A::FluxApproximator, args...; kwargs...) = A.model(args...; kwargs...) forward(A::FluxApproximator, env::E, player::AbstractPlayer=current_player(env)) where {E <: AbstractEnv} = env |> (x -> state(x, player)) |> (x -> forward(A, x)) -function RLBase.optimise!(A::FluxApproximator, grad::NamedTuple) - - Flux.Optimise.update!(A.optimiser_state, A.model, grad.model) -end +RLBase.optimise!(A::FluxApproximator, grad::NamedTuple) = Flux.Optimise.update!(A.optimiser_state, A.model, grad.model) From eb146cddd9ff16358a7ccb9f06b37203e37369ce Mon Sep 17 00:00:00 2001 From: Jeremiah Lewis <4462211+jeremiahpslewis@users.noreply.github.com> Date: Tue, 17 Dec 2024 19:09:30 +0100 Subject: [PATCH 04/13] Bump version to 0.15.4 in Project.toml --- src/ReinforcementLearningCore/Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ReinforcementLearningCore/Project.toml b/src/ReinforcementLearningCore/Project.toml index 5209c0562..70fc380da 100644 --- a/src/ReinforcementLearningCore/Project.toml +++ b/src/ReinforcementLearningCore/Project.toml @@ -1,6 +1,6 @@ name = "ReinforcementLearningCore" uuid = "de1b191a-4ae0-4afa-a27b-92d07f46b2d6" -version = "0.15.3" +version = "0.15.4" [deps] AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c" From b0c3b9a8c3beb0022f9475ee549b2a5678b6e527 Mon Sep 17 00:00:00 2001 From: Jeremiah Lewis <4462211+jeremiahpslewis@users.noreply.github.com> Date: Tue, 17 Dec 2024 19:10:08 +0100 Subject: [PATCH 05/13] Update NEWS.md for v0.15.4: Upgrade Flux.jl to v0.16 and resolve deprecation warnings --- src/ReinforcementLearningCore/NEWS.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/ReinforcementLearningCore/NEWS.md b/src/ReinforcementLearningCore/NEWS.md index 472e6e2d9..a44325896 100644 --- a/src/ReinforcementLearningCore/NEWS.md +++ b/src/ReinforcementLearningCore/NEWS.md @@ -1,5 +1,9 @@ # ReinforcementLearningCore.jl Release Notes +#### v0.15.4 + +- Update `Flux.jl` to `v0.16` and fix deprecation warnings and method errors + #### v0.15.3 - Make `FluxApproximator` work with `QBasedPolicy` From a8d7516bb8de68c301fac6e06cd629a9b0b94770 Mon Sep 17 00:00:00 2001 From: Jeremiah Lewis <4462211+jeremiahpslewis@users.noreply.github.com> Date: Tue, 17 Dec 2024 19:44:20 +0100 Subject: [PATCH 06/13] Add Conda dependency and update test environment setup --- src/ReinforcementLearningEnvironments/test/runtests.jl | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/ReinforcementLearningEnvironments/test/runtests.jl b/src/ReinforcementLearningEnvironments/test/runtests.jl index 80bb8fe8a..e9104ec46 100644 --- a/src/ReinforcementLearningEnvironments/test/runtests.jl +++ b/src/ReinforcementLearningEnvironments/test/runtests.jl @@ -14,6 +14,9 @@ using TimerOutputs using Conda using JLD2 +ENV["CONDA_JL_USE_MINIFORGE"] = "1" + +Conda.add("python") Conda.add("gym") Conda.add("numpy") From ad386976236507816e70aa2372e47c4d85587f8f Mon Sep 17 00:00:00 2001 From: Jeremiah Lewis <4462211+jeremiahpslewis@users.noreply.github.com> Date: Tue, 17 Dec 2024 20:27:13 +0100 Subject: [PATCH 07/13] Update test environment setup to use pip for gym installation --- src/ReinforcementLearningEnvironments/test/runtests.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ReinforcementLearningEnvironments/test/runtests.jl b/src/ReinforcementLearningEnvironments/test/runtests.jl index e9104ec46..32ea608b7 100644 --- a/src/ReinforcementLearningEnvironments/test/runtests.jl +++ b/src/ReinforcementLearningEnvironments/test/runtests.jl @@ -16,8 +16,8 @@ using JLD2 ENV["CONDA_JL_USE_MINIFORGE"] = "1" -Conda.add("python") -Conda.add("gym") +Conda.pip_interop(true, env) +Conda.pip("install", "gym") Conda.add("numpy") @testset "ReinforcementLearningEnvironments" begin From f30a493f8e1d701bd06c94ef059d273604650601 Mon Sep 17 00:00:00 2001 From: Jeremiah Lewis <4462211+jeremiahpslewis@users.noreply.github.com> Date: Tue, 17 Dec 2024 23:14:06 +0100 Subject: [PATCH 08/13] Fix RLEnv tests --- src/ReinforcementLearningEnvironments/test/runtests.jl | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/ReinforcementLearningEnvironments/test/runtests.jl b/src/ReinforcementLearningEnvironments/test/runtests.jl index 32ea608b7..6c1d1b945 100644 --- a/src/ReinforcementLearningEnvironments/test/runtests.jl +++ b/src/ReinforcementLearningEnvironments/test/runtests.jl @@ -16,9 +16,11 @@ using JLD2 ENV["CONDA_JL_USE_MINIFORGE"] = "1" -Conda.pip_interop(true, env) -Conda.pip("install", "gym") -Conda.add("numpy") +Conda.add("python", Conda.ROOTENV) +Conda.add("numpy", Conda.ROOTENV) +Conda.pip_interop(true, Conda.ROOTENV) +Conda.pip("install", "gym", Conda.ROOTENV) + @testset "ReinforcementLearningEnvironments" begin include("environments/environments.jl") From 8b141f1b17060726e9d5484acb6b4f6ae02c6a12 Mon Sep 17 00:00:00 2001 From: Jeremiah Lewis <4462211+jeremiahpslewis@users.noreply.github.com> Date: Tue, 17 Dec 2024 23:18:32 +0100 Subject: [PATCH 09/13] Fix optimizer reference in stock trading environment example --- .../test/environments/examples/stock_trading_env.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ReinforcementLearningEnvironments/test/environments/examples/stock_trading_env.jl b/src/ReinforcementLearningEnvironments/test/environments/examples/stock_trading_env.jl index 9b0866b69..52c53a8f8 100644 --- a/src/ReinforcementLearningEnvironments/test/environments/examples/stock_trading_env.jl +++ b/src/ReinforcementLearningEnvironments/test/environments/examples/stock_trading_env.jl @@ -26,7 +26,7 @@ end Dense(ns, 64, relu), Dense(64, na, relu), ), - Flux.Optimise.Optimiser(ClipNorm(0.5), ADAM(1e-5)), + Flux.Optimise.Optimiser(ClipNorm(0.5), Flux.Adam(1e-5)), ), explorer = EpsilonGreedyExplorer(ϵ_stable=0.01), ), From 9b3d48c37a08499483dde9e2e0bc78f8b061a320 Mon Sep 17 00:00:00 2001 From: Jeremiah Lewis <4462211+jeremiahpslewis@users.noreply.github.com> Date: Tue, 17 Dec 2024 23:34:59 +0100 Subject: [PATCH 10/13] Fix optimizer reference in stock trading environment example --- .../test/environments/examples/stock_trading_env.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ReinforcementLearningEnvironments/test/environments/examples/stock_trading_env.jl b/src/ReinforcementLearningEnvironments/test/environments/examples/stock_trading_env.jl index 52c53a8f8..f567fd66e 100644 --- a/src/ReinforcementLearningEnvironments/test/environments/examples/stock_trading_env.jl +++ b/src/ReinforcementLearningEnvironments/test/environments/examples/stock_trading_env.jl @@ -26,7 +26,7 @@ end Dense(ns, 64, relu), Dense(64, na, relu), ), - Flux.Optimise.Optimiser(ClipNorm(0.5), Flux.Adam(1e-5)), + Flux.Optimise.Optimiser(ClipNorm(0.5), Adam(1e-5)), ), explorer = EpsilonGreedyExplorer(ϵ_stable=0.01), ), From 3ec596e5dee4c6c8fc9c043471bd7648db98c2e7 Mon Sep 17 00:00:00 2001 From: Jeremiah Lewis <4462211+jeremiahpslewis@users.noreply.github.com> Date: Wed, 18 Dec 2024 00:11:57 +0100 Subject: [PATCH 11/13] Refactor optimizer implementation in DDPGPolicy to use OptimiserChain --- docs/homepage/blog/ospp_report_210370190/index.md | 4 ++-- .../test/environments/examples/stock_trading_env.jl | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/homepage/blog/ospp_report_210370190/index.md b/docs/homepage/blog/ospp_report_210370190/index.md index 688e9428a..3a7fac3e3 100644 --- a/docs/homepage/blog/ospp_report_210370190/index.md +++ b/docs/homepage/blog/ospp_report_210370190/index.md @@ -491,11 +491,11 @@ create_critic(critic_dim) = Chain( create_policy(player) = DDPGPolicy( behavior_actor = NeuralNetworkApproximator( model = create_actor(player), - optimizer = Flux.Optimise.Optimiser(ClipNorm(0.5), Adam(1e-2)), + optimizer = OptimiserChain(ClipNorm(0.5), Adam(1e-2)), ), behavior_critic = NeuralNetworkApproximator( model = create_critic(critic_dim), - optimizer = Flux.Optimise.Optimiser(ClipNorm(0.5), Adam(1e-2)), + optimizer = OptimiserChain(ClipNorm(0.5), Adam(1e-2)), ), target_actor = NeuralNetworkApproximator( model = create_actor(player), diff --git a/src/ReinforcementLearningEnvironments/test/environments/examples/stock_trading_env.jl b/src/ReinforcementLearningEnvironments/test/environments/examples/stock_trading_env.jl index f567fd66e..2c99a2f74 100644 --- a/src/ReinforcementLearningEnvironments/test/environments/examples/stock_trading_env.jl +++ b/src/ReinforcementLearningEnvironments/test/environments/examples/stock_trading_env.jl @@ -26,7 +26,7 @@ end Dense(ns, 64, relu), Dense(64, na, relu), ), - Flux.Optimise.Optimiser(ClipNorm(0.5), Adam(1e-5)), + OptimiserChain(ClipNorm(0.5), Adam(1e-5)), ), explorer = EpsilonGreedyExplorer(ϵ_stable=0.01), ), From 573fb1cef8c783668b9aaa5ad12180de607730b4 Mon Sep 17 00:00:00 2001 From: Jeremiah Lewis <4462211+jeremiahpslewis@users.noreply.github.com> Date: Wed, 18 Dec 2024 12:30:47 +0100 Subject: [PATCH 12/13] Refactor UnicodePlots integration into extension --- src/ReinforcementLearningCore/Project.toml | 9 ++++++--- .../ext/UnicodePlotsExt.jl | 18 ++++++++++++++++++ .../src/core/hooks.jl | 15 --------------- src/ReinforcementLearningCore/test/runtests.jl | 1 + 4 files changed, 25 insertions(+), 18 deletions(-) create mode 100644 src/ReinforcementLearningCore/ext/UnicodePlotsExt.jl diff --git a/src/ReinforcementLearningCore/Project.toml b/src/ReinforcementLearningCore/Project.toml index 70fc380da..e68d17158 100644 --- a/src/ReinforcementLearningCore/Project.toml +++ b/src/ReinforcementLearningCore/Project.toml @@ -4,7 +4,6 @@ version = "0.15.4" [deps] AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c" -Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" CircularArrayBuffers = "9de3a189-e0c0-4e15-ba3b-b14b9fb0aec1" Crayons = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f" @@ -21,11 +20,15 @@ ReinforcementLearningTrajectories = "6486599b-a3cd-4e92-a99a-2cea90cc8c3c" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" + +[weakdeps] UnicodePlots = "b8865327-cd53-5732-bb35-84acbb429228" +[extensions] +UnicodePlotsExt = "UnicodePlots" + [compat] AbstractTrees = "0.3, 0.4" -Adapt = "3, 4" ChainRulesCore = "1" CircularArrayBuffers = "0.1.12" Crayons = "4" @@ -57,4 +60,4 @@ UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd" [targets] -test = ["CommonRLInterface", "CUDA", "cuDNN", "DomainSets", "Metal", "Preferences", "ReinforcementLearningEnvironments", "Test", "UUIDs"] +test = ["CommonRLInterface", "CUDA", "cuDNN", "DomainSets", "Metal", "Preferences", "ReinforcementLearningEnvironments", "Test", "UnicodePlots", "UUIDs"] diff --git a/src/ReinforcementLearningCore/ext/UnicodePlotsExt.jl b/src/ReinforcementLearningCore/ext/UnicodePlotsExt.jl new file mode 100644 index 000000000..a2511e2df --- /dev/null +++ b/src/ReinforcementLearningCore/ext/UnicodePlotsExt.jl @@ -0,0 +1,18 @@ +module UnicodePlotsExt + using ReinforcementLearningCore + using UnicodePlots: lineplot, lineplot! + + function Base.show(io::IO, hook::TotalRewardPerEpisode{true, F}) where {F<:Number} + if length(hook.rewards) > 0 + println(io, lineplot( + hook.rewards, + title="Total reward per episode", + xlabel="Episode", + ylabel="Score", + )) + else + println(io, typeof(hook)) + end + return + end +end diff --git a/src/ReinforcementLearningCore/src/core/hooks.jl b/src/ReinforcementLearningCore/src/core/hooks.jl index 91500b7b9..a8ecae986 100644 --- a/src/ReinforcementLearningCore/src/core/hooks.jl +++ b/src/ReinforcementLearningCore/src/core/hooks.jl @@ -10,7 +10,6 @@ export AbstractHook, DoEveryNSteps, DoOnExit -using UnicodePlots: lineplot, lineplot! using Statistics: mean, std using CircularArrayBuffers: CircularVectorBuffer import ReinforcementLearningBase: RLBase @@ -172,20 +171,6 @@ function Base.push!(hook::TotalRewardPerEpisode, return end -function Base.show(io::IO, hook::TotalRewardPerEpisode{true, F}) where {F<:Number} - if length(hook.rewards) > 0 - println(io, lineplot( - hook.rewards, - title="Total reward per episode", - xlabel="Episode", - ylabel="Score", - )) - else - println(io, typeof(hook)) - end - return -end - function Base.push!(hook::TotalRewardPerEpisode{true, F}, ::PostExperimentStage, agent::AbstractPolicy, diff --git a/src/ReinforcementLearningCore/test/runtests.jl b/src/ReinforcementLearningCore/test/runtests.jl index f3420f230..ca71c8686 100644 --- a/src/ReinforcementLearningCore/test/runtests.jl +++ b/src/ReinforcementLearningCore/test/runtests.jl @@ -1,4 +1,5 @@ using Test +using UnicodePlots using UUIDs using Preferences From 90f5a74801d67cd905d6e1b794136c973cd35c2c Mon Sep 17 00:00:00 2001 From: Jeremiah Lewis <4462211+jeremiahpslewis@users.noreply.github.com> Date: Wed, 18 Dec 2024 12:32:35 +0100 Subject: [PATCH 13/13] Move UnicodePlots to package extension in release notes --- src/ReinforcementLearningCore/NEWS.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/ReinforcementLearningCore/NEWS.md b/src/ReinforcementLearningCore/NEWS.md index a44325896..9265d749c 100644 --- a/src/ReinforcementLearningCore/NEWS.md +++ b/src/ReinforcementLearningCore/NEWS.md @@ -1,5 +1,9 @@ # ReinforcementLearningCore.jl Release Notes +#### v0.15.5 + +- Move `UnicodePlots` to package extension + #### v0.15.4 - Update `Flux.jl` to `v0.16` and fix deprecation warnings and method errors