From d660085c7f43663f24f3d7492a44d71a80bcd8d4 Mon Sep 17 00:00:00 2001 From: Mytolo Date: Mon, 17 Jul 2023 18:23:45 +0200 Subject: [PATCH] added experiments to test adjusted tests and merged main --- .../Project.toml | 6 +- .../experiments/MARL/DQN_mpe_simple.jl | 4 +- .../experiments/MARL/IDQN_TicTacToe.jl | 72 +++++++++++++++++-- .../src/ReinforcementLearningExperiments.jl | 5 ++ .../test/runtests.jl | 6 ++ 5 files changed, 85 insertions(+), 8 deletions(-) rename src/ReinforcementLearningExperiments/{src => deps/experiments}/experiments/MARL/DQN_mpe_simple.jl (96%) rename src/ReinforcementLearningExperiments/{src => deps/experiments}/experiments/MARL/IDQN_TicTacToe.jl (56%) diff --git a/src/ReinforcementLearningExperiments/Project.toml b/src/ReinforcementLearningExperiments/Project.toml index 5f41135fc..50039e4b0 100644 --- a/src/ReinforcementLearningExperiments/Project.toml +++ b/src/ReinforcementLearningExperiments/Project.toml @@ -6,12 +6,14 @@ version = "0.3.1" [deps] Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c" +Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Reexport = "189a3867-3050-52da-a836-e630ba90ab69" ReinforcementLearningBase = "e575027e-6cd6-5018-9292-cdc6200d2b44" ReinforcementLearningCore = "de1b191a-4ae0-4afa-a27b-92d07f46b2d6" ReinforcementLearningEnvironments = "25e41dd2-4622-11e9-1641-f1adca772921" ReinforcementLearningZoo = "d607f57d-ee1e-4ba7-bcf2-7734c1e31854" +Requires = "ae029012-a4dd-5104-9daa-d747884805df" StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3" Weave = "44d3d7a6-8a23-5bf8-98c5-b353f8df5ec9" @@ -29,7 +31,9 @@ julia = "1.9" [extras] CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +PyCall = "438e738f-606a-5dbb-bf0a-cddfbfd45ab0" +Requires = "ae029012-a4dd-5104-9daa-d747884805df" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["CUDA", "Test"] +test = ["CUDA", "PyCall", "Test"] diff --git a/src/ReinforcementLearningExperiments/src/experiments/MARL/DQN_mpe_simple.jl b/src/ReinforcementLearningExperiments/deps/experiments/experiments/MARL/DQN_mpe_simple.jl similarity index 96% rename from src/ReinforcementLearningExperiments/src/experiments/MARL/DQN_mpe_simple.jl rename to src/ReinforcementLearningExperiments/deps/experiments/experiments/MARL/DQN_mpe_simple.jl index 2c3363df9..3bef9d9a2 100644 --- a/src/ReinforcementLearningExperiments/src/experiments/MARL/DQN_mpe_simple.jl +++ b/src/ReinforcementLearningExperiments/deps/experiments/experiments/MARL/DQN_mpe_simple.jl @@ -17,14 +17,14 @@ using Flux.Losses: huber_loss function RLCore.Experiment( ::Val{:JuliaRL}, ::Val{:DQN}, - ::Val{:MPESimple}; + ::Val{:MPESimple}, seed=123, n=1, γ=0.99f0, is_enable_double_DQN=true ) rng = StableRNG(seed) - env = discrete2standard_discrete(PettingzooEnv("mpe.simple_v2"; seed=seed)) + env = discrete2standard_discrete(PettingZooEnv("mpe.simple_v2"; seed=seed)) ns, na = length(state(env)), length(action_space(env)) agent = Agent( diff --git a/src/ReinforcementLearningExperiments/src/experiments/MARL/IDQN_TicTacToe.jl b/src/ReinforcementLearningExperiments/deps/experiments/experiments/MARL/IDQN_TicTacToe.jl similarity index 56% rename from src/ReinforcementLearningExperiments/src/experiments/MARL/IDQN_TicTacToe.jl rename to src/ReinforcementLearningExperiments/deps/experiments/experiments/MARL/IDQN_TicTacToe.jl index 027108256..30353b2f9 100644 --- a/src/ReinforcementLearningExperiments/src/experiments/MARL/IDQN_TicTacToe.jl +++ b/src/ReinforcementLearningExperiments/deps/experiments/experiments/MARL/IDQN_TicTacToe.jl @@ -7,7 +7,6 @@ # --- using StableRNGs -using ReinforcementLearning using ReinforcementLearningBase using ReinforcementLearningZoo using ReinforcementLearningCore @@ -16,7 +15,6 @@ using Flux using Flux.Losses: huber_loss using Flux: glorot_uniform -using ProgressMeter rng = StableRNG(1234) @@ -25,6 +23,71 @@ cap = 100 RLCore.forward(L::DQNLearner, state::A) where {A <: Real} = RLCore.forward(L, [state]) + +episodes_per_step = 25 + +function RLCore.Experiment( + ::Val{:JuliaRL}, + ::Val{:IDQN}, + ::Val{:TicTacToe}, + seed=123, + n=1, + γ=0.99f0, + is_enable_double_DQN=true +) + rng = StableRNG(seed) + create_policy() = QBasedPolicy( + learner=DQNLearner( + approximator=Approximator( + model=TwinNetwork( + Chain( + Dense(1, 512, relu; init=glorot_uniform(rng)), + Dense(512, 256, relu; init=glorot_uniform(rng)), + Dense(256, 9; init=glorot_uniform(rng)), + ); + sync_freq=100 + ), + optimiser=Adam(), + ), + n=n, + γ=γ, + is_enable_double_DQN=is_enable_double_DQN, + loss_func=huber_loss, + rng=rng, + ), + explorer=EpsilonGreedyExplorer( + kind=:exp, + ϵ_stable=0.01, + decay_steps=500, + rng=rng, + ), + ) + + e = TicTacToeEnv(); + m = MultiAgentPolicy(NamedTuple((player => + Agent(player != :Cross ? create_policy() : RandomPolicy(;rng=rng), + Trajectory( + container=CircularArraySARTTraces( + capacity=cap, + state=Integer => (1,), + ), + sampler=NStepBatchSampler{SS′ART}( + n=n, + γ=γ, + batch_size=1, + rng=rng + ), + controller=InsertSampleRatioController( + threshold=1, + n_inserted=0 + )) + ) + for player in players(e))) + ); + hooks = MultiAgentHook(NamedTuple((p => TotalRewardPerEpisode() for p ∈ players(e)))) + Experiment(m, e, StopAfterEpisode(episodes_per_step), hooks) +end + create_policy() = QBasedPolicy( learner=DQNLearner( approximator=Approximator( @@ -36,7 +99,7 @@ create_policy() = QBasedPolicy( ); sync_freq=100 ), - optimiser=ADAM(), + optimiser=Adam(), ), n=32, γ=0.99f0, @@ -75,9 +138,8 @@ m = MultiAgentPolicy(NamedTuple((player => ); hooks = MultiAgentHook(NamedTuple((p => TotalRewardPerEpisode() for p ∈ players(e)))) -episodes_per_step = 25 win_rates = (Cross=Float64[], Nought=Float64[]) -@showprogress for i ∈ 1:2 +for i ∈ 1:2 run(m, e, StopAfterEpisode(episodes_per_step; is_show_progress=false), hooks) wr_cross = sum(hooks[:Cross].rewards)/(i*episodes_per_step) wr_nought = sum(hooks[:Nought].rewards)/(i*episodes_per_step) diff --git a/src/ReinforcementLearningExperiments/src/ReinforcementLearningExperiments.jl b/src/ReinforcementLearningExperiments/src/ReinforcementLearningExperiments.jl index ca27a1031..7efab0bac 100644 --- a/src/ReinforcementLearningExperiments/src/ReinforcementLearningExperiments.jl +++ b/src/ReinforcementLearningExperiments/src/ReinforcementLearningExperiments.jl @@ -1,6 +1,7 @@ module ReinforcementLearningExperiments using Reexport +using Requires @reexport using ReinforcementLearningCore, ReinforcementLearningBase, ReinforcementLearningZoo @@ -19,6 +20,10 @@ include(joinpath(EXPERIMENTS_DIR, "JuliaRL_Rainbow_CartPole.jl")) include(joinpath(EXPERIMENTS_DIR, "JuliaRL_VPG_CartPole.jl")) include(joinpath(EXPERIMENTS_DIR, "JuliaRL_TRPO_CartPole.jl")) include(joinpath(EXPERIMENTS_DIR, "JuliaRL_MPO_CartPole.jl")) +include(joinpath(EXPERIMENTS_DIR, "IDQN_TicTacToe.jl")) +@require PyCall = "438e738f-606a-5dbb-bf0a-cddfbfd45ab0" include( + joinpath(EXPERIMENTS_DIR, "DQN_mpe_simple.jl") +) # dynamic loading environments function __init__() end diff --git a/src/ReinforcementLearningExperiments/test/runtests.jl b/src/ReinforcementLearningExperiments/test/runtests.jl index 1086313e0..eae611c97 100644 --- a/src/ReinforcementLearningExperiments/test/runtests.jl +++ b/src/ReinforcementLearningExperiments/test/runtests.jl @@ -1,6 +1,10 @@ using ReinforcementLearningExperiments using CUDA +using Requires + + + CUDA.allowscalar(false) run(E`JuliaRL_NFQ_CartPole`) @@ -15,6 +19,8 @@ run(E`JuliaRL_VPG_CartPole`) run(E`JuliaRL_MPODiscrete_CartPole`) run(E`JuliaRL_MPOContinuous_CartPole`) run(E`JuliaRL_MPOCovariance_CartPole`) +@require PyCall = "438e738f-606a-5dbb-bf0a-cddfbfd45ab0" run(E`JuliaRL_DQN_MPESimple`) +run(E`JuliaRL_IDQN_TicTacToe`) # run(E`JuliaRL_BC_CartPole`) # run(E`JuliaRL_VMPO_CartPole`) # run(E`JuliaRL_BasicDQN_MountainCar`)