diff --git a/src/ReinforcementLearningExperiments/Project.toml b/src/ReinforcementLearningExperiments/Project.toml index a19d3ab64..b8e760d37 100644 --- a/src/ReinforcementLearningExperiments/Project.toml +++ b/src/ReinforcementLearningExperiments/Project.toml @@ -8,12 +8,15 @@ CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c" IntervalSets = "8197267c-284f-5f27-9208-e0e47529a953" +Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" +Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Reexport = "189a3867-3050-52da-a836-e630ba90ab69" ReinforcementLearningBase = "e575027e-6cd6-5018-9292-cdc6200d2b44" ReinforcementLearningCore = "de1b191a-4ae0-4afa-a27b-92d07f46b2d6" ReinforcementLearningEnvironments = "25e41dd2-4622-11e9-1641-f1adca772921" ReinforcementLearningZoo = "d607f57d-ee1e-4ba7-bcf2-7734c1e31854" +Requires = "ae029012-a4dd-5104-9daa-d747884805df" StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3" Weave = "44d3d7a6-8a23-5bf8-98c5-b353f8df5ec9" cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd" @@ -35,7 +38,9 @@ julia = "1.9" [extras] CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +PyCall = "438e738f-606a-5dbb-bf0a-cddfbfd45ab0" +Requires = "ae029012-a4dd-5104-9daa-d747884805df" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["CUDA", "Test"] +test = ["CUDA", "PyCall", "Test"] diff --git a/src/ReinforcementLearningExperiments/deps/experiments/experiments/MARL/DQN_mpe_simple.jl b/src/ReinforcementLearningExperiments/deps/experiments/experiments/MARL/DQN_mpe_simple.jl new file mode 100644 index 000000000..7d832ac68 --- /dev/null +++ b/src/ReinforcementLearningExperiments/deps/experiments/experiments/MARL/DQN_mpe_simple.jl @@ -0,0 +1,15 @@ +# --- +# title: JuliaRL\_DQN\_MPESimple +# cover: +# description: DQN applied to MPE simple +# date: 2023-02-01 +# author: "[Panajiotis Keßler](mailto:panajiotis@christoforidis.net)" +# --- + + +using Plots +ex = E`JuliaRL_DQN_MPESimple` +run(ex) +plot(ex.hook.rewards) +savefig("JuliaRL_DQN_MPESimple.png") + diff --git a/src/ReinforcementLearningExperiments/deps/experiments/experiments/MARL/IDQN_TicTacToe.jl b/src/ReinforcementLearningExperiments/deps/experiments/experiments/MARL/IDQN_TicTacToe.jl new file mode 100644 index 000000000..30353b2f9 --- /dev/null +++ b/src/ReinforcementLearningExperiments/deps/experiments/experiments/MARL/IDQN_TicTacToe.jl @@ -0,0 +1,158 @@ +# --- +# title: JuliaRL\_IDQN\_TicTacToe +# cover: +# description: IDQN applied to TicTacToe competitive +# date: 2023-07-03 +# author: "[Panajiotis Keßler](mailto:panajiotis.kessler@gmail.com)" +# --- + +using StableRNGs +using ReinforcementLearningBase +using ReinforcementLearningZoo +using ReinforcementLearningCore +using Plots +using Flux +using Flux.Losses: huber_loss +using Flux: glorot_uniform + + + +rng = StableRNG(1234) + +cap = 100 + +RLCore.forward(L::DQNLearner, state::A) where {A <: Real} = RLCore.forward(L, [state]) + + +episodes_per_step = 25 + +function RLCore.Experiment( + ::Val{:JuliaRL}, + ::Val{:IDQN}, + ::Val{:TicTacToe}, + seed=123, + n=1, + γ=0.99f0, + is_enable_double_DQN=true +) + rng = StableRNG(seed) + create_policy() = QBasedPolicy( + learner=DQNLearner( + approximator=Approximator( + model=TwinNetwork( + Chain( + Dense(1, 512, relu; init=glorot_uniform(rng)), + Dense(512, 256, relu; init=glorot_uniform(rng)), + Dense(256, 9; init=glorot_uniform(rng)), + ); + sync_freq=100 + ), + optimiser=Adam(), + ), + n=n, + γ=γ, + is_enable_double_DQN=is_enable_double_DQN, + loss_func=huber_loss, + rng=rng, + ), + explorer=EpsilonGreedyExplorer( + kind=:exp, + ϵ_stable=0.01, + decay_steps=500, + rng=rng, + ), + ) + + e = TicTacToeEnv(); + m = MultiAgentPolicy(NamedTuple((player => + Agent(player != :Cross ? create_policy() : RandomPolicy(;rng=rng), + Trajectory( + container=CircularArraySARTTraces( + capacity=cap, + state=Integer => (1,), + ), + sampler=NStepBatchSampler{SS′ART}( + n=n, + γ=γ, + batch_size=1, + rng=rng + ), + controller=InsertSampleRatioController( + threshold=1, + n_inserted=0 + )) + ) + for player in players(e))) + ); + hooks = MultiAgentHook(NamedTuple((p => TotalRewardPerEpisode() for p ∈ players(e)))) + Experiment(m, e, StopAfterEpisode(episodes_per_step), hooks) +end + +create_policy() = QBasedPolicy( + learner=DQNLearner( + approximator=Approximator( + model=TwinNetwork( + Chain( + Dense(1, 512, relu; init=glorot_uniform(rng)), + Dense(512, 256, relu; init=glorot_uniform(rng)), + Dense(256, 9; init=glorot_uniform(rng)), + ); + sync_freq=100 + ), + optimiser=Adam(), + ), + n=32, + γ=0.99f0, + is_enable_double_DQN=true, + loss_func=huber_loss, + rng=rng, + ), + explorer=EpsilonGreedyExplorer( + kind=:exp, + ϵ_stable=0.01, + decay_steps=500, + rng=rng, + ), + ) + +e = TicTacToeEnv(); +m = MultiAgentPolicy(NamedTuple((player => + Agent(player != :Cross ? create_policy() : RandomPolicy(;rng=rng), + Trajectory( + container=CircularArraySARTTraces( + capacity=cap, + state=Integer => (1,), + ), + sampler=NStepBatchSampler{SS′ART}( + n=1, + γ=0.99f0, + batch_size=1, + rng=rng + ), + controller=InsertSampleRatioController( + threshold=1, + n_inserted=0 + )) + ) + for player in players(e))) + ); +hooks = MultiAgentHook(NamedTuple((p => TotalRewardPerEpisode() for p ∈ players(e)))) + +win_rates = (Cross=Float64[], Nought=Float64[]) +for i ∈ 1:2 + run(m, e, StopAfterEpisode(episodes_per_step; is_show_progress=false), hooks) + wr_cross = sum(hooks[:Cross].rewards)/(i*episodes_per_step) + wr_nought = sum(hooks[:Nought].rewards)/(i*episodes_per_step) + push!(win_rates[:Cross], wr_cross) + push!(win_rates[:Nought], wr_nought) +end +p1 = plot([win_rates[:Cross] win_rates[:Nought]], labels=["Cross" "Nought"]) +xlabel!("Iteration steps of $episodes_per_step episodes") +ylabel!("Win rate of the player") + +p2 = plot([hooks[:Cross].rewards hooks[:Nought].rewards], labels=["Cross" "Nought"]) +xlabel!("Overall episodes") +ylabel!("Rewards of the players") + +p = plot(p1, p2, layout=(2,1), size=[1000,1000]) +savefig("TTT_CROSS_DQN_NOUGHT_RANDOM.png") diff --git a/src/ReinforcementLearningExperiments/src/experiments/MARL/DQN_mpe_simple.jl b/src/ReinforcementLearningExperiments/deps/experiments/experiments/MARL/pettingzoo_ex.jl similarity index 82% rename from src/ReinforcementLearningExperiments/src/experiments/MARL/DQN_mpe_simple.jl rename to src/ReinforcementLearningExperiments/deps/experiments/experiments/MARL/pettingzoo_ex.jl index 2ce078a89..d69b975e4 100644 --- a/src/ReinforcementLearningExperiments/src/experiments/MARL/DQN_mpe_simple.jl +++ b/src/ReinforcementLearningExperiments/deps/experiments/experiments/MARL/pettingzoo_ex.jl @@ -1,13 +1,5 @@ -# --- -# title: JuliaRL\_DQN\_MPESimple -# cover: -# description: DQN applied to MPE simple -# date: 2023-02-01 -# author: "[Panajiotis Keßler](mailto:panajiotis@christoforidis.net)" -# --- - using PyCall -using ReinforcementLearningCore, ReinforcementLearningBase, ReinforcementLearningZoo +using ReinforcementLearningCore, ReinforcementLearningBase, ReinforcementLearningZoo, ReinforcementLearningEnvironments using Flux using Flux: glorot_uniform @@ -17,14 +9,14 @@ using Flux.Losses: huber_loss function RLCore.Experiment( ::Val{:JuliaRL}, ::Val{:DQN}, - ::Val{:MPESimple}; + ::Val{:MPESimple}, seed=123, n=1, γ=0.99f0, is_enable_double_DQN=true ) rng = StableRNG(seed) - env = discrete2standard_discrete(PettingzooEnv("mpe.simple_v2"; seed=seed)) + env = discrete2standard_discrete(PettingZooEnv("mpe.simple_v2"; seed=seed)) ns, na = length(state(env)), length(action_space(env)) agent = Agent( @@ -74,11 +66,4 @@ function RLCore.Experiment( stop_condition = StopAfterEpisode(150, is_show_progress=!haskey(ENV, "CI")) hook = TotalRewardPerEpisode() Experiment(agent, env, stop_condition, hook) -end - -using Plots -ex = E`JuliaRL_DQN_MPESimple` -run(ex) -plot(ex.hook.rewards) -savefig("JuliaRL_DQN_MPESimple.png") - +end \ No newline at end of file diff --git a/src/ReinforcementLearningExperiments/src/ReinforcementLearningExperiments.jl b/src/ReinforcementLearningExperiments/src/ReinforcementLearningExperiments.jl index 4c903691b..38a14c658 100644 --- a/src/ReinforcementLearningExperiments/src/ReinforcementLearningExperiments.jl +++ b/src/ReinforcementLearningExperiments/src/ReinforcementLearningExperiments.jl @@ -1,11 +1,13 @@ module ReinforcementLearningExperiments using Reexport +using Pkg +using Requires -@reexport using ReinforcementLearningCore, ReinforcementLearningBase, ReinforcementLearningZoo +@reexport using ReinforcementLearningCore, ReinforcementLearningBase, ReinforcementLearningZoo, ReinforcementLearningEnvironments const EXPERIMENTS_DIR = joinpath(@__DIR__, "experiments") - +# as long as there are not working experiments, this is not working properly # for f in readdir(EXPERIMENTS_DIR) # include(joinpath(EXPERIMENTS_DIR, f)) # end @@ -21,10 +23,16 @@ include(joinpath(EXPERIMENTS_DIR, "JuliaRL_Rainbow_CartPole.jl")) include(joinpath(EXPERIMENTS_DIR, "JuliaRL_VPG_CartPole.jl")) include(joinpath(EXPERIMENTS_DIR, "JuliaRL_TRPO_CartPole.jl")) include(joinpath(EXPERIMENTS_DIR, "JuliaRL_MPO_CartPole.jl")) +include(joinpath(EXPERIMENTS_DIR, "IDQN_TicTacToe.jl")) include(joinpath(EXPERIMENTS_DIR, "DQN_CartPoleGPU.jl")) include(joinpath(EXPERIMENTS_DIR, "JuliaRL_SAC_Pendulum.jl")) # dynamic loading environments -function __init__() end +function __init__() + #PyCall environments experiments + @require PyCall = "438e738f-606a-5dbb-bf0a-cddfbfd45ab0" include( + joinpath(EXPERIMENTS_DIR, "pettingzoo_ex.jl") + ) +end end # module diff --git a/src/ReinforcementLearningExperiments/test/runtests.jl b/src/ReinforcementLearningExperiments/test/runtests.jl index 69471a995..a9f73bdc4 100644 --- a/src/ReinforcementLearningExperiments/test/runtests.jl +++ b/src/ReinforcementLearningExperiments/test/runtests.jl @@ -1,6 +1,12 @@ +using Pkg +if Base.UUID("438e738f-606a-5dbb-bf0a-cddfbfd45ab0") in Pkg.dependencies().keys + using PyCall +end using ReinforcementLearningExperiments using CUDA + + CUDA.allowscalar(false) run(E`JuliaRL_BasicDQN_CartPole`) @@ -18,6 +24,17 @@ run(E`JuliaRL_SAC_Pendulum`) run(E`JuliaRL_MPODiscrete_CartPole`) run(E`JuliaRL_MPOContinuous_CartPole`) run(E`JuliaRL_MPOCovariance_CartPole`) +run(E`JuliaRL_IDQN_TicTacToe`) + +# test PyCall experiments. +# NOTE: Do NOT use E`...` macro as it will execute also if statement is false (beforehand?) +if Base.UUID("438e738f-606a-5dbb-bf0a-cddfbfd45ab0") in Pkg.dependencies().keys + if PyCall.pyexists("pettingzoo.mpe") + x = RLCore.Experiment("JuliaRL_DQN_MPESimple") + run(x) + end +end + # run(E`JuliaRL_BC_CartPole`) # run(E`JuliaRL_VMPO_CartPole`) # run(E`JuliaRL_BasicDQN_MountainCar`)