Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

added first Independent Q Learning experiment #922

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion src/ReinforcementLearningExperiments/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,15 @@ CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
IntervalSets = "8197267c-284f-5f27-9208-e0e47529a953"
Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
ReinforcementLearningBase = "e575027e-6cd6-5018-9292-cdc6200d2b44"
ReinforcementLearningCore = "de1b191a-4ae0-4afa-a27b-92d07f46b2d6"
ReinforcementLearningEnvironments = "25e41dd2-4622-11e9-1641-f1adca772921"
ReinforcementLearningZoo = "d607f57d-ee1e-4ba7-bcf2-7734c1e31854"
Requires = "ae029012-a4dd-5104-9daa-d747884805df"
StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
Weave = "44d3d7a6-8a23-5bf8-98c5-b353f8df5ec9"
cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd"
Expand All @@ -35,7 +38,9 @@ julia = "1.9"

[extras]
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
PyCall = "438e738f-606a-5dbb-bf0a-cddfbfd45ab0"
Requires = "ae029012-a4dd-5104-9daa-d747884805df"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[targets]
test = ["CUDA", "Test"]
test = ["CUDA", "PyCall", "Test"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# ---
# title: JuliaRL\_DQN\_MPESimple
# cover:
# description: DQN applied to MPE simple
# date: 2023-02-01
# author: "[Panajiotis Keßler](mailto:[email protected])"
# ---


using Plots
ex = E`JuliaRL_DQN_MPESimple`
run(ex)
plot(ex.hook.rewards)
savefig("JuliaRL_DQN_MPESimple.png")

Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
# ---
# title: JuliaRL\_IDQN\_TicTacToe
# cover:
# description: IDQN applied to TicTacToe competitive
# date: 2023-07-03
# author: "[Panajiotis Keßler](mailto:[email protected])"
# ---

using StableRNGs
using ReinforcementLearningBase
using ReinforcementLearningZoo
using ReinforcementLearningCore
using Plots
using Flux
using Flux.Losses: huber_loss
using Flux: glorot_uniform



rng = StableRNG(1234)

cap = 100

RLCore.forward(L::DQNLearner, state::A) where {A <: Real} = RLCore.forward(L, [state])


episodes_per_step = 25

function RLCore.Experiment(
::Val{:JuliaRL},
::Val{:IDQN},
::Val{:TicTacToe},
seed=123,
n=1,
γ=0.99f0,
is_enable_double_DQN=true
)
rng = StableRNG(seed)
create_policy() = QBasedPolicy(
learner=DQNLearner(
approximator=Approximator(
model=TwinNetwork(
Chain(
Dense(1, 512, relu; init=glorot_uniform(rng)),
Dense(512, 256, relu; init=glorot_uniform(rng)),
Dense(256, 9; init=glorot_uniform(rng)),
);
sync_freq=100
),
optimiser=Adam(),
),
n=n,
γ=γ,
is_enable_double_DQN=is_enable_double_DQN,
loss_func=huber_loss,
rng=rng,
),
explorer=EpsilonGreedyExplorer(
kind=:exp,
ϵ_stable=0.01,
decay_steps=500,
rng=rng,
),
)

e = TicTacToeEnv();
m = MultiAgentPolicy(NamedTuple((player =>
Agent(player != :Cross ? create_policy() : RandomPolicy(;rng=rng),
Trajectory(
container=CircularArraySARTTraces(
capacity=cap,
state=Integer => (1,),
),
sampler=NStepBatchSampler{SS′ART}(
n=n,
γ=γ,
batch_size=1,
rng=rng
),
controller=InsertSampleRatioController(
threshold=1,
n_inserted=0
))
)
for player in players(e)))
);
hooks = MultiAgentHook(NamedTuple((p => TotalRewardPerEpisode() for p ∈ players(e))))
Experiment(m, e, StopAfterEpisode(episodes_per_step), hooks)
end

create_policy() = QBasedPolicy(
learner=DQNLearner(
approximator=Approximator(
model=TwinNetwork(
Chain(
Dense(1, 512, relu; init=glorot_uniform(rng)),
Dense(512, 256, relu; init=glorot_uniform(rng)),
Dense(256, 9; init=glorot_uniform(rng)),
);
sync_freq=100
),
optimiser=Adam(),
),
n=32,
γ=0.99f0,
is_enable_double_DQN=true,
loss_func=huber_loss,
rng=rng,
),
explorer=EpsilonGreedyExplorer(
kind=:exp,
ϵ_stable=0.01,
decay_steps=500,
rng=rng,
),
)

e = TicTacToeEnv();
m = MultiAgentPolicy(NamedTuple((player =>
Agent(player != :Cross ? create_policy() : RandomPolicy(;rng=rng),
Trajectory(
container=CircularArraySARTTraces(
capacity=cap,
state=Integer => (1,),
),
sampler=NStepBatchSampler{SS′ART}(
n=1,
γ=0.99f0,
batch_size=1,
rng=rng
),
controller=InsertSampleRatioController(
threshold=1,
n_inserted=0
))
)
for player in players(e)))
);
hooks = MultiAgentHook(NamedTuple((p => TotalRewardPerEpisode() for p ∈ players(e))))

win_rates = (Cross=Float64[], Nought=Float64[])
for i ∈ 1:2
run(m, e, StopAfterEpisode(episodes_per_step; is_show_progress=false), hooks)
wr_cross = sum(hooks[:Cross].rewards)/(i*episodes_per_step)
wr_nought = sum(hooks[:Nought].rewards)/(i*episodes_per_step)
push!(win_rates[:Cross], wr_cross)
push!(win_rates[:Nought], wr_nought)
end
p1 = plot([win_rates[:Cross] win_rates[:Nought]], labels=["Cross" "Nought"])
xlabel!("Iteration steps of $episodes_per_step episodes")
ylabel!("Win rate of the player")

p2 = plot([hooks[:Cross].rewards hooks[:Nought].rewards], labels=["Cross" "Nought"])
xlabel!("Overall episodes")
ylabel!("Rewards of the players")

p = plot(p1, p2, layout=(2,1), size=[1000,1000])
savefig("TTT_CROSS_DQN_NOUGHT_RANDOM.png")
Original file line number Diff line number Diff line change
@@ -1,13 +1,5 @@
# ---
# title: JuliaRL\_DQN\_MPESimple
# cover:
# description: DQN applied to MPE simple
# date: 2023-02-01
# author: "[Panajiotis Keßler](mailto:[email protected])"
# ---

using PyCall
using ReinforcementLearningCore, ReinforcementLearningBase, ReinforcementLearningZoo
using ReinforcementLearningCore, ReinforcementLearningBase, ReinforcementLearningZoo, ReinforcementLearningEnvironments
using Flux
using Flux: glorot_uniform

Expand All @@ -17,14 +9,14 @@ using Flux.Losses: huber_loss
function RLCore.Experiment(
::Val{:JuliaRL},
::Val{:DQN},
::Val{:MPESimple};
::Val{:MPESimple},
seed=123,
n=1,
γ=0.99f0,
is_enable_double_DQN=true
)
rng = StableRNG(seed)
env = discrete2standard_discrete(PettingzooEnv("mpe.simple_v2"; seed=seed))
env = discrete2standard_discrete(PettingZooEnv("mpe.simple_v2"; seed=seed))
ns, na = length(state(env)), length(action_space(env))

agent = Agent(
Expand Down Expand Up @@ -74,11 +66,4 @@ function RLCore.Experiment(
stop_condition = StopAfterEpisode(150, is_show_progress=!haskey(ENV, "CI"))
hook = TotalRewardPerEpisode()
Experiment(agent, env, stop_condition, hook)
end

using Plots
ex = E`JuliaRL_DQN_MPESimple`
run(ex)
plot(ex.hook.rewards)
savefig("JuliaRL_DQN_MPESimple.png")

end
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
module ReinforcementLearningExperiments

using Reexport
using Pkg
using Requires

@reexport using ReinforcementLearningCore, ReinforcementLearningBase, ReinforcementLearningZoo
@reexport using ReinforcementLearningCore, ReinforcementLearningBase, ReinforcementLearningZoo, ReinforcementLearningEnvironments

const EXPERIMENTS_DIR = joinpath(@__DIR__, "experiments")

# as long as there are not working experiments, this is not working properly
# for f in readdir(EXPERIMENTS_DIR)
# include(joinpath(EXPERIMENTS_DIR, f))
# end
Expand All @@ -21,10 +23,16 @@ include(joinpath(EXPERIMENTS_DIR, "JuliaRL_Rainbow_CartPole.jl"))
include(joinpath(EXPERIMENTS_DIR, "JuliaRL_VPG_CartPole.jl"))
include(joinpath(EXPERIMENTS_DIR, "JuliaRL_TRPO_CartPole.jl"))
include(joinpath(EXPERIMENTS_DIR, "JuliaRL_MPO_CartPole.jl"))
include(joinpath(EXPERIMENTS_DIR, "IDQN_TicTacToe.jl"))
include(joinpath(EXPERIMENTS_DIR, "DQN_CartPoleGPU.jl"))
include(joinpath(EXPERIMENTS_DIR, "JuliaRL_SAC_Pendulum.jl"))

# dynamic loading environments
function __init__() end
function __init__()
#PyCall environments experiments
@require PyCall = "438e738f-606a-5dbb-bf0a-cddfbfd45ab0" include(
joinpath(EXPERIMENTS_DIR, "pettingzoo_ex.jl")
)
end

end # module
17 changes: 17 additions & 0 deletions src/ReinforcementLearningExperiments/test/runtests.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
using Pkg
if Base.UUID("438e738f-606a-5dbb-bf0a-cddfbfd45ab0") in Pkg.dependencies().keys
using PyCall
end
using ReinforcementLearningExperiments
using CUDA



CUDA.allowscalar(false)

run(E`JuliaRL_BasicDQN_CartPole`)
Expand All @@ -18,6 +24,17 @@ run(E`JuliaRL_SAC_Pendulum`)
run(E`JuliaRL_MPODiscrete_CartPole`)
run(E`JuliaRL_MPOContinuous_CartPole`)
run(E`JuliaRL_MPOCovariance_CartPole`)
run(E`JuliaRL_IDQN_TicTacToe`)

# test PyCall experiments.
# NOTE: Do NOT use E`...` macro as it will execute also if statement is false (beforehand?)
if Base.UUID("438e738f-606a-5dbb-bf0a-cddfbfd45ab0") in Pkg.dependencies().keys
if PyCall.pyexists("pettingzoo.mpe")
x = RLCore.Experiment("JuliaRL_DQN_MPESimple")
run(x)
end
end

# run(E`JuliaRL_BC_CartPole`)
# run(E`JuliaRL_VMPO_CartPole`)
# run(E`JuliaRL_BasicDQN_MountainCar`)
Expand Down
Loading