Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

added first Independent Q Learning experiment #922

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion src/ReinforcementLearningExperiments/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,14 @@ version = "0.3.1"
[deps]
Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
ReinforcementLearningBase = "e575027e-6cd6-5018-9292-cdc6200d2b44"
ReinforcementLearningCore = "de1b191a-4ae0-4afa-a27b-92d07f46b2d6"
ReinforcementLearningEnvironments = "25e41dd2-4622-11e9-1641-f1adca772921"
ReinforcementLearningZoo = "d607f57d-ee1e-4ba7-bcf2-7734c1e31854"
Requires = "ae029012-a4dd-5104-9daa-d747884805df"
StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
Weave = "44d3d7a6-8a23-5bf8-98c5-b353f8df5ec9"

Expand All @@ -29,7 +31,9 @@ julia = "1.9"

[extras]
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
PyCall = "438e738f-606a-5dbb-bf0a-cddfbfd45ab0"
Requires = "ae029012-a4dd-5104-9daa-d747884805df"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[targets]
test = ["CUDA", "Test"]
test = ["CUDA", "PyCall", "Test"]
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
# ---

using PyCall
using ReinforcementLearningCore, ReinforcementLearningBase, ReinforcementLearningZoo
using ReinforcementLearningCore, ReinforcementLearningBase, ReinforcementLearningZoo, ReinforcementLearningEnvironments
using Flux
using Flux: glorot_uniform

Expand All @@ -17,14 +17,14 @@ using Flux.Losses: huber_loss
function RLCore.Experiment(
::Val{:JuliaRL},
::Val{:DQN},
::Val{:MPESimple};
::Val{:MPESimple},
seed=123,
n=1,
γ=0.99f0,
is_enable_double_DQN=true
)
rng = StableRNG(seed)
env = discrete2standard_discrete(PettingzooEnv("mpe.simple_v2"; seed=seed))
env = discrete2standard_discrete(PettingZooEnv("mpe.simple_v2"; seed=seed))
ns, na = length(state(env)), length(action_space(env))

agent = Agent(
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
# ---
# title: JuliaRL\_IDQN\_TicTacToe
# cover:
# description: IDQN applied to TicTacToe competitive
# date: 2023-07-03
# author: "[Panajiotis Keßler](mailto:[email protected])"
# ---

using StableRNGs
using ReinforcementLearningBase
using ReinforcementLearningZoo
using ReinforcementLearningCore
using Plots
using Flux
using Flux.Losses: huber_loss
using Flux: glorot_uniform



rng = StableRNG(1234)

cap = 100

RLCore.forward(L::DQNLearner, state::A) where {A <: Real} = RLCore.forward(L, [state])


episodes_per_step = 25

function RLCore.Experiment(
::Val{:JuliaRL},
::Val{:IDQN},
::Val{:TicTacToe},
seed=123,
n=1,
γ=0.99f0,
is_enable_double_DQN=true
)
rng = StableRNG(seed)
create_policy() = QBasedPolicy(
learner=DQNLearner(
approximator=Approximator(
model=TwinNetwork(
Chain(
Dense(1, 512, relu; init=glorot_uniform(rng)),
Dense(512, 256, relu; init=glorot_uniform(rng)),
Dense(256, 9; init=glorot_uniform(rng)),
);
sync_freq=100
),
optimiser=Adam(),
),
n=n,
γ=γ,
is_enable_double_DQN=is_enable_double_DQN,
loss_func=huber_loss,
rng=rng,
),
explorer=EpsilonGreedyExplorer(
kind=:exp,
ϵ_stable=0.01,
decay_steps=500,
rng=rng,
),
)

e = TicTacToeEnv();
m = MultiAgentPolicy(NamedTuple((player =>
Agent(player != :Cross ? create_policy() : RandomPolicy(;rng=rng),
Trajectory(
container=CircularArraySARTTraces(
capacity=cap,
state=Integer => (1,),
),
sampler=NStepBatchSampler{SS′ART}(
n=n,
γ=γ,
batch_size=1,
rng=rng
),
controller=InsertSampleRatioController(
threshold=1,
n_inserted=0
))
)
for player in players(e)))
);
hooks = MultiAgentHook(NamedTuple((p => TotalRewardPerEpisode() for p ∈ players(e))))
Experiment(m, e, StopAfterEpisode(episodes_per_step), hooks)
end

create_policy() = QBasedPolicy(
learner=DQNLearner(
approximator=Approximator(
model=TwinNetwork(
Chain(
Dense(1, 512, relu; init=glorot_uniform(rng)),
Dense(512, 256, relu; init=glorot_uniform(rng)),
Dense(256, 9; init=glorot_uniform(rng)),
);
sync_freq=100
),
optimiser=Adam(),
),
n=32,
γ=0.99f0,
is_enable_double_DQN=true,
loss_func=huber_loss,
rng=rng,
),
explorer=EpsilonGreedyExplorer(
kind=:exp,
ϵ_stable=0.01,
decay_steps=500,
rng=rng,
),
)

e = TicTacToeEnv();
m = MultiAgentPolicy(NamedTuple((player =>
Agent(player != :Cross ? create_policy() : RandomPolicy(;rng=rng),
Trajectory(
container=CircularArraySARTTraces(
capacity=cap,
state=Integer => (1,),
),
sampler=NStepBatchSampler{SS′ART}(
n=1,
γ=0.99f0,
batch_size=1,
rng=rng
),
controller=InsertSampleRatioController(
threshold=1,
n_inserted=0
))
)
for player in players(e)))
);
hooks = MultiAgentHook(NamedTuple((p => TotalRewardPerEpisode() for p ∈ players(e))))

win_rates = (Cross=Float64[], Nought=Float64[])
for i ∈ 1:2
run(m, e, StopAfterEpisode(episodes_per_step; is_show_progress=false), hooks)
wr_cross = sum(hooks[:Cross].rewards)/(i*episodes_per_step)
wr_nought = sum(hooks[:Nought].rewards)/(i*episodes_per_step)
push!(win_rates[:Cross], wr_cross)
push!(win_rates[:Nought], wr_nought)
end
p1 = plot([win_rates[:Cross] win_rates[:Nought]], labels=["Cross" "Nought"])
xlabel!("Iteration steps of $episodes_per_step episodes")
ylabel!("Win rate of the player")

p2 = plot([hooks[:Cross].rewards hooks[:Nought].rewards], labels=["Cross" "Nought"])
xlabel!("Overall episodes")
ylabel!("Rewards of the players")

p = plot(p1, p2, layout=(2,1), size=[1000,1000])
savefig("TTT_CROSS_DQN_NOUGHT_RANDOM.png")
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
module ReinforcementLearningExperiments

using Reexport
using Requires

@reexport using ReinforcementLearningCore, ReinforcementLearningBase, ReinforcementLearningZoo

Expand All @@ -19,8 +20,14 @@ include(joinpath(EXPERIMENTS_DIR, "JuliaRL_Rainbow_CartPole.jl"))
include(joinpath(EXPERIMENTS_DIR, "JuliaRL_VPG_CartPole.jl"))
include(joinpath(EXPERIMENTS_DIR, "JuliaRL_TRPO_CartPole.jl"))
include(joinpath(EXPERIMENTS_DIR, "JuliaRL_MPO_CartPole.jl"))
include(joinpath(EXPERIMENTS_DIR, "IDQN_TicTacToe.jl"))


# dynamic loading environments
function __init__() end
function __init__()
@require PyCall = "438e738f-606a-5dbb-bf0a-cddfbfd45ab0" include(
joinpath(EXPERIMENTS_DIR, "DQN_mpe_simple.jl")
)
end

end # module
10 changes: 10 additions & 0 deletions src/ReinforcementLearningExperiments/test/runtests.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,14 @@
using ReinforcementLearningExperiments
using CUDA

using Requires



using Requires
Mytolo marked this conversation as resolved.
Show resolved Hide resolved
Mytolo marked this conversation as resolved.
Show resolved Hide resolved



CUDA.allowscalar(false)

run(E`JuliaRL_NFQ_CartPole`)
Expand All @@ -15,6 +23,8 @@ run(E`JuliaRL_VPG_CartPole`)
run(E`JuliaRL_MPODiscrete_CartPole`)
run(E`JuliaRL_MPOContinuous_CartPole`)
run(E`JuliaRL_MPOCovariance_CartPole`)
run(E`JuliaRL_IDQN_TicTacToe`)
@require PyCall = "438e738f-606a-5dbb-bf0a-cddfbfd45ab0" run(E`JuliaRL_DQN_MPESimple`)
Mytolo marked this conversation as resolved.
Show resolved Hide resolved
# run(E`JuliaRL_BC_CartPole`)
# run(E`JuliaRL_VMPO_CartPole`)
# run(E`JuliaRL_BasicDQN_MountainCar`)
Expand Down