From aa5575a38e5d3e2fe8722ed8d932d2d3a9d48c25 Mon Sep 17 00:00:00 2001 From: Zachary Sunberg Date: Sat, 16 Jan 2021 14:49:43 -0700 Subject: [PATCH 1/5] added multiplayer functions --- src/CommonRLInterface.jl | 33 ++++++++++++----- src/multiplayer.jl | 77 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 101 insertions(+), 9 deletions(-) create mode 100644 src/multiplayer.jl diff --git a/src/CommonRLInterface.jl b/src/CommonRLInterface.jl index 2502641..4ab0a99 100644 --- a/src/CommonRLInterface.jl +++ b/src/CommonRLInterface.jl @@ -33,9 +33,9 @@ This function is a *static property* of the environment; the value it returns sh --- - actions(env::AbstractEnv, i::Integer) + actions(env::AbstractEnv, player_index) -Return a collection of all the actions available to player i. +Return a collection of all the actions available to a given player. This function is a *static property* of the environment; the value it returns should not change based on the state. """ @@ -53,18 +53,33 @@ function observe end """ r = act!(env::AbstractEnv, a) -Take action `a` for the current player, advance AbstractEnv `env` forward one step, and return rewards for all players. +Take action `a` and advance AbstractEnv `env` forward one step, and return rewards for all players. This is a *required function* that must be provided by every AbstractEnv. -""" -function act! end -""" - player(env::AbstractEnv) +If the environment has a single player, it is acceptable to return a scalar number. If there are multiple players, it should return a container with indexed with the items in the collection returned by `player_indices`. + +# Example + +## Single Player +```julia +function act!(env::MyMDPEnv, a) + env.state += a + randn() + return env.s^2 +end +``` + +## Two Player -Return the index of the player who should play next in the environment. +```julia +function act!(env::MyMDPEnv, a) + env.positions[player(env)] += a # In this game, each player has a position that is updated by his or her action + rewards = in_goal.(env.positions) # Rewards are +1 for being in a goal region, 0 otherwise + return rewards # returns a vector of rewards for each player +end +``` """ -function player end +function act! end """ terminated(env::AbstractEnv) diff --git a/src/multiplayer.jl b/src/multiplayer.jl new file mode 100644 index 0000000..bce1bd9 --- /dev/null +++ b/src/multiplayer.jl @@ -0,0 +1,77 @@ +""" + player_indices(env::AbstractEnv) + +Return an iterable collection of indices for all players. + +Typically, the indices will be integers, but the only requirement is that they be valid indices for the collection returned by `act!` + +This function is a *static property* of the environment; the value it returns should not change based on the state. + +# Example + +```julia +@provide player_indices(::MyEnv) = 1:2 +``` +""" +function player_indices end + +""" + player(env::AbstractEnv) + +Return the index of the player who should play next in the environment. + +The index should be one of the items in the collection returned by `player_indices`. +""" +function player end + +""" + all_act!(env::AbstractEnv, actions) + +Take `actions` for all players and advance AbstractEnv `env` forward, and return rewards for all players. + +Environments that support simultaneous actions by all players should implement this in addition to or instead of `act!`. + +The `actions` container should be indexed by the indices returned by `player_indices(env)`. +""" +function all_act! end + +""" + all_observe(env::AbstractEnv) + +Return observations from the environment for all players. + +Environments that support simultaneous actions by all players should implement this in addition to or instead of `observe`. +""" +function all_observe end + +""" + UtilityStyle(env) + +Trait that allows an environment to declare certain properties about the relative utility for the players. + +Possible returns are: +- `ZeroSum()` +- `ConstantSum()` +- `GeneralSum()` +- `IdenticalUtility()` + +See the docstrings for each for more details. +""" +abstract type UtilityStyle end + +""" +If `UtilityStyle(env) == ZeroSum()` then the sum of the rewards returned by `act!` is always zero. +""" +struct ZeroSum <: UtilityStyle end +""" +If `UtilityStyle(env) == ConstantSum()` then the sum of the rewards returned by `act!` will always be the same constant. +""" +struct ConstantSum <: UtilityStyle end +""" +If `UtilityStyle(env) == GeneralSum()`, the sum of rewards over a trajectory can take any form. +""" +struct GeneralSum <: UtilityStyle end +""" +If `UtilityStyle(env) == IdenticalUtility()`, all entries of the reward returned by `act!` will be identical for all players. +""" +struct IdenticalUtility <: UtilityStyle end From 94a6cabfaa5b4689b9693d0769bf11dfff20dd22 Mon Sep 17 00:00:00 2001 From: Zachary Sunberg Date: Sun, 17 Jan 2021 00:13:29 -0700 Subject: [PATCH 2/5] indices must be integers --- src/multiplayer.jl | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/src/multiplayer.jl b/src/multiplayer.jl index bce1bd9..e6b27db 100644 --- a/src/multiplayer.jl +++ b/src/multiplayer.jl @@ -1,9 +1,7 @@ """ player_indices(env::AbstractEnv) -Return an iterable collection of indices for all players. - -Typically, the indices will be integers, but the only requirement is that they be valid indices for the collection returned by `act!` +Return an iterable collection of integer indices for all players. This function is a *static property* of the environment; the value it returns should not change based on the state. @@ -19,19 +17,15 @@ function player_indices end player(env::AbstractEnv) Return the index of the player who should play next in the environment. - -The index should be one of the items in the collection returned by `player_indices`. """ function player end """ - all_act!(env::AbstractEnv, actions) + all_act!(env::AbstractEnv, actions::AbstractVector) Take `actions` for all players and advance AbstractEnv `env` forward, and return rewards for all players. Environments that support simultaneous actions by all players should implement this in addition to or instead of `act!`. - -The `actions` container should be indexed by the indices returned by `player_indices(env)`. """ function all_act! end From bc73edcdc8c0642ee43b2ed97fd7482d8ad3895b Mon Sep 17 00:00:00 2001 From: Zachary Sunberg Date: Sun, 17 Jan 2021 01:16:41 -0700 Subject: [PATCH 3/5] started tic tac toe --- examples/tictactoe.jl | 56 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 examples/tictactoe.jl diff --git a/examples/tictactoe.jl b/examples/tictactoe.jl new file mode 100644 index 0000000..eeb2a59 --- /dev/null +++ b/examples/tictactoe.jl @@ -0,0 +1,56 @@ +#XXX not finished yet +using CommonRLInterface + +const RL = CommonRLInterface + +mutable struct TicTacToe <: AbstractEnv + board::Matrix{Int} # 0 = untaken, 1 = x, 2 = o +end + +TicTacToe() = TicTacToe(1, zeros(Int, 3, 3)) + +iswinner(b, p) = any(all(b[i,:].==p) for i in 1:3) || + any(all(b[:,i]) for i in 1:3) || + all(b[i,i]==p for i in 1:3) || + all(b[i,4-i] == p for i in 1:3) + +other(p) = mod1(p+1,2) + +RL.reset!(env::TicTacToe) = fill!(env.board, 0) +RL.actions(env::TicTacToe, player=0) = vec([(i, j) for i in 1:3, j in 1:3]) +RL.observe(env::TicTacToe) = env.board +RL.terminated(env::TicTacToe) = any(iswinner(env.board, p) for p in 1:2) || all(env.board .!= 0) + +function RL.act!(env::TicTacToe, a) + p = player(env) + r = [0, 0] + if env.board[a] == 0 + + else + # if you take an illegal action, you lose + rewards[p] = -1 + rewards[other(p)] = 1 + end + return rewards +end + +@provide RL.player_indices(env::TicTacToe) = 1:2 +@provide function RL.player(env::TicTacToe) + if sum(env.board == 3) + + else + + end +end + +RL.render(env::TicTacToe) = env + +function Base.show(::IO, ::MIME"text/plain", env::TicTacToe) + chars = [' ', 'x', 'o'] + for i in 1:3 + for j in 1:3 + print(io, '|'*chars[env.board[i,j]]) + end + println(io, '|') + end +end From f2be3800766ec5edc4f9a599dafc11dedc2f18c0 Mon Sep 17 00:00:00 2001 From: Zachary Sunberg Date: Sat, 23 Jan 2021 17:15:32 -0700 Subject: [PATCH 4/5] finished multiplayer docs --- docs/make.jl | 1 + docs/src/index.md | 4 ++-- docs/src/multiplayer.md | 28 ++++++++++++++++++++++++++++ docs/src/optional.md | 4 ++-- examples/tictactoe.jl | 10 +++++----- src/CommonRLInterface.jl | 15 +++++++++++++-- src/multiplayer.jl | 8 ++++---- 7 files changed, 55 insertions(+), 15 deletions(-) create mode 100644 docs/src/multiplayer.md diff --git a/docs/make.jl b/docs/make.jl index 97dddb9..1a33f03 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -14,6 +14,7 @@ makedocs(; pages=[ "Home" => "index.md", "required.md", + "multiplayer.md", "optional.md", "wrappers.md", "faqs.md" diff --git a/docs/src/index.md b/docs/src/index.md index dc80e3c..cddbb7d 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -6,10 +6,10 @@ CurrentModule = CommonRLInterface A description of the purpose of the CommonRLInterface package can be found in [the README on GitHub](https://github.com/JuliaReinforcementLearning/CommonRLInterface.jl). -An example environment can be found in the [examples directory on GitHub](https://github.com/JuliaReinforcementLearning/CommonRLInterface.jl/tree/master/examples). +Example environment can be found in the [examples directory on GitHub](https://github.com/JuliaReinforcementLearning/CommonRLInterface.jl/tree/master/examples). Detailed reference documentation can be found using the links below: ```@contents -Pages = ["required.md", "optional.md", "wrappers.md", "faqs.md"] +Pages = ["required.md", "multiplayer.md", "optional.md", "wrappers.md", "faqs.md"] ``` diff --git a/docs/src/multiplayer.md b/docs/src/multiplayer.md new file mode 100644 index 0000000..341bd87 --- /dev/null +++ b/docs/src/multiplayer.md @@ -0,0 +1,28 @@ +# Multiplayer Interface + +CommonRLInterface provides a basic interface for multiplayer games. + +## Sequential games + +Sequential games should implement the optional function [`players`](@ref) to return a range of player ids, and [`player`](@ref) to indicate which player's turn it is. There is no requirement that players play in the order returned by the `players` function. Only the action for the current player should be supplied to [`act!`](@ref), but rewards for all players should be returned. [`observe`](@ref) returns the observation for only the current player. + +## Simultaneous Games/Multi-agent (PO)MDPs + +Environments in which all players take actions at once should implement the [`all_act!`](@ref) and [`all_observe`](@ref) optional functions which take a collection of actions for all players and return observations for each player, respectively. + +## Indicating reward properties + +The [`UtilityStyle`](@ref) trait can be used to indicate that the rewards will meet properties, for example that rewards for all players are identical or that the game is zero-sum. + + +```@docs +players +player +all_act! +all_observe +UtilityStyle +ZeroSum +ConstantSum +GeneralSum +IdenticalUtility +``` diff --git a/docs/src/optional.md b/docs/src/optional.md index 84bea96..f2e63b3 100644 --- a/docs/src/optional.md +++ b/docs/src/optional.md @@ -27,7 +27,8 @@ The optional interface currently contains the following functions: - [`valid_actions`](@ref) - [`valid_action_mask`](@ref) - [`observations`](@ref) -- [`player`](@ref) + +Additional optional functions for multiplayer environments are contained in the [Multiplayer Interface](@ref) To propose adding a new function to the interface, please file an issue with the "candidate interface function" label. @@ -43,5 +44,4 @@ setstate! valid_actions valid_action_mask observations -player ``` diff --git a/examples/tictactoe.jl b/examples/tictactoe.jl index eeb2a59..53b4b9a 100644 --- a/examples/tictactoe.jl +++ b/examples/tictactoe.jl @@ -34,16 +34,16 @@ function RL.act!(env::TicTacToe, a) return rewards end -@provide RL.player_indices(env::TicTacToe) = 1:2 +@provide RL.players(env::TicTacToe) = 1:2 @provide function RL.player(env::TicTacToe) - if sum(env.board == 3) - + if sum(env.board%3==0) + return 1 else - + return 2 end end -RL.render(env::TicTacToe) = env +@provide RL.render(env::TicTacToe) = env function Base.show(::IO, ::MIME"text/plain", env::TicTacToe) chars = [' ', 'x', 'o'] diff --git a/src/CommonRLInterface.jl b/src/CommonRLInterface.jl index 4ab0a99..ec19249 100644 --- a/src/CommonRLInterface.jl +++ b/src/CommonRLInterface.jl @@ -8,7 +8,6 @@ export actions, observe, act!, - player, terminated abstract type AbstractEnv end @@ -57,7 +56,7 @@ Take action `a` and advance AbstractEnv `env` forward one step, and return rewar This is a *required function* that must be provided by every AbstractEnv. -If the environment has a single player, it is acceptable to return a scalar number. If there are multiple players, it should return a container with indexed with the items in the collection returned by `player_indices`. +If the environment has a single player, it is acceptable to return a scalar number. If there are multiple players, it should return a container indexed with the items in the collection returned by `players`. # Example @@ -186,6 +185,18 @@ export valid_action_mask include("spaces.jl") +export + players, + player, + all_act!, + all_observe, + UtilityStyle, + ZeroSum, + ConstantSum, + GeneralSum, + IdenticalUtility +include("multiplayer.jl") + export Wrappers include("wrappers.jl") diff --git a/src/multiplayer.jl b/src/multiplayer.jl index e6b27db..004bb89 100644 --- a/src/multiplayer.jl +++ b/src/multiplayer.jl @@ -1,17 +1,17 @@ """ - player_indices(env::AbstractEnv) + players(env::AbstractEnv) -Return an iterable collection of integer indices for all players. +Return an ordered iterable collection of integer indices for all players, starting with one. This function is a *static property* of the environment; the value it returns should not change based on the state. # Example ```julia -@provide player_indices(::MyEnv) = 1:2 +@provide players(::MyEnv) = 1:2 ``` """ -function player_indices end +function players end """ player(env::AbstractEnv) From 25a4650691796aca454578d9dbab776e9a1e2026 Mon Sep 17 00:00:00 2001 From: Zachary Sunberg Date: Sat, 23 Jan 2021 18:24:31 -0700 Subject: [PATCH 5/5] finished examples and such --- README.md | 6 +++- docs/src/index.md | 2 +- examples/rock_paper_scissors.jl | 45 ++++++++++++++++++++++++++++ examples/tictactoe.jl | 39 ++++++++++++++++-------- src/CommonRLInterface.jl | 6 ++-- test/examples/gridworld.jl | 1 - test/examples/rock_paper_scissors.jl | 11 +++++++ test/examples/tictactoe.jl | 12 ++++++++ test/runtests.jl | 6 ++-- 9 files changed, 106 insertions(+), 22 deletions(-) create mode 100644 examples/rock_paper_scissors.jl create mode 100644 test/examples/rock_paper_scissors.jl create mode 100644 test/examples/tictactoe.jl diff --git a/README.md b/README.md index d8d40e7..7bf4778 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ By design, this package is only concerned with environments and *not* with polic ## Documentation -Detailed documentation can be found here: [![Stable](https://img.shields.io/badge/docs-stable-blue.svg)](https://JuliaReinforcementLearning.github.io/CommonRLInterface.jl/stable). A brief overview is given below: +A few simple examples can be found in the examples directory. Detailed documentation can be found here: [![Stable](https://img.shields.io/badge/docs-stable-blue.svg)](https://JuliaReinforcementLearning.github.io/CommonRLInterface.jl/stable). A brief overview is given below: ### Required Interface @@ -35,6 +35,10 @@ terminated(env) # returns true or false indicating whether the environment has f Additional behavior for an environment can be specified with the optional interface outlined in the documentation. The `provided` function can be used to check whether optional behavior is provided by the environment. +### Multiplayer Environments + +Optional functions allow implementation of both sequential and simultaneous games and multi-agent (PO)MDPs + ### Wrappers A wrapper system described in the documentation allows for easy modification of environments. diff --git a/docs/src/index.md b/docs/src/index.md index cddbb7d..f2d714c 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -6,7 +6,7 @@ CurrentModule = CommonRLInterface A description of the purpose of the CommonRLInterface package can be found in [the README on GitHub](https://github.com/JuliaReinforcementLearning/CommonRLInterface.jl). -Example environment can be found in the [examples directory on GitHub](https://github.com/JuliaReinforcementLearning/CommonRLInterface.jl/tree/master/examples). +Example environments can be found in the [examples directory on GitHub](https://github.com/JuliaReinforcementLearning/CommonRLInterface.jl/tree/master/examples). Detailed reference documentation can be found using the links below: diff --git a/examples/rock_paper_scissors.jl b/examples/rock_paper_scissors.jl new file mode 100644 index 0000000..4da0739 --- /dev/null +++ b/examples/rock_paper_scissors.jl @@ -0,0 +1,45 @@ +using CommonRLInterface + +const RL = CommonRLInterface + +mutable struct RockPaperScissors <: AbstractEnv + status::Symbol # either :start, the play of the first player, or :done +end + +RockPaperScissors() = RockPaperScissors(:start) + +beats(a, b) = (a==:rock && b==:scissors) || (a==:scissors && b==:paper) || (a==:paper && b==:rock) + +# Really all_act!, actions, terminated, players, and reset! are all that's needed to describe the game + +@provide function RL.all_act!(env::RockPaperScissors, as) + env.status = :done + if beats(as[1], as[2]) + return (1, -1) + elseif beats(as[2], as[1]) + return (-1, 1) + else + return (0, 0) + end +end + +RL.actions(env::RockPaperScissors, player=0) = (:rock, :paper, :scissors) +RL.terminated(env::RockPaperScissors) = env.status == :done +RL.reset!(env::RockPaperScissors) = env.status = :start +@provide RL.players(env::RockPaperScissors) = 1:2 + +# We may also wish to implement the rest of the required interface + +RL.observe(env::RockPaperScissors) = [0] + +function RL.act!(env::RockPaperScissors, a) + if env.status == :start + env.status = a + return (0, 0) + else + return all_act!(env, (env.status, a)) + end +end + +@provide RL.player(env::RockPaperScissors) = env.status == :start ? 1 : 2 +@provide RL.UtilityStyle(env::RockPaperScissors) = ZeroSum() diff --git a/examples/tictactoe.jl b/examples/tictactoe.jl index 53b4b9a..01624a2 100644 --- a/examples/tictactoe.jl +++ b/examples/tictactoe.jl @@ -7,36 +7,47 @@ mutable struct TicTacToe <: AbstractEnv board::Matrix{Int} # 0 = untaken, 1 = x, 2 = o end -TicTacToe() = TicTacToe(1, zeros(Int, 3, 3)) +TicTacToe() = TicTacToe(zeros(Int, 3, 3)) iswinner(b, p) = any(all(b[i,:].==p) for i in 1:3) || - any(all(b[:,i]) for i in 1:3) || + any(all(b[:,i].==p) for i in 1:3) || all(b[i,i]==p for i in 1:3) || - all(b[i,4-i] == p for i in 1:3) + all(b[i,4-i]==p for i in 1:3) other(p) = mod1(p+1,2) RL.reset!(env::TicTacToe) = fill!(env.board, 0) RL.actions(env::TicTacToe, player=0) = vec([(i, j) for i in 1:3, j in 1:3]) -RL.observe(env::TicTacToe) = env.board +# symmetrical observations for both players: +1 for your square, -1 for other square +RL.observe(env::TicTacToe) = zeros(Int, 3, 3) + (env.board.==player(env)) - (env.board.==other(player(env))) RL.terminated(env::TicTacToe) = any(iswinner(env.board, p) for p in 1:2) || all(env.board .!= 0) function RL.act!(env::TicTacToe, a) p = player(env) - r = [0, 0] - if env.board[a] == 0 - + winner = 0 + if env.board[a...] == 0 + env.board[a...] = p + for pp in players(env) + if iswinner(env.board, pp) + winner = pp + end + end else # if you take an illegal action, you lose - rewards[p] = -1 - rewards[other(p)] = 1 + winner = other(p) + end + + if winner == 1 + return (1, -1) + elseif winner == 2 + return (-1, 1) end - return rewards + return (0, 0) end @provide RL.players(env::TicTacToe) = 1:2 @provide function RL.player(env::TicTacToe) - if sum(env.board%3==0) + if sum(env.board)%3 == 0 return 1 else return 2 @@ -45,12 +56,14 @@ end @provide RL.render(env::TicTacToe) = env -function Base.show(::IO, ::MIME"text/plain", env::TicTacToe) +function Base.show(io::IO, ::MIME"text/plain", env::TicTacToe) chars = [' ', 'x', 'o'] for i in 1:3 for j in 1:3 - print(io, '|'*chars[env.board[i,j]]) + print(io, '|'*chars[env.board[i,j]+1]) end println(io, '|') end end + +@provide RL.UtilityStyle(::TicTacToe) = ZeroSum() diff --git a/src/CommonRLInterface.jl b/src/CommonRLInterface.jl index ec19249..5667aaf 100644 --- a/src/CommonRLInterface.jl +++ b/src/CommonRLInterface.jl @@ -56,7 +56,7 @@ Take action `a` and advance AbstractEnv `env` forward one step, and return rewar This is a *required function* that must be provided by every AbstractEnv. -If the environment has a single player, it is acceptable to return a scalar number. If there are multiple players, it should return a container indexed with the items in the collection returned by `players`. +If the environment has a single player, it is acceptable to return a scalar number. If there are multiple players, it should return a container with all rewards indexed by player number. # Example @@ -72,9 +72,9 @@ end ```julia function act!(env::MyMDPEnv, a) - env.positions[player(env)] += a # In this game, each player has a position that is updated by his or her action + env.positions[player(env)] += a # In this game, each player has a position that is updated by his or her action rewards = in_goal.(env.positions) # Rewards are +1 for being in a goal region, 0 otherwise - return rewards # returns a vector of rewards for each player + return rewards # returns a vector of rewards for each player end ``` """ diff --git a/test/examples/gridworld.jl b/test/examples/gridworld.jl index 76be6f0..6a4efbf 100644 --- a/test/examples/gridworld.jl +++ b/test/examples/gridworld.jl @@ -3,7 +3,6 @@ module GW end @testset "gridworld" begin - env = GW.GridWorld() reset!(env) while !terminated(env) diff --git a/test/examples/rock_paper_scissors.jl b/test/examples/rock_paper_scissors.jl new file mode 100644 index 0000000..3f1b69d --- /dev/null +++ b/test/examples/rock_paper_scissors.jl @@ -0,0 +1,11 @@ +module RPS + include("../../examples/rock_paper_scissors.jl") +end + +@testset "rock paper scissors" begin + g = RPS.RockPaperScissors() + reset!(g) + while !terminated(g) + @test sum(act!(g, rand(actions(g)))) == 0 + end +end diff --git a/test/examples/tictactoe.jl b/test/examples/tictactoe.jl new file mode 100644 index 0000000..98e41bb --- /dev/null +++ b/test/examples/tictactoe.jl @@ -0,0 +1,12 @@ +module TTT + include("../../examples/tictactoe.jl") +end + +@testset "tictactoe" begin + g = TTT.TicTacToe() + reset!(g) + while !terminated(g) + @test sum(act!(g, rand(actions(g)))) == 0 + render(g) + end +end diff --git a/test/runtests.jl b/test/runtests.jl index 3aa81f5..d11d421 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -162,8 +162,8 @@ end @test observations(env) == 1:10 end -if VERSION >= v"1.4" # not sure if this is the actual minimal version, but I know it will work - include("examples/gridworld.jl") -end +include("examples/gridworld.jl") +include("examples/tictactoe.jl") +include("examples/rock_paper_scissors.jl") include("wrappers.jl")