JuliaReinforcementLearning · zsunberg · Jan 24, 2021 · Jan 16, 2021 · Jan 17, 2021 · Jan 17, 2021
diff --git a/README.md b/README.md
@@ -16,7 +16,7 @@ By design, this package is only concerned with environments and *not* with polic
 
 ## Documentation
 
-Detailed documentation can be found here: [![Stable](https://img.shields.io/badge/docs-stable-blue.svg)](https://JuliaReinforcementLearning.github.io/CommonRLInterface.jl/stable). A brief overview is given below:
+A few simple examples can be found in the examples directory. Detailed documentation can be found here: [![Stable](https://img.shields.io/badge/docs-stable-blue.svg)](https://JuliaReinforcementLearning.github.io/CommonRLInterface.jl/stable). A brief overview is given below:
 
 ### Required Interface
 
@@ -35,6 +35,10 @@ terminated(env) # returns true or false indicating whether the environment has f
 
 Additional behavior for an environment can be specified with the optional interface outlined in the documentation. The `provided` function can be used to check whether optional behavior is provided by the environment.
 
+### Multiplayer Environments
+
+Optional functions allow implementation of both sequential and simultaneous games and multi-agent (PO)MDPs
+
 ### Wrappers
 
 A wrapper system described in the documentation allows for easy modification of environments.

diff --git a/docs/make.jl b/docs/make.jl
@@ -14,6 +14,7 @@ makedocs(;
     pages=[
         "Home" => "index.md",
         "required.md",
+        "multiplayer.md",
         "optional.md",
         "wrappers.md",
         "faqs.md"

diff --git a/docs/src/index.md b/docs/src/index.md
@@ -6,10 +6,10 @@ CurrentModule = CommonRLInterface
 
 A description of the purpose of the CommonRLInterface package can be found in [the README on GitHub](https://github.com/JuliaReinforcementLearning/CommonRLInterface.jl).
 
-An example environment can be found in the [examples directory on GitHub](https://github.com/JuliaReinforcementLearning/CommonRLInterface.jl/tree/master/examples).
+Example environments can be found in the [examples directory on GitHub](https://github.com/JuliaReinforcementLearning/CommonRLInterface.jl/tree/master/examples).
 
 Detailed reference documentation can be found using the links below:
 
 ```@contents
-Pages = ["required.md", "optional.md", "wrappers.md", "faqs.md"]
+Pages = ["required.md", "multiplayer.md", "optional.md", "wrappers.md", "faqs.md"]
 ```
diff --git a/docs/src/multiplayer.md b/docs/src/multiplayer.md
@@ -0,0 +1,28 @@
+# Multiplayer Interface
+
+CommonRLInterface provides a basic interface for multiplayer games.
+
+## Sequential games
+
+Sequential games should implement the optional function [`players`](@ref) to return a range of player ids, and [`player`](@ref) to indicate which player's turn it is. There is no requirement that players play in the order returned by the `players` function. Only the action for the current player should be supplied to [`act!`](@ref), but rewards for all players should be returned. [`observe`](@ref) returns the observation for only the current player.
+
+## Simultaneous Games/Multi-agent (PO)MDPs
+
+Environments in which all players take actions at once should implement the [`all_act!`](@ref) and [`all_observe`](@ref) optional functions which take a collection of actions for all players and return observations for each player, respectively.
+
+## Indicating reward properties
+
+The [`UtilityStyle`](@ref) trait can be used to indicate that the rewards will meet properties, for example that rewards for all players are identical or that the game is zero-sum.
+
+
+```@docs
+players
+player
+all_act!
+all_observe
+UtilityStyle
+ZeroSum
+ConstantSum
+GeneralSum
+IdenticalUtility
+```
diff --git a/docs/src/optional.md b/docs/src/optional.md
@@ -27,7 +27,8 @@ The optional interface currently contains the following functions:
 - [`valid_actions`](@ref)
 - [`valid_action_mask`](@ref)
 - [`observations`](@ref)
-- [`player`](@ref)
+
+Additional optional functions for multiplayer environments are contained in the [Multiplayer Interface](@ref)
 
 To propose adding a new function to the interface, please file an issue with the "candidate interface function" label.
 
@@ -43,5 +44,4 @@ setstate!
 valid_actions
 valid_action_mask
 observations
-player
 ```
diff --git a/examples/rock_paper_scissors.jl b/examples/rock_paper_scissors.jl
@@ -0,0 +1,45 @@
+using CommonRLInterface
+
+const RL = CommonRLInterface
+
+mutable struct RockPaperScissors <: AbstractEnv
+    status::Symbol # either :start, the play of the first player, or :done
+end
+
+RockPaperScissors() = RockPaperScissors(:start)
+
+beats(a, b) = (a==:rock && b==:scissors) || (a==:scissors && b==:paper) || (a==:paper && b==:rock)
+
+# Really all_act!, actions, terminated, players, and reset! are all that's needed to describe the game
+
+@provide function RL.all_act!(env::RockPaperScissors, as)
+    env.status = :done
+    if beats(as[1], as[2]) 
+        return (1, -1)
+    elseif beats(as[2], as[1])
+        return (-1, 1)
+    else
+        return (0, 0)
+    end
+end
+
+RL.actions(env::RockPaperScissors, player=0) = (:rock, :paper, :scissors)
+RL.terminated(env::RockPaperScissors) = env.status == :done
+RL.reset!(env::RockPaperScissors) = env.status = :start
+@provide RL.players(env::RockPaperScissors) = 1:2
+
+# We may also wish to implement the rest of the required interface
+
+RL.observe(env::RockPaperScissors) = [0]
+
+function RL.act!(env::RockPaperScissors, a)
+    if env.status == :start
+        env.status = a
+        return (0, 0)
+    else
+        return all_act!(env, (env.status, a))
+    end
+end
+
+@provide RL.player(env::RockPaperScissors) = env.status == :start ? 1 : 2
+@provide RL.UtilityStyle(env::RockPaperScissors) = ZeroSum()
diff --git a/examples/tictactoe.jl b/examples/tictactoe.jl
@@ -0,0 +1,69 @@
+#XXX not finished yet
+using CommonRLInterface
+
+const RL = CommonRLInterface
+
+mutable struct TicTacToe <: AbstractEnv
+    board::Matrix{Int} # 0 = untaken, 1 = x, 2 = o
+end
+
+TicTacToe() = TicTacToe(zeros(Int, 3, 3))
+
+iswinner(b, p) = any(all(b[i,:].==p) for i in 1:3) ||
+                 any(all(b[:,i].==p) for i in 1:3) ||
+                 all(b[i,i]==p for i in 1:3) ||
+                 all(b[i,4-i]==p for i in 1:3)
+
+other(p) = mod1(p+1,2)
+
+RL.reset!(env::TicTacToe) = fill!(env.board, 0)
+RL.actions(env::TicTacToe, player=0) = vec([(i, j) for i in 1:3, j in 1:3])
+# symmetrical observations for both players: +1 for your square, -1 for other square
+RL.observe(env::TicTacToe) = zeros(Int, 3, 3) + (env.board.==player(env)) - (env.board.==other(player(env)))
+RL.terminated(env::TicTacToe) = any(iswinner(env.board, p) for p in 1:2) || all(env.board .!= 0)
+
+function RL.act!(env::TicTacToe, a)
+    p = player(env)
+    winner = 0
+    if env.board[a...] == 0
+        env.board[a...] = p
+        for pp in players(env)
+            if iswinner(env.board, pp)
+                winner = pp
+            end
+        end
+    else
+        # if you take an illegal action, you lose
+        winner = other(p)
+    end
+
+    if winner == 1
+        return (1, -1)
+    elseif winner == 2
+        return (-1, 1)
+    end
+    return (0, 0)
+end
+
+@provide RL.players(env::TicTacToe) = 1:2
+@provide function RL.player(env::TicTacToe)
+    if sum(env.board)%3 == 0
+        return 1
+    else
+        return 2
+    end
+end
+
+@provide RL.render(env::TicTacToe) = env
+
+function Base.show(io::IO, ::MIME"text/plain", env::TicTacToe)
+    chars = [' ', 'x', 'o']
+    for i in 1:3
+        for j in 1:3
+            print(io, '|'*chars[env.board[i,j]+1])
+        end
+        println(io, '|')
+    end
+end
+
+@provide RL.UtilityStyle(::TicTacToe) = ZeroSum()
diff --git a/src/CommonRLInterface.jl b/src/CommonRLInterface.jl
@@ -8,7 +8,6 @@ export
     actions,
     observe,
     act!,
-    player,
     terminated
 
 abstract type AbstractEnv end
@@ -33,9 +32,9 @@ This function is a *static property* of the environment; the value it returns sh
 
 ---
 
-    actions(env::AbstractEnv, i::Integer)
+    actions(env::AbstractEnv, player_index)
 
-Return a collection of all the actions available to player i.
+Return a collection of all the actions available to a given player.
 
 This function is a *static property* of the environment; the value it returns should not change based on the state.
 """
@@ -53,18 +52,33 @@ function observe end
 """
     r = act!(env::AbstractEnv, a)
 
-Take action `a` for the current player, advance AbstractEnv `env` forward one step, and return rewards for all players.
+Take action `a` and advance AbstractEnv `env` forward one step, and return rewards for all players.
 
 This is a *required function* that must be provided by every AbstractEnv.
-"""
-function act! end
 
-"""
-    player(env::AbstractEnv) 
+If the environment has a single player, it is acceptable to return a scalar number. If there are multiple players, it should return a container with all rewards indexed by player number.
+
+# Example
+
+## Single Player
+```julia
+function act!(env::MyMDPEnv, a)
+    env.state += a + randn()
+    return env.s^2
+end
+```
+
+## Two Player
 
-Return the index of the player who should play next in the environment.
+```julia
+function act!(env::MyMDPEnv, a)
+    env.positions[player(env)] += a   # In this game, each player has a position that is updated by his or her action
+    rewards = in_goal.(env.positions) # Rewards are +1 for being in a goal region, 0 otherwise
+    return rewards                    # returns a vector of rewards for each player
+end
+```
 """
-function player end
+function act! end
 
 """
     terminated(env::AbstractEnv)
@@ -171,6 +185,18 @@ export
     valid_action_mask
 include("spaces.jl")
 
+export
+    players,
+    player,
+    all_act!,
+    all_observe,
+    UtilityStyle,
+    ZeroSum,
+    ConstantSum,
+    GeneralSum,
+    IdenticalUtility
+include("multiplayer.jl")
+
 export
     Wrappers
 include("wrappers.jl")

diff --git a/src/multiplayer.jl b/src/multiplayer.jl
@@ -0,0 +1,71 @@
+"""
+    players(env::AbstractEnv)
+
+Return an ordered iterable collection of integer indices for all players, starting with one.
+
+This function is a *static property* of the environment; the value it returns should not change based on the state.
+
+# Example
+
+```julia
+@provide players(::MyEnv) = 1:2
+```
+"""
+function players end
+
+"""
+    player(env::AbstractEnv) 
+
+Return the index of the player who should play next in the environment.
+"""
+function player end
+
+"""
+    all_act!(env::AbstractEnv, actions::AbstractVector)
+
+Take `actions` for all players and advance AbstractEnv `env` forward, and return rewards for all players.
+
+Environments that support simultaneous actions by all players should implement this in addition to or instead of `act!`.
+"""
+function all_act! end
+
+"""
+    all_observe(env::AbstractEnv)
+
+Return observations from the environment for all players.
+
+Environments that support simultaneous actions by all players should implement this in addition to or instead of `observe`.
+"""
+function all_observe end
+
+"""
+    UtilityStyle(env)
+
+Trait that allows an environment to declare certain properties about the relative utility for the players.
+
+Possible returns are:
+- `ZeroSum()`
+- `ConstantSum()`
+- `GeneralSum()`
+- `IdenticalUtility()`
+
+See the docstrings for each for more details.
+"""
+abstract type UtilityStyle end
+
+"""
+If `UtilityStyle(env) == ZeroSum()` then the sum of the rewards returned by `act!` is always zero.
+"""
+struct ZeroSum <: UtilityStyle end
+"""
+If `UtilityStyle(env) == ConstantSum()` then the sum of the rewards returned by `act!` will always be the same constant.
+"""
+struct ConstantSum <: UtilityStyle end
+"""
+If `UtilityStyle(env) == GeneralSum()`, the sum of rewards over a trajectory can take any form.
+"""
+struct GeneralSum <: UtilityStyle end
+"""
+If `UtilityStyle(env) == IdenticalUtility()`, all entries of the reward returned by `act!` will be identical for all players.
+"""
+struct IdenticalUtility <: UtilityStyle end
diff --git a/test/examples/gridworld.jl b/test/examples/gridworld.jl
@@ -3,7 +3,6 @@ module GW
 end
 
 @testset "gridworld" begin
-
     env = GW.GridWorld()
     reset!(env)
     while !terminated(env)

diff --git a/test/examples/rock_paper_scissors.jl b/test/examples/rock_paper_scissors.jl
@@ -0,0 +1,11 @@
+module RPS 
+    include("../../examples/rock_paper_scissors.jl")
+end
+
+@testset "rock paper scissors" begin
+    g = RPS.RockPaperScissors()
+    reset!(g)
+    while !terminated(g)
+        @test sum(act!(g, rand(actions(g)))) == 0
+    end
+end
diff --git a/test/examples/tictactoe.jl b/test/examples/tictactoe.jl
@@ -0,0 +1,12 @@
+module TTT
+    include("../../examples/tictactoe.jl")
+end
+
+@testset "tictactoe" begin
+    g = TTT.TicTacToe()
+    reset!(g)
+    while !terminated(g)
+        @test sum(act!(g, rand(actions(g)))) == 0
+        render(g)
+    end
+end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -162,8 +162,8 @@ end
     @test observations(env) == 1:10
 end
 
-if VERSION >= v"1.4" # not sure if this is the actual minimal version, but I know it will work
-    include("examples/gridworld.jl")
-end
+include("examples/gridworld.jl")
+include("examples/tictactoe.jl")
+include("examples/rock_paper_scissors.jl")
 
 include("wrappers.jl")