Update Docs for v0.11 release (#1056)

* update run function * update docs * fix naming, update docs * fix random walk example * Add wrapper test * update docs * update docs * fix / update docs * bump trajectories * fix tests * add player type * syntax * migrate tictactoe * fix import * Add RLCore as dependency to RLEnvs * update player * Fix tests * Fix player state in abstract_learner.jl * type annotations * Add PlayerNamedTuple * Fix files * Simplify Player syntax * symbol -> player * Fix tests * fix * Move player struct * Fix tests * Fix typo * Fix * Fix player * Fix test * Fix Poker * Fix wrapper * Fix tests * Fix naming * Fix env tests * Fix KuhnPoker * Fix env * Fix type ambiguity * Fix pigenv * Fix tic tac toe * Fix errors --------- Co-authored-by: Jeremiah Lewis <--get>
JuliaReinforcementLearning · Mar 26, 2024 · de5893f · de5893f
1 parent 18b1e1f
commit de5893f
Show file tree

Hide file tree

Showing 46 changed files with 566 additions and 416 deletions.
diff --git a/Project.toml b/Project.toml
@@ -12,9 +12,9 @@ ReinforcementLearningEnvironments = "25e41dd2-4622-11e9-1641-f1adca772921"
 
 [compat]
 Reexport = "0.2, 1"
-ReinforcementLearningBase = "0.12"
+ReinforcementLearningBase = "0.13"
 ReinforcementLearningCore = "0.15"
-ReinforcementLearningEnvironments = "0.8"
+ReinforcementLearningEnvironments = "0.9"
 julia = "1.6"
 
 [extras]

diff --git a/docs/Project.toml b/docs/Project.toml
@@ -1,6 +1,6 @@
 [deps]
 ArcadeLearningEnvironment = "b7f77d8d-088d-5e02-8ac0-89aab2acc977"
-BSON = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0"
+JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
 DemoCards = "311a05b2-6137-4a5a-b473-18580a3d38b5"

diff --git a/docs/homepage/guide/index.md b/docs/homepage/guide/index.md
@@ -85,7 +85,7 @@ Usually a closure or a functional object will be used to store some intermediate
 In most cases, you don't need to write a customized hook. Some generic hooks are provided so that you can inject logic at the appropriate time:
 
 - [`DoEveryNSteps`](https://juliareinforcementlearning.org/ReinforcementLearning.jl/latest/rl_core/#ReinforcementLearningCore.DoEveryNSteps)
-- [`DoEveryNEpisode`](https://juliareinforcementlearning.org/ReinforcementLearning.jl/latest/rl_core/#ReinforcementLearningCore.DoEveryNEpisode)
+- [`DoEveryNEpisodes`](https://juliareinforcementlearning.org/ReinforcementLearning.jl/latest/rl_core/#ReinforcementLearningCore.DoEveryNEpisodes)
 
 However, if you do need to write a customized hook, the following methods must be provided:
 

diff --git a/docs/src/How_to_implement_a_new_algorithm.md b/docs/src/How_to_implement_a_new_algorithm.md
@@ -10,43 +10,42 @@ function _run(policy::AbstractPolicy,
         stop_condition::AbstractStopCondition,
         hook::AbstractHook,
         reset_condition::AbstractResetCondition)
-
     push!(policy, PreExperimentStage(), env)
     is_stop = false
     while !is_stop
         reset!(env)
         push!(policy, PreEpisodeStage(), env)
         optimise!(policy, PreEpisodeStage())
 
-        while !reset_condition(policy, env) # one episode
+        while !check!(reset_condition, policy, env) # one episode
             push!(policy, PreActStage(), env)
             optimise!(policy, PreActStage())
 
-            RLBase.plan!(policy, env)
+            action = RLBase.plan!(policy, env)
             act!(env, action)
 
             push!(policy, PostActStage(), env, action)
             optimise!(policy, PostActStage())
 
-            if check_stop(stop_condition, policy, env)
+            if check!(stop_condition, policy, env)
                 is_stop = true
                 break
             end
         end # end of an episode
 
         push!(policy, PostEpisodeStage(), env)
         optimise!(policy, PostEpisodeStage())
+
     end
     push!(policy, PostExperimentStage(), env)
     hook
 end
-
 ```
 
 Implementing a new algorithm mainly consists of creating your own `AbstractPolicy` (or `AbstractLearner`, see [this section](#using-resources-from-rlcore)) subtype, its action sampling method (by overloading `Base.push!(policy::YourPolicyType, env)`) and implementing its behavior at each stage. However, ReinforcemementLearning.jl provides plenty of pre-implemented utilities that you should use to 1) have less code to write 2) lower the chances of bugs and 3) make your code more understandable and maintainable (if you intend to contribute your algorithm). 
 
 ## Using Agents
-The recommended way is to use the policy wrapper `Agent`. An agent is itself an `AbstractPolicy` that wraps a policy and a trajectory (also called Experience Replay Buffer in RL literature). Agent comes with default implementations of `push!(agent, stage, env)` and `plan!(agent, env)` that will probably fit what you need at most stages so that you don't have to write them again. Looking at the [source code](https://github.com/JuliaReinforcementLearning/ReinforcementLearning.jl/blob/main/src/ReinforcementLearningCore/src/policies/agent.jl/), we can see that the default Agent calls are  
+The recommended way is to use the policy wrapper `Agent`. An agent is itself an `AbstractPolicy` that wraps a policy and a trajectory (also called Experience Replay Buffer in reinforcement learning literature). Agent comes with default implementations of `push!(agent, stage, env)` and `plan!(agent, env)` that will probably fit what you need at most stages so that you don't have to write them again. Looking at the [source code](https://github.com/JuliaReinforcementLearning/ReinforcementLearning.jl/blob/main/src/ReinforcementLearningCore/src/policies/agent.jl/), we can see that the default Agent calls are  
 
 ```julia
 function Base.push!(agent::Agent, ::PreEpisodeStage, env::AbstractEnv)
@@ -61,21 +60,21 @@ end
 
  The function `RLBase.plan!(agent::Agent, env::AbstractEnv)`, is called at the `action = RLBase.plan!(policy, env)` line. It simply gets an action from the policy of the agent by calling `RLBase.plan!(your_new_policy, env)` function. At the `PreEpisodeStage()`, the agent pushes the initial state to the trajectory. At the `PostActStage()`, the agent pushes the transition to the trajectory.
 
-If you need a different behavior at some stages, then you can overload the `Base.push!(Agent{<:YourPolicyType}, [stage,] env)` or `Base.push!(Agent{<:Any, <: YourTrajectoryType}, [stage,] env)`, or `Base.plan!`, depending on whether you have a custom policy or just a custom trajectory. For example, many algorithms (such as PPO) need to store an additional trace of the logpdf of the sampled actions and thus overload the function at the `PreActStage()`.
+If you need a different behavior at some stages, then you can overload the `Base.push!(Agent{<:YourPolicyType}, [stage,] env)` or `Base.push!(Agent{<:Any, <: YourTrajectoryType}, [stage,] env)`, or `Base.plan!`, depending on whether you have a custom policy or just a custom trajectory. For example, many algorithms (such as PPO) need to store an additional trace of the `logpdf` of the sampled actions and thus overload the function at the `PreActStage()`.
 
 ## Updating the policy
 
 Finally, you need to implement the learning function by implementing `RLBase.optimise!(::YourPolicyType, ::Stage, ::Trajectory)`. By default this does nothing at all stages. Overload it on the stage where you wish to optimise (most often, at `PostActStage()` or `PostEpisodeStage()`). This function should loop the trajectory to sample batches. Inside the loop, put whatever is required. For example:
 
 ```julia
-function RLBase.optimise!(p::YourPolicyType, ::PostEpisodeStage, traj::Trajectory)
-    for batch in traj
-        optimise!(p, batch)
+function RLBase.optimise!(policy::YourPolicyType, ::PostEpisodeStage, trajectory::Trajectory)
+    for batch in trajectory
+        optimise!(policy, batch)
     end
 end
 
 ```
-where `optimise!(p, batch)` is a function that will typically compute the gradient and update a neural network, or update a tabular policy. What is inside the loop is free to be whatever you need but it's a good idea to implement a `optimise!(p::YourPolicyType, batch::NamedTuple)` function for clarity instead of coding everything in the loop. This is further discussed in the next section on `Trajectory`s. An example of where this could be different is when you want to update priorities, see [the PER learner](https://github.com/JuliaReinforcementLearning/ReinforcementLearning.jl/blob/main/src/ReinforcementLearningZoo/src/algorithms/dqns/prioritized_dqn.jl) for an example.
+where `optimise!(policy, batch)` is a function that will typically compute the gradient and update a neural network, or update a tabular policy. What is inside the loop is free to be whatever you need but it's a good idea to implement a `optimise!(policy::YourPolicyType, batch::NamedTuple)` function for clarity instead of coding everything in the loop. This is further discussed in the next section on `Trajectory`s.
 
 ## ReinforcementLearningTrajectories
 
@@ -112,13 +111,13 @@ ReinforcementLearningTrajectories' design aims to eventually support distributed
 
 The sampler is the object that will fetch data in your trajectory to create the `batch` in the optimise for loop. The simplest one is the `BatchSampler{names}(batchsize, rng)`.`batchsize` is the number of elements to sample and `rng` is an optional argument that you may set to a custom rng for reproducibility. `names` is the set of traces the sampler must query. For example a `BatchSampler{(:state, :action, :next_state)}(32)` will sample a named tuple `(state = [32 states], action=[32 actions], next_state=[32 states that are one-off with respect that in state])`.
 
-## Using resources from RLCore
+## Using resources from ReinforcementLearningCore
 
-RL algorithms typically only differ partially  but broadly use the same mechanisms. The subpackage RLCore contains some modules that you can reuse to implement your algorithm. 
-These will take care of many aspects of training for you. See the [RLCore manual](./rlcore.md)
+RL algorithms typically only differ partially  but broadly use the same mechanisms. The subpackage ReinforcementLearningCore contains some modules that you can reuse to implement your algorithm. 
+These will take care of many aspects of training for you. See the [ReinforcementLearningCore manual](./rlcore.md)
 
 ### Utils
-In utils/distributions.jl you will find implementations of gaussian log probabilities functions that are both GPU compatible and differentiable and that do not require the overhead of using Distributions.jl structs.
+In `utils/distributions.jl` you will find implementations of gaussian log probabilities functions that are both GPU compatible and differentiable and that do not require the overhead of using `Distributions.jl` structs.
 
 ## Conventions
 Finally, there are a few "conventions" and good practices that you should follow, especially if you intend to contribute to this package (don't worry we'll be happy to help if needed).
@@ -127,9 +126,9 @@ Finally, there are a few "conventions" and good practices that you should follow
 ReinforcementLearning.jl aims to provide a framework for reproducible experiments. To do so, make sure that your policy type has a `rng` field and that all random operations (e.g. action sampling) use `rand(your_policy.rng, args...)`. For trajectory sampling, you can set the sampler's rng to that of the policy when creating and agent or simply instantiate its own rng.
 
 ### GPU compatibility
-Deep RL algorithms are often much faster when the neural nets are updated on a GPU. For now, we only support CUDA.jl as a backend. This means that you will have to think about the transfer of data between the CPU (where the trajectory is) and the GPU memory (where the neural nets are). To do so you will find in utils/device.jl some functions that do most of the work for you. The ones that you need to know are `send_to_device(device, data)` that sends data to the specified device, `send_to_host(data)` which sends data to the CPU memory (it fallbacks to `send_to_device(Val{:cpu}, data)`) and `device(x)` that returns the device on which `x` is. 
+Deep RL algorithms are often much faster when the neural nets are updated on a GPU. This means that you will have to think about the transfer of data between the CPU (where the trajectory is) and the GPU memory (where the neural nets are). `Flux.jl` offers `gpu` and `cpu` functions to make it easier to send data back and forth.
 Normally, you should be able to write a single implementation of your algorithm that works on CPU and GPUs thanks to the multiple dispatch offered by Julia.
 
-GPU friendlyness will also require that your code does not use _scalar indexing_ (see the CUDA.jl documentation for more information), make sure to test your algorithm on the GPU after disallowing scalar indexing by using `CUDA.allowscalar(false)`.
+GPU friendliness will also require that your code does not use _scalar indexing_ (see the `CUDA.jl` or `Metal.jl` documentation for more information); when using `CUDA.jl` make sure to test your algorithm on the GPU after disallowing scalar indexing by using `CUDA.allowscalar(false)`.
 
 Finally, it is a good idea to implement the `Flux.gpu(yourpolicy)` and `cpu(yourpolicy)` functions, for user convenience. Be careful that sampling on the GPU requires a specific type of rng, you can generate one with `CUDA.default_rng()`
diff --git a/docs/src/How_to_use_hooks.md b/docs/src/How_to_use_hooks.md
@@ -8,10 +8,12 @@ programming. We write the code in a loop and execute them step by step.
 
 ```julia
 while true
-    env |> policy |> env
+    action = plan!(policy, env)
+    act!(env, action)
+
     # write your own logic here
     # like saving parameters, recording loss function, evaluating policy, etc.
-    stop_condition(env, policy) && break
+    check!(stop_condition, env, policy) && break
     is_terminated(env) && reset!(env)
 end
 ```
@@ -30,18 +32,19 @@ execution pipeline. However, we believe this is not necessary in Julia. With the
 declarative programming approach, we gain much more flexibilities.
 
 Now the question is how to design the hook. A natural choice is to wrap the
-comments part in the above pseudocode into a function:
+comments part in the above pseudo-code into a function:
 
 ```julia
 while true
-    env |> policy |> env
-    hook(policy, env)
-    stop_condition(env, policy) && break
+    action = plan!(policy, env)
+    act!(env, action)
+    push!(hook, policy, env)
+    check!(stop_condition, env, policy) && break
     is_terminated(env) && reset!(env)
 end
 ```
 
-But sometimes, we'd like to have a more fingrained control. So we split the calling
+But sometimes, we'd like to have a more fine-grained control. So we split the calling
 of hooks into several different stages:
 
 - [`PreExperimentStage`](@ref)
@@ -54,20 +57,22 @@ of hooks into several different stages:
 ## How to define a customized hook?
 
 By default, an instance of [`AbstractHook`](@ref) will do nothing when called
-with `(hook::AbstractHook)(::AbstractStage, policy, env)`. So when writing a
+with `push!(hook::AbstractHook, ::AbstractStage, policy, env)`. So when writing a
 customized hook, you only need to implement the necessary runtime logic.
 
 For example, assume we want to record the wall time of each episode.
 
 ```@repl how_to_use_hooks
 using ReinforcementLearning
+import Base.push!
 Base.@kwdef mutable struct TimeCostPerEpisode <: AbstractHook
     t::UInt64 = time_ns()
     time_costs::Vector{UInt64} = []
 end
-(h::TimeCostPerEpisode)(::PreEpisodeStage, policy, env) = h.t = time_ns()
-(h::TimeCostPerEpisode)(::PostEpisodeStage, policy, env) = push!(h.time_costs, time_ns()-h.t)
+Base.push!(h::TimeCostPerEpisode, ::PreEpisodeStage, policy, env) = h.t = time_ns()
+Base.push!(h::TimeCostPerEpisode, ::PostEpisodeStage, policy, env) = push!(h.time_costs, time_ns()-h.t)
 h = TimeCostPerEpisode()
+
 run(RandomPolicy(), CartPoleEnv(), StopAfterNEpisodes(10), h)
 h.time_costs
 ```
@@ -77,14 +82,13 @@ h.time_costs
 - [`StepsPerEpisode`](@ref)
 - [`RewardsPerEpisode`](@ref)
 - [`TotalRewardPerEpisode`](@ref)
-- [`TotalBatchRewardPerEpisode`](@ref)
 
 ## Periodic jobs
 
 Sometimes, we'd like to periodically run some functions. Two handy hooks are
 provided for this kind of tasks:
 
-- [`DoEveryNEpisode`](@ref)
+- [`DoEveryNEpisodes`](@ref)
 - [`DoEveryNSteps`](@ref)
 
 Following are some typical usages.
@@ -98,7 +102,7 @@ run(
     policy,
     CartPoleEnv(),
     StopAfterNEpisodes(100),
-    DoEveryNEpisode(;n=10) do t, policy, env
+    DoEveryNEpisodes(;n=10) do t, policy, env
         # In real world cases, the policy is usually wrapped in an Agent,
         # we need to extract the inner policy to run it in the *actor* mode.
         # Here for illustration only, we simply use the original policy.
@@ -117,40 +121,33 @@ run(
 
 ### Save parameters
 
-[BSON.jl](https://github.com/JuliaIO/BSON.jl) is recommended to save the parameters of a policy.
+[JLD2.jl](https://github.com/JuliaIO/JLD2.jl) is recommended to save the parameters of a policy.
 
 ```@repl how_to_use_hooks
-using Flux
-using Flux.Losses: huber_loss
-using BSON
+using ReinforcementLearning
+using JLD2
 
-env = CartPoleEnv(; T = Float32)
-ns, na = length(state(env)), length(action_space(env))
+env = RandomWalk1D()
+ns, na = length(state_space(env)), length(action_space(env))
 
 policy = Agent(
-    policy = QBasedPolicy(
-        learner = BasicDQNLearner(
-            approximator = NeuralNetworkApproximator(
-                model = Chain(
-                    Dense(ns, 128, relu; init = glorot_uniform),
-                    Dense(128, 128, relu; init = glorot_uniform),
-                    Dense(128, na; init = glorot_uniform),
-                ) |> cpu,
-                optimizer = Adam(),
-            ),
-            batchsize = 32,
-            min_replay_history = 100,
-            loss_func = huber_loss,
-        ),
-        explorer = EpsilonGreedyExplorer(
-            kind = :exp,
-            ϵ_stable = 0.01,
-            decay_steps = 500,
+    QBasedPolicy(;
+        learner = TDLearner(
+            TabularQApproximator(n_state = ns, n_action = na),
+            :SARS;
         ),
+        explorer = EpsilonGreedyExplorer(ϵ_stable=0.01),
     ),
-    trajectory = CircularArraySARTTrajectory(
-        capacity = 1000,
-        state = Vector{Float32} => (ns,),
+    Trajectory(
+        CircularArraySARTSTraces(;
+            capacity = 1,
+            state = Int64 => (),
+            action = Int64 => (),
+            reward = Float64 => (),
+            terminal = Bool => (),
+        ),
+        DummySampler(),
+        InsertSampleRatioController(),
     ),
 )
 
@@ -161,40 +158,10 @@ run(
     env,
     StopAfterNSteps(10_000),
     DoEveryNSteps(n=1_000) do t, p, e
-        ps = params(p)
-        f = joinpath(parameters_dir, "parameters_at_step_$t.bson")
-        BSON.@save f ps
+        ps = policy.policy.learner.approximator
+        f = joinpath(parameters_dir, "parameters_at_step_$t.jld2")
+        JLD2.@save f ps
         println("parameters at step $t saved to $f")
     end
 )
 ```
-
-### Logging data
-
-Below we demonstrate how to use
-[TensorBoardLogger.jl](https://github.com/PhilipVinc/TensorBoardLogger.jl) to
-log runtime metrics. But users could also other tools like
-[wandb](https://wandb.ai/site) through
-[PyCall.jl](https://github.com/JuliaPy/PyCall.jl).
-
-
-```@repl how_to_use_hooks
-using TensorBoardLogger
-using Logging
-tf_log_dir = "logs"
-lg = TBLogger(tf_log_dir, min_level = Logging.Info)
-total_reward_per_episode = TotalRewardPerEpisode()
-hook = ComposedHook(
-    total_reward_per_episode,
-    DoEveryNEpisode() do t, agent, env
-        with_logger(lg) do
-            @info "training"  reward = total_reward_per_episode.rewards[end]
-        end
-    end
-)
-run(RandomPolicy(), CartPoleEnv(), StopAfterNEpisodes(50), hook)
-readdir(tf_log_dir)
-```
-
-Then run `tensorboard --logdir logs` and open the link on the screen in your
-browser. (Obviously you need to install tensorboard first.)