diff --git a/src/policies/q_based_policies/explorers/UCB_explorer.jl b/src/policies/q_based_policies/explorers/UCB_explorer.jl index 4cb31d0..149f316 100644 --- a/src/policies/q_based_policies/explorers/UCB_explorer.jl +++ b/src/policies/q_based_policies/explorers/UCB_explorer.jl @@ -24,20 +24,7 @@ end UCBExplorer(na; c = 2.0, ϵ = 1e-10, step = 1, rng = Random.GLOBAL_RNG, is_training = true) = UCBExplorer(c, fill(ϵ, na), 1, rng, is_training) -@doc raw""" - (ucb::UCBExplorer)(values::AbstractArray) -Unlike [`EpsilonGreedyExplorer`](@ref), uncertaintyies are considered in UCB. - -!!! note - If multiple values with the same maximum value are found. - Then a random one will be returned! - -```math -A_t = \underset{a}{\arg \max} \left[ Q_t(a) + c \sqrt{\frac{\ln t}{N_t(a)}} \right] -``` - -See more details at Section (2.7) on Page 35 of the book *Sutton, Richard S., and Andrew G. Barto. Reinforcement learning: An introduction. MIT press, 2018.* -""" function (p::UCBExplorer)(values::AbstractArray) +function (p::UCBExplorer)(values::AbstractArray) v, inds = find_all_max(@. values + p.c * sqrt(log(p.step + 1) / p.actioncounts)) action = sample(p.rng, inds) if p.is_training diff --git a/src/policies/tabular_random_policy.jl b/src/policies/tabular_random_policy.jl index a802226..1a797c1 100644 --- a/src/policies/tabular_random_policy.jl +++ b/src/policies/tabular_random_policy.jl @@ -64,7 +64,7 @@ end (p::TabularRandomPolicy)(env::AbstractEnv) = sample(p.rng, action_space(env), Weights(prob(p, env), 1.0)) -"!!! Assumeing table is already initialized" +# !!! Assumeing table is already initialized (p::TabularRandomPolicy{S})(state::S) where S = sample(p.rng, Weights(p.table[state], 1.0)) """