Robust (#16)

yardenas · web-flow · commit e960e3add243 · 2024-11-03T08:56:07.000+01:00
* New interface for Q tranformations

* Implementation of CVaR

* Add costs
diff --git a/ss2r/algorithms/sac/losses.py b/ss2r/algorithms/sac/losses.py
@@ -25,6 +25,8 @@
 from brax.training.agents.sac import networks as sac_networks
 from brax.training.types import Params, PRNGKey
 
+from ss2r.algorithms.sac.robustness import QTransformation, SACBase
+
 Transition: TypeAlias = types.Transition
 
 
@@ -70,10 +72,12 @@ def critic_loss(
         alpha: jnp.ndarray,
         transitions: Transition,
         key: PRNGKey,
-        exploration_bonus: bool = True,
         safe: bool = False,
+        target_q_fn: QTransformation = SACBase(),
     ) -> jnp.ndarray:
-        domain_params = transitions.extras.get("domain_parameters", None)
+        domain_params = transitions.extras["state_extras"].get(
+            "domain_parameters", None
+        )
         if domain_params is not None:
             action = jnp.concatenate([transitions.action, domain_params], axis=-1)
         else:
@@ -83,46 +87,30 @@ def critic_loss(
         q_old_action = q_network.apply(
             normalizer_params, q_params, transitions.observation, action
         )
-        next_dist_params = policy_network.apply(
-            normalizer_params, policy_params, transitions.next_observation
-        )
-        next_action = parametric_action_distribution.sample_no_postprocessing(
-            next_dist_params, key
-        )
-        next_log_prob = parametric_action_distribution.log_prob(
-            next_dist_params, next_action
-        )
-        next_action = parametric_action_distribution.postprocess(next_action)
-        if domain_params is not None:
-            next_action = jnp.concatenate([next_action, domain_params], axis=-1)
-        next_q = q_network.apply(
-            normalizer_params,
-            target_q_params,
-            transitions.next_observation,
-            next_action,
-        )
-        if safe:
-            next_v = jnp.mean(next_q, axis=-1)
-        else:
-            next_v = jnp.min(next_q, axis=-1)
-        if exploration_bonus:
-            next_v -= alpha * next_log_prob
-        reward = transitions.reward
-        if safe:
-            assert "imagined_cost" in transitions.extras or "cost" in transitions.extras
-            reward = transitions.extras.get(
-                "imagined_cost",
-                transitions.extras.get("cost", jnp.zeros_like(transitions.reward)),
+
+        def policy(obs: jax.Array) -> tuple[jax.Array, jax.Array]:
+            next_dist_params = policy_network.apply(
+                normalizer_params, policy_params, obs
+            )
+            next_action = parametric_action_distribution.sample_no_postprocessing(
+                next_dist_params, key
             )
-        target_q = jax.lax.stop_gradient(
-            reward * reward_scaling + transitions.discount * gamma * next_v
+            next_log_prob = parametric_action_distribution.log_prob(
+                next_dist_params, next_action
+            )
+            next_action = parametric_action_distribution.postprocess(next_action)
+            return next_action, next_log_prob
+
+        q_fn = lambda obs, action: q_network.apply(
+            normalizer_params, target_q_params, obs, action
+        )
+        target_q = target_q_fn(
+            transitions, q_fn, policy, gamma, domain_params, alpha, reward_scaling
         )
         q_error = q_old_action - jnp.expand_dims(target_q, -1)
-
         # Better bootstrapping for truncated episodes.
         truncation = transitions.extras["state_extras"]["truncation"]
         q_error *= jnp.expand_dims(1 - truncation, -1)
-
         q_loss = 0.5 * jnp.mean(jnp.square(q_error))
         return q_loss
 
@@ -145,7 +133,9 @@ def actor_loss(
         )
         log_prob = parametric_action_distribution.log_prob(dist_params, action)
         action = parametric_action_distribution.postprocess(action)
-        domain_params = transitions.extras.get("domain_parameters", None)
+        domain_params = transitions.extras["state_extras"].get(
+            "domain_parameters", None
+        )
         if domain_params is not None:
             action = jnp.concatenate([action, domain_params], axis=-1)
         qr_action = qr_network.apply(
diff --git a/ss2r/algorithms/sac/robustness.py b/ss2r/algorithms/sac/robustness.py
@@ -0,0 +1,130 @@
+from typing import Callable, Protocol
+
+import jax
+import jax.numpy as jnp
+from brax.training.types import Params, Transition
+
+
+class QTransformation(Protocol):
+    def __call__(
+        self,
+        transitions: Transition,
+        q_fn: Callable[[Params, jax.Array], jax.Array],
+        policy: Callable[[jax.Array], tuple[jax.Array, jax.Array]],
+        gamma: float,
+        domain_params: jax.Array | None = None,
+        alpha: jax.Array | None = None,
+        reward_scaling: float = 1.0,
+    ):
+        ...
+
+
+class LCB(QTransformation):
+    def __init__(self, lambda_: float) -> None:
+        self.lambda_ = lambda_
+
+    def __call__(
+        self,
+        transitions: Transition,
+        q_fn: Callable[[Params, jax.Array], jax.Array],
+        policy: Callable[[jax.Array], tuple[jax.Array, jax.Array]],
+        gamma: float,
+        domain_params: jax.Array | None = None,
+        alpha: jax.Array | None = None,
+        reward_scaling: float = 1.0,
+    ):
+        next_obs = transitions.extras["state_extras"]["state_propagation"]["next_obs"]
+        next_action, _ = policy(next_obs)
+        if domain_params is not None:
+            domain_params = jnp.tile(
+                domain_params[:, None], (1, next_action.shape[1], 1)
+            )
+            next_action = jnp.concatenate([next_action, domain_params], axis=-1)
+        next_q = q_fn(next_obs, next_action)
+        next_v = next_q.mean(axis=-1)
+        std = jnp.std(next_v, axis=-1)
+        cost = transitions.extras["state_extras"]["cost"]
+        cost += self.lambda_ * std
+        target_q = jax.lax.stop_gradient(
+            cost * reward_scaling + transitions.discount * gamma * next_v
+        )
+        return target_q
+
+
+class CVaR(QTransformation):
+    def __init__(self, confidence: float) -> None:
+        self.confidence = confidence
+
+    def __call__(
+        self,
+        transitions: Transition,
+        q_fn: Callable[[Params, jax.Array], jax.Array],
+        policy: Callable[[jax.Array], tuple[jax.Array, jax.Array]],
+        gamma: float,
+        domain_params: jax.Array | None = None,
+        alpha: jax.Array | None = None,
+        reward_scaling: float = 1.0,
+    ):
+        next_obs = transitions.extras["state_extras"]["state_propagation"]["next_obs"]
+        next_action, _ = policy(next_obs)
+        if domain_params is not None:
+            domain_params = jnp.tile(
+                domain_params[:, None], (1, next_action.shape[1], 1)
+            )
+            next_action = jnp.concatenate([next_action, domain_params], axis=-1)
+        next_q = q_fn(next_obs, next_action)
+        next_v = next_q.mean(axis=-1)
+        sort_next_v = jnp.sort(next_v, axis=-1)
+        cvar_index = int((1 - self.confidence) * next_v.shape[1])
+        next_v = jnp.mean(sort_next_v[:, :cvar_index], axis=-1)
+        cost = transitions.extras["state_extras"]["cost"]
+        target_q = jax.lax.stop_gradient(
+            cost * reward_scaling + transitions.discount * gamma * next_v
+        )
+        return target_q
+
+
+class SACBase(QTransformation):
+    def __call__(
+        self,
+        transitions: Transition,
+        q_fn: Callable[[Params, jax.Array], jax.Array],
+        policy: Callable[[jax.Array], tuple[jax.Array, jax.Array]],
+        gamma: float,
+        domain_params: jax.Array | None = None,
+        alpha: jax.Array | None = None,
+        reward_scaling: float = 1.0,
+    ):
+        next_action, next_log_prob = policy(transitions.next_observation)
+        if domain_params is not None:
+            next_action = jnp.concatenate([next_action, domain_params], axis=-1)
+        next_q = q_fn(transitions.next_observation, next_action)
+        next_v = next_q.min(axis=-1)
+        next_v -= alpha * next_log_prob
+        target_q = jax.lax.stop_gradient(
+            transitions.reward * reward_scaling + transitions.discount * gamma * next_v
+        )
+        return target_q
+
+
+class SACCost(QTransformation):
+    def __call__(
+        self,
+        transitions: Transition,
+        q_fn: Callable[[Params, jax.Array], jax.Array],
+        policy: Callable[[jax.Array], tuple[jax.Array, jax.Array]],
+        gamma: float,
+        domain_params: jax.Array | None = None,
+        alpha: jax.Array | None = None,
+        reward_scaling: float = 1.0,
+    ):
+        next_action, _ = policy(transitions.next_observation)
+        if domain_params is not None:
+            next_action = jnp.concatenate([next_action, domain_params], axis=-1)
+        next_q = q_fn(transitions.next_observation, next_action)
+        next_v = next_q.mean(axis=-1)
+        cost = transitions.extras["state_extras"]["cost"]
+        target_q = jax.lax.stop_gradient(
+            cost * reward_scaling + transitions.discount * gamma * next_v
+        )
+        return target_q
diff --git a/ss2r/algorithms/sac/train.py b/ss2r/algorithms/sac/train.py
@@ -35,11 +35,8 @@
 
 import ss2r.algorithms.sac.losses as sac_losses
 import ss2r.algorithms.sac.networks as sac_networks
-from ss2r.algorithms.sac.wrappers import (
-    DomainRandomizationParams,
-    StatePropagation,
-    std_bonus,
-)
+from ss2r.algorithms.sac.robustness import SACCost
+from ss2r.algorithms.sac.wrappers import DomainRandomizationParams, StatePropagation
 from ss2r.rl.evaluation import ConstraintsEvaluator
 
 Metrics: TypeAlias = types.Metrics
@@ -172,6 +169,8 @@ def train(
     lagrange_multiplier: float = 1e-9,
     penalty_multiplier: float = 1.0,
     penalty_multiplier_factor: float = 1.0,
+    cost_q_transform: str | None = None,
+    cvar_confidence: float = 0.95,
 ):
     """SAC training."""
     process_id = jax.process_index()
@@ -247,12 +246,7 @@ def train(
     else:
         domain_parameters = None
     if propagation is not None:
-        cost_penalty_fn = (
-            functools.partial(std_bonus, lambda_=cost_penalty)
-            if cost_penalty is not None
-            else None
-        )
-        env = StatePropagation(env, cost_penalty_fn=cost_penalty_fn)
+        env = StatePropagation(env)
 
     obs_size = env.observation_size
     action_size = env.action_size
@@ -287,12 +281,14 @@ def train(
         "policy_extras": {},
     }
     if domain_parameters is not None:
-        extras["domain_parameters"] = domain_parameters[0]
+        extras["state_extras"]["domain_parameters"] = domain_parameters[0]  # type: ignore
     if safe:
-        if propagation is not None and cost_penalty is not None:
-            extras["imagined_cost"] = 0.0
-        else:
-            extras["cost"] = 0.0
+        if propagation is not None:
+            extras["state_extras"]["state_propagation"] = {  # type: ignore
+                "next_obs": jnp.tile(dummy_obs, (num_envs,) + (1,) * dummy_obs.ndim),
+                "rng": rng,
+            }
+        extras["state_extras"]["cost"] = 0.0  # type: ignore
 
     dummy_transition = Transition(  # pytype: disable=wrong-arg-types  # jax-ndarray
         observation=dummy_obs,
@@ -372,8 +368,8 @@ def sgd_step(
                 alpha,
                 transitions,
                 key_critic,
-                False,
                 True,
+                SACCost(),
                 optimizer_state=training_state.qc_optimizer_state,
             )
             cost_metrics = {
@@ -457,9 +453,13 @@ def get_experience(
         ReplayBufferState,
     ]:
         policy = make_policy((normalizer_params, policy_params))
-        extra_fields = ("truncation",) + tuple(
-            key for key in extras.keys() if key not in ["state_extras", "policy_extras"]
-        )
+        extra_fields = ("truncation",)
+        if domain_parameters is not None:
+            extra_fields += ("domain_parameters",)  # type: ignore
+        if propagation is not None:
+            extra_fields += ("state_propagation",)  # type: ignore
+        if safe:
+            extra_fields += ("cost",)  # type: ignore
         step = lambda state: acting.actor_step(
             env, state, policy, key, extra_fields=extra_fields
         )
@@ -469,10 +469,7 @@ def get_experience(
             normalizer_params, transitions.observation, pmap_axis_name=_PMAP_AXIS_NAME
         )
         if transitions.observation.ndim == 3:
-            transitions = jax.tree_util.tree_map(
-                lambda x: x[0],
-                transitions,
-            )
+            transitions = jax.tree_util.tree_map(lambda x: x[0], transitions)
         buffer_state = replay_buffer.insert(buffer_state, transitions)
         return normalizer_params, env_state, buffer_state
 
diff --git a/ss2r/algorithms/sac/wrappers.py b/ss2r/algorithms/sac/wrappers.py
@@ -28,39 +28,40 @@ class StatePropagation(Wrapper):
     This wrapper assumes that the environment is wrapped before with a VmapWrapper or DomainRandomizationVmapWrapper
     """
 
-    def __init__(self, env, propagation_fn=ts1, cost_penalty_fn=None):
+    def __init__(self, env, propagation_fn=ts1):
         super().__init__(env)
-        self.cost_penalty_fn = cost_penalty_fn
         self.propagation_fn = propagation_fn
         self.num_envs = None
 
     def reset(self, rng: jax.Array) -> State:
         if self.num_envs is None:
             self.num_envs = rng.shape[0]
         state = self.env.reset(rng)
-        if "propagation_rng" in state.info:
-            propagation_rng = state.info["propagation_rng"]
-        else:
-            propagation_rng = jax.random.split(rng[0])[1]
+        propagation_rng = jax.random.split(rng[0])[1]
         n_key, key = jax.random.split(propagation_rng)
-        state.info["propagation_rng"] = jax.random.split(n_key, self.num_envs)
-        state.info["imagined_cost"] = jnp.zeros(self.num_envs)
-        return self.propagation_fn(state, key)
+        state.info["state_propagation"] = {}
+        state.info["state_propagation"]["rng"] = jax.random.split(n_key, self.num_envs)
+        orig_next_obs = state.obs
+        state = self.propagation_fn(state, key)
+        state.info["state_propagation"]["next_obs"] = orig_next_obs
+        return state
 
     def step(self, state: State, action: jax.Array) -> State:
         # The order here matters, the tree_map changes the dimensions of
         # the propgattion_rng
-        propagation_rng = state.info["propagation_rng"]
+        propagation_rng = state.info["state_propagation"]["rng"]
         tile = lambda tree: jax.tree_map(
             lambda x: jnp.tile(x, (self.num_envs,) + (1,) * x.ndim), tree
         )
         state, action = tile(state), tile(action)
         nstate = self.env.step(state, action)
         n_key, key = jax.random.split(propagation_rng)
-        nstate.info["propagation_rng"] = jax.random.split(n_key, self.num_envs)
-        if self.cost_penalty_fn is not None:
-            nstate.info["imagined_cost"] += self.cost_penalty_fn(nstate)
-        return self.propagation_fn(nstate, key)
+        orig_next_obs = nstate.obs
+        nstate.info["state_propagation"]["rng"] = jax.random.split(n_key, self.num_envs)
+        nstate.info["state_propagation"]["next_obs"] = nstate.obs
+        nstate = self.propagation_fn(nstate, key)
+        nstate.info["state_propagation"]["next_obs"] = orig_next_obs
+        return nstate
 
 
 def get_randomized_values(sys_v, in_axes):
diff --git a/ss2r/configs/agent/sac.yaml b/ss2r/configs/agent/sac.yaml
@@ -19,4 +19,6 @@ cost_penalty: null
 propagation: standard
 lagrange_multiplier: 0.0001
 penalty_multiplier: 5e-8
-penalty_multiplier_factor: 8e-6
+penalty_multiplier_factor: 8e-6
+cost_q_transform: cvar
+cvar_confidence: 0.95
diff --git a/ss2r/configs/config.yaml b/ss2r/configs/config.yaml