diff --git a/verl/trainer/config/_generated_ppo_trainer.yaml b/verl/trainer/config/_generated_ppo_trainer.yaml index 1b1f604114c..22f9e83daaa 100644 --- a/verl/trainer/config/_generated_ppo_trainer.yaml +++ b/verl/trainer/config/_generated_ppo_trainer.yaml @@ -606,6 +606,9 @@ algorithm: lam: 1.0 adv_estimator: gae norm_adv_by_std_in_grpo: true + f_grpo_gamma: 1.0 + f_grpo_reward_correct: 1.0 + f_grpo_reward_wrong: 0.0 use_kl_in_reward: false kl_penalty: kl kl_ctrl: diff --git a/verl/trainer/config/_generated_ppo_veomni_trainer.yaml b/verl/trainer/config/_generated_ppo_veomni_trainer.yaml index 4528e0d667d..4d33cefcdee 100644 --- a/verl/trainer/config/_generated_ppo_veomni_trainer.yaml +++ b/verl/trainer/config/_generated_ppo_veomni_trainer.yaml @@ -576,6 +576,9 @@ algorithm: lam: 1.0 adv_estimator: gae norm_adv_by_std_in_grpo: true + f_grpo_gamma: 1.0 + f_grpo_reward_correct: 1.0 + f_grpo_reward_wrong: 0.0 use_kl_in_reward: false kl_penalty: kl kl_ctrl: diff --git a/verl/trainer/config/algorithm.py b/verl/trainer/config/algorithm.py index 5aa650d7bf9..b71517b6377 100644 --- a/verl/trainer/config/algorithm.py +++ b/verl/trainer/config/algorithm.py @@ -575,6 +575,9 @@ class AlgoConfig(BaseConfig): lam (float): Trade-off between bias and variance in the GAE estimator. adv_estimator (str): Advantage estimator type: "gae", "grpo", "reinforce_plus_plus", etc. norm_adv_by_std_in_grpo (bool): Whether to normalize advantages by std (specific to GRPO). + f_grpo_gamma (Optional[float]): F-GRPO focal exponent γ (>= 0). Required when adv_estimator="f_grpo". + f_grpo_reward_correct (Optional[float]): Reward value for correct rollouts (R_c) used in μ̂_pos(x). + f_grpo_reward_wrong (Optional[float]): Reward value for wrong rollouts (R_w) used in μ̂_pos(x). use_kl_in_reward (bool): Whether to enable in-reward KL penalty. kl_penalty (str): How to estimate KL divergence: "kl", "abs", "mse", "low_var_kl", or "full". kl_ctrl (KLControlConfig): KL control configuration. @@ -603,6 +606,9 @@ class AlgoConfig(BaseConfig): lam: float = 1.0 adv_estimator: str = "gae" norm_adv_by_std_in_grpo: bool = True + f_grpo_gamma: Optional[float] = None + f_grpo_reward_correct: Optional[float] = None + f_grpo_reward_wrong: Optional[float] = None use_kl_in_reward: bool = False kl_penalty: str = "kl" kl_ctrl: KLControlConfig = field(default_factory=KLControlConfig) diff --git a/verl/trainer/config/ppo_trainer.yaml b/verl/trainer/config/ppo_trainer.yaml index fd9b59862ae..6e21df50941 100644 --- a/verl/trainer/config/ppo_trainer.yaml +++ b/verl/trainer/config/ppo_trainer.yaml @@ -69,12 +69,21 @@ algorithm: # Trade-off between bias and variance in the GAE estimator lam: 1.0 - # Advantage estimator type: "gae", "grpo", "reinforce_plus_plus", etc. + # Advantage estimator type: "gae", "grpo", "f_grpo", "reinforce_plus_plus", etc. adv_estimator: gae # Whether to normalize advantages by std (specific to GRPO) norm_adv_by_std_in_grpo: True + # F-GRPO focal weighting γ (>= 0), only used when adv_estimator: f_grpo + f_grpo_gamma: 1.0 + + # F-GRPO reward value for correct rollouts (R_c), used in μ̂_pos(x) + f_grpo_reward_correct: 1.0 + + # F-GRPO reward value for wrong rollouts (R_w), used in μ̂_pos(x) + f_grpo_reward_wrong: 0.0 + # Whether to enable in-reward KL penalty use_kl_in_reward: False diff --git a/verl/trainer/ppo/core_algos.py b/verl/trainer/ppo/core_algos.py index 2039fe56f62..eebf1bd637d 100644 --- a/verl/trainer/ppo/core_algos.py +++ b/verl/trainer/ppo/core_algos.py @@ -96,6 +96,7 @@ class AdvantageEstimator(str, Enum): GAE = "gae" GRPO = "grpo" + F_GRPO = "f_grpo" REINFORCE_PLUS_PLUS = "reinforce_plus_plus" REINFORCE_PLUS_PLUS_BASELINE = "reinforce_plus_plus_baseline" REMAX = "remax" @@ -357,6 +358,90 @@ def compute_grpo_vectorized_outcome_advantage( return advantages, advantages +@register_adv_est(AdvantageEstimator.F_GRPO) +def compute_f_grpo_outcome_advantage( + token_level_rewards: torch.Tensor, + response_mask: torch.Tensor, + index: np.ndarray, + epsilon: float = 1e-6, + norm_adv_by_std_in_grpo: bool = True, + config: Optional[AlgoConfig] = None, + *, + focal_gamma: Optional[float] = None, + reward_correct: Optional[float] = None, + reward_wrong: Optional[float] = None, + **kwargs, +) -> tuple[torch.Tensor, torch.Tensor]: + r""" + F-GRPO (outcome-only): difficulty-aware (focal) scaling of group-relative advantages. + + We scale the GRPO group-relative advantage with a per-prompt weight g(x): + + \hat{A}_i^{F-GRPO} = g(x) \cdot \hat{A}_i^{GRPO} + + where + + g(x) = (1 - \hat{\mu}_{pos}(x))^{\gamma}, \gamma \ge 0 + \hat{\mu}_{pos}(x) = (\bar{R}(x) - R_w) / (R_c - R_w) + + Config keys (when called via trainer with `config`): + - algorithm.f_grpo_gamma (float): focal exponent \gamma controlling down-weighting strength. + - algorithm.f_grpo_reward_correct (float): reward value for correct rollouts (R_c). + - algorithm.f_grpo_reward_wrong (float): reward value for wrong rollouts (R_w). + + Notes: + - Implementation of F-GRPO paper https://arxiv.org/abs/2602.06717 + """ + if config is not None: + norm_adv_by_std_in_grpo = config.get("norm_adv_by_std_in_grpo", norm_adv_by_std_in_grpo) + if focal_gamma is None: + focal_gamma = config.get("f_grpo_gamma", None) + if reward_correct is None: + reward_correct = config.get("f_grpo_reward_correct", None) + if reward_wrong is None: + reward_wrong = config.get("f_grpo_reward_wrong", None) + + if focal_gamma is None or reward_correct is None or reward_wrong is None: + raise ValueError( + "F-GRPO requires `focal_gamma`, `reward_correct`, and `reward_wrong` to be provided. " + "Set them in config or pass them directly to " + "`compute_f_grpo_outcome_advantage(..., focal_gamma=..., reward_correct=..., reward_wrong=...)`." + ) + + focal_gamma = float(focal_gamma) + reward_correct = float(reward_correct) + reward_wrong = float(reward_wrong) + + if focal_gamma < 0: + raise ValueError(f"focal_gamma must be >= 0. Got {focal_gamma}.") + denom = reward_correct - reward_wrong + if denom <= 0: + raise ValueError( + f"reward_correct must be > reward_wrong for F-GRPO. Got reward_correct={reward_correct}, " + f"reward_wrong={reward_wrong}." + ) + + with torch.no_grad(): + scores = token_level_rewards.sum(dim=-1) # (bs,) + + gidx = as_torch_index(index, device=scores.device) + mean_g, std_g, _ = group_mean_std(scores, gidx, eps=epsilon, device=scores.device) + + if norm_adv_by_std_in_grpo: + scalars = (scores - mean_g[gidx]) / (std_g[gidx] + epsilon) + else: + scalars = scores - mean_g[gidx] + + mu_hat = (mean_g - reward_wrong) / denom + mu_hat = torch.clamp(mu_hat, 0.0, 1.0) + weights = torch.pow(1.0 - mu_hat, focal_gamma) # (G,) + + scalars = scalars * weights[gidx] + + advantages = scalars.unsqueeze(-1) * response_mask + return advantages, advantages + + @register_adv_est(AdvantageEstimator.GRPO_PASSK) # or simply: @register_adv_est("grpo_passk") def compute_grpo_passk_outcome_advantage( token_level_rewards: torch.Tensor,