|
| 1 | +# Copyright 2025 Individual Contributor: linxxx3 ([email protected]) |
| 2 | +# |
| 3 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | +# you may not use this file except in compliance with the License. |
| 5 | +# You may obtain a copy of the License at |
| 6 | +# |
| 7 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | +# |
| 9 | +# Unless required by applicable law or agreed to in writing, software |
| 10 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | +# See the License for the specific language governing permissions and |
| 13 | +# limitations under the License. |
| 14 | + |
| 15 | +import asyncio |
| 16 | +import logging |
| 17 | +import os |
| 18 | +import random |
| 19 | +from typing import Any, cast |
| 20 | +from uuid import uuid4 |
| 21 | + |
| 22 | +import hydra |
| 23 | +import ray |
| 24 | +from omegaconf import DictConfig, OmegaConf |
| 25 | + |
| 26 | +from verl.experimental.agent_loop.agent_loop import AgentLoopBase, AgentLoopOutput |
| 27 | + |
| 28 | +from .trajectory import Trajectory |
| 29 | + |
| 30 | +logger = logging.getLogger(__file__) |
| 31 | +logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN")) |
| 32 | + |
| 33 | + |
| 34 | +class LightningAgentLoop(AgentLoopBase): |
| 35 | + @classmethod |
| 36 | + def init_class(cls, config, tokenizer, processor, **kwargs): |
| 37 | + if cls._class_initialized: |
| 38 | + return |
| 39 | + cls._validate_config(config) |
| 40 | + cls.agent_client_config = OmegaConf.load(config.lightning_trainer.agent_client_config_path) |
| 41 | + logger.info(f"LightningAgentLoop using agent_server_addr: {config.lightning_trainer.agent_server_addr}") |
| 42 | + cls.max_turns = config.actor_rollout_ref.rollout.multi_turn.max_assistant_turns |
| 43 | + cls._class_initialized = True |
| 44 | + |
| 45 | + @classmethod |
| 46 | + def _validate_config(cls, config: DictConfig): |
| 47 | + assert config.get("lightning_trainer") is not None, "config.lightning_trainer is required" |
| 48 | + assert config.lightning_trainer.model_name |
| 49 | + assert config.lightning_trainer.agent_server_addr |
| 50 | + assert config.lightning_trainer.agent_client_config_path |
| 51 | + |
| 52 | + async def run(self, sampling_params: dict, **kwargs) -> AgentLoopOutput: |
| 53 | + model_name = self.config.lightning_trainer.model_name |
| 54 | + client = hydra.utils.instantiate( |
| 55 | + self.agent_client_config, |
| 56 | + server_address=self.config.lightning_trainer.agent_server_addr, |
| 57 | + ) |
| 58 | + |
| 59 | + async def _wait_random(min_seconds: int = 0, max_seconds: int = 3): |
| 60 | + wait_time = random.uniform(min_seconds, max_seconds) |
| 61 | + await asyncio.sleep(wait_time) |
| 62 | + |
| 63 | + trace_id = str(uuid4()) |
| 64 | + resp = None |
| 65 | + try: |
| 66 | + await _wait_random() # avoid large amount of simultaneous requests |
| 67 | + logger.debug(f"AgentClient sending request {trace_id=}, {sampling_params=}") |
| 68 | + resp = await client.chat( |
| 69 | + trace_id=trace_id, sampling_params=sampling_params, max_turns=self.max_turns, **kwargs |
| 70 | + ) |
| 71 | + logger.debug(f"AgentClient final response {trace_id=}: {resp}") |
| 72 | + except Exception as e: |
| 73 | + import traceback |
| 74 | + |
| 75 | + # client.chat should not raise exception |
| 76 | + logger.error(f"Error in client.chat, should not happen: {e}") |
| 77 | + traceback.print_exc() |
| 78 | + |
| 79 | + llm_router = ray.get_actor("LLMRouter") # get LLMRouter handler by name |
| 80 | + assert llm_router is not None, "LLMRouter actor not found" |
| 81 | + trajactory = await llm_router.retrieve_trajectory.remote(model_name=model_name, trace_id=trace_id) |
| 82 | + logger.debug(f"Retrieved trajectory for {trace_id=}: {trajactory}") |
| 83 | + |
| 84 | + output = None |
| 85 | + if trajactory is None: |
| 86 | + logger.error(f"Trajectory not found for model: {model_name}, trace_id: {trace_id}") |
| 87 | + try: |
| 88 | + trajactory = cast(Trajectory, trajactory) |
| 89 | + output = _trajectory_to_agent_loop_output(trajactory, resp) |
| 90 | + except Exception as e: |
| 91 | + logger.error(f"Invalid trajectory for model: {model_name}, trace_id: {trace_id}, error: {e}") |
| 92 | + if output is None: |
| 93 | + output = _create_empty_agent_loop_output( |
| 94 | + trace_id=trace_id, |
| 95 | + model_name=model_name, |
| 96 | + prompt_length=self.config.actor_rollout_ref.rollout.prompt_length, |
| 97 | + response_length=self.config.actor_rollout_ref.rollout.response_length, |
| 98 | + pad_token_id=self.tokenizer.pad_token_id, |
| 99 | + final_response=resp, |
| 100 | + ) |
| 101 | + |
| 102 | + ## maybe compute score here |
| 103 | + ## fill in output.reward_score and output.extra_fields["reward_extra_info"] |
| 104 | + return self._postprocess(output) |
| 105 | + |
| 106 | + def _postprocess(self, output: AgentLoopOutput) -> AgentLoopOutput: |
| 107 | + max_response_length = self.config.actor_rollout_ref.rollout.response_length |
| 108 | + |
| 109 | + output.response_ids = output.response_ids[:max_response_length] |
| 110 | + output.response_mask = output.response_mask[:max_response_length] |
| 111 | + assert len(output.response_ids) == len(output.response_mask) |
| 112 | + |
| 113 | + if output.response_logprobs: |
| 114 | + output.response_logprobs = output.response_logprobs[:max_response_length] |
| 115 | + assert len(output.response_ids) == len(output.response_logprobs) |
| 116 | + |
| 117 | + return output |
| 118 | + |
| 119 | + |
| 120 | +def _trajectory_to_agent_loop_output(trajectory: Trajectory, final_response: Any) -> AgentLoopOutput: |
| 121 | + last_item = trajectory.get_last_item() |
| 122 | + if last_item is None: |
| 123 | + raise ValueError(f"Trajectory is empty, model: {trajectory.model_name}, trace_id: {trajectory.trace_id}") |
| 124 | + |
| 125 | + ## TODO: metrics |
| 126 | + output = AgentLoopOutput( |
| 127 | + prompt_ids=last_item.prompt_ids, |
| 128 | + response_ids=last_item.response_ids, |
| 129 | + response_mask=last_item.response_mask, |
| 130 | + response_logprobs=None, |
| 131 | + reward_score=None, |
| 132 | + num_turns=len(trajectory.items), |
| 133 | + metrics={}, |
| 134 | + extra_fields={ |
| 135 | + "model_name": trajectory.model_name, |
| 136 | + "trace_id": trajectory.trace_id, |
| 137 | + "final_response": final_response, |
| 138 | + }, |
| 139 | + ) |
| 140 | + return output |
| 141 | + |
| 142 | + |
| 143 | +def _create_empty_agent_loop_output( |
| 144 | + trace_id: str, model_name: str, prompt_length: int, response_length: int, pad_token_id: int, final_response: Any |
| 145 | +) -> AgentLoopOutput: |
| 146 | + """Create an empty AgentLoopOutput, with padding response_ids and response_mask.""" |
| 147 | + return AgentLoopOutput( |
| 148 | + prompt_ids=[pad_token_id] * prompt_length, |
| 149 | + response_ids=[pad_token_id] * response_length, |
| 150 | + response_mask=[0] * response_length, |
| 151 | + response_logprobs=None, |
| 152 | + reward_score=None, |
| 153 | + num_turns=0, |
| 154 | + metrics={}, |
| 155 | + extra_fields={ |
| 156 | + "model_name": model_name, |
| 157 | + "trace_id": trace_id, |
| 158 | + "final_response": final_response, |
| 159 | + }, |
| 160 | + ) |
0 commit comments