volcengine
diff --git a/‎recipe/agent_lightning_like/__init__.py‎
Lines changed: 13 additions & 0 deletions b/‎recipe/agent_lightning_like/__init__.py‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎recipe/agent_lightning_like/agent_client_base.py‎
Lines changed: 37 additions & 0 deletions b/‎recipe/agent_lightning_like/agent_client_base.py‎
Lines changed: 37 additions & 0 deletions
diff --git a/‎recipe/agent_lightning_like/agent_loop.py‎
Lines changed: 160 additions & 0 deletions b/‎recipe/agent_lightning_like/agent_loop.py‎
Lines changed: 160 additions & 0 deletions
diff --git a/‎recipe/agent_lightning_like/config/lightning_ppo_trainer.yaml‎
Lines changed: 41 additions & 0 deletions b/‎recipe/agent_lightning_like/config/lightning_ppo_trainer.yaml‎
Lines changed: 41 additions & 0 deletions
diff --git a/‎recipe/agent_lightning_like/example/README.md‎
Lines changed: 133 additions & 0 deletions b/‎recipe/agent_lightning_like/example/README.md‎
Lines changed: 133 additions & 0 deletions
@@ -0,0 +1,13 @@
+# Copyright 2025 Individual Contributor: linxxx3 ([email protected])
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
@@ -0,0 +1,37 @@
+# Copyright 2025 Individual Contributor: linxxx3 ([email protected])
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC, abstractmethod
+from typing import Any
+
+
+class AgentClientBase(ABC):
+    """Agent client base class."""
+
+    def __init__(self, server_address: str, **kwargs):
+        if server_address.startswith("http"):
+            self.server_address_full = server_address
+        else:
+            self.server_address_full = f"http://{server_address}"
+
+    @abstractmethod
+    async def chat(self, trace_id: str, sampling_params: dict[str, Any], **kwargs) -> Any:
+        """Custom chat function.
+        Note: use async http client like aiohttp in this function, to avoid blocking the event loop.
+        Args:
+            trace_id: trace id for collecting the trajectory
+            sampling_params: sampling parameters, e.g., temperature, top_p, max_tokens, etc.
+            **kwargs: non-tensor fields of a data sample from RLHFDataset
+        """
+        raise NotImplementedError
@@ -0,0 +1,160 @@
+# Copyright 2025 Individual Contributor: linxxx3 ([email protected])
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import logging
+import os
+import random
+from typing import Any, cast
+from uuid import uuid4
+
+import hydra
+import ray
+from omegaconf import DictConfig, OmegaConf
+
+from verl.experimental.agent_loop.agent_loop import AgentLoopBase, AgentLoopOutput
+
+from .trajectory import Trajectory
+
+logger = logging.getLogger(__file__)
+logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))
+
+
+class LightningAgentLoop(AgentLoopBase):
+    @classmethod
+    def init_class(cls, config, tokenizer, processor, **kwargs):
+        if cls._class_initialized:
+            return
+        cls._validate_config(config)
+        cls.agent_client_config = OmegaConf.load(config.lightning_trainer.agent_client_config_path)
+        logger.info(f"LightningAgentLoop using agent_server_addr: {config.lightning_trainer.agent_server_addr}")
+        cls.max_turns = config.actor_rollout_ref.rollout.multi_turn.max_assistant_turns
+        cls._class_initialized = True
+
+    @classmethod
+    def _validate_config(cls, config: DictConfig):
+        assert config.get("lightning_trainer") is not None, "config.lightning_trainer is required"
+        assert config.lightning_trainer.model_name
+        assert config.lightning_trainer.agent_server_addr
+        assert config.lightning_trainer.agent_client_config_path
+
+    async def run(self, sampling_params: dict, **kwargs) -> AgentLoopOutput:
+        model_name = self.config.lightning_trainer.model_name
+        client = hydra.utils.instantiate(
+            self.agent_client_config,
+            server_address=self.config.lightning_trainer.agent_server_addr,
+        )
+
+        async def _wait_random(min_seconds: int = 0, max_seconds: int = 3):
+            wait_time = random.uniform(min_seconds, max_seconds)
+            await asyncio.sleep(wait_time)
+
+        trace_id = str(uuid4())
+        resp = None
+        try:
+            await _wait_random()  # avoid large amount of simultaneous requests
+            logger.debug(f"AgentClient sending request {trace_id=}, {sampling_params=}")
+            resp = await client.chat(
+                trace_id=trace_id, sampling_params=sampling_params, max_turns=self.max_turns, **kwargs
+            )
+            logger.debug(f"AgentClient final response {trace_id=}: {resp}")
+        except Exception as e:
+            import traceback
+
+            # client.chat should not raise exception
+            logger.error(f"Error in client.chat, should not happen: {e}")
+            traceback.print_exc()
+
+        llm_router = ray.get_actor("LLMRouter")  # get LLMRouter handler by name
+        assert llm_router is not None, "LLMRouter actor not found"
+        trajactory = await llm_router.retrieve_trajectory.remote(model_name=model_name, trace_id=trace_id)
+        logger.debug(f"Retrieved trajectory for {trace_id=}: {trajactory}")
+
+        output = None
+        if trajactory is None:
+            logger.error(f"Trajectory not found for model: {model_name}, trace_id: {trace_id}")
+        try:
+            trajactory = cast(Trajectory, trajactory)
+            output = _trajectory_to_agent_loop_output(trajactory, resp)
+        except Exception as e:
+            logger.error(f"Invalid trajectory for model: {model_name}, trace_id: {trace_id}, error: {e}")
+        if output is None:
+            output = _create_empty_agent_loop_output(
+                trace_id=trace_id,
+                model_name=model_name,
+                prompt_length=self.config.actor_rollout_ref.rollout.prompt_length,
+                response_length=self.config.actor_rollout_ref.rollout.response_length,
+                pad_token_id=self.tokenizer.pad_token_id,
+                final_response=resp,
+            )
+
+        ## maybe compute score here
+        ## fill in output.reward_score and output.extra_fields["reward_extra_info"]
+        return self._postprocess(output)
+
+    def _postprocess(self, output: AgentLoopOutput) -> AgentLoopOutput:
+        max_response_length = self.config.actor_rollout_ref.rollout.response_length
+
+        output.response_ids = output.response_ids[:max_response_length]
+        output.response_mask = output.response_mask[:max_response_length]
+        assert len(output.response_ids) == len(output.response_mask)
+
+        if output.response_logprobs:
+            output.response_logprobs = output.response_logprobs[:max_response_length]
+            assert len(output.response_ids) == len(output.response_logprobs)
+
+        return output
+
+
+def _trajectory_to_agent_loop_output(trajectory: Trajectory, final_response: Any) -> AgentLoopOutput:
+    last_item = trajectory.get_last_item()
+    if last_item is None:
+        raise ValueError(f"Trajectory is empty, model: {trajectory.model_name}, trace_id: {trajectory.trace_id}")
+
+    ## TODO: metrics
+    output = AgentLoopOutput(
+        prompt_ids=last_item.prompt_ids,
+        response_ids=last_item.response_ids,
+        response_mask=last_item.response_mask,
+        response_logprobs=None,
+        reward_score=None,
+        num_turns=len(trajectory.items),
+        metrics={},
+        extra_fields={
+            "model_name": trajectory.model_name,
+            "trace_id": trajectory.trace_id,
+            "final_response": final_response,
+        },
+    )
+    return output
+
+
+def _create_empty_agent_loop_output(
+    trace_id: str, model_name: str, prompt_length: int, response_length: int, pad_token_id: int, final_response: Any
+) -> AgentLoopOutput:
+    """Create an empty AgentLoopOutput, with padding response_ids and response_mask."""
+    return AgentLoopOutput(
+        prompt_ids=[pad_token_id] * prompt_length,
+        response_ids=[pad_token_id] * response_length,
+        response_mask=[0] * response_length,
+        response_logprobs=None,
+        reward_score=None,
+        num_turns=0,
+        metrics={},
+        extra_fields={
+            "model_name": model_name,
+            "trace_id": trace_id,
+            "final_response": final_response,
+        },
+    )
@@ -0,0 +1,41 @@
+hydra:
+  searchpath:
+    - file://verl/trainer/config
+
+defaults:
+  - ppo_trainer
+  - _self_
+
+# config for the rollout
+rollout:
+  agent:
+    # custom agent loop class, should be a subclass of AgentLoopBase
+    agent_loop_config_path: null
+
+# custom config for the agent-lightning-like trainer
+lightning_trainer:
+
+  # model name used in the agent server
+  model_name: Default
+
+  # custom agent client class, should be a subclass of AgentClientBase
+  agent_client_config_path: null
+
+  # standalone custom agent server address, with format of "ip:port" or "https://ip:port"
+  agent_server_addr: null
+
+  # health-check url path of agent server
+  health_check_url: /health
+
+  health_check_timeout: 60
+
+  # request header name for trace_id, used by llm_router to manage trajectories.
+  # the header should be included in the request chain from agent client to agent server to llm_router
+  request_header_trace_id: trace_id
+
+  # tool call parser, used by llm_router to extract tool calls from model responses
+  # inherit from rollout.multi_turn.format by default
+  tool_call_parser: ${oc.select:actor_rollout_ref.rollout.multi_turn.format,hermes}
+
+  # reasoning parser, used by lm_router to extract reasoning_content from model responses
+  reasoning_parser: null
@@ -0,0 +1,133 @@
+# Agent-Lightning-like RL training Example
+
+Agent-Lightning-like is a RL training recipe inspire by Agent Lightning (https://arxiv.org/abs/2508.03680). You can train almost **ANY** agent by writing a few lines of codes. More important, the agent can run in an independent Python environment or even on a separate machine, as a service. That makes the training simpler, especially when you have a complex agent system. 
+
+Here is a tiny example to demonstrate how to use this recipe. The example uses OpenAI agent-sdk, but this recipe does not restrict on which framework you use to write the agent.
+
+## Prepare agent server
+
+Wrap the agent as a http service, if you don't have one. As an example, `agent_server.py` demonstrates how to set up a `/chat` API endpoint, which features an integrated `calc_gsm8k_reward` tool.
+
+We need to inject two elements into the Agent, the LLM service url and additional request headers.
+
+The LLM service url is provided after veRL training started. The agent gets it by calling `get_llm_server_address` defined in `recipe/agent_lightning_like/notify.py`:
+
+```python
+# model_provider.py
+DEFAULT_MODEL_NAME = "Default"
+
+class CustomModelProvider(ModelProvider):
+    def get_model(self, model_name: str | None) -> Model:
+        model_configs = get_model_configs()
+        model_name = model_name or DEFAULT_MODEL_NAME
+        if model_name not in model_configs:
+            raise ValueError(f"Model {model_name} not found in model configs: {model_configs.keys()}")
+        config = model_configs[model_name]
+        base_url = config["base_url"]
+        api_key = config.get("api_key", "")
+        client = AsyncOpenAI(base_url=base_url, api_key=api_key)
+        return OpenAIChatCompletionsModel(model=model_name, openai_client=client)
+
+
+def get_model_configs():
+    """Demo: get model configurations from LLM_SERVER_NOTIFY_FILE."""
+    server_address = get_llm_server_address()
+    base_url = f"http://{server_address}/v1"
+    model_configs = {
+        DEFAULT_MODEL_NAME: {
+            "base_url": base_url,
+            "api_key": "",
+        },
+    }
+    return model_configs
+```
+
+The `LLM_SERVER_NOTIFY_FILE` env set the file that pass the llm endpoint from the trainer to the agent server.
+
+An additional request header with a name "trace_id" is included in the request context, and we need to pass it to the LLM server. We do it by setting `extra_headers` in `model_settings`.
+
+```python
+# agent_server.py
+
+@app.post("/chat")
+async def chat(request: Annotated[ChatRequest, fastapi.Body()]):
+    """A demo chat function."""
+    context = request.context
+    model_provider = CustomModelProvider()
+    extra_headers = request.extra_headers or {}
+    extra_headers.update({HEADER_TRACE_ID: context.trace_id})
+    model_settings = ModelSettings(
+        temperature=request.temperature,
+        top_p=request.top_p,
+        max_tokens=request.max_tokens,
+        extra_headers=extra_headers,	# inject trace_id here
+        extra_body=request.extra_body or {},
+    )
+    agent = Agent[UserContext](
+        name="Assistant",
+        instructions=request.system_prompt or "You are a helpful assistant.",
+        tools=[calc_gsm8k_reward],
+    )
+    # ......
+```
+
+## Write agent client
+
+Trainer uses a client to send prompts to the agent server, that starts the rollout. The client shall implement an async `chat` method, like the demo `agent_client.py`. The `chat` method is expected to throw no exceptions.
+
+```python
+# agent_client.py
+
+class AgentClient(AgentClientBase):
+
+    async def chat(self, trace_id: str, sampling_params: dict[str, Any], **kwargs) -> Any:
+        # kwargs include "max_turns" and non-tensor fields of a data sample from RLHFDataset
+        # ...
+        # async send request to agent server
+```
+
+## Prepare dataset
+Let's prepare two small datasets for training and evaluation:
+```bash
+python examples/data_preprocess/gsm8k_tool_agent_loop.py
+```
+
+We use a simple `CustomDataset` class defined in `dataset.py` to adapt the "agent_name" field in the generated dataset with the one we define in `agent_loop.yaml`.
+
+```python
+# dataset.py
+from verl.utils.dataset import RLHFDataset
+
+class CustomDataset(RLHFDataset):
+    """A custom dataset for the agent-lightning-like example."""
+    def __getitem__(self, item):
+        row_dict = super().__getitem__(item)
+        row_dict["agent_name"] = "lightning_demo"  # must match the name in agent_loop.yaml
+        row_dict.pop("tools_kwargs", None)  # remove tools_kwargs if exists, tools defined in agent server side
+        return row_dict
+```
+
+## Training
+
+Prepare these yaml config file if you train your own agent:  `agent_loop.yaml`, `agent_client.yaml`, `recipe/agent_lightning_like/config/lightning_ppo_trainer.yaml`, and write a start script.
+
+Run this demo example:
+
+```bash
+bash recipe/agent_lightning_like/example/run_qwen2.5_7b.sh 2>&1 | tee run.log
+```
+
+You probably need a 8-GPU node for this example, or choose a smaller model.
+
+The validation score is expected to reach about 93.6/100 after training one epoch.
+
+## Testing
+
+There are some CI tests in `recipe/agent_lightning_like/test`.
+
+Run a test:
+
+```bash
+PYTHONPATH=$(pwd) pytest -s recipe/agent_lightning_like/test/test_xxx.py
+```
+