From 3114ad99d62ce3e6a48c77c2c255f1d6970e44b5 Mon Sep 17 00:00:00 2001 From: Christian Munley Date: Tue, 27 Jan 2026 15:23:00 -0800 Subject: [PATCH 01/13] responses api Signed-off-by: Christian Munley --- responses_api_models/vllm_model/app.py | 136 +++++++++++ .../configs/vllm_model_native_responses.yaml | 10 + ...m_model_native_responses_for_training.yaml | 10 + .../vllm_model/tests/test_app.py | 217 ++++++++++++++++++ 4 files changed, 373 insertions(+) create mode 100644 responses_api_models/vllm_model/configs/vllm_model_native_responses.yaml create mode 100644 responses_api_models/vllm_model/configs/vllm_model_native_responses_for_training.yaml diff --git a/responses_api_models/vllm_model/app.py b/responses_api_models/vllm_model/app.py index 46319303d..cbf707e60 100644 --- a/responses_api_models/vllm_model/app.py +++ b/responses_api_models/vllm_model/app.py @@ -20,6 +20,7 @@ from aiohttp.client_exceptions import ClientResponseError from fastapi import Request +from fastapi.responses import JSONResponse from pydantic import BaseModel, Field from nemo_gym.base_responses_api_model import ( @@ -67,6 +68,8 @@ class VLLMModelConfig(BaseResponsesAPIModelConfig): uses_reasoning_parser: bool replace_developer_role_with_system: bool = False + use_native_responses_api: bool = False + chat_template_kwargs: Optional[Dict[str, Any]] = None # Corresponds to the extra_body of OpenAI Client. @@ -101,6 +104,139 @@ def model_post_init(self, context): async def responses( self, request: Request, body: NeMoGymResponseCreateParamsNonStreaming = Body() ) -> NeMoGymResponse: + session_id = request.session[SESSION_ID_KEY] + if session_id not in self._session_id_to_client: + client_idx = len(self._session_id_to_client) % len(self._clients) + client = self._clients[client_idx] + self._session_id_to_client[session_id] = client + client = self._session_id_to_client[session_id] + + if self.config.use_native_responses_api: + body_dict = body.model_dump(exclude_unset=True) + body_dict["model"] = self.config.model + + if self.config.return_token_id_information: + body_dict["top_logprobs"] = 1 + if "include" not in body_dict: + body_dict["include"] = [] + if "message.output_text.logprobs" not in body_dict["include"]: + body_dict["include"].append("message.output_text.logprobs") + + if self.config.extra_body: + body_dict = {**self.config.extra_body, **body_dict} + + try: + vllm_response_dict = await client.create_response(**body_dict) + except ClientResponseError as e: + """ + """ + result_content_str = e.response_content.decode() + + is_out_of_context_length = e.status == 400 and ( + "context length" in result_content_str or "max_tokens" in result_content_str + ) + if is_out_of_context_length: + return NeMoGymResponse( + id=f"resp_{uuid4().hex}", + created_at=int(time()), + model=self.config.model, + object="response", + parallel_tool_calls=True, + tool_choice="auto", + tools=[], + output=[ + NeMoGymResponseOutputMessage( + id=f"msg_{uuid4().hex}", + role="assistant", + content=[NeMoGymResponseOutputText(type="output_text", text="", annotations=[])], + status="completed", + type="message", + ) + ], + ) + else: + raise e + + if self.config.uses_reasoning_parser: + output = vllm_response_dict.get("output", []) + for output_item in output: + if output_item.get("type") == "message" and output_item.get("role") == "assistant": + content = output_item.get("content", []) + for content_item in content: + if content_item.get("type") == "output_text": + text = content_item.get("text", "") + reasoning_matches, cleaned_text = self._converter._extract_reasoning_from_content(text) + + if reasoning_matches: + content_item["text"] = cleaned_text + reasoning_item = { + "id": f"rs_{uuid4().hex}", + "type": "reasoning", + "summary": [ + {"text": reasoning_text, "type": "summary_text"} + for reasoning_text in reasoning_matches + ], + "status": "completed", + } + + output_idx = output.index(output_item) + output.insert(output_idx, reasoning_item) + + if self.config.return_token_id_information: + output = vllm_response_dict.get("output", []) + for output_item in output: + if output_item.get("type") == "message" and output_item.get("role") == "assistant": + content = output_item.get("content", []) + new_content = [] + for content_item in content: + if content_item.get("type") == "output_text": + logprobs = content_item.get("logprobs", []) + if logprobs: + generation_token_ids = [] + generation_log_probs = [] + for logprob_item in logprobs: + token = logprob_item.get("token", "") + if token.startswith("token_id:"): + token_id = token.removeprefix("token_id:") + else: + token_id = str(logprob_item.get("token_id", token)) + generation_token_ids.append(token_id) + generation_log_probs.append(logprob_item.get("logprob", 0.0)) + + tokenize_body_dict = {"model": body_dict["model"]} + if "input" in body_dict: + tokenize_body_dict["messages"] = body_dict["input"] + if "tools" in body_dict: + tokenize_body_dict["tools"] = body_dict["tools"] + + tokenize_response = await client.create_tokenize(**tokenize_body_dict) + prompt_token_ids = tokenize_response.get("tokens", []) + + output_item["prompt_token_ids"] = prompt_token_ids + output_item["generation_token_ids"] = generation_token_ids + output_item["generation_log_probs"] = generation_log_probs + + # Rebuild content item without logprobs + new_content_item = { + "type": content_item["type"], + "text": content_item["text"], + "annotations": content_item.get("annotations", []), + } + new_content.append(new_content_item) + else: + new_content.append(content_item) + else: + new_content.append(content_item) + + if new_content: + output_item["content"] = new_content + + validated_response = NeMoGymResponse.model_validate(vllm_response_dict) + return JSONResponse( + content=validated_response.model_dump(mode="json", exclude_none=True), + status_code=200 + ) + # Response Create Params -> Chat Completion Create Params chat_completion_create_params = self._converter.responses_to_chat_completion_create_params(body) body.model = self.config.model diff --git a/responses_api_models/vllm_model/configs/vllm_model_native_responses.yaml b/responses_api_models/vllm_model/configs/vllm_model_native_responses.yaml new file mode 100644 index 000000000..9eef48f6f --- /dev/null +++ b/responses_api_models/vllm_model/configs/vllm_model_native_responses.yaml @@ -0,0 +1,10 @@ +policy_model: + responses_api_models: + vllm_model: + entrypoint: app.py + base_url: ${policy_base_url} + api_key: ${policy_api_key} + model: ${policy_model_name} + return_token_id_information: false + uses_reasoning_parser: true + use_native_responses_api: true diff --git a/responses_api_models/vllm_model/configs/vllm_model_native_responses_for_training.yaml b/responses_api_models/vllm_model/configs/vllm_model_native_responses_for_training.yaml new file mode 100644 index 000000000..64e53c228 --- /dev/null +++ b/responses_api_models/vllm_model/configs/vllm_model_native_responses_for_training.yaml @@ -0,0 +1,10 @@ +policy_model: + responses_api_models: + vllm_model: + entrypoint: app.py + base_url: ${policy_base_url} + api_key: ${policy_api_key} + model: ${policy_model_name} + return_token_id_information: true + uses_reasoning_parser: true + use_native_responses_api: true diff --git a/responses_api_models/vllm_model/tests/test_app.py b/responses_api_models/vllm_model/tests/test_app.py index 27f7ade2c..20e90dd3c 100644 --- a/responses_api_models/vllm_model/tests/test_app.py +++ b/responses_api_models/vllm_model/tests/test_app.py @@ -16,6 +16,7 @@ from typing import Any, Union from unittest.mock import AsyncMock, MagicMock +from aiohttp.client_exceptions import ClientResponseError from fastapi.testclient import TestClient from pytest import MonkeyPatch, mark @@ -680,6 +681,28 @@ def _setup_server(self, monkeypatch: MonkeyPatch): return VLLMModel(config=config, server_client=MagicMock(spec=ServerClient)) + def _setup_server_native_api( + self, monkeypatch: MonkeyPatch, return_token_id_information: bool = False, uses_reasoning_parser: bool = False + ): + config = VLLMModelConfig( + host="0.0.0.0", + port=8081, + base_url="http://api.openai.com/v1", + api_key="dummy_key", + model="dummy_model", + entrypoint="", + name="", + return_token_id_information=return_token_id_information, + uses_reasoning_parser=uses_reasoning_parser, + use_native_responses_api=True, + ) + + get_global_config_dict_mock = MagicMock() + get_global_config_dict_mock.return_value = dict() + monkeypatch.setattr(nemo_gym.server_utils, "get_global_config_dict", get_global_config_dict_mock) + + return VLLMModel(config=config, server_client=MagicMock(spec=ServerClient)) + async def test_sanity(self, monkeypatch: MonkeyPatch) -> None: self._setup_server(monkeypatch) @@ -2039,6 +2062,200 @@ def test_responses_reasoning_parser(self, monkeypatch: MonkeyPatch): actual_messages = mock_method.call_args.kwargs["messages"] assert expected_messages == actual_messages + def test_native_responses_api_basic(self, monkeypatch: MonkeyPatch): + server = self._setup_server_native_api(monkeypatch) + app = server.setup_webserver() + client = TestClient(app) + + mock_vllm_response = { + "id": "resp_native_123", + "created_at": FIXED_TIME, + "model": "dummy_model", + "object": "response", + "parallel_tool_calls": True, + "tool_choice": "auto", + "tools": [], + "output": [ + { + "id": "msg_456", + "type": "message", + "role": "assistant", + "content": [ + { + "type": "output_text", + "text": "Hello! How can I help you?", + "annotations": [], + } + ], + "status": "completed", + } + ], + } + + mock_create_response = AsyncMock(return_value=mock_vllm_response) + monkeypatch.setattr(NeMoGymAsyncOpenAI, "create_response", mock_create_response) + + request_body = NeMoGymResponseCreateParamsNonStreaming(input="What is the weather?") + + response = client.post( + "/v1/responses", + json=request_body.model_dump(exclude_unset=True, mode="json"), + ) + assert response.status_code == 200 + + data = response.json() + assert data["output"][0]["content"][0]["text"] == "Hello! How can I help you?" + assert mock_create_response.called + assert mock_create_response.call_args.kwargs["model"] == "dummy_model" + + def test_native_responses_api_with_reasoning_parser(self, monkeypatch: MonkeyPatch): + server = self._setup_server_native_api(monkeypatch, uses_reasoning_parser=True) + app = server.setup_webserver() + client = TestClient(app) + + monkeypatch.setattr("responses_api_models.vllm_model.app.uuid4", lambda: FakeUUID()) + + mock_vllm_response = { + "id": "resp_native_123", + "created_at": FIXED_TIME, + "model": "dummy_model", + "object": "response", + "parallel_tool_calls": True, + "tool_choice": "auto", + "tools": [], + "output": [ + { + "id": "msg_456", + "type": "message", + "role": "assistant", + "content": [ + { + "type": "output_text", + "text": "I should check the weather for the userLet me help you with that!", + "annotations": [], + } + ], + "status": "completed", + } + ], + } + + mock_create_response = AsyncMock(return_value=mock_vllm_response) + monkeypatch.setattr(NeMoGymAsyncOpenAI, "create_response", mock_create_response) + + request_body = NeMoGymResponseCreateParamsNonStreaming(input="What is the weather?") + + response = client.post( + "/v1/responses", + json=request_body.model_dump(exclude_unset=True, mode="json"), + ) + assert response.status_code == 200 + + data = response.json() + + assert len(data["output"]) == 2 + assert data["output"][0]["type"] == "reasoning" + assert data["output"][0]["summary"][0]["text"] == "I should check the weather for the user" + assert data["output"][1]["type"] == "message" + assert data["output"][1]["content"][0]["text"] == "Let me help you with that!" + + def test_native_responses_api_with_token_ids(self, monkeypatch: MonkeyPatch): + server = self._setup_server_native_api(monkeypatch, return_token_id_information=True) + app = server.setup_webserver() + client = TestClient(app) + + mock_vllm_response = { + "id": "resp_native_123", + "created_at": FIXED_TIME, + "model": "dummy_model", + "object": "response", + "parallel_tool_calls": True, + "tool_choice": "auto", + "tools": [], + "output": [ + { + "id": "msg_456", + "type": "message", + "role": "assistant", + "content": [ + { + "type": "output_text", + "text": "Hello!", + "annotations": [], + "logprobs": [ + {"token": "token_id:100", "token_id": 100, "logprob": -0.5}, + {"token": "token_id:200", "token_id": 200, "logprob": -1.2}, + ], + } + ], + "status": "completed", + } + ], + } + + mock_tokenize_response = {"tokens": [1, 2, 3, 4, 5]} + + mock_create_response = AsyncMock(return_value=mock_vllm_response) + mock_create_tokenize = AsyncMock(return_value=mock_tokenize_response) + + monkeypatch.setattr(NeMoGymAsyncOpenAI, "create_response", mock_create_response) + monkeypatch.setattr(NeMoGymAsyncOpenAI, "create_tokenize", mock_create_tokenize) + + request_body = NeMoGymResponseCreateParamsNonStreaming(input="What is the weather?") + + response = client.post( + "/v1/responses", + json=request_body.model_dump(exclude_unset=True, mode="json"), + ) + assert response.status_code == 200 + + data = response.json() + output_item = data["output"][0] + + assert "prompt_token_ids" in output_item + assert output_item["prompt_token_ids"] == [1, 2, 3, 4, 5] + assert "generation_token_ids" in output_item + assert output_item["generation_token_ids"] == [100, 200] + assert "generation_log_probs" in output_item + assert output_item["generation_log_probs"] == [-0.5, -1.2] + assert "logprobs" not in output_item["content"][0] + assert mock_create_response.called + assert mock_create_tokenize.called + + def test_native_responses_api_context_length_error(self, monkeypatch: MonkeyPatch): + server = self._setup_server_native_api(monkeypatch) + app = server.setup_webserver() + client = TestClient(app) + + monkeypatch.setattr("responses_api_models.vllm_model.app.time", lambda: FIXED_TIME) + monkeypatch.setattr("responses_api_models.vllm_model.app.uuid4", lambda: FakeUUID()) + + error_content = b'{"error": "This model\'s maximum context length is 4096 tokens"}' + mock_error = ClientResponseError( + request_info=MagicMock(), + history=(), + status=400, + message="Bad Request", + ) + mock_error.response_content = error_content + + mock_create_response = AsyncMock(side_effect=mock_error) + monkeypatch.setattr(NeMoGymAsyncOpenAI, "create_response", mock_create_response) + + request_body = NeMoGymResponseCreateParamsNonStreaming(input="What is the weather?" * 1000) + + response = client.post( + "/v1/responses", + json=request_body.model_dump(exclude_unset=True, mode="json"), + ) + assert response.status_code == 200 + + data = response.json() + + assert data["model"] == "dummy_model" + assert data["output"][0]["type"] == "message" + assert data["output"][0]["content"][0]["text"] == "" + class TestVLLMConverter: def setup_method(self, _): From 56254526668ecc4c7277eb77fd9d050241845aa7 Mon Sep 17 00:00:00 2001 From: Christian Munley Date: Tue, 27 Jan 2026 18:12:06 -0800 Subject: [PATCH 02/13] clean Signed-off-by: Christian Munley --- responses_api_models/vllm_model/app.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/responses_api_models/vllm_model/app.py b/responses_api_models/vllm_model/app.py index cbf707e60..dd9b41b19 100644 --- a/responses_api_models/vllm_model/app.py +++ b/responses_api_models/vllm_model/app.py @@ -111,6 +111,7 @@ async def responses( self._session_id_to_client[session_id] = client client = self._session_id_to_client[session_id] + # Native Responses API path (vLLM 0.10.2+) if self.config.use_native_responses_api: body_dict = body.model_dump(exclude_unset=True) body_dict["model"] = self.config.model @@ -128,10 +129,7 @@ async def responses( try: vllm_response_dict = await client.create_response(**body_dict) except ClientResponseError as e: - """ - """ result_content_str = e.response_content.decode() - is_out_of_context_length = e.status == 400 and ( "context length" in result_content_str or "max_tokens" in result_content_str ) @@ -166,7 +164,6 @@ async def responses( if content_item.get("type") == "output_text": text = content_item.get("text", "") reasoning_matches, cleaned_text = self._converter._extract_reasoning_from_content(text) - if reasoning_matches: content_item["text"] = cleaned_text reasoning_item = { @@ -178,7 +175,6 @@ async def responses( ], "status": "completed", } - output_idx = output.index(output_item) output.insert(output_idx, reasoning_item) @@ -216,7 +212,6 @@ async def responses( output_item["generation_token_ids"] = generation_token_ids output_item["generation_log_probs"] = generation_log_probs - # Rebuild content item without logprobs new_content_item = { "type": content_item["type"], "text": content_item["text"], From fa95aed0686bc7902b10329f86bcfcf5362fe9b3 Mon Sep 17 00:00:00 2001 From: Christian Munley Date: Tue, 27 Jan 2026 18:38:28 -0800 Subject: [PATCH 03/13] clean Signed-off-by: Christian Munley --- responses_api_models/vllm_model/app.py | 1 - 1 file changed, 1 deletion(-) diff --git a/responses_api_models/vllm_model/app.py b/responses_api_models/vllm_model/app.py index dd9b41b19..22f70f94b 100644 --- a/responses_api_models/vllm_model/app.py +++ b/responses_api_models/vllm_model/app.py @@ -111,7 +111,6 @@ async def responses( self._session_id_to_client[session_id] = client client = self._session_id_to_client[session_id] - # Native Responses API path (vLLM 0.10.2+) if self.config.use_native_responses_api: body_dict = body.model_dump(exclude_unset=True) body_dict["model"] = self.config.model From 91f10ccce8b18b57cc7c873eb57ac7bbbbd76cab Mon Sep 17 00:00:00 2001 From: Christian Munley Date: Mon, 2 Feb 2026 15:58:04 -0800 Subject: [PATCH 04/13] updates for prompt tok ids and review Signed-off-by: Christian Munley --- responses_api_models/vllm_model/app.py | 79 ++++++------------- .../vllm_model/tests/test_app.py | 27 ++++--- 2 files changed, 37 insertions(+), 69 deletions(-) diff --git a/responses_api_models/vllm_model/app.py b/responses_api_models/vllm_model/app.py index 22f70f94b..2aa0eefd2 100644 --- a/responses_api_models/vllm_model/app.py +++ b/responses_api_models/vllm_model/app.py @@ -116,6 +116,7 @@ async def responses( body_dict["model"] = self.config.model if self.config.return_token_id_information: + body_dict["enable_response_messages"] = True body_dict["top_logprobs"] = 1 if "include" not in body_dict: body_dict["include"] = [] @@ -150,80 +151,44 @@ async def responses( type="message", ) ], + incomplete_details={"reason": "max_output_tokens"}, ) else: raise e - if self.config.uses_reasoning_parser: - output = vllm_response_dict.get("output", []) - for output_item in output: - if output_item.get("type") == "message" and output_item.get("role") == "assistant": - content = output_item.get("content", []) - for content_item in content: - if content_item.get("type") == "output_text": - text = content_item.get("text", "") - reasoning_matches, cleaned_text = self._converter._extract_reasoning_from_content(text) - if reasoning_matches: - content_item["text"] = cleaned_text - reasoning_item = { - "id": f"rs_{uuid4().hex}", - "type": "reasoning", - "summary": [ - {"text": reasoning_text, "type": "summary_text"} - for reasoning_text in reasoning_matches - ], - "status": "completed", - } - output_idx = output.index(output_item) - output.insert(output_idx, reasoning_item) - if self.config.return_token_id_information: + prompt_token_ids = vllm_response_dict["input_messages"][0]["tokens"] + generation_token_ids = vllm_response_dict["output_messages"][0]["tokens"] + output = vllm_response_dict.get("output", []) for output_item in output: if output_item.get("type") == "message" and output_item.get("role") == "assistant": + output_item["prompt_token_ids"] = prompt_token_ids + output_item["generation_token_ids"] = generation_token_ids + + generation_log_probs = [] content = output_item.get("content", []) new_content = [] for content_item in content: if content_item.get("type") == "output_text": logprobs = content_item.get("logprobs", []) - if logprobs: - generation_token_ids = [] - generation_log_probs = [] - for logprob_item in logprobs: - token = logprob_item.get("token", "") - if token.startswith("token_id:"): - token_id = token.removeprefix("token_id:") - else: - token_id = str(logprob_item.get("token_id", token)) - generation_token_ids.append(token_id) - generation_log_probs.append(logprob_item.get("logprob", 0.0)) - - tokenize_body_dict = {"model": body_dict["model"]} - if "input" in body_dict: - tokenize_body_dict["messages"] = body_dict["input"] - if "tools" in body_dict: - tokenize_body_dict["tools"] = body_dict["tools"] - - tokenize_response = await client.create_tokenize(**tokenize_body_dict) - prompt_token_ids = tokenize_response.get("tokens", []) - - output_item["prompt_token_ids"] = prompt_token_ids - output_item["generation_token_ids"] = generation_token_ids - output_item["generation_log_probs"] = generation_log_probs - - new_content_item = { - "type": content_item["type"], - "text": content_item["text"], - "annotations": content_item.get("annotations", []), - } - new_content.append(new_content_item) - else: - new_content.append(content_item) + for logprob_item in logprobs: + generation_log_probs.append(logprob_item.get("logprob", 0.0)) + new_content_item = { + "type": content_item["type"], + "text": content_item["text"], + "annotations": content_item.get("annotations", []), + } + new_content.append(new_content_item) else: new_content.append(content_item) - if new_content: output_item["content"] = new_content + if generation_log_probs: + output_item["generation_log_probs"] = generation_log_probs + + vllm_response_dict.pop("input_messages", None) + vllm_response_dict.pop("output_messages", None) validated_response = NeMoGymResponse.model_validate(vllm_response_dict) return JSONResponse( diff --git a/responses_api_models/vllm_model/tests/test_app.py b/responses_api_models/vllm_model/tests/test_app.py index 20e90dd3c..17a8a719d 100644 --- a/responses_api_models/vllm_model/tests/test_app.py +++ b/responses_api_models/vllm_model/tests/test_app.py @@ -2108,13 +2108,12 @@ def test_native_responses_api_basic(self, monkeypatch: MonkeyPatch): assert mock_create_response.called assert mock_create_response.call_args.kwargs["model"] == "dummy_model" - def test_native_responses_api_with_reasoning_parser(self, monkeypatch: MonkeyPatch): - server = self._setup_server_native_api(monkeypatch, uses_reasoning_parser=True) + def test_native_responses_api_with_reasoning(self, monkeypatch: MonkeyPatch): + """Test that vLLM returns reasoning items natively (no client-side parsing needed).""" + server = self._setup_server_native_api(monkeypatch) app = server.setup_webserver() client = TestClient(app) - monkeypatch.setattr("responses_api_models.vllm_model.app.uuid4", lambda: FakeUUID()) - mock_vllm_response = { "id": "resp_native_123", "created_at": FIXED_TIME, @@ -2124,6 +2123,11 @@ def test_native_responses_api_with_reasoning_parser(self, monkeypatch: MonkeyPat "tool_choice": "auto", "tools": [], "output": [ + { + "id": "rs_123", + "type": "reasoning", + "summary": [{"type": "summary_text", "text": "I should check the weather for the user"}], + }, { "id": "msg_456", "type": "message", @@ -2131,7 +2135,7 @@ def test_native_responses_api_with_reasoning_parser(self, monkeypatch: MonkeyPat "content": [ { "type": "output_text", - "text": "I should check the weather for the userLet me help you with that!", + "text": "Let me help you with that!", "annotations": [], } ], @@ -2183,23 +2187,21 @@ def test_native_responses_api_with_token_ids(self, monkeypatch: MonkeyPatch): "text": "Hello!", "annotations": [], "logprobs": [ - {"token": "token_id:100", "token_id": 100, "logprob": -0.5}, - {"token": "token_id:200", "token_id": 200, "logprob": -1.2}, + {"token": "Hello", "logprob": -0.5}, + {"token": "!", "logprob": -1.2}, ], } ], "status": "completed", } ], + "input_messages": [{"tokens": [1, 2, 3, 4, 5], "type": "raw_message_tokens"}], + "output_messages": [{"tokens": [100, 200], "type": "raw_message_tokens"}], } - mock_tokenize_response = {"tokens": [1, 2, 3, 4, 5]} - mock_create_response = AsyncMock(return_value=mock_vllm_response) - mock_create_tokenize = AsyncMock(return_value=mock_tokenize_response) monkeypatch.setattr(NeMoGymAsyncOpenAI, "create_response", mock_create_response) - monkeypatch.setattr(NeMoGymAsyncOpenAI, "create_tokenize", mock_create_tokenize) request_body = NeMoGymResponseCreateParamsNonStreaming(input="What is the weather?") @@ -2219,8 +2221,9 @@ def test_native_responses_api_with_token_ids(self, monkeypatch: MonkeyPatch): assert "generation_log_probs" in output_item assert output_item["generation_log_probs"] == [-0.5, -1.2] assert "logprobs" not in output_item["content"][0] + assert "input_messages" not in data + assert "output_messages" not in data assert mock_create_response.called - assert mock_create_tokenize.called def test_native_responses_api_context_length_error(self, monkeypatch: MonkeyPatch): server = self._setup_server_native_api(monkeypatch) From d2441527b0be135e3b64ebc5fb0ac6ac8a7e7fd0 Mon Sep 17 00:00:00 2001 From: Christian Munley Date: Mon, 2 Feb 2026 16:22:00 -0800 Subject: [PATCH 05/13] helper fns Signed-off-by: Christian Munley --- responses_api_models/vllm_model/app.py | 177 +++++++++++++------------ 1 file changed, 92 insertions(+), 85 deletions(-) diff --git a/responses_api_models/vllm_model/app.py b/responses_api_models/vllm_model/app.py index 2aa0eefd2..20c9b03f1 100644 --- a/responses_api_models/vllm_model/app.py +++ b/responses_api_models/vllm_model/app.py @@ -112,95 +112,103 @@ async def responses( client = self._session_id_to_client[session_id] if self.config.use_native_responses_api: - body_dict = body.model_dump(exclude_unset=True) - body_dict["model"] = self.config.model - - if self.config.return_token_id_information: - body_dict["enable_response_messages"] = True - body_dict["top_logprobs"] = 1 - if "include" not in body_dict: - body_dict["include"] = [] - if "message.output_text.logprobs" not in body_dict["include"]: - body_dict["include"].append("message.output_text.logprobs") - - if self.config.extra_body: - body_dict = {**self.config.extra_body, **body_dict} - - try: - vllm_response_dict = await client.create_response(**body_dict) - except ClientResponseError as e: - result_content_str = e.response_content.decode() - is_out_of_context_length = e.status == 400 and ( - "context length" in result_content_str or "max_tokens" in result_content_str - ) - if is_out_of_context_length: - return NeMoGymResponse( - id=f"resp_{uuid4().hex}", - created_at=int(time()), - model=self.config.model, - object="response", - parallel_tool_calls=True, - tool_choice="auto", - tools=[], - output=[ - NeMoGymResponseOutputMessage( - id=f"msg_{uuid4().hex}", - role="assistant", - content=[NeMoGymResponseOutputText(type="output_text", text="", annotations=[])], - status="completed", - type="message", - ) - ], - incomplete_details={"reason": "max_output_tokens"}, - ) - else: - raise e - - if self.config.return_token_id_information: - prompt_token_ids = vllm_response_dict["input_messages"][0]["tokens"] - generation_token_ids = vllm_response_dict["output_messages"][0]["tokens"] - - output = vllm_response_dict.get("output", []) - for output_item in output: - if output_item.get("type") == "message" and output_item.get("role") == "assistant": - output_item["prompt_token_ids"] = prompt_token_ids - output_item["generation_token_ids"] = generation_token_ids - - generation_log_probs = [] - content = output_item.get("content", []) - new_content = [] - for content_item in content: - if content_item.get("type") == "output_text": - logprobs = content_item.get("logprobs", []) - for logprob_item in logprobs: - generation_log_probs.append(logprob_item.get("logprob", 0.0)) - new_content_item = { - "type": content_item["type"], - "text": content_item["text"], - "annotations": content_item.get("annotations", []), - } - new_content.append(new_content_item) - else: - new_content.append(content_item) - if new_content: - output_item["content"] = new_content - if generation_log_probs: - output_item["generation_log_probs"] = generation_log_probs - - vllm_response_dict.pop("input_messages", None) - vllm_response_dict.pop("output_messages", None) - - validated_response = NeMoGymResponse.model_validate(vllm_response_dict) - return JSONResponse( - content=validated_response.model_dump(mode="json", exclude_none=True), - status_code=200 + return await self._handle_native_responses_api(client, body) + + return await self._handle_chat_completions_responses(request, body) + + async def _handle_native_responses_api( + self, client: NeMoGymAsyncOpenAI, body: NeMoGymResponseCreateParamsNonStreaming + ) -> JSONResponse: + body_dict = body.model_dump(exclude_unset=True) + body_dict["model"] = self.config.model + + if self.config.return_token_id_information: + body_dict["enable_response_messages"] = True + body_dict["top_logprobs"] = 1 + if "include" not in body_dict: + body_dict["include"] = [] + if "message.output_text.logprobs" not in body_dict["include"]: + body_dict["include"].append("message.output_text.logprobs") + + if self.config.extra_body: + body_dict = {**self.config.extra_body, **body_dict} + + try: + vllm_response_dict = await client.create_response(**body_dict) + except ClientResponseError as e: + result_content_str = e.response_content.decode() + is_out_of_context_length = e.status == 400 and ( + "context length" in result_content_str or "max_tokens" in result_content_str ) + if is_out_of_context_length: + return NeMoGymResponse( + id=f"resp_{uuid4().hex}", + created_at=int(time()), + model=self.config.model, + object="response", + parallel_tool_calls=True, + tool_choice="auto", + tools=[], + output=[ + NeMoGymResponseOutputMessage( + id=f"msg_{uuid4().hex}", + role="assistant", + content=[NeMoGymResponseOutputText(type="output_text", text="", annotations=[])], + status="completed", + type="message", + ) + ], + incomplete_details={"reason": "max_output_tokens"}, + ) + else: + raise e - # Response Create Params -> Chat Completion Create Params + if self.config.return_token_id_information: + prompt_token_ids = vllm_response_dict["input_messages"][0]["tokens"] + generation_token_ids = vllm_response_dict["output_messages"][0]["tokens"] + + output = vllm_response_dict.get("output", []) + for output_item in output: + if output_item.get("type") == "message" and output_item.get("role") == "assistant": + output_item["prompt_token_ids"] = prompt_token_ids + output_item["generation_token_ids"] = generation_token_ids + + generation_log_probs = [] + content = output_item.get("content", []) + new_content = [] + for content_item in content: + if content_item.get("type") == "output_text": + logprobs = content_item.get("logprobs", []) + for logprob_item in logprobs: + generation_log_probs.append(logprob_item.get("logprob", 0.0)) + new_content_item = { + "type": content_item["type"], + "text": content_item["text"], + "annotations": content_item.get("annotations", []), + } + new_content.append(new_content_item) + else: + new_content.append(content_item) + if new_content: + output_item["content"] = new_content + if generation_log_probs: + output_item["generation_log_probs"] = generation_log_probs + + vllm_response_dict.pop("input_messages", None) + vllm_response_dict.pop("output_messages", None) + + validated_response = NeMoGymResponse.model_validate(vllm_response_dict) + return JSONResponse( + content=validated_response.model_dump(mode="json", exclude_none=True), + status_code=200 + ) + + async def _handle_chat_completions_responses( + self, request: Request, body: NeMoGymResponseCreateParamsNonStreaming + ) -> NeMoGymResponse: chat_completion_create_params = self._converter.responses_to_chat_completion_create_params(body) body.model = self.config.model - # Chat Completion Create Params -> Chat Completion chat_completion_response = await self.chat_completions(request, chat_completion_create_params) choice = chat_completion_response.choices[0] @@ -208,7 +216,6 @@ async def responses( response_output = self._converter.postprocess_chat_response(choice) response_output_dicts = [item.model_dump() for item in response_output] - # Chat Completion -> Response return NeMoGymResponse( id=f"resp_{uuid4().hex}", created_at=int(time()), From e80ff1f03eabf6bd60e4ec4b9f850858d4fc1a7c Mon Sep 17 00:00:00 2001 From: Christian Munley Date: Mon, 2 Feb 2026 16:53:55 -0800 Subject: [PATCH 06/13] err pattern, ruff Signed-off-by: Christian Munley --- responses_api_models/vllm_model/app.py | 9 ++++----- responses_api_models/vllm_model/tests/test_app.py | 2 +- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/responses_api_models/vllm_model/app.py b/responses_api_models/vllm_model/app.py index 20c9b03f1..e8d7b706f 100644 --- a/responses_api_models/vllm_model/app.py +++ b/responses_api_models/vllm_model/app.py @@ -138,7 +138,9 @@ async def _handle_native_responses_api( except ClientResponseError as e: result_content_str = e.response_content.decode() is_out_of_context_length = e.status == 400 and ( - "context length" in result_content_str or "max_tokens" in result_content_str + "context length" in result_content_str + or "max_tokens" in result_content_str + or "max_model_len" in result_content_str ) if is_out_of_context_length: return NeMoGymResponse( @@ -198,10 +200,7 @@ async def _handle_native_responses_api( vllm_response_dict.pop("output_messages", None) validated_response = NeMoGymResponse.model_validate(vllm_response_dict) - return JSONResponse( - content=validated_response.model_dump(mode="json", exclude_none=True), - status_code=200 - ) + return JSONResponse(content=validated_response.model_dump(mode="json", exclude_none=True), status_code=200) async def _handle_chat_completions_responses( self, request: Request, body: NeMoGymResponseCreateParamsNonStreaming diff --git a/responses_api_models/vllm_model/tests/test_app.py b/responses_api_models/vllm_model/tests/test_app.py index 17a8a719d..8f6ad5d20 100644 --- a/responses_api_models/vllm_model/tests/test_app.py +++ b/responses_api_models/vllm_model/tests/test_app.py @@ -2140,7 +2140,7 @@ def test_native_responses_api_with_reasoning(self, monkeypatch: MonkeyPatch): } ], "status": "completed", - } + }, ], } From 7533e7729143ecb9dd14d0a3ab38c2a5c51227e5 Mon Sep 17 00:00:00 2001 From: Christian Munley Date: Mon, 2 Feb 2026 16:56:58 -0800 Subject: [PATCH 07/13] trigger ci Signed-off-by: Christian Munley From 3826cf8b099f802edd58262f9b39fbc3bf73de35 Mon Sep 17 00:00:00 2001 From: Christian Munley Date: Mon, 2 Feb 2026 17:12:44 -0800 Subject: [PATCH 08/13] return type ngr Signed-off-by: Christian Munley --- responses_api_models/vllm_model/app.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/responses_api_models/vllm_model/app.py b/responses_api_models/vllm_model/app.py index e8d7b706f..f014c8c7a 100644 --- a/responses_api_models/vllm_model/app.py +++ b/responses_api_models/vllm_model/app.py @@ -20,7 +20,6 @@ from aiohttp.client_exceptions import ClientResponseError from fastapi import Request -from fastapi.responses import JSONResponse from pydantic import BaseModel, Field from nemo_gym.base_responses_api_model import ( @@ -118,7 +117,7 @@ async def responses( async def _handle_native_responses_api( self, client: NeMoGymAsyncOpenAI, body: NeMoGymResponseCreateParamsNonStreaming - ) -> JSONResponse: + ) -> NeMoGymResponse: body_dict = body.model_dump(exclude_unset=True) body_dict["model"] = self.config.model @@ -199,8 +198,7 @@ async def _handle_native_responses_api( vllm_response_dict.pop("input_messages", None) vllm_response_dict.pop("output_messages", None) - validated_response = NeMoGymResponse.model_validate(vllm_response_dict) - return JSONResponse(content=validated_response.model_dump(mode="json", exclude_none=True), status_code=200) + return NeMoGymResponse.model_validate(vllm_response_dict) async def _handle_chat_completions_responses( self, request: Request, body: NeMoGymResponseCreateParamsNonStreaming From 0db0647c962ba4fdf7e97961a947f3374257df46 Mon Sep 17 00:00:00 2001 From: Christian Munley Date: Mon, 2 Feb 2026 17:22:34 -0800 Subject: [PATCH 09/13] test logprob none Signed-off-by: Christian Munley --- responses_api_models/vllm_model/tests/test_app.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/responses_api_models/vllm_model/tests/test_app.py b/responses_api_models/vllm_model/tests/test_app.py index 8f6ad5d20..523a0c4ab 100644 --- a/responses_api_models/vllm_model/tests/test_app.py +++ b/responses_api_models/vllm_model/tests/test_app.py @@ -2220,7 +2220,7 @@ def test_native_responses_api_with_token_ids(self, monkeypatch: MonkeyPatch): assert output_item["generation_token_ids"] == [100, 200] assert "generation_log_probs" in output_item assert output_item["generation_log_probs"] == [-0.5, -1.2] - assert "logprobs" not in output_item["content"][0] + assert output_item["content"][0].get("logprobs") is None assert "input_messages" not in data assert "output_messages" not in data assert mock_create_response.called From e2581edcb3f9992af8512ff8937e655bd355265a Mon Sep 17 00:00:00 2001 From: Christian Munley Date: Mon, 2 Feb 2026 17:44:44 -0800 Subject: [PATCH 10/13] more tests, remove extra cfgs, small fixes Signed-off-by: Christian Munley --- responses_api_models/vllm_model/app.py | 4 +- .../vllm_model/configs/vllm_model.yaml | 1 + .../configs/vllm_model_for_training.yaml | 1 + .../configs/vllm_model_native_responses.yaml | 10 - ...m_model_native_responses_for_training.yaml | 10 - .../vllm_model/tests/test_app.py | 227 +++++++++++++++++- 6 files changed, 230 insertions(+), 23 deletions(-) delete mode 100644 responses_api_models/vllm_model/configs/vllm_model_native_responses.yaml delete mode 100644 responses_api_models/vllm_model/configs/vllm_model_native_responses_for_training.yaml diff --git a/responses_api_models/vllm_model/app.py b/responses_api_models/vllm_model/app.py index f014c8c7a..365fb3c32 100644 --- a/responses_api_models/vllm_model/app.py +++ b/responses_api_models/vllm_model/app.py @@ -162,7 +162,7 @@ async def _handle_native_responses_api( incomplete_details={"reason": "max_output_tokens"}, ) else: - raise e + raise if self.config.return_token_id_information: prompt_token_ids = vllm_response_dict["input_messages"][0]["tokens"] @@ -179,7 +179,7 @@ async def _handle_native_responses_api( new_content = [] for content_item in content: if content_item.get("type") == "output_text": - logprobs = content_item.get("logprobs", []) + logprobs = content_item.get("logprobs") or [] for logprob_item in logprobs: generation_log_probs.append(logprob_item.get("logprob", 0.0)) new_content_item = { diff --git a/responses_api_models/vllm_model/configs/vllm_model.yaml b/responses_api_models/vllm_model/configs/vllm_model.yaml index f7850d900..e4ab33e7f 100644 --- a/responses_api_models/vllm_model/configs/vllm_model.yaml +++ b/responses_api_models/vllm_model/configs/vllm_model.yaml @@ -7,3 +7,4 @@ policy_model: model: ${policy_model_name} return_token_id_information: false uses_reasoning_parser: true + use_native_responses_api: false diff --git a/responses_api_models/vllm_model/configs/vllm_model_for_training.yaml b/responses_api_models/vllm_model/configs/vllm_model_for_training.yaml index 70727036c..41f817bbe 100644 --- a/responses_api_models/vllm_model/configs/vllm_model_for_training.yaml +++ b/responses_api_models/vllm_model/configs/vllm_model_for_training.yaml @@ -7,3 +7,4 @@ policy_model: model: ${policy_model_name} return_token_id_information: true uses_reasoning_parser: true + use_native_responses_api: false diff --git a/responses_api_models/vllm_model/configs/vllm_model_native_responses.yaml b/responses_api_models/vllm_model/configs/vllm_model_native_responses.yaml deleted file mode 100644 index 9eef48f6f..000000000 --- a/responses_api_models/vllm_model/configs/vllm_model_native_responses.yaml +++ /dev/null @@ -1,10 +0,0 @@ -policy_model: - responses_api_models: - vllm_model: - entrypoint: app.py - base_url: ${policy_base_url} - api_key: ${policy_api_key} - model: ${policy_model_name} - return_token_id_information: false - uses_reasoning_parser: true - use_native_responses_api: true diff --git a/responses_api_models/vllm_model/configs/vllm_model_native_responses_for_training.yaml b/responses_api_models/vllm_model/configs/vllm_model_native_responses_for_training.yaml deleted file mode 100644 index 64e53c228..000000000 --- a/responses_api_models/vllm_model/configs/vllm_model_native_responses_for_training.yaml +++ /dev/null @@ -1,10 +0,0 @@ -policy_model: - responses_api_models: - vllm_model: - entrypoint: app.py - base_url: ${policy_base_url} - api_key: ${policy_api_key} - model: ${policy_model_name} - return_token_id_information: true - uses_reasoning_parser: true - use_native_responses_api: true diff --git a/responses_api_models/vllm_model/tests/test_app.py b/responses_api_models/vllm_model/tests/test_app.py index 523a0c4ab..dfbdfeb0e 100644 --- a/responses_api_models/vllm_model/tests/test_app.py +++ b/responses_api_models/vllm_model/tests/test_app.py @@ -2109,7 +2109,6 @@ def test_native_responses_api_basic(self, monkeypatch: MonkeyPatch): assert mock_create_response.call_args.kwargs["model"] == "dummy_model" def test_native_responses_api_with_reasoning(self, monkeypatch: MonkeyPatch): - """Test that vLLM returns reasoning items natively (no client-side parsing needed).""" server = self._setup_server_native_api(monkeypatch) app = server.setup_webserver() client = TestClient(app) @@ -2259,6 +2258,232 @@ def test_native_responses_api_context_length_error(self, monkeypatch: MonkeyPatc assert data["output"][0]["type"] == "message" assert data["output"][0]["content"][0]["text"] == "" + def test_native_responses_api_max_model_len_error(self, monkeypatch: MonkeyPatch): + server = self._setup_server_native_api(monkeypatch) + app = server.setup_webserver() + client = TestClient(app) + + monkeypatch.setattr("responses_api_models.vllm_model.app.time", lambda: FIXED_TIME) + monkeypatch.setattr("responses_api_models.vllm_model.app.uuid4", lambda: FakeUUID()) + + error_content = b'{"error": "The engine prompt length 5000 exceeds the max_model_len 4096"}' + mock_error = ClientResponseError( + request_info=MagicMock(), + history=(), + status=400, + message="Bad Request", + ) + mock_error.response_content = error_content + + mock_create_response = AsyncMock(side_effect=mock_error) + monkeypatch.setattr(NeMoGymAsyncOpenAI, "create_response", mock_create_response) + + request_body = NeMoGymResponseCreateParamsNonStreaming(input="What is the weather?" * 1000) + + response = client.post( + "/v1/responses", + json=request_body.model_dump(exclude_unset=True, mode="json"), + ) + assert response.status_code == 200 + + data = response.json() + assert data["output"][0]["content"][0]["text"] == "" + assert data["incomplete_details"] == {"reason": "max_output_tokens"} + + def test_native_responses_api_tool_calls(self, monkeypatch: MonkeyPatch): + server = self._setup_server_native_api(monkeypatch) + app = server.setup_webserver() + client = TestClient(app) + + mock_vllm_response = { + "id": "resp_native_tools", + "created_at": FIXED_TIME, + "model": "dummy_model", + "object": "response", + "parallel_tool_calls": True, + "tool_choice": "auto", + "tools": [], + "output": [ + { + "id": "fc_123", + "type": "function_call", + "name": "get_weather", + "arguments": '{"location": "San Francisco"}', + "call_id": "call_123", + "status": "completed", + } + ], + } + + mock_create_response = AsyncMock(return_value=mock_vllm_response) + monkeypatch.setattr(NeMoGymAsyncOpenAI, "create_response", mock_create_response) + + request_body = NeMoGymResponseCreateParamsNonStreaming(input="What is the weather in San Francisco?") + + response = client.post( + "/v1/responses", + json=request_body.model_dump(exclude_unset=True, mode="json"), + ) + assert response.status_code == 200 + + data = response.json() + assert data["output"][0]["type"] == "function_call" + assert data["output"][0]["name"] == "get_weather" + assert data["output"][0]["arguments"] == '{"location": "San Francisco"}' + assert mock_create_response.called + + def test_native_responses_api_multiturn(self, monkeypatch: MonkeyPatch): + server = self._setup_server_native_api(monkeypatch) + app = server.setup_webserver() + client = TestClient(app) + + mock_vllm_response = { + "id": "resp_native_multiturn", + "created_at": FIXED_TIME, + "model": "dummy_model", + "object": "response", + "parallel_tool_calls": True, + "tool_choice": "auto", + "tools": [], + "output": [ + { + "id": "msg_789", + "type": "message", + "role": "assistant", + "content": [ + { + "type": "output_text", + "text": "The capital of France is Paris.", + "annotations": [], + } + ], + "status": "completed", + } + ], + } + + mock_create_response = AsyncMock(return_value=mock_vllm_response) + monkeypatch.setattr(NeMoGymAsyncOpenAI, "create_response", mock_create_response) + + request_body = NeMoGymResponseCreateParamsNonStreaming( + input=[ + {"role": "user", "content": "What is the capital of Germany?"}, + {"role": "assistant", "content": "The capital of Germany is Berlin."}, + {"role": "user", "content": "What about France?"}, + ] + ) + + response = client.post( + "/v1/responses", + json=request_body.model_dump(exclude_unset=True, mode="json"), + ) + assert response.status_code == 200 + + data = response.json() + assert data["output"][0]["content"][0]["text"] == "The capital of France is Paris." + assert mock_create_response.called + # Verify the input was passed through + assert mock_create_response.call_args.kwargs["input"] == [ + {"role": "user", "content": "What is the capital of Germany?"}, + {"role": "assistant", "content": "The capital of Germany is Berlin."}, + {"role": "user", "content": "What about France?"}, + ] + + def test_native_responses_api_string_input(self, monkeypatch: MonkeyPatch): + server = self._setup_server_native_api(monkeypatch) + app = server.setup_webserver() + client = TestClient(app) + + mock_vllm_response = { + "id": "resp_native_str", + "created_at": FIXED_TIME, + "model": "dummy_model", + "object": "response", + "parallel_tool_calls": True, + "tool_choice": "auto", + "tools": [], + "output": [ + { + "id": "msg_str", + "type": "message", + "role": "assistant", + "content": [ + { + "type": "output_text", + "text": "Hello there!", + "annotations": [], + } + ], + "status": "completed", + } + ], + } + + mock_create_response = AsyncMock(return_value=mock_vllm_response) + monkeypatch.setattr(NeMoGymAsyncOpenAI, "create_response", mock_create_response) + + request_body = NeMoGymResponseCreateParamsNonStreaming(input="Hello") + + response = client.post( + "/v1/responses", + json=request_body.model_dump(exclude_unset=True, mode="json"), + ) + assert response.status_code == 200 + + data = response.json() + assert data["output"][0]["content"][0]["text"] == "Hello there!" + assert mock_create_response.called + assert mock_create_response.call_args.kwargs["input"] == "Hello" + + def test_native_responses_api_with_instructions(self, monkeypatch: MonkeyPatch): + server = self._setup_server_native_api(monkeypatch) + app = server.setup_webserver() + client = TestClient(app) + + mock_vllm_response = { + "id": "resp_native_inst", + "created_at": FIXED_TIME, + "model": "dummy_model", + "object": "response", + "parallel_tool_calls": True, + "tool_choice": "auto", + "tools": [], + "output": [ + { + "id": "msg_inst", + "type": "message", + "role": "assistant", + "content": [ + { + "type": "output_text", + "text": "Ahoy! How can I help ye today?", + "annotations": [], + } + ], + "status": "completed", + } + ], + } + + mock_create_response = AsyncMock(return_value=mock_vllm_response) + monkeypatch.setattr(NeMoGymAsyncOpenAI, "create_response", mock_create_response) + + request_body = NeMoGymResponseCreateParamsNonStreaming( + input="Hello", + instructions="You are a pirate. Always respond like a pirate.", + ) + + response = client.post( + "/v1/responses", + json=request_body.model_dump(exclude_unset=True, mode="json"), + ) + assert response.status_code == 200 + + data = response.json() + assert "Ahoy" in data["output"][0]["content"][0]["text"] + assert mock_create_response.called + assert mock_create_response.call_args.kwargs["instructions"] == "You are a pirate. Always respond like a pirate." + class TestVLLMConverter: def setup_method(self, _): From ec9cd23364beed00b99251b653b2799ad9dca6ff Mon Sep 17 00:00:00 2001 From: Christian Munley Date: Mon, 2 Feb 2026 17:50:56 -0800 Subject: [PATCH 11/13] tests Signed-off-by: Christian Munley --- responses_api_models/vllm_model/app.py | 2 + .../vllm_model/tests/test_app.py | 71 +------------------ 2 files changed, 5 insertions(+), 68 deletions(-) diff --git a/responses_api_models/vllm_model/app.py b/responses_api_models/vllm_model/app.py index 365fb3c32..c1432cb65 100644 --- a/responses_api_models/vllm_model/app.py +++ b/responses_api_models/vllm_model/app.py @@ -206,6 +206,7 @@ async def _handle_chat_completions_responses( chat_completion_create_params = self._converter.responses_to_chat_completion_create_params(body) body.model = self.config.model + # Chat Completion Create Params -> Chat Completion chat_completion_response = await self.chat_completions(request, chat_completion_create_params) choice = chat_completion_response.choices[0] @@ -213,6 +214,7 @@ async def _handle_chat_completions_responses( response_output = self._converter.postprocess_chat_response(choice) response_output_dicts = [item.model_dump() for item in response_output] + # Chat Completion -> Response return NeMoGymResponse( id=f"resp_{uuid4().hex}", created_at=int(time()), diff --git a/responses_api_models/vllm_model/tests/test_app.py b/responses_api_models/vllm_model/tests/test_app.py index dfbdfeb0e..07579cd01 100644 --- a/responses_api_models/vllm_model/tests/test_app.py +++ b/responses_api_models/vllm_model/tests/test_app.py @@ -16,7 +16,6 @@ from typing import Any, Union from unittest.mock import AsyncMock, MagicMock -from aiohttp.client_exceptions import ClientResponseError from fastapi.testclient import TestClient from pytest import MonkeyPatch, mark @@ -2224,72 +2223,6 @@ def test_native_responses_api_with_token_ids(self, monkeypatch: MonkeyPatch): assert "output_messages" not in data assert mock_create_response.called - def test_native_responses_api_context_length_error(self, monkeypatch: MonkeyPatch): - server = self._setup_server_native_api(monkeypatch) - app = server.setup_webserver() - client = TestClient(app) - - monkeypatch.setattr("responses_api_models.vllm_model.app.time", lambda: FIXED_TIME) - monkeypatch.setattr("responses_api_models.vllm_model.app.uuid4", lambda: FakeUUID()) - - error_content = b'{"error": "This model\'s maximum context length is 4096 tokens"}' - mock_error = ClientResponseError( - request_info=MagicMock(), - history=(), - status=400, - message="Bad Request", - ) - mock_error.response_content = error_content - - mock_create_response = AsyncMock(side_effect=mock_error) - monkeypatch.setattr(NeMoGymAsyncOpenAI, "create_response", mock_create_response) - - request_body = NeMoGymResponseCreateParamsNonStreaming(input="What is the weather?" * 1000) - - response = client.post( - "/v1/responses", - json=request_body.model_dump(exclude_unset=True, mode="json"), - ) - assert response.status_code == 200 - - data = response.json() - - assert data["model"] == "dummy_model" - assert data["output"][0]["type"] == "message" - assert data["output"][0]["content"][0]["text"] == "" - - def test_native_responses_api_max_model_len_error(self, monkeypatch: MonkeyPatch): - server = self._setup_server_native_api(monkeypatch) - app = server.setup_webserver() - client = TestClient(app) - - monkeypatch.setattr("responses_api_models.vllm_model.app.time", lambda: FIXED_TIME) - monkeypatch.setattr("responses_api_models.vllm_model.app.uuid4", lambda: FakeUUID()) - - error_content = b'{"error": "The engine prompt length 5000 exceeds the max_model_len 4096"}' - mock_error = ClientResponseError( - request_info=MagicMock(), - history=(), - status=400, - message="Bad Request", - ) - mock_error.response_content = error_content - - mock_create_response = AsyncMock(side_effect=mock_error) - monkeypatch.setattr(NeMoGymAsyncOpenAI, "create_response", mock_create_response) - - request_body = NeMoGymResponseCreateParamsNonStreaming(input="What is the weather?" * 1000) - - response = client.post( - "/v1/responses", - json=request_body.model_dump(exclude_unset=True, mode="json"), - ) - assert response.status_code == 200 - - data = response.json() - assert data["output"][0]["content"][0]["text"] == "" - assert data["incomplete_details"] == {"reason": "max_output_tokens"} - def test_native_responses_api_tool_calls(self, monkeypatch: MonkeyPatch): server = self._setup_server_native_api(monkeypatch) app = server.setup_webserver() @@ -2482,7 +2415,9 @@ def test_native_responses_api_with_instructions(self, monkeypatch: MonkeyPatch): data = response.json() assert "Ahoy" in data["output"][0]["content"][0]["text"] assert mock_create_response.called - assert mock_create_response.call_args.kwargs["instructions"] == "You are a pirate. Always respond like a pirate." + assert ( + mock_create_response.call_args.kwargs["instructions"] == "You are a pirate. Always respond like a pirate." + ) class TestVLLMConverter: From 144c402bcc1c822a5dc85c916204a23d8a300643 Mon Sep 17 00:00:00 2001 From: Christian Munley Date: Mon, 2 Feb 2026 17:53:52 -0800 Subject: [PATCH 12/13] tests helper Signed-off-by: Christian Munley --- .../vllm_model/tests/test_app.py | 45 +++++++------------ 1 file changed, 15 insertions(+), 30 deletions(-) diff --git a/responses_api_models/vllm_model/tests/test_app.py b/responses_api_models/vllm_model/tests/test_app.py index 07579cd01..72cb526b3 100644 --- a/responses_api_models/vllm_model/tests/test_app.py +++ b/responses_api_models/vllm_model/tests/test_app.py @@ -661,39 +661,24 @@ class FakeUUID: class TestApp: - def _setup_server(self, monkeypatch: MonkeyPatch): - config = VLLMModelConfig( - host="0.0.0.0", - port=8081, - base_url="http://api.openai.com/v1", - api_key="dummy_key", # pragma: allowlist secret - model="dummy_model", - entrypoint="", - name="", - return_token_id_information=False, - uses_reasoning_parser=False, - ) - - get_global_config_dict_mock = MagicMock() - get_global_config_dict_mock.return_value = dict() - monkeypatch.setattr(nemo_gym.server_utils, "get_global_config_dict", get_global_config_dict_mock) - - return VLLMModel(config=config, server_client=MagicMock(spec=ServerClient)) - - def _setup_server_native_api( - self, monkeypatch: MonkeyPatch, return_token_id_information: bool = False, uses_reasoning_parser: bool = False + def _setup_server( + self, + monkeypatch: MonkeyPatch, + return_token_id_information: bool = False, + uses_reasoning_parser: bool = False, + use_native_responses_api: bool = False, ): config = VLLMModelConfig( host="0.0.0.0", port=8081, base_url="http://api.openai.com/v1", - api_key="dummy_key", + api_key="dummy_key", # pragma: allowlist secret model="dummy_model", entrypoint="", name="", return_token_id_information=return_token_id_information, uses_reasoning_parser=uses_reasoning_parser, - use_native_responses_api=True, + use_native_responses_api=use_native_responses_api, ) get_global_config_dict_mock = MagicMock() @@ -2062,7 +2047,7 @@ def test_responses_reasoning_parser(self, monkeypatch: MonkeyPatch): assert expected_messages == actual_messages def test_native_responses_api_basic(self, monkeypatch: MonkeyPatch): - server = self._setup_server_native_api(monkeypatch) + server = self._setup_server(monkeypatch, use_native_responses_api=True) app = server.setup_webserver() client = TestClient(app) @@ -2108,7 +2093,7 @@ def test_native_responses_api_basic(self, monkeypatch: MonkeyPatch): assert mock_create_response.call_args.kwargs["model"] == "dummy_model" def test_native_responses_api_with_reasoning(self, monkeypatch: MonkeyPatch): - server = self._setup_server_native_api(monkeypatch) + server = self._setup_server(monkeypatch, use_native_responses_api=True) app = server.setup_webserver() client = TestClient(app) @@ -2162,7 +2147,7 @@ def test_native_responses_api_with_reasoning(self, monkeypatch: MonkeyPatch): assert data["output"][1]["content"][0]["text"] == "Let me help you with that!" def test_native_responses_api_with_token_ids(self, monkeypatch: MonkeyPatch): - server = self._setup_server_native_api(monkeypatch, return_token_id_information=True) + server = self._setup_server(monkeypatch, return_token_id_information=True, use_native_responses_api=True) app = server.setup_webserver() client = TestClient(app) @@ -2224,7 +2209,7 @@ def test_native_responses_api_with_token_ids(self, monkeypatch: MonkeyPatch): assert mock_create_response.called def test_native_responses_api_tool_calls(self, monkeypatch: MonkeyPatch): - server = self._setup_server_native_api(monkeypatch) + server = self._setup_server(monkeypatch, use_native_responses_api=True) app = server.setup_webserver() client = TestClient(app) @@ -2266,7 +2251,7 @@ def test_native_responses_api_tool_calls(self, monkeypatch: MonkeyPatch): assert mock_create_response.called def test_native_responses_api_multiturn(self, monkeypatch: MonkeyPatch): - server = self._setup_server_native_api(monkeypatch) + server = self._setup_server(monkeypatch, use_native_responses_api=True) app = server.setup_webserver() client = TestClient(app) @@ -2323,7 +2308,7 @@ def test_native_responses_api_multiturn(self, monkeypatch: MonkeyPatch): ] def test_native_responses_api_string_input(self, monkeypatch: MonkeyPatch): - server = self._setup_server_native_api(monkeypatch) + server = self._setup_server(monkeypatch, use_native_responses_api=True) app = server.setup_webserver() client = TestClient(app) @@ -2369,7 +2354,7 @@ def test_native_responses_api_string_input(self, monkeypatch: MonkeyPatch): assert mock_create_response.call_args.kwargs["input"] == "Hello" def test_native_responses_api_with_instructions(self, monkeypatch: MonkeyPatch): - server = self._setup_server_native_api(monkeypatch) + server = self._setup_server(monkeypatch, use_native_responses_api=True) app = server.setup_webserver() client = TestClient(app) From 3e8b41a9e833b66969e61fe5fe2dc408777444e9 Mon Sep 17 00:00:00 2001 From: Christian Munley Date: Mon, 2 Feb 2026 21:42:18 -0800 Subject: [PATCH 13/13] rename some things Signed-off-by: Christian Munley --- responses_api_models/vllm_model/app.py | 12 ++++++------ .../vllm_model/configs/vllm_model.yaml | 2 +- .../configs/vllm_model_for_training.yaml | 2 +- .../vllm_model/tests/test_app.py | 18 +++++++++--------- 4 files changed, 17 insertions(+), 17 deletions(-) diff --git a/responses_api_models/vllm_model/app.py b/responses_api_models/vllm_model/app.py index c1432cb65..feaca789c 100644 --- a/responses_api_models/vllm_model/app.py +++ b/responses_api_models/vllm_model/app.py @@ -67,7 +67,7 @@ class VLLMModelConfig(BaseResponsesAPIModelConfig): uses_reasoning_parser: bool replace_developer_role_with_system: bool = False - use_native_responses_api: bool = False + use_responses_endpoint: bool = False chat_template_kwargs: Optional[Dict[str, Any]] = None @@ -110,12 +110,12 @@ async def responses( self._session_id_to_client[session_id] = client client = self._session_id_to_client[session_id] - if self.config.use_native_responses_api: - return await self._handle_native_responses_api(client, body) + if self.config.use_responses_endpoint: + return await self._call_responses(client, body) - return await self._handle_chat_completions_responses(request, body) + return await self._call_chat_completions(request, body) - async def _handle_native_responses_api( + async def _call_responses( self, client: NeMoGymAsyncOpenAI, body: NeMoGymResponseCreateParamsNonStreaming ) -> NeMoGymResponse: body_dict = body.model_dump(exclude_unset=True) @@ -200,7 +200,7 @@ async def _handle_native_responses_api( return NeMoGymResponse.model_validate(vllm_response_dict) - async def _handle_chat_completions_responses( + async def _call_chat_completions( self, request: Request, body: NeMoGymResponseCreateParamsNonStreaming ) -> NeMoGymResponse: chat_completion_create_params = self._converter.responses_to_chat_completion_create_params(body) diff --git a/responses_api_models/vllm_model/configs/vllm_model.yaml b/responses_api_models/vllm_model/configs/vllm_model.yaml index e4ab33e7f..c77ddcbfe 100644 --- a/responses_api_models/vllm_model/configs/vllm_model.yaml +++ b/responses_api_models/vllm_model/configs/vllm_model.yaml @@ -7,4 +7,4 @@ policy_model: model: ${policy_model_name} return_token_id_information: false uses_reasoning_parser: true - use_native_responses_api: false + use_responses_endpoint: false diff --git a/responses_api_models/vllm_model/configs/vllm_model_for_training.yaml b/responses_api_models/vllm_model/configs/vllm_model_for_training.yaml index 41f817bbe..be13f5ae2 100644 --- a/responses_api_models/vllm_model/configs/vllm_model_for_training.yaml +++ b/responses_api_models/vllm_model/configs/vllm_model_for_training.yaml @@ -7,4 +7,4 @@ policy_model: model: ${policy_model_name} return_token_id_information: true uses_reasoning_parser: true - use_native_responses_api: false + use_responses_endpoint: false diff --git a/responses_api_models/vllm_model/tests/test_app.py b/responses_api_models/vllm_model/tests/test_app.py index 72cb526b3..5a0d23103 100644 --- a/responses_api_models/vllm_model/tests/test_app.py +++ b/responses_api_models/vllm_model/tests/test_app.py @@ -666,7 +666,7 @@ def _setup_server( monkeypatch: MonkeyPatch, return_token_id_information: bool = False, uses_reasoning_parser: bool = False, - use_native_responses_api: bool = False, + use_responses_endpoint: bool = False, ): config = VLLMModelConfig( host="0.0.0.0", @@ -678,7 +678,7 @@ def _setup_server( name="", return_token_id_information=return_token_id_information, uses_reasoning_parser=uses_reasoning_parser, - use_native_responses_api=use_native_responses_api, + use_responses_endpoint=use_responses_endpoint, ) get_global_config_dict_mock = MagicMock() @@ -2047,7 +2047,7 @@ def test_responses_reasoning_parser(self, monkeypatch: MonkeyPatch): assert expected_messages == actual_messages def test_native_responses_api_basic(self, monkeypatch: MonkeyPatch): - server = self._setup_server(monkeypatch, use_native_responses_api=True) + server = self._setup_server(monkeypatch, use_responses_endpoint=True) app = server.setup_webserver() client = TestClient(app) @@ -2093,7 +2093,7 @@ def test_native_responses_api_basic(self, monkeypatch: MonkeyPatch): assert mock_create_response.call_args.kwargs["model"] == "dummy_model" def test_native_responses_api_with_reasoning(self, monkeypatch: MonkeyPatch): - server = self._setup_server(monkeypatch, use_native_responses_api=True) + server = self._setup_server(monkeypatch, use_responses_endpoint=True) app = server.setup_webserver() client = TestClient(app) @@ -2147,7 +2147,7 @@ def test_native_responses_api_with_reasoning(self, monkeypatch: MonkeyPatch): assert data["output"][1]["content"][0]["text"] == "Let me help you with that!" def test_native_responses_api_with_token_ids(self, monkeypatch: MonkeyPatch): - server = self._setup_server(monkeypatch, return_token_id_information=True, use_native_responses_api=True) + server = self._setup_server(monkeypatch, return_token_id_information=True, use_responses_endpoint=True) app = server.setup_webserver() client = TestClient(app) @@ -2209,7 +2209,7 @@ def test_native_responses_api_with_token_ids(self, monkeypatch: MonkeyPatch): assert mock_create_response.called def test_native_responses_api_tool_calls(self, monkeypatch: MonkeyPatch): - server = self._setup_server(monkeypatch, use_native_responses_api=True) + server = self._setup_server(monkeypatch, use_responses_endpoint=True) app = server.setup_webserver() client = TestClient(app) @@ -2251,7 +2251,7 @@ def test_native_responses_api_tool_calls(self, monkeypatch: MonkeyPatch): assert mock_create_response.called def test_native_responses_api_multiturn(self, monkeypatch: MonkeyPatch): - server = self._setup_server(monkeypatch, use_native_responses_api=True) + server = self._setup_server(monkeypatch, use_responses_endpoint=True) app = server.setup_webserver() client = TestClient(app) @@ -2308,7 +2308,7 @@ def test_native_responses_api_multiturn(self, monkeypatch: MonkeyPatch): ] def test_native_responses_api_string_input(self, monkeypatch: MonkeyPatch): - server = self._setup_server(monkeypatch, use_native_responses_api=True) + server = self._setup_server(monkeypatch, use_responses_endpoint=True) app = server.setup_webserver() client = TestClient(app) @@ -2354,7 +2354,7 @@ def test_native_responses_api_string_input(self, monkeypatch: MonkeyPatch): assert mock_create_response.call_args.kwargs["input"] == "Hello" def test_native_responses_api_with_instructions(self, monkeypatch: MonkeyPatch): - server = self._setup_server(monkeypatch, use_native_responses_api=True) + server = self._setup_server(monkeypatch, use_responses_endpoint=True) app = server.setup_webserver() client = TestClient(app)