From 30ebc6144609fca697ce1c188a8a9592c6885c5e Mon Sep 17 00:00:00 2001
From: Alec Solder <alecs@fb.com>
Date: Wed, 5 Nov 2025 09:13:50 -0800
Subject: [PATCH 1/7] Chat format GD for tool calling with gptoss

Signed-off-by: Alec Solder <alecs@fb.com>
---
 ...test_gptoss_structural_tags_integration.py | 576 +++++++++++++-----
 vllm/entrypoints/harmony_utils.py             |  25 +
 vllm/entrypoints/openai/serving_responses.py  |   3 +-
 vllm/reasoning/abs_reasoning_parsers.py       |   3 +-
 vllm/reasoning/gptoss_reasoning_parser.py     | 170 ++++--
 5 files changed, 580 insertions(+), 197 deletions(-)

diff --git a/tests/entrypoints/openai/test_gptoss_structural_tags_integration.py b/tests/entrypoints/openai/test_gptoss_structural_tags_integration.py
index fbfae4f268d5..232dc4e7ddc1 100644
--- a/tests/entrypoints/openai/test_gptoss_structural_tags_integration.py
+++ b/tests/entrypoints/openai/test_gptoss_structural_tags_integration.py
@@ -7,15 +7,36 @@
 from unittest.mock import Mock
 
 import pytest
+import pytest_asyncio
+from openai import OpenAI
 
 from vllm.entrypoints.openai.protocol import (
     StructuredOutputsParams,
 )
-from vllm.entrypoints.tool_server import ToolServer
 from vllm.reasoning.gptoss_reasoning_parser import (
     GptOssReasoningParser,
 )
 
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "openai/gpt-oss-20b"
+
+GET_WEATHER_SCHEMA = {
+    "type": "function",
+    "name": "get_weather",
+    "description": "Get current temperature for provided coordinates in celsius.",
+    "parameters": {
+        "type": "object",
+        "properties": {
+            "latitude": {"type": "number"},
+            "longitude": {"type": "number"},
+        },
+        "required": ["latitude", "longitude"],
+        "additionalProperties": False,
+    },
+    "strict": True,
+}
+
 
 class TestGptOssStructuralTagsIntegration:
     """Integration tests for structural tags in GPT-OSS tool calls."""
@@ -32,72 +53,60 @@ def gptoss_parser(self, mock_tokenizer):
         """Create a real GptOssReasoningParser instance."""
         return GptOssReasoningParser(mock_tokenizer)
 
-    @pytest.fixture
-    def tool_server_with_python(self):
-        """Create a tool server with Python tool enabled."""
-        tool_server = Mock(spec=ToolServer)
-        tool_server.has_tool = Mock(side_effect=lambda tool: tool == "python")
-        return tool_server
-
-    @pytest.fixture
-    def tool_server_empty(self):
-        """Create a tool server with no tools."""
-        tool_server = Mock(spec=ToolServer)
-        tool_server.has_tool = Mock(return_value=False)
-        return tool_server
-
     def test_end_to_end_no_tools(self, gptoss_parser):
         """Test end-to-end flow when no tools are available."""
-        # Test the parser directly
-        result = gptoss_parser.prepare_structured_tag(None, None)
+        # Test the parser directly with no tools
+        result = gptoss_parser.prepare_structured_tag(None, [])
         parsed_result = json.loads(result)
 
         # Verify basic structure
         assert parsed_result["type"] == "structural_tag"
         assert parsed_result["format"]["type"] == "triggered_tags"
-        assert len(parsed_result["format"]["tags"]) == 1
 
-        # Verify only analysis channel is allowed
-        analysis_tag = parsed_result["format"]["tags"][0]
-        assert analysis_tag["begin"] == "<|channel|>analysis<|message|>"
-        assert analysis_tag["content"]["type"] == "any_text"
-        assert analysis_tag["end"] == "<|end|>"
+        # With no tools, should have 5 BASE_TAGS (commentary excluded)
+        # (analysis, final, and variants - no commentary)
+        assert len(parsed_result["format"]["tags"]) == 5
+
+        # Verify BASE_TAGS are present (check for key channel types)
+        tag_begins = [tag["begin"] for tag in parsed_result["format"]["tags"]]
+        assert "<|channel|>analysis" in tag_begins
+        assert "<|channel|>commentary" not in tag_begins  # Excluded without tools
+        assert "<|channel|>final" in tag_begins
 
-        # Verify triggers
-        assert parsed_result["format"]["triggers"] == ["<|channel|>analysis"]
+        # Verify all tags use regex content type and <|message|> end tag
+        for tag in parsed_result["format"]["tags"]:
+            assert tag["content"]["type"] == "regex"
+            assert tag["end"] == "<|message|>"
+
+        # Verify triggers include both channel and assistant start
+        assert "<|channel|>" in parsed_result["format"]["triggers"]
+        assert "<|start|>assistant" in parsed_result["format"]["triggers"]
         assert parsed_result["format"]["stop_after_first"] is False
 
-    def test_end_to_end_with_python_tool(self, gptoss_parser, tool_server_with_python):
-        """Test end-to-end flow with Python tool enabled."""
-        result = gptoss_parser.prepare_structured_tag(None, tool_server_with_python)
+    def test_structural_tag_with_python_tool(self, gptoss_parser):
+        """Test structural tag generation with Python tool enabled."""
+        # Python is an analysis tool
+        result = gptoss_parser.prepare_structured_tag(None, ["python"])
         parsed_result = json.loads(result)
 
-        # Should have analysis tag + 2 python tags
-        assert len(parsed_result["format"]["tags"]) == 3
+        # Should have 5 BASE_TAGS (no commentary) + 2 python tags
+        assert len(parsed_result["format"]["tags"]) == 7
 
-        # Verify all expected tags are present
+        # Verify python tags are present in analysis channel
         tag_begins = [tag["begin"] for tag in parsed_result["format"]["tags"]]
-        expected_begins = [
-            "<|channel|>analysis<|message|>",
-            "<|channel|>commentary to=python",
-            "<|channel|>analysis to=python",
-        ]
-
-        for expected in expected_begins:
-            assert expected in tag_begins
+        assert "<|channel|>analysis to=python" in tag_begins
+        assert "<|start|>assistant<|channel|>analysis to=python" in tag_begins
+        # Commentary BASE_TAG should not be present (no commentary tools)
+        assert "<|channel|>commentary" not in tag_begins
 
-        # Verify triggers include commentary
-        assert "<|channel|>analysis" in parsed_result["format"]["triggers"]
-        assert "<|channel|>commentary to=" in parsed_result["format"]["triggers"]
+        # Verify triggers are set
+        assert "<|channel|>" in parsed_result["format"]["triggers"]
+        assert "<|start|>assistant" in parsed_result["format"]["triggers"]
 
-    def test_structured_outputs_params_integration(
-        self, gptoss_parser, tool_server_with_python
-    ):
+    def test_structured_outputs_params_integration(self, gptoss_parser):
         """Test integration with StructuredOutputsParams."""
-        # Generate structural tag
-        structural_tag = gptoss_parser.prepare_structured_tag(
-            None, tool_server_with_python
-        )
+        # Generate structural tag with python tool
+        structural_tag = gptoss_parser.prepare_structured_tag(None, ["python"])
 
         # Create StructuredOutputsParams
         params = StructuredOutputsParams(structural_tag=structural_tag)
@@ -110,84 +119,65 @@ def test_structured_outputs_params_integration(
         assert parsed_tag["type"] == "structural_tag"
 
     @pytest.mark.parametrize(
-        "browser, python, container, expected_tags",
+        "tool_names, expected_tags",
         [
-            # No tools
-            (False, False, False, 1),
-            # Single tool
-            (True, False, False, 3),
-            # Multiple tools
-            (True, True, False, 5),
-            # All tools
-            (True, True, True, 7),
+            # No tools: 5 BASE_TAGS (no commentary)
+            ([], 5),
+            # Single analysis tool: 5 BASE_TAGS (no commentary) + 2 tags
+            (["python"], 7),
+            # Multiple analysis tools: 5 BASE_TAGS (no commentary) + 4 tags
+            (["python", "browser.search"], 9),
+            # All builtin tool types: 5 BASE_TAGS (no commentary) + 6 tags
+            (["python", "browser.search", "container.exec"], 11),
         ],
     )
     def test_tool_server_interaction_flow(
-        self, gptoss_parser, browser, python, container, expected_tags
+        self, gptoss_parser, tool_names, expected_tags
     ):
-        """Test the complete tool server interaction flow."""
-
-        # Create a mock ToolServer
-        tool_server = Mock(spec=ToolServer)
-
-        # Simulate tool availability based on parameters
-        tool_server.has_tool = Mock(
-            side_effect=lambda tool: {
-                "browser": browser,
-                "python": python,
-                "container": container,
-            }.get(tool, False)
-        )
+        """Test the complete tool interaction flow with different tool combinations."""
 
         # Run the parser and verify results
-        result = gptoss_parser.prepare_structured_tag(None, tool_server)
+        result = gptoss_parser.prepare_structured_tag(None, tool_names)
         parsed_result = json.loads(result)
 
-        # Validate number of tags
+        # Validate number of tags (BASE_TAGS + 2 per tool)
         assert len(parsed_result["format"]["tags"]) == expected_tags
 
         # Verify tool-specific tags exist for enabled tools
         tag_begins = [tag["begin"] for tag in parsed_result["format"]["tags"]]
-        for tool, enabled in {
-            "browser": browser,
-            "python": python,
-            "container": container,
-        }.items():
-            if enabled:
-                assert f"<|channel|>commentary to={tool}" in tag_begins
-                assert f"<|channel|>analysis to={tool}" in tag_begins
-
-    def test_original_tag_preservation(self, gptoss_parser, tool_server_with_python):
+        for tool_name in tool_names:
+            # Each tool should have both first and last message variants
+            assert f"<|channel|>analysis to={tool_name}" in tag_begins
+            assert f"<|start|>assistant<|channel|>analysis to={tool_name}" in tag_begins
+
+    def test_original_tag_preservation(self, gptoss_parser):
         """Test that original tags are preserved when provided."""
         original_tag = '{"type": "custom_tag", "data": "preserved"}'
 
-        result = gptoss_parser.prepare_structured_tag(
-            original_tag, tool_server_with_python
-        )
+        result = gptoss_parser.prepare_structured_tag(original_tag, ["python"])
 
         # Should return original tag unchanged
         assert result == original_tag
 
     @pytest.mark.parametrize(
-        "tools",
+        "tool_names",
         [
             [],
-            ["browser"],
             ["python"],
-            ["container"],
-            ["browser", "python"],
-            ["browser", "container"],
-            ["python", "container"],
-            ["browser", "python", "container"],
+            ["browser.search"],
+            ["container.exec"],
+            ["python", "browser.search"],
+            ["browser.search", "container.exec"],
+            ["python", "container.exec"],
+            ["python", "browser.search", "container.exec"],
+            ["functions.get_weather"],  # Function tool
+            ["python", "functions.get_weather"],  # Mixed builtin and function
         ],
     )
-    def test_json_validity_comprehensive(self, gptoss_parser, tools):
+    def test_json_validity_comprehensive(self, gptoss_parser, tool_names):
         """Test JSON validity across all possible tool combinations."""
 
-        tool_server = Mock(spec=ToolServer)
-        tool_server.has_tool = Mock(side_effect=lambda tool: tool in tools)
-
-        result = gptoss_parser.prepare_structured_tag(None, tool_server)
+        result = gptoss_parser.prepare_structured_tag(None, tool_names)
 
         # Should be valid JSON
         parsed_result = json.loads(result)
@@ -198,32 +188,35 @@ def test_json_validity_comprehensive(self, gptoss_parser, tools):
         assert "tags" in parsed_result["format"]
         assert "triggers" in parsed_result["format"]
 
-        # Tag count should be: 1 (analysis) + 2 * len(tools)
-        expected_tag_count = 1 + (2 * len(tools))
+        # Tag count depends on whether there are commentary tools
+        has_commentary_tools = any(tool.startswith("functions") for tool in tool_names)
+        base_tag_count = 6 if has_commentary_tools else 5
+        expected_tag_count = base_tag_count + (2 * len(tool_names))
         assert len(parsed_result["format"]["tags"]) == expected_tag_count
 
-    def test_error_handling_invalid_tool_server(self, gptoss_parser):
-        """Test error handling with invalid tool server."""
-        # Tool server that raises exceptions
-        tool_server = Mock(spec=ToolServer)
-        tool_server.has_tool = Mock(side_effect=Exception("Tool server error"))
+    def test_error_handling_empty_tool_names(self, gptoss_parser):
+        """Test handling of empty list for tool names."""
+        # Empty list should be handled gracefully
+        result_empty = gptoss_parser.prepare_structured_tag(None, [])
+        parsed_empty = json.loads(result_empty)
+
+        # Should have only BASE_TAGS (5 without commentary)
+        assert len(parsed_empty["format"]["tags"]) == 5
 
-        # Should handle gracefully and still return a valid tag
-        with pytest.raises(Exception, match="Tool server error"):
-            gptoss_parser.prepare_structured_tag(None, tool_server)
+        # Verify it's valid JSON with correct structure
+        assert parsed_empty["type"] == "structural_tag"
+        assert "format" in parsed_empty
+        assert "tags" in parsed_empty["format"]
 
     def test_concurrent_requests_isolation(self, gptoss_parser):
         """Test that concurrent requests don't interfere with each other."""
-        # Simulate concurrent requests with different tool servers
-        tool_server_1 = Mock(spec=ToolServer)
-        tool_server_1.has_tool = Mock(side_effect=lambda tool: tool == "python")
-
-        tool_server_2 = Mock(spec=ToolServer)
-        tool_server_2.has_tool = Mock(side_effect=lambda tool: tool == "browser")
+        # Simulate concurrent requests with different tool configurations
+        tool_names_1 = ["python"]
+        tool_names_2 = ["browser.search"]
 
         # Generate tags concurrently
-        result_1 = gptoss_parser.prepare_structured_tag(None, tool_server_1)
-        result_2 = gptoss_parser.prepare_structured_tag(None, tool_server_2)
+        result_1 = gptoss_parser.prepare_structured_tag(None, tool_names_1)
+        result_2 = gptoss_parser.prepare_structured_tag(None, tool_names_2)
 
         # Parse results
         parsed_1 = json.loads(result_1)
@@ -233,22 +226,19 @@ def test_concurrent_requests_isolation(self, gptoss_parser):
         tags_1 = [tag["begin"] for tag in parsed_1["format"]["tags"]]
         tags_2 = [tag["begin"] for tag in parsed_2["format"]["tags"]]
 
-        # Result 1 should have python tags
-        assert "<|channel|>commentary to=python" in tags_1
-        assert "<|channel|>commentary to=browser" not in tags_1
+        # Result 1 should have python tags but not browser tags
+        assert "<|channel|>analysis to=python" in tags_1
+        assert "<|channel|>analysis to=browser.search" not in tags_1
 
-        # Result 2 should have browser tags
-        assert "<|channel|>commentary to=browser" in tags_2
-        assert "<|channel|>commentary to=python" not in tags_2
+        # Result 2 should have browser tags but not python tags
+        assert "<|channel|>analysis to=browser.search" in tags_2
+        assert "<|channel|>analysis to=python" not in tags_2
 
     def test_tag_format_consistency(self, gptoss_parser):
         """Test that all generated tags follow consistent format."""
-        tool_server = Mock(spec=ToolServer)
-        tool_server.has_tool = Mock(
-            side_effect=lambda tool: tool in ["python", "browser"]
-        )
+        tool_names = ["python", "browser.search"]
 
-        result = gptoss_parser.prepare_structured_tag(None, tool_server)
+        result = gptoss_parser.prepare_structured_tag(None, tool_names)
         parsed_result = json.loads(result)
 
         # Verify all tags have required fields
@@ -256,25 +246,333 @@ def test_tag_format_consistency(self, gptoss_parser):
             assert "begin" in tag
             assert "content" in tag
             assert "end" in tag
-            assert tag["content"]["type"] == "any_text"
-            assert tag["end"] == "<|end|>"
+            assert tag["content"]["type"] == "regex"
 
-            # Verify begin format
-            assert tag["begin"].startswith("<|channel|>")
+            # End tag should always end with <|message|>
+            # BASE_TAGS have just "<|message|>"
+            # Tool-specific tags have content type prefix like:
+            # " code<|message|>" or " <|constrain|>json<|message|>"
+            assert tag["end"].endswith("<|message|>")
+
+            # Verify begin format starts with channel or assistant start
+            assert tag["begin"].startswith("<|channel|>") or tag["begin"].startswith(
+                "<|start|>assistant"
+            )
 
     def test_trigger_configuration(self, gptoss_parser):
         """Test trigger configuration for different tool setups."""
         # Test with no tools
-        result_no_tools = gptoss_parser.prepare_structured_tag(None, None)
+        result_no_tools = gptoss_parser.prepare_structured_tag(None, [])
         parsed_no_tools = json.loads(result_no_tools)
-        assert parsed_no_tools["format"]["triggers"] == ["<|channel|>analysis"]
 
-        # Test with tools
-        tool_server = Mock(spec=ToolServer)
-        tool_server.has_tool = Mock(side_effect=lambda tool: tool == "python")
+        # Triggers should include channel and assistant start
+        expected_triggers = ["<|channel|>", "<|start|>assistant"]
+        assert set(parsed_no_tools["format"]["triggers"]) == set(expected_triggers)
 
-        result_with_tools = gptoss_parser.prepare_structured_tag(None, tool_server)
+        # Test with tools - should have same triggers
+        result_with_tools = gptoss_parser.prepare_structured_tag(None, ["python"])
         parsed_with_tools = json.loads(result_with_tools)
 
-        expected_triggers = ["<|channel|>analysis", "<|channel|>commentary to="]
+        # Triggers remain the same regardless of tools
         assert set(parsed_with_tools["format"]["triggers"]) == set(expected_triggers)
+
+    def test_tool_names_with_namespaces(self, gptoss_parser):
+        """Test that tool names with namespace prefixes work correctly."""
+        # Test with all namespace formats
+        tool_names = [
+            "python",  # Special case - no namespace
+            "browser.search",  # Browser namespace
+            "browser.find",
+            "browser.open",
+            "container.exec",  # Container namespace
+            "functions.get_weather",  # Functions namespace
+            "functions.calculate",
+        ]
+
+        result = gptoss_parser.prepare_structured_tag(None, tool_names)
+        parsed_result = json.loads(result)
+
+        # Should have 6 BASE_TAGS + 2 * 7 tools = 20 tags
+        assert len(parsed_result["format"]["tags"]) == 20
+
+        # Verify all tool names appear in tags
+        tag_begins = [tag["begin"] for tag in parsed_result["format"]["tags"]]
+        for tool_name in tool_names:
+            # Check that tool appears in at least one tag
+            # Functions go to commentary channel, others to analysis
+            if tool_name.startswith("functions"):
+                assert f"<|channel|>commentary to={tool_name}" in tag_begins
+            else:
+                assert f"<|channel|>analysis to={tool_name}" in tag_begins
+
+    def test_analysis_vs_commentary_tools(self, gptoss_parser):
+        """Test tools categorization into analysis vs commentary channels."""
+        # Analysis tools: builtin tools (python, browser.*, container.*)
+        analysis_tools = ["python", "browser.search", "container.exec"]
+
+        # Commentary tools: custom functions
+        commentary_tools = ["functions.get_weather", "functions.calculate"]
+
+        # Test with both types
+        all_tools = analysis_tools + commentary_tools
+        result = gptoss_parser.prepare_structured_tag(None, all_tools)
+        parsed_result = json.loads(result)
+
+        tag_begins = [tag["begin"] for tag in parsed_result["format"]["tags"]]
+
+        # Verify analysis tools go to analysis channel
+        for tool in analysis_tools:
+            assert f"<|channel|>analysis to={tool}" in tag_begins
+            assert f"<|start|>assistant<|channel|>analysis to={tool}" in tag_begins
+            # Should NOT be in commentary
+            assert f"<|channel|>commentary to={tool}" not in tag_begins
+
+        # Verify commentary tools go to commentary channel
+        for tool in commentary_tools:
+            assert f"<|channel|>commentary to={tool}" in tag_begins
+            assert f"<|start|>assistant<|channel|>commentary to={tool}" in tag_begins
+            # Should NOT be in analysis
+            assert f"<|channel|>analysis to={tool}" not in tag_begins
+
+    def test_mixed_function_and_builtin_tools(self, gptoss_parser):
+        """Test structural tag generation with mixed function and builtin tools."""
+        # Test with both function and builtin tools
+        tool_names = [
+            "python",  # Code interpreter (builtin)
+            "functions.get_weather",  # Custom function
+            "functions.search_database",  # Another custom function
+        ]
+
+        result = gptoss_parser.prepare_structured_tag(None, tool_names)
+        parsed_result = json.loads(result)
+
+        # Verify valid JSON structure
+        assert parsed_result["type"] == "structural_tag"
+        assert parsed_result["format"]["type"] == "triggered_tags"
+
+        # Should have 6 BASE_TAGS + 2 * 3 tools = 12 tags
+        assert len(parsed_result["format"]["tags"]) == 12
+
+        tag_begins = [tag["begin"] for tag in parsed_result["format"]["tags"]]
+
+        # Verify python (builtin) is in analysis channel
+        assert "<|channel|>analysis to=python" in tag_begins
+        assert "<|start|>assistant<|channel|>analysis to=python" in tag_begins
+
+        # Verify functions are in commentary channel
+        assert "<|channel|>commentary to=functions.get_weather" in tag_begins
+        assert (
+            "<|start|>assistant<|channel|>commentary to=functions.get_weather"
+            in tag_begins
+        )
+        assert "<|channel|>commentary to=functions.search_database" in tag_begins
+        assert (
+            "<|start|>assistant<|channel|>commentary to=functions.search_database"
+            in tag_begins
+        )
+
+        # Verify triggers are correct
+        assert "<|channel|>" in parsed_result["format"]["triggers"]
+        assert "<|start|>assistant" in parsed_result["format"]["triggers"]
+
+        # Verify at_least_one and stop_after_first settings
+        assert parsed_result["format"]["at_least_one"] is True
+        assert parsed_result["format"]["stop_after_first"] is False
+
+
+# E2E tests with real server and model inference
+@pytest.fixture(scope="module")
+def monkeypatch_module():
+    from _pytest.monkeypatch import MonkeyPatch
+
+    mpatch = MonkeyPatch()
+    yield mpatch
+    mpatch.undo()
+
+
+@pytest.fixture(scope="module")
+def server(monkeypatch_module: pytest.MonkeyPatch):
+    """Start vLLM server with gpt-oss model for e2e tests."""
+    args = [
+        "--enforce-eager",
+        "--tool-server",
+        "demo",
+        "--structured_outputs_config="
+        '{"enable_in_reasoning": true, "reasoning_parser": "openai_gptoss"}',
+    ]
+
+    with monkeypatch_module.context() as m:
+        m.setenv("VLLM_ENABLE_RESPONSES_API_STORE", "1")
+        m.setenv("PYTHON_EXECUTION_BACKEND", "dangerously_use_uv")
+        # Helps the model follow instructions better
+        m.setenv("VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS", "1")
+        with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+            yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    """Get async OpenAI client for e2e tests."""
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_e2e_mixed_function_and_builtin_tools(client: OpenAI, model_name: str):
+    """E2E test with both function tools and code_interpreter (builtin tool).
+
+    This test validates that the structural tags work correctly when both
+    function tools and builtin tools are available, ensuring proper channel
+    categorization (functions -> commentary, builtin -> analysis).
+    """
+    tools = [
+        GET_WEATHER_SCHEMA,  # Function tool
+        {"type": "code_interpreter", "container": {"type": "auto"}},  # Builtin tool
+    ]
+
+    # Make a request that could potentially use either tool
+    response = await client.responses.create(
+        model=model_name,
+        input=(
+            "Calculate 15 * 23 using code, then tell me what the weather "
+            "is like at coordinates 48.8566, 2.3522"
+        ),
+        tools=tools,
+        temperature=0.0,  # More deterministic
+    )
+
+    assert response is not None
+    assert response.status == "completed"
+
+    # Should have output items (reasoning, code_interpreter, function_call, or message)
+    assert len(response.output) > 0
+
+    # Validate that we can have different output types
+    output_types = [item.type for item in response.output]
+    print(f"Output types: {output_types}")
+
+    # At least one output should exist
+    assert len(output_types) > 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_e2e_mixed_tools_streaming(client: OpenAI, model_name: str):
+    """E2E streaming test with both function and builtin tools.
+
+    Validates that structural tags work correctly in streaming mode with
+    mixed tool types.
+    """
+    tools = [
+        GET_WEATHER_SCHEMA,  # Function tool
+        {"type": "code_interpreter", "container": {"type": "auto"}},  # Builtin tool
+    ]
+
+    stream_response = await client.responses.create(
+        model=model_name,
+        input="What is 17 * 19? Use code to calculate.",
+        tools=tools,
+        stream=True,
+        temperature=0.0,
+    )
+
+    assert stream_response is not None
+
+    events = []
+    async for event in stream_response:
+        events.append(event)
+        if event.type == "response.completed":
+            assert event.response.status == "completed"
+            assert len(event.response.output) > 0
+
+    # Verify we received events
+    assert len(events) > 0
+
+    # Verify we have a completed event
+    assert any(e.type == "response.completed" for e in events)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_e2e_function_tool_edge_case_names(client: OpenAI, model_name: str):
+    """E2E test with function tool having edge case name (100 alternating L's).
+
+    Without guided decoding via structural tags, the model would struggle
+    to generate valid tool calls with unusual names like 100 alternating L's.
+    This test validates that structural tags enable correct generation.
+    """
+    # Function tool with name that's alternating uppercase/lowercase L's
+    # 100 characters total - edge case that would typically cause issues
+    # without proper guided decoding
+    edge_case_name = "".join("Ll" for _ in range(50))  # "LlLlLl..." 100 chars
+
+    edge_case_tool = {
+        "type": "function",
+        "name": edge_case_name,
+        "description": "A test function with alternating L's in the name",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "value": {"type": "number"},
+            },
+            "required": ["value"],
+            "additionalProperties": False,
+        },
+        "strict": True,
+    }
+
+    response = await client.responses.create(
+        model=model_name,
+        input="Use the LlLl tool with value 42",
+        instructions=(
+            "You must call the function tool available to you. "
+            "Just jump straight into calling it."
+        ),
+        tools=[edge_case_tool],
+        temperature=0.0,
+        extra_body={"enable_response_messages": True},
+        reasoning={"effort": "low"},
+    )
+
+    assert response is not None
+    assert response.status == "completed"
+
+    # Validate using output_messages like in test_response_api_mcp_tools.py
+    tool_call_found = False
+    tool_call_on_commentary = False
+
+    for message in response.output_messages:
+        recipient = message.get("recipient")
+        channel = message.get("channel")
+
+        # Check for function tool call with edge case name
+        expected_recipient = f"functions.{edge_case_name}"
+        if recipient and recipient == expected_recipient:
+            tool_call_found = True
+            # Function tools should go to commentary channel
+            assert channel == "commentary", (
+                "Function tool call should be on commentary channel"
+            )
+            tool_call_on_commentary = True
+
+    # Verify the structural tags enabled successful tool call generation
+    assert tool_call_found, (
+        f"Should have found a function tool call with edge case name "
+        f"(100 char alternating L's). "
+        f"Structural tags should enable this despite unusual name. "
+        f"Expected recipient: 'functions.{edge_case_name[:20]}...'"
+    )
+    assert tool_call_on_commentary, "Function tool should be on commentary channel"
+
+    # Verify input messages have developer message with function tool
+    developer_message_found = False
+    for message in response.input_messages:
+        author = message.get("author", {})
+        if author.get("role") == "developer":
+            developer_message_found = True
+            break
+
+    assert developer_message_found, (
+        "Developer message with function tool should be present in input_messages"
+    )
diff --git a/vllm/entrypoints/harmony_utils.py b/vllm/entrypoints/harmony_utils.py
index 7958d0317739..6e95c8a15594 100644
--- a/vllm/entrypoints/harmony_utils.py
+++ b/vllm/entrypoints/harmony_utils.py
@@ -502,6 +502,31 @@ def get_stop_tokens_for_assistant_actions() -> list[int]:
     return get_encoding().stop_tokens_for_assistant_actions()
 
 
+def get_tool_names_from_messages(messages: list[Message]) -> set[str]:
+    """
+    Returns a set of tool names for the purpose of guided decoding
+    """
+    tool_names: set[str] = set()
+    for message in messages:
+        if message.author.role == Role.SYSTEM or message.author.role == Role.DEVELOPER:
+            assert len(message.content) == 1, (
+                f"SYSTEM/DEVELOPER messages should have exactly 1 content item, "
+                f"got {len(message.content)}"
+            )
+            message_content = message.content[0]
+            assert isinstance(message_content, (SystemContent, DeveloperContent)), (
+                f"SYSTEM/DEVELOPER message content should be SystemContent or "
+                f"DeveloperContent, got {type(message_content).__name__}"
+            )
+            tool_namespace_configs = (
+                message_content.tools.values() if message_content.tools else []
+            )
+            for tool_namespace_config in tool_namespace_configs:
+                for tool in tool_namespace_config.tools:
+                    tool_names.add(f"{tool_namespace_config.name}.{tool.name}")
+    return tool_names
+
+
 def get_streamable_parser_for_assistant() -> StreamableParser:
     return StreamableParser(get_encoding(), role=Role.ASSISTANT)
 
diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
index 2ee8de5fba07..da10039541d9 100644
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -68,6 +68,7 @@
     get_developer_message,
     get_stop_tokens_for_assistant_actions,
     get_system_message,
+    get_tool_names_from_messages,
     get_user_message,
     has_custom_tools,
     parse_output_message,
@@ -401,7 +402,7 @@ async def create_responses(
                         sampling_params.structured_outputs.structural_tag = (
                             reasoning_parser.prepare_structured_tag(
                                 sampling_params.structured_outputs.structural_tag,
-                                self.tool_server,
+                                get_tool_names_from_messages(messages),
                             )
                         )
                 generator = self._generate_with_builtin_tools(
diff --git a/vllm/reasoning/abs_reasoning_parsers.py b/vllm/reasoning/abs_reasoning_parsers.py
index ebd660ca5a84..2da56d52d698 100644
--- a/vllm/reasoning/abs_reasoning_parsers.py
+++ b/vllm/reasoning/abs_reasoning_parsers.py
@@ -7,7 +7,6 @@
 from functools import cached_property
 from typing import TYPE_CHECKING, Any
 
-from vllm.entrypoints.tool_server import ToolServer
 from vllm.logger import init_logger
 from vllm.utils.collection_utils import is_list_of
 from vllm.utils.import_utils import import_from_path
@@ -119,7 +118,7 @@ def extract_reasoning_content_streaming(
     def prepare_structured_tag(
         self,
         original_tag: str | None,
-        tool_server: ToolServer | None,
+        tool_names: list[str] | None = None,
     ) -> str:
         """
         Instance method that is implemented for preparing the structured tag
diff --git a/vllm/reasoning/gptoss_reasoning_parser.py b/vllm/reasoning/gptoss_reasoning_parser.py
index e6766ddcbc68..bf81f094bc4e 100644
--- a/vllm/reasoning/gptoss_reasoning_parser.py
+++ b/vllm/reasoning/gptoss_reasoning_parser.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import copy
 import json
 from collections.abc import Sequence
 
@@ -7,65 +8,133 @@
 
 from vllm.entrypoints.harmony_utils import parse_chat_output
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage
-from vllm.entrypoints.tool_server import ToolServer
 from vllm.logger import init_logger
 from vllm.reasoning import ReasoningParser, ReasoningParserManager
 
 logger = init_logger(__name__)
 
-no_func_reaonsing_tag = {
+TRIGGERS = ["<|channel|>", "<|start|>assistant"]
+BASE_TAGS = [
+    # Allow normal reasoning messages as the first message
+    {
+        "type": "tag",
+        "begin": "<|channel|>analysis",
+        "content": {"type": "regex", "pattern": "(?:)"},
+        "end": "<|message|>",
+    },
+    {
+        "type": "tag",
+        "begin": "<|channel|>commentary",
+        "content": {"type": "regex", "pattern": "(?:)"},
+        "end": "<|message|>",
+    },
+    # Allow final messages as the first message
+    {
+        "type": "tag",
+        "begin": "<|channel|>final",
+        "content": {"type": "regex", "pattern": "(?:)"},
+        "end": "<|message|>",
+    },
+    # Allow final messages as the last message
+    {
+        "type": "tag",
+        "begin": "<|start|>assistant<|channel|>final",
+        "content": {"type": "regex", "pattern": "(?:)"},
+        "end": "<|message|>",
+    },
+    # The same cases, but when the model tends to
+    # will use <|constrain|>json when the user is asking for json output
+    {
+        "type": "tag",
+        "begin": "<|channel|>final <|constrain|>json",
+        "content": {"type": "regex", "pattern": "(?:)"},
+        "end": "<|message|>",
+    },
+    {
+        "type": "tag",
+        "begin": "<|start|>assistant<|channel|>final <|constrain|>json",
+        "content": {"type": "regex", "pattern": "(?:)"},
+        "end": "<|message|>",
+    },
+]
+
+
+STRUCTURAL_TAG_TEMPLATE = {
     "type": "structural_tag",
     "format": {
         "type": "triggered_tags",
-        "tags": [
-            {
-                "begin": "<|channel|>analysis<|message|>",
-                "content": {"type": "any_text"},
-                "end": "<|end|>",
-            }
-        ],
-        "triggers": ["<|channel|>analysis"],
+        "triggers": ["<|channel|>", "<|start|>assistant"],
+        "tags": [],
+        "at_least_one": True,
         "stop_after_first": False,
     },
 }
 
 
-def from_builtin_tool_to_tag(tool: str) -> list[dict]:
-    tag = [
+def create_tool_tags(channel_name: str, tool_name: str) -> list[dict]:
+    """
+    Generate tool-specific tags based on channel name and tool name.
+
+    Args:
+        channel_name: The channel name (e.g., "analysis", "commentary")
+        tool_name: The tool name (e.g., "python", "container")
+
+    Returns:
+        List of two tag dictionaries for first and last message positions
+    """
+    analysis_content_type = "code"
+    commentary_content_type = "<|constrain|>json"
+    content_type = (
+        analysis_content_type if channel_name == "analysis" else commentary_content_type
+    )
+    return [
+        # Tool as first message
         {
-            "begin": f"<|channel|>commentary to={tool}",
-            "content": {"type": "any_text"},
-            "end": "<|end|>",
+            "type": "tag",
+            "begin": f"<|channel|>{channel_name} to={tool_name}",
+            "content": {"type": "regex", "pattern": "(?:)"},
+            "end": f" {content_type}<|message|>",
         },
+        # Tool as last message
+        # It is critical to have this as the model often makes mistakes
+        # between `<|start|>assistant` and `<|channel|>` tags
+        # so there needs to be an extra case to prevent it
         {
-            "begin": f"<|channel|>analysis to={tool}",
-            "content": {"type": "any_text"},
-            "end": "<|end|>",
+            "type": "tag",
+            "begin": f"<|start|>assistant<|channel|>{channel_name} to={tool_name}",
+            "content": {"type": "regex", "pattern": "(?:)"},
+            "end": f" {content_type}<|message|>",
         },
     ]
-    return tag
 
 
-def tag_with_builtin_funcs(no_func_reaonsing_tag, builtin_tool_list: list[str]) -> dict:
-    import copy
+def get_structural_tags(analysis_tools: set[str], commentary_tools: set[str]):
+    # Start with base tags, but conditionally include commentary tag
+    if commentary_tools:
+        # Include all BASE_TAGS if there are commentary tools
+        tags = BASE_TAGS.copy()
+    else:
+        # Exclude commentary BASE_TAG if no commentary tools
+        tags = [tag for tag in BASE_TAGS if tag["begin"] != "<|channel|>commentary"]
+
+    # Add tool-specific tags for commentary channel
+    for tool_name in commentary_tools:
+        if tool_name:  # Skip empty strings from split
+            tags.extend(create_tool_tags("commentary", tool_name))
 
-    new_tag = copy.deepcopy(no_func_reaonsing_tag)
-    new_tag["format"]["triggers"].append("<|channel|>commentary to=")
+    # Add tool-specific tags for analysis channel
+    for tool_name in analysis_tools:
+        if tool_name:  # Skip empty strings from split
+            tags.extend(create_tool_tags("analysis", tool_name))
 
-    for tool in builtin_tool_list:
-        new_tag["format"]["tags"].extend(from_builtin_tool_to_tag(tool))
-    return new_tag
+    # Build the complete structural tag
+    structural_tags = copy.deepcopy(STRUCTURAL_TAG_TEMPLATE)
+    structural_tags["format"]["tags"] = tags
+    return json.dumps(structural_tags)
 
 
 @ReasoningParserManager.register_module("openai_gptoss")
 class GptOssReasoningParser(ReasoningParser):
-    """
-    Reasoning parser for GptOss model.
-
-    The GptOss model uses harmony to extract reasoning content and this parser
-    is only used for detecting the end of the reasoning content.
-    """
-
     def __init__(self, tokenizer: PreTrainedTokenizerBase, *args, **kwargs):
         super().__init__(tokenizer, *args, **kwargs)
         self.reasoning_end_token_ids = self.model_tokenizer.encode(
@@ -128,30 +197,21 @@ def extract_reasoning_content(
 
     # This function prepares the structural tag to format reasoning output
     def prepare_structured_tag(
-        self, original_tag: str | None, tool_server: ToolServer | None
+        self,
+        original_tag: str | None,
+        tool_names: list[str] | None = None,
     ) -> str:
-        if original_tag is None:
-            if tool_server is None:
-                return json.dumps(no_func_reaonsing_tag)
-            else:
-                builtin_tool_list: list[str] = []
-                if tool_server.has_tool("browser"):
-                    builtin_tool_list.append("browser")
-                if tool_server.has_tool("python"):
-                    builtin_tool_list.append("python")
-                if tool_server.has_tool("container"):
-                    builtin_tool_list.append("container")
-
-                if len(builtin_tool_list) > 0:
-                    logger.info("Builtin_tool_list: %s", builtin_tool_list)
-                    func_tag = json.dumps(
-                        tag_with_builtin_funcs(no_func_reaonsing_tag, builtin_tool_list)
-                    )
+        # Easiest way to separate based on channel for now
+        analysis_tools = set()
+        commentary_tools = set()
+        if tool_names:
+            for tool_name in tool_names:
+                if tool_name.startswith("functions"):
+                    commentary_tools.add(tool_name)
                 else:
-                    logger.info("Builtin_tool_list is empty")
-                    func_tag = json.dumps(no_func_reaonsing_tag)
-
-                return func_tag
+                    analysis_tools.add(tool_name)
+        if original_tag is None:
+            return get_structural_tags(analysis_tools, commentary_tools)
         else:
             # There is potential risk for appending the tag to the original tag
             return original_tag

From c3016e5d0e0087111098538be00f453974511dcb Mon Sep 17 00:00:00 2001
From: Alec Solder <alecs@fb.com>
Date: Wed, 5 Nov 2025 09:23:37 -0800
Subject: [PATCH 2/7] Clean up reasoning parser logic

Signed-off-by: Alec Solder <alecs@fb.com>
---
 vllm/reasoning/gptoss_reasoning_parser.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/vllm/reasoning/gptoss_reasoning_parser.py b/vllm/reasoning/gptoss_reasoning_parser.py
index bf81f094bc4e..7b9bd143c51f 100644
--- a/vllm/reasoning/gptoss_reasoning_parser.py
+++ b/vllm/reasoning/gptoss_reasoning_parser.py
@@ -201,6 +201,8 @@ def prepare_structured_tag(
         original_tag: str | None,
         tool_names: list[str] | None = None,
     ) -> str:
+        if original_tag is not None:
+            return original_tag
         # Easiest way to separate based on channel for now
         analysis_tools = set()
         commentary_tools = set()
@@ -210,8 +212,4 @@ def prepare_structured_tag(
                     commentary_tools.add(tool_name)
                 else:
                     analysis_tools.add(tool_name)
-        if original_tag is None:
-            return get_structural_tags(analysis_tools, commentary_tools)
-        else:
-            # There is potential risk for appending the tag to the original tag
-            return original_tag
+        return get_structural_tags(analysis_tools, commentary_tools)

From 8bbbd3747366b02ca978d19fb8280285ac1dee7b Mon Sep 17 00:00:00 2001
From: Alec Solder <alecs@fb.com>
Date: Wed, 5 Nov 2025 10:06:08 -0800
Subject: [PATCH 3/7] Require enable_in_reasoning

Signed-off-by: Alec Solder <alecs@fb.com>
---
 .../test_gptoss_structural_tags_integration.py  |  3 +--
 vllm/entrypoints/openai/api_server.py           |  2 +-
 vllm/entrypoints/openai/serving_chat.py         |  2 ++
 vllm/entrypoints/openai/serving_responses.py    | 17 ++++++++++++++---
 4 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/tests/entrypoints/openai/test_gptoss_structural_tags_integration.py b/tests/entrypoints/openai/test_gptoss_structural_tags_integration.py
index 232dc4e7ddc1..632b96a02a77 100644
--- a/tests/entrypoints/openai/test_gptoss_structural_tags_integration.py
+++ b/tests/entrypoints/openai/test_gptoss_structural_tags_integration.py
@@ -397,8 +397,7 @@ def server(monkeypatch_module: pytest.MonkeyPatch):
         "--enforce-eager",
         "--tool-server",
         "demo",
-        "--structured_outputs_config="
-        '{"enable_in_reasoning": true, "reasoning_parser": "openai_gptoss"}',
+        '--structured_outputs_config={"enable_in_reasoning": true}',
     ]
 
     with monkeypatch_module.context() as m:
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index e184f22f3630..ebcf20525c0f 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -1776,11 +1776,11 @@ async def init_app_state(
             enable_auto_tools=args.enable_auto_tool_choice,
             tool_parser=args.tool_call_parser,
             tool_server=tool_server,
-            reasoning_parser=args.structured_outputs_config.reasoning_parser,
             enable_prompt_tokens_details=args.enable_prompt_tokens_details,
             enable_force_include_usage=args.enable_force_include_usage,
             enable_log_outputs=args.enable_log_outputs,
             log_error_stack=args.log_error_stack,
+            structured_outputs_config=args.structured_outputs_config,
         )
         if "generate" in supported_tasks
         else None
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 25979d5502b0..ed5c384ec096 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -252,6 +252,8 @@ async def create_chat_completion(
                     request_prompts,
                     engine_prompts,
                 ) = self._make_request_with_harmony(request)
+                # TODO: Add gptoss reasoning parser prepare_structured_tag
+                # here like in serving_responses
         except (ValueError, TypeError, RuntimeError, jinja2.TemplateError) as e:
             logger.exception("Error in preprocessing prompt inputs")
             return self.create_error_response(f"{e} {e.__cause__}")
diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py
index da10039541d9..49bf2a56a733 100644
--- a/vllm/entrypoints/openai/serving_responses.py
+++ b/vllm/entrypoints/openai/serving_responses.py
@@ -52,6 +52,7 @@
 from openai_harmony import Message as OpenAIHarmonyMessage
 
 from vllm import envs
+from vllm.config.structured_outputs import StructuredOutputsConfig
 from vllm.engine.protocol import EngineClient
 from vllm.entrypoints.chat_utils import (
     ChatCompletionMessageParam,
@@ -135,7 +136,6 @@ def __init__(
         chat_template: str | None,
         chat_template_content_format: ChatTemplateContentFormatOption,
         return_tokens_as_token_ids: bool = False,
-        reasoning_parser: str = "",
         enable_auto_tools: bool = False,
         tool_parser: str | None = None,
         tool_server: ToolServer | None = None,
@@ -143,6 +143,7 @@ def __init__(
         enable_force_include_usage: bool = False,
         enable_log_outputs: bool = False,
         log_error_stack: bool = False,
+        structured_outputs_config: StructuredOutputsConfig | None = None,
     ) -> None:
         super().__init__(
             engine_client=engine_client,
@@ -157,8 +158,11 @@ def __init__(
         self.enable_log_outputs = enable_log_outputs
 
         self.reasoning_parser = self._get_reasoning_parser(
-            reasoning_parser_name=reasoning_parser
+            ""
+            if not structured_outputs_config
+            else structured_outputs_config.reasoning_parser
         )
+        self.structured_outputs_config = structured_outputs_config
         self.enable_prompt_tokens_details = enable_prompt_tokens_details
         self.enable_force_include_usage = enable_force_include_usage
         self.default_sampling_params = self.model_config.get_diff_sampling_param()
@@ -393,7 +397,14 @@ async def create_responses(
                 else:
                     context = SimpleContext()
 
-                if self.reasoning_parser is not None:
+                # Enable in reasoning must be true since structural tags are
+                # currently used to guide the harmony chat format
+                # which is technically in the reasoning, not the content
+                if (
+                    self.reasoning_parser is not None
+                    and self.structured_outputs_config
+                    and self.structured_outputs_config.enable_in_reasoning
+                ):
                     reasoning_parser = self.reasoning_parser(tokenizer)
                     if sampling_params.structured_outputs is None:
                         sampling_params.structured_outputs = StructuredOutputsParams()

From f5954b8eee67aefd119d305179c9dab1d8387213 Mon Sep 17 00:00:00 2001
From: Alec Solder <alecs@fb.com>
Date: Wed, 5 Nov 2025 10:17:15 -0800
Subject: [PATCH 4/7] Accidentally removed docstring

Signed-off-by: Alec Solder <alecs@fb.com>
---
 vllm/reasoning/gptoss_reasoning_parser.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/vllm/reasoning/gptoss_reasoning_parser.py b/vllm/reasoning/gptoss_reasoning_parser.py
index 7b9bd143c51f..0e2834ea51e2 100644
--- a/vllm/reasoning/gptoss_reasoning_parser.py
+++ b/vllm/reasoning/gptoss_reasoning_parser.py
@@ -135,6 +135,12 @@ def get_structural_tags(analysis_tools: set[str], commentary_tools: set[str]):
 
 @ReasoningParserManager.register_module("openai_gptoss")
 class GptOssReasoningParser(ReasoningParser):
+    """
+    Reasoning parser for GptOss model.
+    The GptOss model uses harmony to extract reasoning content and this parser
+    is only used for detecting the end of the reasoning content.
+    """
+
     def __init__(self, tokenizer: PreTrainedTokenizerBase, *args, **kwargs):
         super().__init__(tokenizer, *args, **kwargs)
         self.reasoning_end_token_ids = self.model_tokenizer.encode(

From 644439c87bc8392ab7abad8fd265e697f853a04a Mon Sep 17 00:00:00 2001
From: Alec Solder <alecs@fb.com>
Date: Wed, 5 Nov 2025 10:23:25 -0800
Subject: [PATCH 5/7] Change function signature

Signed-off-by: Alec Solder <alecs@fb.com>
---
 vllm/reasoning/gptoss_reasoning_parser.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/reasoning/gptoss_reasoning_parser.py b/vllm/reasoning/gptoss_reasoning_parser.py
index 0e2834ea51e2..889f0f8f758d 100644
--- a/vllm/reasoning/gptoss_reasoning_parser.py
+++ b/vllm/reasoning/gptoss_reasoning_parser.py
@@ -205,7 +205,7 @@ def extract_reasoning_content(
     def prepare_structured_tag(
         self,
         original_tag: str | None,
-        tool_names: list[str] | None = None,
+        tool_names: set[str] | None = None,
     ) -> str:
         if original_tag is not None:
             return original_tag

From 5540a7ed7c3b44655981365b6280b65a2c11363b Mon Sep 17 00:00:00 2001
From: Alec Solder <alecs@fb.com>
Date: Wed, 5 Nov 2025 10:29:12 -0800
Subject: [PATCH 6/7] And the abs one too

Signed-off-by: Alec Solder <alecs@fb.com>
---
 vllm/reasoning/abs_reasoning_parsers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/reasoning/abs_reasoning_parsers.py b/vllm/reasoning/abs_reasoning_parsers.py
index 2da56d52d698..4432658bdfd3 100644
--- a/vllm/reasoning/abs_reasoning_parsers.py
+++ b/vllm/reasoning/abs_reasoning_parsers.py
@@ -118,7 +118,7 @@ def extract_reasoning_content_streaming(
     def prepare_structured_tag(
         self,
         original_tag: str | None,
-        tool_names: list[str] | None = None,
+        tool_names: set[str] | None = None,
     ) -> str:
         """
         Instance method that is implemented for preparing the structured tag

From e63c0efcaf73430334643eaadc892edbf2e52230 Mon Sep 17 00:00:00 2001
From: Alec Solder <alecs@fb.com>
Date: Wed, 5 Nov 2025 14:16:04 -0800
Subject: [PATCH 7/7] When commentary is enabled, make commentary valid for
 analysis tools

Signed-off-by: Alec Solder <alecs@fb.com>
---
 ...test_gptoss_structural_tags_integration.py | 157 +++++++++++-------
 vllm/entrypoints/harmony_utils.py             |   4 +
 vllm/reasoning/gptoss_reasoning_parser.py     |  25 ++-
 3 files changed, 120 insertions(+), 66 deletions(-)

diff --git a/tests/entrypoints/openai/test_gptoss_structural_tags_integration.py b/tests/entrypoints/openai/test_gptoss_structural_tags_integration.py
index 632b96a02a77..be1cc553ea77 100644
--- a/tests/entrypoints/openai/test_gptoss_structural_tags_integration.py
+++ b/tests/entrypoints/openai/test_gptoss_structural_tags_integration.py
@@ -189,9 +189,22 @@ def test_json_validity_comprehensive(self, gptoss_parser, tool_names):
         assert "triggers" in parsed_result["format"]
 
         # Tag count depends on whether there are commentary tools
-        has_commentary_tools = any(tool.startswith("functions") for tool in tool_names)
-        base_tag_count = 6 if has_commentary_tools else 5
-        expected_tag_count = base_tag_count + (2 * len(tool_names))
+        num_analysis_tools = sum(
+            1 for tool in tool_names if not tool.startswith("functions")
+        )
+        num_commentary_tools = sum(
+            1 for tool in tool_names if tool.startswith("functions")
+        )
+
+        if num_commentary_tools > 0:
+            # With commentary: 6 BASE_TAGS + 4 per analysis + 2 per commentary
+            expected_tag_count = (
+                6 + (4 * num_analysis_tools) + (2 * num_commentary_tools)
+            )
+        else:
+            # Without commentary: 5 BASE_TAGS + 2 per analysis
+            expected_tag_count = 5 + (2 * num_analysis_tools)
+
         assert len(parsed_result["format"]["tags"]) == expected_tag_count
 
     def test_error_handling_empty_tool_names(self, gptoss_parser):
@@ -292,18 +305,23 @@ def test_tool_names_with_namespaces(self, gptoss_parser):
         result = gptoss_parser.prepare_structured_tag(None, tool_names)
         parsed_result = json.loads(result)
 
-        # Should have 6 BASE_TAGS + 2 * 7 tools = 20 tags
-        assert len(parsed_result["format"]["tags"]) == 20
+        # 5 analysis tools, 2 commentary tools
+        # 6 BASE_TAGS + (5 × 4) + (2 × 2) = 6 + 20 + 4 = 30 tags
+        assert len(parsed_result["format"]["tags"]) == 30
 
         # Verify all tool names appear in tags
         tag_begins = [tag["begin"] for tag in parsed_result["format"]["tags"]]
         for tool_name in tool_names:
-            # Check that tool appears in at least one tag
-            # Functions go to commentary channel, others to analysis
+            # Functions go to commentary channel only
+            # Analysis tools go to BOTH channels (when commentary exists)
             if tool_name.startswith("functions"):
                 assert f"<|channel|>commentary to={tool_name}" in tag_begins
+                # Should NOT be on analysis
+                assert f"<|channel|>analysis to={tool_name}" not in tag_begins
             else:
+                # Analysis tools on both channels when commentary exists
                 assert f"<|channel|>analysis to={tool_name}" in tag_begins
+                assert f"<|channel|>commentary to={tool_name}" in tag_begins
 
     def test_analysis_vs_commentary_tools(self, gptoss_parser):
         """Test tools categorization into analysis vs commentary channels."""
@@ -319,21 +337,39 @@ def test_analysis_vs_commentary_tools(self, gptoss_parser):
         parsed_result = json.loads(result)
 
         tag_begins = [tag["begin"] for tag in parsed_result["format"]["tags"]]
+        all_tags = parsed_result["format"]["tags"]
 
-        # Verify analysis tools go to analysis channel
+        # When commentary tools exist, analysis tools get BOTH channels
         for tool in analysis_tools:
+            # Analysis tools always on analysis channel
             assert f"<|channel|>analysis to={tool}" in tag_begins
             assert f"<|start|>assistant<|channel|>analysis to={tool}" in tag_begins
-            # Should NOT be in commentary
-            assert f"<|channel|>commentary to={tool}" not in tag_begins
+            # Also on commentary (to handle model flipping)
+            assert f"<|channel|>commentary to={tool}" in tag_begins
+            assert f"<|start|>assistant<|channel|>commentary to={tool}" in tag_begins
+
+            # Verify content_type: analysis tools use "code" on BOTH channels
+            for tag in all_tags:
+                if f"to={tool}" in tag["begin"]:
+                    assert tag["end"] == " code<|message|>", (
+                        f"Analysis tool {tool} should use 'code' format on "
+                        f"{tag['begin'].split('|')[1]} channel"
+                    )
 
-        # Verify commentary tools go to commentary channel
+        # Verify commentary tools go to commentary channel only
         for tool in commentary_tools:
             assert f"<|channel|>commentary to={tool}" in tag_begins
             assert f"<|start|>assistant<|channel|>commentary to={tool}" in tag_begins
             # Should NOT be in analysis
             assert f"<|channel|>analysis to={tool}" not in tag_begins
 
+            # Verify content_type: function tools use "json" format
+            for tag in all_tags:
+                if f"to={tool}" in tag["begin"]:
+                    assert tag["end"] == " <|constrain|>json<|message|>", (
+                        f"Function tool {tool} should use 'json' format"
+                    )
+
     def test_mixed_function_and_builtin_tools(self, gptoss_parser):
         """Test structural tag generation with mixed function and builtin tools."""
         # Test with both function and builtin tools
@@ -350,16 +386,19 @@ def test_mixed_function_and_builtin_tools(self, gptoss_parser):
         assert parsed_result["type"] == "structural_tag"
         assert parsed_result["format"]["type"] == "triggered_tags"
 
-        # Should have 6 BASE_TAGS + 2 * 3 tools = 12 tags
-        assert len(parsed_result["format"]["tags"]) == 12
+        # 1 analysis tool, 2 commentary tools
+        # 6 BASE_TAGS + (1 × 4) + (2 × 2) = 6 + 4 + 4 = 14 tags
+        assert len(parsed_result["format"]["tags"]) == 14
 
         tag_begins = [tag["begin"] for tag in parsed_result["format"]["tags"]]
 
-        # Verify python (builtin) is in analysis channel
+        # Verify python (builtin) is on BOTH channels (commentary exists)
         assert "<|channel|>analysis to=python" in tag_begins
         assert "<|start|>assistant<|channel|>analysis to=python" in tag_begins
+        assert "<|channel|>commentary to=python" in tag_begins
+        assert "<|start|>assistant<|channel|>commentary to=python" in tag_begins
 
-        # Verify functions are in commentary channel
+        # Verify functions are in commentary channel only
         assert "<|channel|>commentary to=functions.get_weather" in tag_begins
         assert (
             "<|start|>assistant<|channel|>commentary to=functions.get_weather"
@@ -370,6 +409,9 @@ def test_mixed_function_and_builtin_tools(self, gptoss_parser):
             "<|start|>assistant<|channel|>commentary to=functions.search_database"
             in tag_begins
         )
+        # Functions should NOT be on analysis
+        assert "<|channel|>analysis to=functions.get_weather" not in tag_begins
+        assert "<|channel|>analysis to=functions.search_database" not in tag_begins
 
         # Verify triggers are correct
         assert "<|channel|>" in parsed_result["format"]["triggers"]
@@ -418,6 +460,7 @@ async def client(server):
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.skip(reason="gpt-oss 20b messes up to much, works on 120b though")
 async def test_e2e_mixed_function_and_builtin_tools(client: OpenAI, model_name: str):
     """E2E test with both function tools and code_interpreter (builtin tool).
 
@@ -427,70 +470,64 @@ async def test_e2e_mixed_function_and_builtin_tools(client: OpenAI, model_name:
     """
     tools = [
         GET_WEATHER_SCHEMA,  # Function tool
-        {"type": "code_interpreter", "container": {"type": "auto"}},  # Builtin tool
+        {"type": "code_interpreter", "container": {"type": "auto"}},
     ]
 
     # Make a request that could potentially use either tool
     response = await client.responses.create(
         model=model_name,
         input=(
-            "Calculate 15 * 23 using code, then tell me what the weather "
+            "Execute the following code using the python tool: "
+            "import random; print(random.randint(1, 50), random.randint(1, 50))"
+            "\n Then tell me what the weather "
             "is like at coordinates 48.8566, 2.3522"
         ),
         tools=tools,
-        temperature=0.0,  # More deterministic
+        instructions=(
+            "You must use the Python tool to execute code. Never simulate execution."
+        ),
+        extra_body={"enable_response_messages": True},
     )
 
     assert response is not None
     assert response.status == "completed"
 
-    # Should have output items (reasoning, code_interpreter, function_call, or message)
-    assert len(response.output) > 0
-
-    # Validate that we can have different output types
-    output_types = [item.type for item in response.output]
-    print(f"Output types: {output_types}")
+    # Validate using output_messages - check recipients and channels
+    python_tool_call_found = False
+    python_tool_response_found = False
+    function_tool_call_found = False
 
-    # At least one output should exist
-    assert len(output_types) > 0
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("model_name", [MODEL_NAME])
-async def test_e2e_mixed_tools_streaming(client: OpenAI, model_name: str):
-    """E2E streaming test with both function and builtin tools.
+    for message in response.output_messages:
+        recipient = message.get("recipient")
+        channel = message.get("channel")
+        author = message.get("author", {})
 
-    Validates that structural tags work correctly in streaming mode with
-    mixed tool types.
-    """
-    tools = [
-        GET_WEATHER_SCHEMA,  # Function tool
-        {"type": "code_interpreter", "container": {"type": "auto"}},  # Builtin tool
-    ]
+        # Check for python tool call
+        if recipient and recipient.startswith("python"):
+            python_tool_call_found = True
+
+        # Check for python tool response
+        if (
+            author.get("role") == "tool"
+            and author.get("name")
+            and author.get("name").startswith("python")
+        ):
+            python_tool_response_found = True
+
+        # Check for function tool call
+        if recipient and recipient.startswith("functions.get_weather"):
+            function_tool_call_found = True
+            assert channel == "commentary", (
+                "Function tool call should be on commentary channel"
+            )
 
-    stream_response = await client.responses.create(
-        model=model_name,
-        input="What is 17 * 19? Use code to calculate.",
-        tools=tools,
-        stream=True,
-        temperature=0.0,
+    # Verify both tool types were used
+    assert python_tool_call_found, "Should have found python tool call"
+    assert python_tool_response_found, "Should have found python tool response"
+    assert function_tool_call_found, (
+        "Should have found function tool call on commentary channel"
     )
 
-    assert stream_response is not None
-
-    events = []
-    async for event in stream_response:
-        events.append(event)
-        if event.type == "response.completed":
-            assert event.response.status == "completed"
-            assert len(event.response.output) > 0
-
-    # Verify we received events
-    assert len(events) > 0
-
-    # Verify we have a completed event
-    assert any(e.type == "response.completed" for e in events)
-
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("model_name", [MODEL_NAME])
diff --git a/vllm/entrypoints/harmony_utils.py b/vllm/entrypoints/harmony_utils.py
index 6e95c8a15594..f7078b271bfd 100644
--- a/vllm/entrypoints/harmony_utils.py
+++ b/vllm/entrypoints/harmony_utils.py
@@ -522,6 +522,10 @@ def get_tool_names_from_messages(messages: list[Message]) -> set[str]:
                 message_content.tools.values() if message_content.tools else []
             )
             for tool_namespace_config in tool_namespace_configs:
+                # gpt-oss special case for python tool not needing a namespace
+                if tool_namespace_config.name == "python":
+                    tool_names.add("python")
+                    continue
                 for tool in tool_namespace_config.tools:
                     tool_names.add(f"{tool_namespace_config.name}.{tool.name}")
     return tool_names
diff --git a/vllm/reasoning/gptoss_reasoning_parser.py b/vllm/reasoning/gptoss_reasoning_parser.py
index 889f0f8f758d..56692fb95540 100644
--- a/vllm/reasoning/gptoss_reasoning_parser.py
+++ b/vllm/reasoning/gptoss_reasoning_parser.py
@@ -71,22 +71,30 @@
 }
 
 
-def create_tool_tags(channel_name: str, tool_name: str) -> list[dict]:
+def create_tool_tags(
+    channel_name: str, tool_name: str, content_type: str | None = None
+) -> list[dict]:
     """
     Generate tool-specific tags based on channel name and tool name.
 
     Args:
         channel_name: The channel name (e.g., "analysis", "commentary")
         tool_name: The tool name (e.g., "python", "container")
+        content_type: Optional explicit content type. If not provided,
+                      inferred from channel.
 
     Returns:
         List of two tag dictionaries for first and last message positions
     """
-    analysis_content_type = "code"
-    commentary_content_type = "<|constrain|>json"
-    content_type = (
-        analysis_content_type if channel_name == "analysis" else commentary_content_type
-    )
+    if content_type is None:
+        analysis_content_type = "code"
+        commentary_content_type = "<|constrain|>json"
+        content_type = (
+            analysis_content_type
+            if channel_name == "analysis"
+            else commentary_content_type
+        )
+
     return [
         # Tool as first message
         {
@@ -126,6 +134,11 @@ def get_structural_tags(analysis_tools: set[str], commentary_tools: set[str]):
     for tool_name in analysis_tools:
         if tool_name:  # Skip empty strings from split
             tags.extend(create_tool_tags("analysis", tool_name))
+            # If commentary tools exist, also allow analysis tools on commentary
+            # This handles model training issue where it flips between channels
+            # Use "code" content type (analysis tools keep their format)
+            if commentary_tools:
+                tags.extend(create_tool_tags("commentary", tool_name, "code"))
 
     # Build the complete structural tag
     structural_tags = copy.deepcopy(STRUCTURAL_TAG_TEMPLATE)