[Bugfix] Fixed when return_token_ids=False, the first event still contains prompt_token_ids. (vllm-project#27561)

chaunceyjiang · web-flow · commit a4fc21895ed2 · 2025-10-27T11:06:43.000Z
Signed-off-by: chaunceyjiang &lt;chaunceyjiang@gmail.com&gt;
diff --git a/tests/entrypoints/openai/test_return_token_ids.py b/tests/entrypoints/openai/test_return_token_ids.py
@@ -27,8 +27,12 @@ def server():
 
 
 @pytest.mark.asyncio
-async def test_basic_completion_with_emoji(server):
+@pytest.mark.parametrize("return_token_ids", [True, False, None])
+async def test_basic_completion_with_emoji(server, return_token_ids: bool | None):
     """Test basic completion with emoji to verify token_ids field."""
+    extra_body = None
+    if return_token_ids is not None:
+        extra_body = {"return_token_ids": return_token_ids}
     async with server.get_async_client() as client:
         # Test with return_token_ids enabled
         completion = await client.completions.create(
@@ -37,14 +41,20 @@ async def test_basic_completion_with_emoji(server):
             max_tokens=10,
             temperature=0,
             logprobs=1,
-            extra_body={"return_token_ids": True},
+            extra_body=extra_body,
         )
 
         # Check the raw response to see the structure
         completion_dict = completion.model_dump()
 
         # Verify prompt_token_ids field is present in the completion response
         assert "prompt_token_ids" in completion_dict["choices"][0]
+        if not return_token_ids:
+            # If return_token_ids is False, token_ids should not be present
+            assert completion_dict["choices"][0].get("token_ids") is None
+            assert completion_dict["choices"][0].get("prompt_token_ids") is None
+            # Skip further checks
+            return
         assert isinstance(completion.choices[0].prompt_token_ids, list)
 
         # Check against the expected prompt token IDs
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
@@ -399,7 +399,7 @@ async def completion_stream_generator(
 
                         # has_echoed[i] is reused here to indicate whether
                         # we have already returned the prompt token IDs.
-                        if not has_echoed[i]:
+                        if not has_echoed[i] and request.return_token_ids:
                             prompt_token_ids_to_return = prompt_token_ids
                             has_echoed[i] = True