Skip to content

Commit 985c60a

Browse files
mvanhornclaude
andauthored
fix(vlm): add max_tokens parameter to VLM completion calls to prevent vLLM rejection (#689)
* fix(vlm): add max_tokens parameter to VLM completion calls to prevent vLLM rejection Without max_tokens, vLLM allocates all context space to input tokens and assigns 0 output tokens, rejecting requests with "You passed N input tokens and requested 0 output tokens." Even when prompts fit, the model has no guaranteed output space, leading to truncated or empty responses. This adds max_tokens support across all VLM backends: - VLMConfig: new max_tokens field (default 4096) - VLMBase: reads max_tokens from config dict - OpenAI, VolcEngine, LiteLLM backends: pass max_tokens in API calls - Conditional inclusion (if self.max_tokens) so None disables the limit Fixes #674 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * fix(vlm): default max_tokens to None to preserve provider behavior Change default from 4096 to None so max_tokens is only sent when explicitly configured. Prevents silently truncating outputs on OpenAI/VolcEngine where omitting max_tokens lets the server choose. Also use `is not None` instead of truthiness for max_tokens guards. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --------- Co-authored-by: Matt Van Horn <455140+mvanhorn@users.noreply.github.com> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 63b78a3 commit 985c60a

5 files changed

Lines changed: 24 additions & 0 deletions

File tree

openviking/models/vlm/backends/litellm_vlm.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,8 @@ def _build_kwargs(self, model: str, messages: list) -> dict[str, Any]:
199199
"messages": messages,
200200
"temperature": self.temperature,
201201
}
202+
if self.max_tokens is not None:
203+
kwargs["max_tokens"] = self.max_tokens
202204

203205
if self.api_key:
204206
kwargs["api_key"] = self.api_key

openviking/models/vlm/backends/openai_vlm.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,8 @@ def get_completion(self, prompt: str, thinking: bool = False) -> str:
6262
"messages": [{"role": "user", "content": prompt}],
6363
"temperature": self.temperature,
6464
}
65+
if self.max_tokens is not None:
66+
kwargs["max_tokens"] = self.max_tokens
6567

6668
response = client.chat.completions.create(**kwargs)
6769
self._update_token_usage_from_response(response)
@@ -77,6 +79,8 @@ async def get_completion_async(
7779
"messages": [{"role": "user", "content": prompt}],
7880
"temperature": self.temperature,
7981
}
82+
if self.max_tokens is not None:
83+
kwargs["max_tokens"] = self.max_tokens
8084

8185
last_error = None
8286
for attempt in range(max_retries + 1):
@@ -165,6 +169,8 @@ def get_vision_completion(
165169
"messages": [{"role": "user", "content": content}],
166170
"temperature": self.temperature,
167171
}
172+
if self.max_tokens is not None:
173+
kwargs["max_tokens"] = self.max_tokens
168174

169175
response = client.chat.completions.create(**kwargs)
170176
self._update_token_usage_from_response(response)
@@ -189,6 +195,8 @@ async def get_vision_completion_async(
189195
"messages": [{"role": "user", "content": content}],
190196
"temperature": self.temperature,
191197
}
198+
if self.max_tokens is not None:
199+
kwargs["max_tokens"] = self.max_tokens
192200

193201
response = await client.chat.completions.create(**kwargs)
194202
self._update_token_usage_from_response(response)

openviking/models/vlm/backends/volcengine_vlm.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,8 @@ def get_completion(self, prompt: str, thinking: bool = False) -> str:
6868
"temperature": self.temperature,
6969
"thinking": {"type": "disabled" if not thinking else "enabled"},
7070
}
71+
if self.max_tokens is not None:
72+
kwargs["max_tokens"] = self.max_tokens
7173

7274
response = client.chat.completions.create(**kwargs)
7375
self._update_token_usage_from_response(response)
@@ -84,6 +86,8 @@ async def get_completion_async(
8486
"temperature": self.temperature,
8587
"thinking": {"type": "disabled" if not thinking else "enabled"},
8688
}
89+
if self.max_tokens is not None:
90+
kwargs["max_tokens"] = self.max_tokens
8791

8892
last_error = None
8993
for attempt in range(max_retries + 1):
@@ -235,6 +239,8 @@ def get_vision_completion(
235239
"temperature": self.temperature,
236240
"thinking": {"type": "disabled" if not thinking else "enabled"},
237241
}
242+
if self.max_tokens is not None:
243+
kwargs["max_tokens"] = self.max_tokens
238244

239245
response = client.chat.completions.create(**kwargs)
240246
self._update_token_usage_from_response(response)
@@ -260,6 +266,8 @@ async def get_vision_completion_async(
260266
"temperature": self.temperature,
261267
"thinking": {"type": "disabled" if not thinking else "enabled"},
262268
}
269+
if self.max_tokens is not None:
270+
kwargs["max_tokens"] = self.max_tokens
263271

264272
response = await client.chat.completions.create(**kwargs)
265273
self._update_token_usage_from_response(response)

openviking/models/vlm/base.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ def __init__(self, config: Dict[str, Any]):
2525
self.api_base = config.get("api_base")
2626
self.temperature = config.get("temperature", 0.0)
2727
self.max_retries = config.get("max_retries", 2)
28+
self.max_tokens = config.get("max_tokens")
2829

2930
# Token usage tracking
3031
self._token_tracker = TokenUsageTracker()

openviking_cli/utils/config/vlm_config.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,10 @@ class VLMConfig(BaseModel):
2626

2727
default_provider: Optional[str] = Field(default=None, description="Default provider name")
2828

29+
max_tokens: Optional[int] = Field(
30+
default=None, description="Maximum tokens for VLM completion output (None = provider default)"
31+
)
32+
2933
thinking: bool = Field(default=False, description="Enable thinking mode for VolcEngine models")
3034

3135
max_concurrent: int = Field(
@@ -134,6 +138,7 @@ def _build_vlm_config_dict(self) -> Dict[str, Any]:
134138
"max_retries": self.max_retries,
135139
"provider": name,
136140
"thinking": self.thinking,
141+
"max_tokens": self.max_tokens,
137142
}
138143

139144
if config:

0 commit comments

Comments
 (0)