Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions openviking/models/vlm/backends/litellm_vlm.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,8 @@ def _build_kwargs(self, model: str, messages: list) -> dict[str, Any]:
"messages": messages,
"temperature": self.temperature,
}
if self.max_tokens:
kwargs["max_tokens"] = self.max_tokens

if self.api_key:
kwargs["api_key"] = self.api_key
Expand Down
8 changes: 8 additions & 0 deletions openviking/models/vlm/backends/openai_vlm.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ def get_completion(self, prompt: str, thinking: bool = False) -> str:
"messages": [{"role": "user", "content": prompt}],
"temperature": self.temperature,
}
if self.max_tokens:
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[Suggestion] (non-blocking)

Using if self.max_tokens treats 0 the same as None (both falsy). While max_tokens=0 is never a valid API value, if self.max_tokens is not None is semantically clearer and avoids any edge-case surprises. Same applies to all other backends.

if self.max_tokens is not None:
    kwargs["max_tokens"] = self.max_tokens

kwargs["max_tokens"] = self.max_tokens

response = client.chat.completions.create(**kwargs)
self._update_token_usage_from_response(response)
Expand All @@ -77,6 +79,8 @@ async def get_completion_async(
"messages": [{"role": "user", "content": prompt}],
"temperature": self.temperature,
}
if self.max_tokens:
kwargs["max_tokens"] = self.max_tokens

last_error = None
for attempt in range(max_retries + 1):
Expand Down Expand Up @@ -165,6 +169,8 @@ def get_vision_completion(
"messages": [{"role": "user", "content": content}],
"temperature": self.temperature,
}
if self.max_tokens:
kwargs["max_tokens"] = self.max_tokens

response = client.chat.completions.create(**kwargs)
self._update_token_usage_from_response(response)
Expand All @@ -189,6 +195,8 @@ async def get_vision_completion_async(
"messages": [{"role": "user", "content": content}],
"temperature": self.temperature,
}
if self.max_tokens:
kwargs["max_tokens"] = self.max_tokens

response = await client.chat.completions.create(**kwargs)
self._update_token_usage_from_response(response)
Expand Down
8 changes: 8 additions & 0 deletions openviking/models/vlm/backends/volcengine_vlm.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@ def get_completion(self, prompt: str, thinking: bool = False) -> str:
"temperature": self.temperature,
"thinking": {"type": "disabled" if not thinking else "enabled"},
}
if self.max_tokens:
kwargs["max_tokens"] = self.max_tokens

response = client.chat.completions.create(**kwargs)
self._update_token_usage_from_response(response)
Expand All @@ -84,6 +86,8 @@ async def get_completion_async(
"temperature": self.temperature,
"thinking": {"type": "disabled" if not thinking else "enabled"},
}
if self.max_tokens:
kwargs["max_tokens"] = self.max_tokens

last_error = None
for attempt in range(max_retries + 1):
Expand Down Expand Up @@ -235,6 +239,8 @@ def get_vision_completion(
"temperature": self.temperature,
"thinking": {"type": "disabled" if not thinking else "enabled"},
}
if self.max_tokens:
kwargs["max_tokens"] = self.max_tokens

response = client.chat.completions.create(**kwargs)
self._update_token_usage_from_response(response)
Expand All @@ -260,6 +266,8 @@ async def get_vision_completion_async(
"temperature": self.temperature,
"thinking": {"type": "disabled" if not thinking else "enabled"},
}
if self.max_tokens:
kwargs["max_tokens"] = self.max_tokens

response = await client.chat.completions.create(**kwargs)
self._update_token_usage_from_response(response)
Expand Down
1 change: 1 addition & 0 deletions openviking/models/vlm/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ def __init__(self, config: Dict[str, Any]):
self.api_base = config.get("api_base")
self.temperature = config.get("temperature", 0.0)
self.max_retries = config.get("max_retries", 2)
self.max_tokens = config.get("max_tokens")

# Token usage tracking
self._token_tracker = TokenUsageTracker()
Expand Down
5 changes: 5 additions & 0 deletions openviking_cli/utils/config/vlm_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@ class VLMConfig(BaseModel):

default_provider: Optional[str] = Field(default=None, description="Default provider name")

max_tokens: Optional[int] = Field(
default=4096, description="Maximum tokens for VLM completion output"
)
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[Design] (blocking)

The default of 4096 changes behavior for all existing users, not just vLLM users. For OpenAI / VolcEngine native APIs, omitting max_tokens lets the server choose its own (typically generous) default. Forcing 4096 could silently truncate outputs that previously worked fine.

Suggestion: default to None so that max_tokens is only sent when the user explicitly configures it. vLLM users (who need this fix) can add "max_tokens": 4096 to their config; everyone else remains unaffected.

max_tokens: Optional[int] = Field(
    default=None, description="Maximum tokens for VLM completion output"
)

You could also call this out more prominently in the config example in the PR description or docs, so vLLM users know to set it.


thinking: bool = Field(default=False, description="Enable thinking mode for VolcEngine models")

max_concurrent: int = Field(
Expand Down Expand Up @@ -134,6 +138,7 @@ def _build_vlm_config_dict(self) -> Dict[str, Any]:
"max_retries": self.max_retries,
"provider": name,
"thinking": self.thinking,
"max_tokens": self.max_tokens,
}

if config:
Expand Down
Loading