diff --git a/openviking/models/vlm/backends/volcengine_vlm.py b/openviking/models/vlm/backends/volcengine_vlm.py
index d8ecc701..f74cdb74 100644
--- a/openviking/models/vlm/backends/volcengine_vlm.py
+++ b/openviking/models/vlm/backends/volcengine_vlm.py
@@ -2,16 +2,13 @@
 # SPDX-License-Identifier: AGPL-3.0
 """VolcEngine VLM backend implementation."""
 
+import asyncio
 import base64
 import json
 import logging
 import time
-from collections import OrderedDict
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple, Union
-
-from openviking.utils.model_retry import retry_async, retry_sync
-from openviking_cli.utils import run_async
+from typing import Any, Dict, List, Optional, Union
 
 from ..base import ToolCall, VLMResponse
 from .openai_vlm import OpenAIVLM
@@ -19,158 +16,59 @@
 logger = logging.getLogger(__name__)
 
 
-class LRUCache:
-    """Simple LRU cache implementation."""
-
-    def __init__(self, maxsize: int = 100):
-        self._cache = OrderedDict()
-        self._maxsize = maxsize
-
-    def get(self, key: str) -> Optional[str]:
-        if key in self._cache:
-            self._cache.move_to_end(key)
-            return self._cache[key]
-        return None
-
-    def set(self, key: str, value: str) -> None:
-        if key in self._cache:
-            self._cache.move_to_end(key)
-        self._cache[key] = value
-        if len(self._cache) > self._maxsize:
-            self._cache.popitem(last=False)
-
-    def clear(self) -> None:
-        self._cache.clear()
-
-
 class VolcEngineVLM(OpenAIVLM):
-    """VolcEngine VLM backend with prompt caching support."""
+    """VolcEngine VLM backend with Chat Completions API support."""
 
     def __init__(self, config: Dict[str, Any]):
         super().__init__(config)
         self._sync_client = None
         self._async_client = None
         self.provider = "volcengine"
-        self._response_cache = LRUCache(maxsize=100)
 
         if not self.api_base:
             self.api_base = "https://ark.cn-beijing.volces.com/api/v3"
         if not self.model:
             self.model = "doubao-seed-2-0-pro-260215"
 
-    def _get_response_id_cache_key(self, messages: List[Dict[str, Any]]) -> str:
-        """Generate cache key for response_id using JSON serialization."""
-        key_messages = []
-        for msg in messages:
-            filtered = {k: v for k, v in msg.items() if k != "cache_control"}
-            key_messages.append(filtered)
-        return json.dumps(key_messages, ensure_ascii=False, sort_keys=True)
-
-    def _parse_messages_with_breakpoints(
-        self, messages: List[Dict[str, Any]]
-    ) -> Tuple[List[List[Dict[str, Any]]], List[Dict[str, Any]]]:
-        """Split messages into cacheable prefixes and dynamic suffix."""
-        first_breakpoint_idx = -1
-        for i, msg in enumerate(messages):
-            if msg.get("cache_control"):
-                first_breakpoint_idx = i
-                break
-
-        if first_breakpoint_idx > 0:
-            static_segment = messages[: first_breakpoint_idx + 1]
-            dynamic_messages = messages[first_breakpoint_idx + 1 :]
-            return [static_segment], dynamic_messages
-
-        return [], messages
+    def _parse_tool_calls(self, message) -> List[ToolCall]:
+        """Parse tool calls from VolcEngine response message."""
+        tool_calls = []
+        if hasattr(message, "tool_calls") and message.tool_calls:
+            for tc in message.tool_calls:
+                args = tc.function.arguments
+                if isinstance(args, str):
+                    try:
+                        args = json.loads(args)
+                    except json.JSONDecodeError:
+                        args = {"raw": args}
+                tool_calls.append(ToolCall(id=tc.id, name=tc.function.name, arguments=args))
+        return tool_calls
 
-    async def _get_or_create_from_segments(
-        self, segments: List[List[Dict[str, Any]]], end_idx: int
-    ) -> Optional[str]:
-        """Recursively get or create cached prefixes."""
-        if end_idx <= 0:
-            return None
-
-        def segments_to_messages(segs: List[List[Dict[str, Any]]]) -> List[Dict[str, Any]]:
-            msgs: List[Dict[str, Any]] = []
-            for seg in segs:
-                msgs.extend(seg)
-            return msgs
-
-        prefix = segments_to_messages(segments[:end_idx])
-        if end_idx == 1:
-            return await self._get_or_create_from_messages(prefix)
-
-        previous_response_id = await self._get_or_create_from_segments(segments, end_idx - 1)
-        return await self._get_or_create_from_messages(
-            segments_to_messages(segments[end_idx - 1 : end_idx]),
-            previous_response_id=previous_response_id,
-        )
-
-    async def _get_or_create_from_messages(
-        self, messages: List[Dict[str, Any]], previous_response_id: Optional[str] = None
-    ) -> Optional[str]:
-        """Create a cached prefix and return its response id."""
-        cache_key = self._get_response_id_cache_key(messages)
-        cached_id = self._response_cache.get(cache_key)
-        if cached_id is not None:
-            return cached_id
+    def _build_vlm_response(self, response, has_tools: bool) -> Union[str, VLMResponse]:
+        """Build response from Chat Completions response. Returns str or VLMResponse based on has_tools."""
+        choice = response.choices[0]
+        message = choice.message
 
-        client = self.get_async_client()
-        input_data = self._convert_messages_to_input(messages)
-        try:
-            response = await client.responses.create(
-                model=self.model,
-                previous_response_id=previous_response_id,
-                input=input_data,
-                caching={"type": "enabled", "prefix": True},
-                thinking={"type": "disabled"},
-            )
-            cached_id = response.id
-            self._response_cache.set(cache_key, cached_id)
-            return cached_id
-        except Exception as e:
-            logger.warning("[VolcEngineVLM] Failed to create cached prefix: %s", e)
-            return None
+        if has_tools:
+            usage = {}
+            if hasattr(response, "usage") and response.usage:
+                usage = {
+                    "prompt_tokens": response.usage.prompt_tokens,
+                    "completion_tokens": response.usage.completion_tokens,
+                    "total_tokens": response.usage.total_tokens,
+                    "prompt_tokens_details": getattr(response.usage, "prompt_tokens_details", None),
+                }
 
-    async def responseapi_prefixcache_completion(
-        self,
-        static_segments: List[List[Dict[str, Any]]],
-        dynamic_messages: List[Dict[str, Any]],
-        response_format: Optional[Dict[str, Any]] = None,
-        tools: Optional[List[Dict[str, Any]]] = None,
-        tool_choice: Optional[str] = None,
-        thinking: bool = False,
-    ) -> Any:
-        """Call VolcEngine Responses API with optional prefix caching."""
-        if static_segments:
-            response_id = await self._get_or_create_from_segments(
-                static_segments, len(static_segments)
+            return VLMResponse(
+                content=message.content,
+                tool_calls=self._parse_tool_calls(message),
+                finish_reason=choice.finish_reason or "stop",
+                usage=usage,
             )
-        else:
-            response_id = None
-
-        client = self.get_async_client()
-        kwargs: Dict[str, Any] = {
-            "model": self.model,
-            "input": self._convert_messages_to_input(dynamic_messages),
-            "temperature": self.temperature,
-            "thinking": {"type": "enabled" if thinking else "disabled"},
-            "caching": {"type": "enabled"},
-        }
-        if self.max_tokens is not None:
-            kwargs["max_tokens"] = self.max_tokens
-        if response_format:
-            kwargs["text"] = {"format": response_format}
-        if response_id:
-            kwargs["previous_response_id"] = response_id
-        if tools:
-            kwargs["tools"] = self._convert_tools(tools)
-            kwargs["tool_choice"] = tool_choice or "auto"
-
-        return await client.responses.create(**kwargs)
+        return message.content or ""
 
     def get_client(self):
-        """Get sync client."""
+        """Get sync client"""
         if self._sync_client is None:
             try:
                 import volcenginesdkarkruntime
@@ -185,7 +83,7 @@ def get_client(self):
         return self._sync_client
 
     def get_async_client(self):
-        """Get async client."""
+        """Get async client"""
         if self._async_client is None:
             try:
                 import volcenginesdkarkruntime
@@ -199,109 +97,6 @@ def get_async_client(self):
             )
         return self._async_client
 
-    def _update_token_usage_from_response(
-        self,
-        response,
-        duration_seconds: float = 0.0,
-    ) -> None:
-        """Update token usage from either Responses API or chat completions."""
-        if hasattr(response, "usage") and response.usage:
-            usage = response.usage
-            if hasattr(usage, "input_tokens") or hasattr(usage, "output_tokens"):
-                prompt_tokens = getattr(usage, "input_tokens", 0) or 0
-                completion_tokens = getattr(usage, "output_tokens", 0) or 0
-                self.update_token_usage(
-                    model_name=self.model or "unknown",
-                    provider=self.provider,
-                    prompt_tokens=prompt_tokens,
-                    completion_tokens=completion_tokens,
-                    duration_seconds=duration_seconds,
-                )
-                return
-        super()._update_token_usage_from_response(response, duration_seconds=duration_seconds)
-
-    def _build_vlm_response(self, response, has_tools: bool) -> Union[str, VLMResponse]:
-        """Build a VLM response from Responses API or chat completions payloads."""
-        if hasattr(response, "choices"):
-            return super()._build_vlm_response(response, has_tools)
-
-        content = ""
-        tool_calls: List[ToolCall] = []
-        finish_reason = "stop"
-
-        if hasattr(response, "output") and response.output:
-            for item in response.output:
-                item_type = getattr(item, "type", None)
-                if item_type == "function_call":
-                    args = item.arguments
-                    if isinstance(args, str):
-                        try:
-                            args = json.loads(args)
-                        except json.JSONDecodeError:
-                            args = {"raw": args}
-                    tool_calls.append(
-                        ToolCall(id=item.call_id or "", name=item.name or "", arguments=args)
-                    )
-                    finish_reason = "tool_calls"
-                    continue
-
-                if item_type != "message":
-                    continue
-
-                if hasattr(item, "content"):
-                    if isinstance(item.content, list):
-                        text_parts = []
-                        for block in item.content:
-                            if getattr(block, "type", None) == "output_text":
-                                text_parts.append(block.text or "")
-                            elif hasattr(block, "text"):
-                                text_parts.append(block.text or "")
-                        content = "".join(text_parts)
-                    else:
-                        content = item.content or ""
-
-                if hasattr(item, "tool_calls") and item.tool_calls:
-                    for tc in item.tool_calls:
-                        args = tc.arguments
-                        if isinstance(args, str):
-                            try:
-                                args = json.loads(args)
-                            except json.JSONDecodeError:
-                                args = {"raw": args}
-                        tool_name = getattr(tc, "name", None)
-                        if not tool_name and hasattr(tc, "function"):
-                            tool_name = tc.function.name
-                        tool_calls.append(
-                            ToolCall(
-                                id=getattr(tc, "id", "") or "", name=tool_name or "", arguments=args
-                            )
-                        )
-
-                finish_reason = getattr(item, "finish_reason", "stop") or "stop"
-
-        usage: Dict[str, Any] = {}
-        if hasattr(response, "usage") and response.usage:
-            u = response.usage
-            usage = {
-                "prompt_tokens": getattr(u, "input_tokens", 0),
-                "completion_tokens": getattr(u, "output_tokens", 0),
-                "total_tokens": getattr(u, "total_tokens", 0),
-            }
-            input_details = getattr(u, "input_tokens_details", None)
-            if input_details:
-                usage["prompt_tokens_details"] = {
-                    "cached_tokens": getattr(input_details, "cached_tokens", 0),
-                }
-
-        if has_tools:
-            return VLMResponse(
-                content=content,
-                tool_calls=tool_calls,
-                finish_reason=finish_reason,
-                usage=usage,
-            )
-        return content
-
     def get_completion(
         self,
         prompt: str = "",
@@ -310,113 +105,29 @@ def get_completion(
         tool_choice: Optional[str] = None,
         messages: Optional[List[Dict[str, Any]]] = None,
     ) -> Union[str, VLMResponse]:
-        """Get text completion via the async Responses API implementation."""
-        return run_async(
-            self.get_completion_async(
-                prompt=prompt,
-                thinking=thinking,
-                tools=tools,
-                tool_choice=tool_choice,
-                messages=messages,
-            )
-        )
-
-    def _convert_messages_to_input(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
-        """Convert OpenAI-style messages to VolcEngine Responses API input format."""
-        input_messages = []
-        for msg in messages:
-            role = msg.get("role", "user")
-            content = msg.get("content", "")
-
-            if role == "tool_call" and isinstance(content, dict):
-                content_str = json.dumps(content, ensure_ascii=False)
-                role = "user"
-            else:
-                if isinstance(content, list):
-                    text_parts = []
-                    image_urls = []
-                    for block in content:
-                        if not isinstance(block, dict):
-                            continue
-                        block_type = block.get("type", "")
-                        if block_type == "text" or "text" in block:
-                            text = block.get("text", "")
-                            if text:
-                                text_parts.append(text)
-                        elif block_type == "image_url" or "image_url" in block:
-                            image_url = block.get("image_url", {})
-                            if isinstance(image_url, dict):
-                                url = image_url.get("url", "")
-                                if url:
-                                    image_urls.append(url)
-                        else:
-                            text = block.get("text", "")
-                            if text:
-                                text_parts.append(text)
-                    content = " ".join(text_parts)
-                    if image_urls:
-                        data_urls = [u for u in image_urls if u.startswith("data:")]
-                        if data_urls:
-                            content = (
-                                content
-                                + "\n[Images: "
-                                + ", ".join([f"data URL ({i + 1})" for i in range(len(data_urls))])
-                                + "]"
-                            )
-
-                content_str = str(content) if content else "[empty]"
-                if not content_str or content_str == "[empty]":
-                    continue
-
-                if role == "tool":
-                    content_str = f"[Tool Result]\n{content_str}"
-                    role = "user"
-
-            input_messages.append({"role": role, "content": content_str})
-
-        return input_messages
-
-    def _convert_tools(self, tools: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
-        """Convert OpenAI-style tool format to VolcEngine Responses API format."""
-        converted = []
-        for tool in tools:
-            if not isinstance(tool, dict):
-                converted.append(tool)
-                continue
-
-            if tool.get("type") == "function" and "function" in tool:
-                func = tool["function"]
-                converted.append(
-                    {
-                        "type": "function",
-                        "name": func.get("name", ""),
-                        "description": func.get("description", ""),
-                        "parameters": func.get("parameters", {}),
-                    }
-                )
-            elif "function" in tool:
-                func = tool["function"]
-                converted.append(
-                    {
-                        "type": "function",
-                        "name": func.get("name", ""),
-                        "description": func.get("description", ""),
-                        "parameters": func.get("parameters", {}),
-                    }
-                )
-            elif tool.get("type") != "function":
-                converted.append(
-                    {
-                        "type": "function",
-                        "name": tool.get("name", ""),
-                        "description": tool.get("description", ""),
-                        "parameters": tool.get("parameters", {}),
-                    }
-                )
-            else:
-                converted.append(tool)
+        """Get text completion via Chat Completions API."""
+        kwargs_messages = messages or [{"role": "user", "content": prompt}]
+        kwargs = {
+            "model": self.model or "doubao-seed-2-0-pro-260215",
+            "messages": kwargs_messages,
+            "temperature": self.temperature,
+            "thinking": {"type": "disabled" if not thinking else "enabled"},
+        }
+        if self.max_tokens is not None:
+            kwargs["max_tokens"] = self.max_tokens
+        if tools:
+            kwargs["tools"] = tools
+            kwargs["tool_choice"] = tool_choice or "auto"
 
-        return converted
+        client = self.get_client()
+        t0 = time.perf_counter()
+        response = client.chat.completions.create(**kwargs)
+        elapsed = time.perf_counter() - t0
+        self._update_token_usage_from_response(response, duration_seconds=elapsed)
+        result = self._build_vlm_response(response, has_tools=bool(tools))
+        if tools:
+            return result
+        return self._clean_response(str(result))
 
     async def get_completion_async(
         self,
@@ -425,52 +136,10 @@ async def get_completion_async(
         tools: Optional[List[Dict[str, Any]]] = None,
         tool_choice: Optional[str] = None,
         messages: Optional[List[Dict[str, Any]]] = None,
+        max_retries: int = 0,
     ) -> Union[str, VLMResponse]:
-        """Get text completion with prompt caching support."""
+        """Get text completion asynchronously via Chat Completions API."""
         kwargs_messages = messages or [{"role": "user", "content": prompt}]
-        static_segments, dynamic_messages = self._parse_messages_with_breakpoints(kwargs_messages)
-
-        async def _call() -> Union[str, VLMResponse]:
-            response = await self.responseapi_prefixcache_completion(
-                static_segments=static_segments,
-                dynamic_messages=dynamic_messages,
-                response_format=None,
-                tools=tools,
-                tool_choice=tool_choice,
-                thinking=thinking,
-            )
-            self._update_token_usage_from_response(response, duration_seconds=0.0)
-            result = self._build_vlm_response(response, has_tools=bool(tools))
-            if tools:
-                return result
-            return self._clean_response(str(result))
-
-        return await retry_async(
-            _call,
-            max_retries=self.max_retries,
-            logger=logger,
-            operation_name="VolcEngine VLM async completion",
-        )
-
-    def _build_vision_kwargs(
-        self,
-        prompt: str = "",
-        images: Optional[List[Union[str, Path, bytes]]] = None,
-        thinking: bool = False,
-        tools: Optional[List[Dict[str, Any]]] = None,
-        tool_choice: Optional[str] = None,
-        messages: Optional[List[Dict[str, Any]]] = None,
-    ) -> Dict[str, Any]:
-        if messages:
-            kwargs_messages = messages
-        else:
-            content = []
-            if images:
-                content.extend(self._prepare_image(img) for img in images)
-            if prompt:
-                content.append({"type": "text", "text": prompt})
-            kwargs_messages = [{"role": "user", "content": content}]
-
         kwargs = {
             "model": self.model or "doubao-seed-2-0-pro-260215",
             "messages": kwargs_messages,
@@ -482,57 +151,109 @@ def _build_vision_kwargs(
         if tools:
             kwargs["tools"] = tools
             kwargs["tool_choice"] = tool_choice or "auto"
-        return kwargs
+
+        client = self.get_async_client()
+
+        last_error = None
+        for attempt in range(max_retries + 1):
+            try:
+                t0 = time.perf_counter()
+                response = await client.chat.completions.create(**kwargs)
+                elapsed = time.perf_counter() - t0
+                self._update_token_usage_from_response(response, duration_seconds=elapsed)
+                result = self._build_vlm_response(response, has_tools=bool(tools))
+                if tools:
+                    return result
+                return self._clean_response(str(result))
+            except Exception as e:
+                last_error = e
+                if attempt < max_retries:
+                    await asyncio.sleep(2**attempt)
+
+        if last_error:
+            raise last_error
+        else:
+            raise RuntimeError("Unknown error in async completion")
 
     def _detect_image_format(self, data: bytes) -> str:
-        """Detect image format from magic bytes."""
+        """Detect image format from magic bytes.
+
+        Returns the MIME type, or raises ValueError for unsupported formats like SVG.
+
+        Supported formats per VolcEngine docs:
+        https://www.volcengine.com/docs/82379/1362931
+        - JPEG, PNG, GIF, WEBP, BMP, TIFF, ICO, DIB, ICNS, SGI, JPEG2000, HEIC, HEIF
+        """
         if len(data) < 12:
+            logger.warning(f"[VolcEngineVLM] Image data too small: {len(data)} bytes")
             return "image/png"
 
+        # PNG: 89 50 4E 47 0D 0A 1A 0A
         if data[:8] == b"\x89PNG\r\n\x1a\n":
             return "image/png"
-        if data[:2] == b"\xff\xd8":
+        # JPEG: FF D8
+        elif data[:2] == b"\xff\xd8":
             return "image/jpeg"
-        if data[:6] in (b"GIF87a", b"GIF89a"):
+        # GIF: GIF87a or GIF89a
+        elif data[:6] in (b"GIF87a", b"GIF89a"):
             return "image/gif"
-        if data[:4] == b"RIFF" and len(data) >= 12 and data[8:12] == b"WEBP":
+        # WEBP: RIFF....WEBP
+        elif data[:4] == b"RIFF" and len(data) >= 12 and data[8:12] == b"WEBP":
             return "image/webp"
-        if data[:2] == b"BM":
+        # BMP: BM
+        elif data[:2] == b"BM":
             return "image/bmp"
-        if data[:4] == b"II*\x00" or data[:4] == b"MM\x00*":
+        # TIFF (little-endian): 49 49 2A 00
+        # TIFF (big-endian): 4D 4D 00 2A
+        elif data[:4] == b"II*\x00" or data[:4] == b"MM\x00*":
             return "image/tiff"
-        if data[:4] == b"\x00\x00\x01\x00":
+        # ICO: 00 00 01 00
+        elif data[:4] == b"\x00\x00\x01\x00":
             return "image/ico"
-        if data[:4] == b"icns":
+        # ICNS: 69 63 6E 73 ("icns")
+        elif data[:4] == b"icns":
             return "image/icns"
-        if data[:2] == b"\x01\xda":
+        # SGI: 01 DA
+        elif data[:2] == b"\x01\xda":
             return "image/sgi"
-        if data[:8] == b"\x00\x00\x00\x0cjP  " or data[:4] == b"\xff\x4f\xff\x51":
+        # JPEG2000: 00 00 00 0C 6A 50 20 20 (JP2 signature)
+        elif data[:8] == b"\x00\x00\x00\x0cjP  " or data[:4] == b"\xff\x4f\xff\x51":
             return "image/jp2"
-        if len(data) >= 12 and data[4:8] == b"ftyp":
+        # HEIC/HEIF: ftyp box with heic/heif brand
+        # 00 00 00 XX 66 74 79 70 68 65 69 63 (heic)
+        # 00 00 00 XX 66 74 79 70 68 65 69 66 (heif)
+        elif len(data) >= 12 and data[4:8] == b"ftyp":
             brand = data[8:12]
             if brand == b"heic":
                 return "image/heic"
-            if brand == b"heif" or brand[:3] == b"mif":
+            elif brand == b"heif":
                 return "image/heif"
-        if data[:4] == b"<svg" or (data[:5] == b"<?xml" and b"<svg" in data[:100]):
+            elif brand[:3] == b"mif":
+                return "image/heif"
+        # SVG (not supported)
+        elif data[:4] == b"<svg" or (data[:5] == b"<?xml" and b"<svg" in data[:100]):
             raise ValueError(
                 "SVG format is not supported by VolcEngine VLM API. "
                 "Supported formats: JPEG, PNG, GIF, WEBP, BMP, TIFF, ICO, ICNS, SGI, JPEG2000, HEIC, HEIF"
             )
 
+        # Unknown format - log and default to PNG
+        logger.warning(f"[VolcEngineVLM] Unknown image format, magic bytes: {data[:16].hex()}")
         return "image/png"
 
     def _prepare_image(self, image: Union[str, Path, bytes]) -> Dict[str, Any]:
-        """Prepare image data for vision completion."""
+        """Prepare image data"""
         if isinstance(image, bytes):
             b64 = base64.b64encode(image).decode("utf-8")
             mime_type = self._detect_image_format(image)
+            logger.info(
+                f"[VolcEngineVLM] Preparing image from bytes, size={len(image)}, detected mime={mime_type}"
+            )
             return {
                 "type": "image_url",
                 "image_url": {"url": f"data:{mime_type};base64,{b64}"},
             }
-        if isinstance(image, Path) or (
+        elif isinstance(image, Path) or (
             isinstance(image, str) and not image.startswith(("http://", "https://"))
         ):
             path = Path(image)
@@ -565,7 +286,8 @@ def _prepare_image(self, image: Union[str, Path, bytes]) -> Dict[str, Any]:
                 "type": "image_url",
                 "image_url": {"url": f"data:{mime_type};base64,{b64}"},
             }
-        return {"type": "image_url", "image_url": {"url": image}}
+        else:
+            return {"type": "image_url", "image_url": {"url": image}}
 
     def get_vision_completion(
         self,
@@ -575,26 +297,38 @@ def get_vision_completion(
         tools: Optional[List[Dict[str, Any]]] = None,
         messages: Optional[List[Dict[str, Any]]] = None,
     ) -> Union[str, VLMResponse]:
-        """Get vision completion through chat completions."""
-        client = self.get_client()
-        kwargs = self._build_vision_kwargs(prompt, images, thinking, tools, None, messages)
+        """Get vision completion via Chat Completions API."""
+        if messages:
+            kwargs_messages = messages
+        else:
+            content = []
+            if images:
+                content.extend(self._prepare_image(img) for img in images)
+            if prompt:
+                content.append({"type": "text", "text": prompt})
+            kwargs_messages = [{"role": "user", "content": content}]
 
-        def _call() -> Union[str, VLMResponse]:
-            t0 = time.perf_counter()
-            response = client.chat.completions.create(**kwargs)
-            elapsed = time.perf_counter() - t0
-            self._update_token_usage_from_response(response, duration_seconds=elapsed)
-            result = self._build_vlm_response(response, has_tools=bool(tools))
-            if tools:
-                return result
-            return self._clean_response(str(result))
+        kwargs = {
+            "model": self.model or "doubao-seed-2-0-pro-260215",
+            "messages": kwargs_messages,
+            "temperature": self.temperature,
+            "thinking": {"type": "disabled" if not thinking else "enabled"},
+        }
+        if self.max_tokens is not None:
+            kwargs["max_tokens"] = self.max_tokens
+        if tools:
+            kwargs["tools"] = tools
+            kwargs["tool_choice"] = "auto"
 
-        return retry_sync(
-            _call,
-            max_retries=self.max_retries,
-            logger=logger,
-            operation_name="VolcEngine VLM vision completion",
-        )
+        client = self.get_client()
+        t0 = time.perf_counter()
+        response = client.chat.completions.create(**kwargs)
+        elapsed = time.perf_counter() - t0
+        self._update_token_usage_from_response(response, duration_seconds=elapsed)
+        result = self._build_vlm_response(response, has_tools=bool(tools))
+        if tools:
+            return result
+        return self._clean_response(str(result))
 
     async def get_vision_completion_async(
         self,
@@ -604,23 +338,35 @@ async def get_vision_completion_async(
         tools: Optional[List[Dict[str, Any]]] = None,
         messages: Optional[List[Dict[str, Any]]] = None,
     ) -> Union[str, VLMResponse]:
-        """Get vision completion asynchronously through chat completions."""
-        client = self.get_async_client()
-        kwargs = self._build_vision_kwargs(prompt, images, thinking, tools, None, messages)
+        """Get vision completion asynchronously via Chat Completions API."""
+        if messages:
+            kwargs_messages = messages
+        else:
+            content = []
+            if images:
+                content.extend(self._prepare_image(img) for img in images)
+            if prompt:
+                content.append({"type": "text", "text": prompt})
+            kwargs_messages = [{"role": "user", "content": content}]
 
-        async def _call() -> Union[str, VLMResponse]:
-            t0 = time.perf_counter()
-            response = await client.chat.completions.create(**kwargs)
-            elapsed = time.perf_counter() - t0
-            self._update_token_usage_from_response(response, duration_seconds=elapsed)
-            result = self._build_vlm_response(response, has_tools=bool(tools))
-            if tools:
-                return result
-            return self._clean_response(str(result))
+        kwargs = {
+            "model": self.model or "doubao-seed-2-0-pro-260215",
+            "messages": kwargs_messages,
+            "temperature": self.temperature,
+            "thinking": {"type": "disabled" if not thinking else "enabled"},
+        }
+        if self.max_tokens is not None:
+            kwargs["max_tokens"] = self.max_tokens
+        if tools:
+            kwargs["tools"] = tools
+            kwargs["tool_choice"] = "auto"
 
-        return await retry_async(
-            _call,
-            max_retries=self.max_retries,
-            logger=logger,
-            operation_name="VolcEngine VLM async vision completion",
-        )
+        client = self.get_async_client()
+        t0 = time.perf_counter()
+        response = await client.chat.completions.create(**kwargs)
+        elapsed = time.perf_counter() - t0
+        self._update_token_usage_from_response(response, duration_seconds=elapsed)
+        result = self._build_vlm_response(response, has_tools=bool(tools))
+        if tools:
+            return result
+        return self._clean_response(str(result))
\ No newline at end of file
diff --git a/openviking/session/memory/tools.py b/openviking/session/memory/tools.py
index 6818f0e1..61d08dd9 100644
--- a/openviking/session/memory/tools.py
+++ b/openviking/session/memory/tools.py
@@ -6,6 +6,7 @@
 Reference: bot/vikingbot/agent/tools/base.py design pattern
 """
 
+import json
 from abc import ABC, abstractmethod
 from typing import Any, Dict, List, Optional, Tuple, Union
 
@@ -65,7 +66,12 @@ def add_tool_call_pair_to_messages(
 ) -> None:
     """Add a tool call pair with optimized format to save tokens."""
     messages.append(
-        {"role": "user", "content": {"tool_call_name": tool_name, "args": params, "result": result}}
+        {
+            "role": "user",
+            "content": json.dumps(
+                {"tool_call_name": tool_name, "args": params, "result": result}, ensure_ascii=False
+            ),
+        }
     )