agentscope-ai · octo-patch · Apr 11, 2026 · Apr 12, 2026 · gemini-code-assist · Apr 12, 2026
diff --git a/src/copaw/local_models/tag_parser.py b/src/copaw/local_models/tag_parser.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 """Parse special tags from model-generated text.
 
-Handles ``<think>...</think>`` (reasoning) and
+Handles ``<think>...</think>`` / ``<thought>...</thought>`` (reasoning) and
 ``<tool_call>...</tool_call>`` (function calling) tags that local models
 like Qwen3-Instruct embed in their raw text output.
 """
@@ -23,6 +23,9 @@
 THINK_START = "<think>"
 THINK_END = "</think>"
 
+THOUGHT_START = "<thought>"
+THOUGHT_END = "</thought>"
+
 TOOL_CALL_START = "<tool_call>"
 TOOL_CALL_END = "</tool_call>"
 
@@ -32,6 +35,12 @@
     re.DOTALL,
 )
 
+# Regex to find a complete <thought>...</thought> block (non-greedy).
+_THOUGHT_RE = re.compile(
+    r"<thought>(.*?)</thought>",
+    re.DOTALL,
+)
+
 # Regex to find complete <tool_call>...</tool_call> blocks (non-greedy).
 _TOOL_CALL_RE = re.compile(
     r"<tool_call>\s*(.*?)\s*</tool_call>",
@@ -269,19 +278,26 @@ def _parse_single_tool_call(raw_text: str) -> ParsedToolCall | None:
 
 
 def text_contains_think_tag(text: str) -> bool:
-    """Fast substring check for a ``<think>`` tag."""
-    return THINK_START in text
+    """Fast substring check for a ``<think>`` or ``<thought>`` tag."""
+    return THINK_START in text or THOUGHT_START in text
 
 
 def extract_thinking_from_text(text: str) -> TextWithThinking:
-    """Extract ``<think>...</think>`` content from *text*.
+    """Extract ``<think>...</think>`` or ``<thought>...</thought>`` content
+    from *text*.
+
+    Both tag variants are supported: ``<think>`` (used by models such as
+    Qwen3-Instruct) and ``<thought>`` (used by some other providers).
 
     Returns a :class:`TextWithThinking` with:
 
     * ``thinking``       – the reasoning content (empty if none found)
-    * ``remaining_text`` – everything outside the think tags
-    * ``has_open_tag``   – ``True`` if ``<think>`` opened but not closed yet
+    * ``remaining_text`` – everything outside the think/thought tags
+    * ``has_open_tag``   – ``True`` if an opening tag was found but the
+                           matching closing tag has not yet been seen
+                           (streaming scenario)
     """
+    # Try <think>...</think> first.
     match = _THINK_RE.search(text)
     if match:
         thinking = match.group(1).strip()
@@ -291,18 +307,37 @@ def extract_thinking_from_text(text: str) -> TextWithThinking:
             remaining_text=remaining,
         )
 
-    # No complete block — check for an unclosed <think>.
-    open_idx = text.find(THINK_START)
-    if open_idx != -1:
-        remaining = text[:open_idx].strip()
-        partial = text[open_idx + len(THINK_START) :]
+    # Try <thought>...</thought>.
+    match = _THOUGHT_RE.search(text)
+    if match:
+        thinking = match.group(1).strip()
+        remaining = (text[: match.start()] + text[match.end() :]).strip()
         return TextWithThinking(
-            thinking=partial.strip(),
+            thinking=thinking,
             remaining_text=remaining,
-            has_open_tag=True,
         )
 
-    return TextWithThinking(remaining_text=text)
+    # No complete block — check for an unclosed <think> or <thought>.
+    think_idx = text.find(THINK_START)
+    thought_idx = text.find(THOUGHT_START)
+
+    # Pick whichever open tag appears first (if both present).
+    if think_idx != -1 and (thought_idx == -1 or think_idx <= thought_idx):
+        open_idx = think_idx
+        open_tag_len = len(THINK_START)
+    elif thought_idx != -1:
+        open_idx = thought_idx
+        open_tag_len = len(THOUGHT_START)
+    else:
+        return TextWithThinking(remaining_text=text)
+
+    remaining = text[:open_idx].strip()
+    partial = text[open_idx + open_tag_len :]
+    return TextWithThinking(
+        thinking=partial.strip(),
+        remaining_text=remaining,
+        has_open_tag=True,
+    )
 
 
 def text_contains_tool_call_tag(text: str) -> bool:

diff --git a/src/copaw/providers/capability_baseline.py b/src/copaw/providers/capability_baseline.py
@@ -268,16 +268,17 @@ def _load_baseline(self) -> None:
                     note="GLM text/code models are text-only",
                 ),
             )
-        self._register(
-            ExpectedCapability(
-                provider_id="zhipu-cn",
-                model_id="glm-5v-turbo",
-                expected_image=True,
-                expected_video=False,
-                doc_url=_zhipu_cn_doc,
-                note="GLM vision model supports image input",
-            ),
-        )
+        for mid in ("glm-5v-turbo", "glm-4v", "glm-4v-plus", "glm-4v-flash", "glm-4.6v-flash"):
+            self._register(
+                ExpectedCapability(
+                    provider_id="zhipu-cn",
+                    model_id=mid,
+                    expected_image=True,
+                    expected_video=False,
+                    doc_url=_zhipu_cn_doc,
+                    note="GLM vision model supports image input",
+                ),
+            )
 
         # ---------------------------------------------------------------
         # Zhipu Coding Plan (BigModel)
@@ -294,16 +295,17 @@ def _load_baseline(self) -> None:
                     note="GLM text/code models are text-only",
                 ),
             )
-        self._register(
-            ExpectedCapability(
-                provider_id="zhipu-cn-codingplan",
-                model_id="glm-5v-turbo",
-                expected_image=True,
-                expected_video=False,
-                doc_url=_zhipu_cn_cp_doc,
-                note="GLM vision model supports image input",
-            ),
-        )
+        for mid in ("glm-5v-turbo", "glm-4v", "glm-4v-plus", "glm-4v-flash", "glm-4.6v-flash"):
+            self._register(
+                ExpectedCapability(
+                    provider_id="zhipu-cn-codingplan",
+                    model_id=mid,
+                    expected_image=True,
+                    expected_video=False,
+                    doc_url=_zhipu_cn_cp_doc,
+                    note="GLM vision model supports image input",
+                ),
+            )
 
         # ---------------------------------------------------------------
         # Zhipu (Z.AI)
@@ -320,16 +322,17 @@ def _load_baseline(self) -> None:
                     note="GLM text/code models are text-only",
                 ),
             )
-        self._register(
-            ExpectedCapability(
-                provider_id="zhipu-intl",
-                model_id="glm-5v-turbo",
-                expected_image=True,
-                expected_video=False,
-                doc_url=_zhipu_intl_doc,
-                note="GLM vision model supports image input",
-            ),
-        )
+        for mid in ("glm-5v-turbo", "glm-4v", "glm-4v-plus", "glm-4v-flash", "glm-4.6v-flash"):
+            self._register(
+                ExpectedCapability(
+                    provider_id="zhipu-intl",
+                    model_id=mid,
+                    expected_image=True,
+                    expected_video=False,
+                    doc_url=_zhipu_intl_doc,
+                    note="GLM vision model supports image input",
+                ),
+            )
 
         # ---------------------------------------------------------------
         # Zhipu Coding Plan (Z.AI)
@@ -346,16 +349,17 @@ def _load_baseline(self) -> None:
                     note="GLM text/code models are text-only",
                 ),
             )
-        self._register(
-            ExpectedCapability(
-                provider_id="zhipu-intl-codingplan",
-                model_id="glm-5v-turbo",
-                expected_image=True,
-                expected_video=False,
-                doc_url=_zhipu_intl_cp_doc,
-                note="GLM vision model supports image input",
-            ),
-        )
+        for mid in ("glm-5v-turbo", "glm-4v", "glm-4v-plus", "glm-4v-flash", "glm-4.6v-flash"):
+            self._register(
+                ExpectedCapability(
+                    provider_id="zhipu-intl-codingplan",
+                    model_id=mid,
+                    expected_image=True,
+                    expected_video=False,
+                    doc_url=_zhipu_intl_cp_doc,
+                    note="GLM vision model supports image input",
+                ),
+            )
 
         # ---------------------------------------------------------------
         # 4. OpenAI

diff --git a/src/copaw/providers/openai_chat_model_compat.py b/src/copaw/providers/openai_chat_model_compat.py
@@ -13,7 +13,9 @@
 from pydantic import BaseModel
 
 from copaw.local_models.tag_parser import (
+    extract_thinking_from_text,
     parse_tool_calls_from_text,
+    text_contains_think_tag,
     text_contains_tool_call_tag,
 )
 
@@ -268,11 +270,29 @@ async def _parse_openai_stream_response(
                 # --- 2. Scan text/content blocks ---
                 # Some models emit <tool_call> tags directly in their
                 # response text instead of (or in addition to) thinking.
+                # Others embed reasoning inside <think>/<thought> tags
+                # in the text rather than via reasoning_content.
                 new_content: list | None = None
+                injected_thinking_blocks: list = []
                 for i, block in enumerate(parsed.content):
                     if block.get("type") != "text":
                         continue
                     text = block.get("text") or ""
+
+                    # --- 2a. Extract <think>/<thought> tags from text ---
+                    if text_contains_think_tag(text):
+                        think_result = extract_thinking_from_text(text)
+                        if think_result.thinking or think_result.has_open_tag:
+                            injected_thinking_blocks.append(
+                                {
+                                    "type": "thinking",
+                                    "thinking": think_result.thinking,
+                                },
+                            )
+                            text = think_result.remaining_text
+                            block["text"] = text
+
+                    # --- 2b. Extract <tool_call> tags from text ---
                     if not text_contains_tool_call_tag(text):
                         continue
 
@@ -300,6 +320,14 @@ async def _parse_openai_stream_response(
                             new_content = list(parsed.content)
                         new_content[i] = None  # type: ignore[index]
 
+                if injected_thinking_blocks:
+                    # Prepend extracted thinking blocks before existing content.
+                    parsed.content = injected_thinking_blocks + list(
+                        parsed.content,
+                    )
+                    # Rebuild new_content index offsets after prepending.
+                    new_content = None
+
                 if new_content is not None:
                     parsed.content = [b for b in new_content if b is not None]
-                if injected_thinking_blocks:
-                    # Prepend extracted thinking blocks before existing content.
-                    parsed.content = injected_thinking_blocks + list(
-                        parsed.content,
-                    )
-                    # Rebuild new_content index offsets after prepending.
-                    new_content = None
-
-                if new_content is not None:
-                    parsed.content = [b for b in new_content if b is not None]
+                if new_content is not None:
+                    parsed.content = [b for b in new_content if b is not None]
+
+                if injected_thinking_blocks:
+                    # Prepend extracted thinking blocks before existing content.
+                    parsed.content = injected_thinking_blocks + list(parsed.content)
-                if injected_thinking_blocks:
-                    # Prepend extracted thinking blocks before existing content.
-                    parsed.content = injected_thinking_blocks + list(
-                        parsed.content,
-                    )
-                    # Rebuild new_content index offsets after prepending.
-                    new_content = None
-
-                if new_content is not None:
-                    parsed.content = [b for b in new_content if b is not None]
+                if new_content is not None:
+                    parsed.content = [b for b in new_content if b is not None]
+
+                if injected_thinking_blocks:
+                    # Prepend extracted thinking blocks before existing content.
+                    parsed.content = injected_thinking_blocks + list(parsed.content)
 

diff --git a/src/copaw/providers/provider_manager.py b/src/copaw/providers/provider_manager.py
@@ -172,6 +172,34 @@
         supports_video=False,
         probe_source="documentation",
     ),
+    ModelInfo(
+        id="glm-4v",
+        name="glm-4v",
+        supports_image=True,
+        supports_video=False,
+        probe_source="documentation",
+    ),
+    ModelInfo(
+        id="glm-4v-plus",
+        name="glm-4v-plus",
+        supports_image=True,
+        supports_video=False,
+        probe_source="documentation",
+    ),
+    ModelInfo(
+        id="glm-4v-flash",
+        name="glm-4v-flash",
+        supports_image=True,
+        supports_video=False,
+        probe_source="documentation",
+    ),
+    ModelInfo(
+        id="glm-4.6v-flash",
+        name="glm-4.6v-flash",
+        supports_image=True,
+        supports_video=False,
+        probe_source="documentation",
+    ),
 ]
 
 OPENAI_MODELS: List[ModelInfo] = [

diff --git a/tests/unit/providers/test_provider_manager.py b/tests/unit/providers/test_provider_manager.py
@@ -126,6 +126,10 @@ def test_builtin_zhipu_providers_registered(isolated_secret_dir) -> None:
             "glm-5.1",
             "glm-5-turbo",
             "glm-5v-turbo",
+            "glm-4v",
+            "glm-4v-plus",
+            "glm-4v-flash",
+            "glm-4.6v-flash",
         ]