agentscope-ai · shaohuaxi · Apr 15, 2026
diff --git a/src/qwenpaw/app/channels/sip/__init__.py b/src/qwenpaw/app/channels/sip/__init__.py
diff --git a/src/qwenpaw/app/channels/sip/backend.py b/src/qwenpaw/app/channels/sip/backend.py
@@ -0,0 +1,54 @@
+# -*- coding: utf-8 -*-
+"""SipBackend Protocol -- abstraction for Dev / Production.
+
+* **PyVoIPBackend** (``sip_mode="dev"``)
+* **LiveKitBackend** (``sip_mode="livekit"``)
+"""
+from __future__ import annotations
+
+import asyncio
+from typing import (
+    Any,
+    Callable,
+    Coroutine,
+    Optional,
+    Protocol,
+    runtime_checkable,
+)
+
+IncomingCallCallback = Callable[
+    [
+        str,
+        str,
+        str,
+        asyncio.Queue,
+        Callable[[bytes], Coroutine[Any, Any, None]],
+    ],
+    Coroutine[Any, Any, None],
+]
+
+CallEndedCallback = Callable[
+    [str],
+    Coroutine[Any, Any, None],
+]
+
+
+@runtime_checkable
+class SipBackend(Protocol):
+    """Pluggable SIP/RTP backend."""
+
+    on_incoming_call: Optional[IncomingCallCallback]
+    on_call_ended: Optional[CallEndedCallback]
+
+    async def start(self) -> None:
+        """Start the backend."""
+
+    async def stop(self) -> None:
+        """Stop the backend."""
+
+    async def play_audio(
+        self,
+        call_id: str,
+        audio: bytes,
+    ) -> None:
+        """Send raw PCM audio to the caller."""
diff --git a/src/qwenpaw/app/channels/sip/fake_stt_tts.py b/src/qwenpaw/app/channels/sip/fake_stt_tts.py
@@ -0,0 +1,97 @@
+# -*- coding: utf-8 -*-
+"""Fake STT/TTS for E2E testing -- no external API.
+
+FakeSTTStream: emits scripted transcripts after N frames.
+fake_tts_wav(): returns a minimal valid WAV tone.
+
+Env vars:
+  FAKE_STT_SCRIPT  -- comma-separated sentences
+  FAKE_STT_FRAMES  -- frames per utterance (default 50)
+"""
+from __future__ import annotations
+
+import asyncio
+import io
+import logging
+import math
+import os
+import struct
+import wave
+from typing import Awaitable, Callable, Optional
+
+logger = logging.getLogger(__name__)
+
+
+class FakeSTTStream:
+    """Scripted STT engine for integration testing."""
+
+    def __init__(self) -> None:
+        raw = os.environ.get(
+            "FAKE_STT_SCRIPT",
+            "\u4f60\u597d,"
+            "\u4eca\u5929\u5929\u6c14\u600e\u4e48\u6837,"
+            "\u518d\u89c1",
+        )
+        self._script = [s.strip() for s in raw.split(",") if s.strip()]
+        self._frames_per_utterance = int(
+            os.environ.get("FAKE_STT_FRAMES", "50"),
+        )
+        self._frame_count = 0
+        self._utterance_idx = 0
+        self.on_transcript: Optional[Callable[[str], Awaitable[None]]] = None
+        self._loop: Optional[asyncio.AbstractEventLoop] = None
+
+    async def start(self) -> None:
+        self._loop = asyncio.get_running_loop()
+        logger.info(
+            "FakeSTT started: script=%s, fpu=%d",
+            self._script,
+            self._frames_per_utterance,
+        )
+
+    async def feed_audio(self, _chunk: bytes) -> None:
+        self._frame_count += 1
+        threshold = self._frames_per_utterance
+        if self._frame_count % threshold == 0 and self._utterance_idx < len(
+            self._script,
+        ):
+            text = self._script[self._utterance_idx]
+            self._utterance_idx += 1
+            logger.info(
+                "FakeSTT transcript #%d: %s",
+                self._utterance_idx,
+                text,
+            )
+            if self.on_transcript:
+                await self.on_transcript(text)
+
+    async def stop(self) -> None:
+        logger.info(
+            "FakeSTT stopped: %d frames, %d utts",
+            self._frame_count,
+            self._utterance_idx,
+        )
+
+
+def fake_tts_wav(
+    _text: str,
+    duration_ms: int = 200,
+    sample_rate: int = 8000,
+) -> bytes:
+    """Return a minimal WAV with a short 440 Hz tone."""
+    n_samples = sample_rate * duration_ms // 1000
+    samples = []
+    for i in range(n_samples):
+        val = int(
+            16000 * math.sin(2 * math.pi * 440 * i / sample_rate),
+        )
+        samples.append(struct.pack("<h", val))
+    pcm = b"".join(samples)
+
+    buf = io.BytesIO()
+    with wave.open(buf, "wb") as wf:
+        wf.setnchannels(1)
+        wf.setsampwidth(2)
+        wf.setframerate(sample_rate)
+        wf.writeframes(pcm)
+    return buf.getvalue()