psypeal
diff --git a/‎App/Resources/voices/longanyang/prompt_mel.bin‎
201 KB b/‎App/Resources/voices/longanyang/prompt_mel.bin‎
201 KB
diff --git a/‎App/Resources/voices/longanyang/prompt_text.txt‎
Lines changed: 1 addition & 0 deletions b/‎App/Resources/voices/longanyang/prompt_text.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎App/Resources/voices/longanyang/prompt_tokens.bin‎
1.34 KB b/‎App/Resources/voices/longanyang/prompt_tokens.bin‎
1.34 KB
diff --git a/‎App/Resources/voices/longanyang/speaker_embedding.bin‎
768 Bytes b/‎App/Resources/voices/longanyang/speaker_embedding.bin‎
768 Bytes
diff --git a/‎App/Resources/voices/longjiaxin_v3/prompt_mel.bin‎
191 KB b/‎App/Resources/voices/longjiaxin_v3/prompt_mel.bin‎
191 KB
diff --git a/‎App/Resources/voices/longjiaxin_v3/prompt_text.txt‎
Lines changed: 1 addition & 1 deletion b/‎App/Resources/voices/longjiaxin_v3/prompt_text.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎App/Resources/voices/longjiaxin_v3/prompt_tokens.bin‎
1.27 KB b/‎App/Resources/voices/longjiaxin_v3/prompt_tokens.bin‎
1.27 KB
diff --git a/‎App/Resources/voices/longjiaxin_v3/speaker_embedding.bin‎
768 Bytes b/‎App/Resources/voices/longjiaxin_v3/speaker_embedding.bin‎
768 Bytes
diff --git a/‎App/Services/TTS/OnDeviceTTSEngine.swift‎
Lines changed: 16 additions & 3 deletions b/‎App/Services/TTS/OnDeviceTTSEngine.swift‎
Lines changed: 16 additions & 3 deletions
diff --git a/‎tools/extract_voice_onnx.py‎
Lines changed: 186 additions & 0 deletions b/‎tools/extract_voice_onnx.py‎
Lines changed: 186 additions & 0 deletions
@@ -0,0 +1 @@
+Hello, my name is Onyx. How are you doing today?
@@ -1 +1 @@
-PLACEHOLDER: Replace with actual prompt audio transcript for Jiaxin voice.
+Hello, my name is Nova. It's nice to meet you.
@@ -128,9 +128,9 @@ actor OnDeviceTTSEngine: TTSEngine {
             return cached
         }
 
-        let voicesDir = ONNXSessionManager.defaultModelsDirectory
-            .appendingPathComponent("voices", isDirectory: true)
-            .appendingPathComponent(voice.id, isDirectory: true)
+        // Check app bundle first (voice data is small and bundled with the app),
+        // then fall back to the downloaded models directory.
+        let voicesDir = voiceDataDirectory(for: voice.id)
 
         // Load speaker embedding (192 × Float32)
         let embPath = voicesDir.appendingPathComponent("speaker_embedding.bin")
@@ -174,6 +174,19 @@ actor OnDeviceTTSEngine: TTSEngine {
         return voiceData
     }
 
+    /// Resolve the directory containing voice data files for the given voice ID.
+    /// Checks the app bundle first, then the downloaded models directory.
+    private func voiceDataDirectory(for voiceId: String) -> URL {
+        // App bundle: App/Resources/voices/{voiceId}/
+        if let bundlePath = Bundle.main.path(forResource: "speaker_embedding", ofType: "bin", inDirectory: "voices/\(voiceId)") {
+            return URL(fileURLWithPath: bundlePath).deletingLastPathComponent()
+        }
+        // Fallback: downloaded models directory
+        return ONNXSessionManager.defaultModelsDirectory
+            .appendingPathComponent("voices", isDirectory: true)
+            .appendingPathComponent(voiceId, isDirectory: true)
+    }
+
     // MARK: - Pipeline Setup
 
     /// Initialize tokenizer, LLM, and flow vocoder if not already loaded.
 
@@ -0,0 +1,186 @@
+#!/usr/bin/env python3
+"""
+Extract voice data for on-device CosyVoice 3 using ONNX Runtime (CPU, no GPU).
+
+Extracts from a reference audio clip:
+  - speaker_embedding.bin  (192 x float32)
+  - prompt_tokens.bin      (N x int64)
+  - prompt_mel.bin         (int32 header + frames*80 x float32)
+  - prompt_text.txt        (UTF-8 transcript)
+
+Usage:
+  python extract_voice_onnx.py \
+    --audio prompts/en_female_nova_greeting.wav \
+    --text "Hello, my name is Sarah." \
+    --voice-id test_female \
+    --models-dir models \
+    --output-dir ../App/Resources/voices/
+"""
+
+import argparse
+import struct
+import sys
+from pathlib import Path
+
+import numpy as np
+import librosa
+import onnxruntime as ort
+
+
+def extract_speaker_embedding(campplus_session: ort.InferenceSession, audio_path: str) -> np.ndarray:
+    """Extract 192-dim speaker embedding using CAMPPlus ONNX model."""
+    # Load audio at 16kHz
+    audio, _ = librosa.load(audio_path, sr=16000)
+    audio = audio.astype(np.float32)
+
+    # Kaldi-style fbank features (80 mels)
+    mel = librosa.feature.melspectrogram(
+        y=audio, sr=16000, n_fft=400, hop_length=160,
+        n_mels=80, fmin=20, fmax=7600
+    )
+    log_mel = np.log(np.maximum(mel, 1e-10))
+    log_mel = log_mel.T  # [frames, 80]
+
+    # Mean normalization (Kaldi-style)
+    log_mel = log_mel - log_mel.mean(axis=0, keepdims=True)
+
+    # [1, frames, 80]
+    feat = log_mel[np.newaxis, :, :].astype(np.float32)
+
+    input_name = campplus_session.get_inputs()[0].name
+    embedding = campplus_session.run(None, {input_name: feat})[0]
+
+    # Flatten to [192]
+    return embedding.flatten().astype(np.float32)
+
+
+def extract_speech_tokens(tokenizer_session: ort.InferenceSession, audio_path: str) -> np.ndarray:
+    """Extract speech tokens using Speech Tokenizer v3 ONNX model."""
+    # Load audio at 16kHz
+    audio, _ = librosa.load(audio_path, sr=16000)
+    audio = audio.astype(np.float32)
+
+    # Whisper-style mel (128 mels)
+    mel = librosa.feature.melspectrogram(
+        y=audio, sr=16000, n_fft=400, hop_length=160,
+        n_mels=128, fmin=0, fmax=8000
+    )
+    log_mel = np.log10(np.maximum(mel, 1e-10))
+    log_mel = np.maximum(log_mel, log_mel.max() - 8.0)
+    log_mel = (log_mel + 4.0) / 4.0
+
+    # [1, 128, frames]
+    feat = log_mel[np.newaxis, :, :].astype(np.float32)
+    feat_len = np.array([feat.shape[2]], dtype=np.int32)
+
+    input_names = [inp.name for inp in tokenizer_session.get_inputs()]
+    tokens = tokenizer_session.run(None, {
+        input_names[0]: feat,
+        input_names[1]: feat_len,
+    })[0]
+
+    # Flatten to [N] and convert to int64
+    return tokens.flatten().astype(np.int64)
+
+
+def extract_prompt_mel(audio_path: str) -> tuple[np.ndarray, int]:
+    """Extract mel spectrogram for flow conditioning. Returns (mel_floats, frame_count)."""
+    # Load at 24kHz (CosyVoice native rate)
+    audio, _ = librosa.load(audio_path, sr=24000)
+    audio = audio.astype(np.float32)
+
+    # CosyVoice flow mel params
+    mel = librosa.feature.melspectrogram(
+        y=audio, sr=24000, n_fft=1024, hop_length=256,
+        n_mels=80, fmin=0, fmax=12000
+    )
+    log_mel = np.log(np.maximum(mel, 1e-10))
+
+    # [frames, 80] row-major
+    mel_feat = log_mel.T.astype(np.float32)
+    n_frames = mel_feat.shape[0]
+
+    return mel_feat.flatten(), n_frames
+
+
+def save_voice_data(
+    output_dir: Path,
+    speaker_embedding: np.ndarray,
+    prompt_tokens: np.ndarray,
+    prompt_mel_flat: np.ndarray,
+    prompt_mel_frames: int,
+    prompt_text: str,
+):
+    """Save extracted voice data in the binary format expected by OnDeviceTTSEngine."""
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    # speaker_embedding.bin: raw float32, 192 elements
+    emb_path = output_dir / "speaker_embedding.bin"
+    speaker_embedding.tofile(str(emb_path))
+    print(f"  speaker_embedding.bin: shape=({speaker_embedding.shape[0]},) size={emb_path.stat().st_size} bytes")
+
+    # prompt_tokens.bin: raw int64
+    tok_path = output_dir / "prompt_tokens.bin"
+    prompt_tokens.tofile(str(tok_path))
+    print(f"  prompt_tokens.bin: shape=({prompt_tokens.shape[0]},) size={tok_path.stat().st_size} bytes")
+
+    # prompt_mel.bin: int32 frame count header + float32 data
+    mel_path = output_dir / "prompt_mel.bin"
+    with open(mel_path, "wb") as f:
+        f.write(struct.pack("<i", prompt_mel_frames))
+        prompt_mel_flat.tofile(f)
+    print(f"  prompt_mel.bin: frames={prompt_mel_frames} size={mel_path.stat().st_size} bytes")
+
+    # prompt_text.txt: UTF-8
+    text_path = output_dir / "prompt_text.txt"
+    text_path.write_text(prompt_text, encoding="utf-8")
+    print(f"  prompt_text.txt: {len(prompt_text)} chars")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Extract voice data using ONNX Runtime (CPU)")
+    parser.add_argument("--audio", required=True, help="Path to prompt audio WAV file")
+    parser.add_argument("--text", required=True, help="Transcript of the prompt audio")
+    parser.add_argument("--voice-id", required=True, help="Voice ID (e.g., longjiaxin_v3)")
+    parser.add_argument("--models-dir", default="models", help="Directory with campplus.onnx and speech_tokenizer_v3.onnx")
+    parser.add_argument("--output-dir", default="../App/Resources/voices", help="Output base directory")
+    args = parser.parse_args()
+
+    models_dir = Path(args.models_dir)
+    campplus_path = models_dir / "campplus.onnx"
+    tokenizer_path = models_dir / "speech_tokenizer_v3.onnx"
+
+    if not campplus_path.exists():
+        print(f"ERROR: {campplus_path} not found. Download from HuggingFace first.")
+        sys.exit(1)
+    if not tokenizer_path.exists():
+        print(f"ERROR: {tokenizer_path} not found. Download from HuggingFace first.")
+        sys.exit(1)
+
+    print(f"Loading ONNX models...")
+    campplus = ort.InferenceSession(str(campplus_path), providers=["CPUExecutionProvider"])
+    tokenizer = ort.InferenceSession(str(tokenizer_path), providers=["CPUExecutionProvider"])
+
+    print(f"\nExtracting voice data for '{args.voice_id}' from: {args.audio}")
+
+    print("\n1. Extracting speaker embedding (CAMPPlus)...")
+    embedding = extract_speaker_embedding(campplus, args.audio)
+    print(f"   Shape: ({embedding.shape[0]},) | Min: {embedding.min():.4f} Max: {embedding.max():.4f}")
+
+    print("\n2. Extracting speech tokens (Speech Tokenizer v3)...")
+    tokens = extract_speech_tokens(tokenizer, args.audio)
+    print(f"   Shape: ({tokens.shape[0]},) | Token range: [{tokens.min()}, {tokens.max()}]")
+
+    print("\n3. Extracting prompt mel spectrogram...")
+    mel_flat, mel_frames = extract_prompt_mel(args.audio)
+    print(f"   Frames: {mel_frames} | Total floats: {mel_flat.shape[0]}")
+
+    output_dir = Path(args.output_dir) / args.voice_id
+    print(f"\nSaving to: {output_dir}")
+    save_voice_data(output_dir, embedding, tokens, mel_flat, mel_frames, args.text)
+
+    print(f"\nDone! Voice data for '{args.voice_id}' saved.")
+
+
+if __name__ == "__main__":
+    main()
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Hello, my name is Onyx. How are you doing today?`
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-PLACEHOLDER: Replace with actual prompt audio transcript for Jiaxin voice.`
	`1`	`+Hello, my name is Nova. It's nice to meet you.`