Prospector/test_speech.py at main · fuseai-fellowship/Prospector · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import tempfile
import soundfile as sf
from pathlib import Path
from src.utils.speech_service import SpeechService


def test_tts_asr_roundtrip():
    # Initialize the speech service
    # The cache directory must be the one containing your Vosk model
    service = SpeechService(cache_dir="speech_models", preload_voices=["af_bella"])

    # Text to test
    text = "Hello there, how are you today?"
    print(f"\n🗣️ Original text: {text}")

    # ---- TEXT TO SPEECH (Produces 24kHz WAV bytes) ----
    wav_bytes = service.text_to_speech(text, voice="af_bella")

    # Save for inspection
    # Ensure to use a specific filename or Vosk will struggle to transcribe from the raw bytes
    # without a WAV header being constructed in a separate file.
    output_path = Path(tempfile.gettempdir()) / "kokoro_test.wav"
    with open(output_path, "wb") as f:
        f.write(wav_bytes)
    print(f"💾 Saved synthesized audio to: {output_path}")

    # ---- SPEECH TO TEXT ----
    print("🎧 Transcribing generated audio...")
    # This call now handles the necessary 24kHz -> 16kHz resampling internally.
    result_text = service.transcribe_audio(wav_bytes)
    print(f"📝 Transcribed text: {result_text}")

    # ---- Simple Accuracy Check ----
    # Vosk often drops punctuation and capitalization, so compare normalized text
    normalized_original = text.lower().replace(",", "").replace("?", "").strip()
    normalized_result = result_text.strip().lower()

    # We expect a high degree of match, but not 100% due to ASR imperfections
    if normalized_result == normalized_original:
        print("✅ Transcription succeeded (exact match).")
    elif normalized_result and all(
        word in normalized_result for word in normalized_original.split()
    ):
        print("✅ Transcription succeeded (all words found).")
    else:
        # Check if it failed completely
        if not result_text.strip():
            print("❌ Transcription returned empty text!")
        else:
            print(
                f"⚠️ Transcription mismatch! Expected '{normalized_original}', Got '{normalized_result}'. This is often due to low ASR model accuracy or voice differences."
            )


if __name__ == "__main__":
    test_tts_asr_roundtrip()