Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions comps/asr/src/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ aiohttp
datasets
docarray[full]
fastapi
faster-whisper
mcp
opentelemetry-api
opentelemetry-exporter-otlp
Expand Down
97 changes: 97 additions & 0 deletions comps/third_parties/whisper/src/faster_whisper_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
# Copyright (C) 2025 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

import os
import uuid
import wave

from fastapi import WebSocket
from faster_whisper import WhisperModel

BYTES_PER_SAMPLE = 2 # 16-bit audio
SAMPLE_RATE = 16000 # Hz


class FasterWhisperModel:
"""Leverage FasterWhisper to do realtime transcription on CPU."""

def __init__(self, model_size_or_path="tiny", device="cpu", compute_type="int8") -> None:
self.model = WhisperModel(
model_size_or_path=model_size_or_path, device=device, compute_type=compute_type, local_files_only=False
)
self.model_size_or_path = model_size_or_path

async def audio2text_streaming(
self,
websocket: WebSocket,
audio_data: bytes,
item_id: int,
event_id: str,
vad_filter: bool = True,
beam_size: int = 5,
language: str = "en",
min_silence_duration_ms: int = 500,
is_final: bool = False,
):
"""Process the audio data chunk."""
if not audio_data:
return
# initialize the transcription variable to concatenate the transcription
transcription = ""

# generate a temporary file name
uid = str(uuid.uuid4())
temp_filename = f"{uid}.wav"

try:
# create a WAV file
with wave.open(temp_filename, "wb") as wav_file:
wav_file.setnchannels(1) # single channel
wav_file.setsampwidth(BYTES_PER_SAMPLE) # 16-bit
wav_file.setframerate(SAMPLE_RATE)
wav_file.writeframes(audio_data)

# use faster-whisper for speech recognition
segments, _ = self.model.transcribe(
temp_filename,
language=None if language == "auto" else language,
beam_size=beam_size,
vad_filter=vad_filter,
vad_parameters=dict(min_silence_duration_ms=min_silence_duration_ms),
)

segments = list(segments)
i = 0
for segment in segments:
print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
response = {
"content_index": i,
"type": "conversation.item.input_audio_transcription.delta",
"event_id": event_id,
"item_id": "item_" + f"{item_id:03}",
"delta": segment.text,
}
transcription += segment.text
await websocket.send_json(response)

# if it is the last chunk of data, send the completion signal
if is_final:
# Send completion signal for streaming format
await websocket.send_json(
{
"content_index": 0,
"type": "conversation.item.input_audio_transcription.completion",
"event_id": event_id,
"item_id": "item_" + f"{item_id:03}",
"transcript": transcription,
}
)
transcription = ""

except Exception as e:
print(f"Error in transcription: {e}")
await websocket.send_json({"status": "error", "error": str(e)})
finally:
# clean up the temporary file
if os.path.exists(temp_filename):
os.remove(temp_filename)
Loading