Skip to content

Commit 22fcf61

Browse files
committed
feat: disable vad
1 parent 407e81f commit 22fcf61

File tree

3 files changed

+148
-61
lines changed

3 files changed

+148
-61
lines changed

apps/whisper-api/README.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,12 @@ export RUST_LOG=debug
9090

9191
# Force CPU usage
9292
export CANDLE_FORCE_CPU=1
93+
94+
# Disable VAD (Voice Activity Detection) - process entire audio directly
95+
export DISABLE_VAD=true
96+
97+
# Set VAD threshold (0.0-1.0, default: 0.15)
98+
export VAD_THRESHOLD=0.2
9399
```
94100

95101
## Acknowledgements

apps/whisper-api/src/main.rs

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,16 +20,16 @@ use crate::{
2020

2121
mod api;
2222
mod audio_manager;
23+
mod huggingface;
2324
mod router;
2425
mod vad;
2526
mod whisper;
26-
mod huggingface;
27-
2827

2928
// Application state with dynamic model loading
3029
struct AppState {
3130
vad: Arc<Mutex<VADProcessor>>,
3231
device: Device,
32+
vad_enabled: bool,
3333
// Use RwLock for read-heavy workload (checking cache)
3434
whisper_models: Arc<RwLock<HashMap<String, Arc<Mutex<WhisperProcessor>>>>>,
3535
}
@@ -50,20 +50,30 @@ impl AppState {
5050

5151
println!("🚀 Using device: {device:?}");
5252

53-
// Get VAD threshold from environment or use default
53+
// Check if VAD is enabled
54+
let vad_enabled = std::env::var("DISABLE_VAD")
55+
.map(|s| s.to_lowercase() != "true" && s != "1")
56+
.unwrap_or(true);
57+
58+
println!("🎯 VAD enabled: {vad_enabled}");
59+
60+
// Get VAD threshold from environment or use default (lowered for better detection)
5461
let vad_threshold = std::env::var("VAD_THRESHOLD")
5562
.ok()
5663
.and_then(|s| s.parse().ok())
57-
.unwrap_or(0.3);
64+
.unwrap_or(0.15);
5865

59-
println!("🎯 VAD threshold: {vad_threshold}");
66+
if vad_enabled {
67+
println!("🎯 VAD threshold: {vad_threshold}");
68+
}
6069

6170
// Initialize VAD processor (always use CPU for VAD)
6271
let vad = VADProcessor::new(candle_core::Device::Cpu, vad_threshold)?;
6372

6473
Ok(Self {
6574
vad: Arc::new(Mutex::new(vad)),
6675
device,
76+
vad_enabled,
6777
whisper_models: Arc::new(RwLock::new(HashMap::new())),
6878
})
6979
}

apps/whisper-api/src/router.rs

Lines changed: 127 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
use std::{
22
collections::HashMap,
3+
pin::Pin,
34
sync::Arc,
45
time::{Duration, Instant},
56
};
@@ -15,7 +16,7 @@ use axum::{
1516
sse::{Event, KeepAlive, Sse},
1617
},
1718
};
18-
use futures::stream::{self, Stream};
19+
use futures::stream::{self, Stream, StreamExt};
1920
use symphonia::{
2021
core::{
2122
audio::{AudioBufferRef, Signal},
@@ -266,16 +267,31 @@ async fn transcribe_audio_complete(
266267
let whisper_processor = state.get_whisper_processor(&model_name).await?;
267268
processing_stats.model_loading_duration = model_loading_start.elapsed();
268269

270+
// Check if VAD is enabled
271+
if !state.vad_enabled {
272+
println!("🔇 VAD disabled, processing entire audio directly");
273+
let whisper_start = Instant::now();
274+
let mut whisper = whisper_processor.lock().await;
275+
let transcript = whisper.transcribe(&audio_data)?;
276+
processing_stats.whisper_transcription_duration = whisper_start.elapsed();
277+
processing_stats.vad_processing_duration = Duration::ZERO;
278+
279+
return Ok(transcript);
280+
}
281+
269282
// Process audio through VAD and Whisper
270283
let mut vad = state.vad.lock().await;
271284
let mut whisper = whisper_processor.lock().await;
272-
let mut audio_buffer = AudioBuffer::new(10000, 100, 500, sample_rate);
285+
// Use more lenient parameters: max_duration=10s, min_speech=50ms, min_silence=300ms
286+
let mut audio_buffer = AudioBuffer::new(10000, 50, 300, sample_rate);
273287

274288
let mut transcripts = Vec::new();
275289
let mut frame_buffer = Vec::<f32>::new();
276290

277291
let vad_start = Instant::now();
278292
let mut whisper_total_time = Duration::ZERO;
293+
let mut speech_frame_count = 0;
294+
let mut total_frame_count = 0;
279295

280296
// Process in chunks
281297
for chunk in audio_data.chunks(1024) {
@@ -287,6 +303,11 @@ async fn transcribe_audio_complete(
287303
let speech_prob = vad.process_chunk(&frame)?;
288304
let is_speech = vad.is_speech(speech_prob);
289305

306+
total_frame_count += 1;
307+
if is_speech {
308+
speech_frame_count += 1;
309+
}
310+
290311
if let Some(complete_audio) = audio_buffer.add_chunk(&frame, is_speech) {
291312
// Measure Whisper transcription time
292313
let whisper_start = Instant::now();
@@ -300,9 +321,24 @@ async fn transcribe_audio_complete(
300321
}
301322
}
302323

324+
println!("🎯 VAD Stats: {speech_frame_count}/{total_frame_count} frames detected as speech ({:.1}%)", speech_frame_count as f32 / total_frame_count as f32 * 100.0);
325+
303326
processing_stats.vad_processing_duration = vad_start.elapsed() - whisper_total_time;
304327
processing_stats.whisper_transcription_duration = whisper_total_time;
305328

329+
// Fallback: If no segments were detected by VAD, process the entire audio
330+
if transcripts.is_empty() && !audio_data.is_empty() {
331+
println!("⚠️ No VAD segments detected, processing entire audio as fallback");
332+
let whisper_start = Instant::now();
333+
let transcript = whisper.transcribe(&audio_data)?;
334+
let whisper_fallback_time = whisper_start.elapsed();
335+
processing_stats.whisper_transcription_duration += whisper_fallback_time;
336+
337+
if !transcript.trim().is_empty() && !transcript.contains("[BLANK_AUDIO]") {
338+
transcripts.push(transcript.trim().to_string());
339+
}
340+
}
341+
306342
Ok(transcripts.join(" "))
307343
}
308344

@@ -312,7 +348,7 @@ async fn create_transcription_stream(
312348
model_name: String, // Change to owned String
313349
audio_data: Vec<f32>,
314350
mut processing_stats: ProcessingStats,
315-
) -> Result<impl Stream<Item = Result<Event, anyhow::Error>>, (StatusCode, Json<ErrorResponse>)> {
351+
) -> Result<Pin<Box<dyn Stream<Item = Result<Event, anyhow::Error>> + Send>>, (StatusCode, Json<ErrorResponse>)> {
316352
let stream_start = Instant::now();
317353

318354
// Get the appropriate Whisper processor for this model with timing
@@ -337,68 +373,103 @@ async fn create_transcription_stream(
337373

338374
let sample_rate = 16000;
339375

340-
Ok(stream::unfold((state, whisper_processor, audio_data, 0, AudioBuffer::new(10000, 100, 500, sample_rate), processing_stats, stream_start), move |(state, whisper_processor, audio_data, mut processed, mut audio_buffer, mut stats, stream_start)| async move {
341-
if processed >= audio_data.len() {
342-
// Print final statistics for streaming
343-
stats.total_duration = stream_start.elapsed();
344-
stats.print_summary();
345-
return None;
346-
}
376+
// If VAD is disabled, process entire audio directly and return as single stream event
377+
if !state.vad_enabled {
378+
println!("🔇 VAD disabled for streaming, processing entire audio directly");
379+
let whisper_start = Instant::now();
380+
let mut whisper = whisper_processor.lock().await;
381+
let transcript = match whisper.transcribe(&audio_data) {
382+
Ok(text) => text,
383+
Err(e) => {
384+
return Err((
385+
StatusCode::INTERNAL_SERVER_ERROR,
386+
Json(ErrorResponse {
387+
error: ErrorDetail {
388+
message: format!("Transcription failed: {e}"),
389+
error_type: "server_error".to_string(),
390+
param: None,
391+
code: None,
392+
},
393+
}),
394+
));
395+
},
396+
};
397+
processing_stats.whisper_transcription_duration = whisper_start.elapsed();
398+
processing_stats.vad_processing_duration = Duration::ZERO;
399+
processing_stats.total_duration = stream_start.elapsed();
400+
processing_stats.print_summary();
347401

348-
// Process audio in chunks suitable for VAD (512 samples at a time)
349-
let chunk_size = 512.min(audio_data.len() - processed);
350-
let chunk = &audio_data[processed..processed + chunk_size];
351-
processed += chunk_size;
402+
let event_data = StreamChunk { text: transcript, timestamp: Some(audio_data.len() as f64 / f64::from(sample_rate)) };
403+
let event = Event::default().json_data(event_data).unwrap();
352404

353-
// Process through VAD and Whisper processors
354-
let mut whisper_result = None;
405+
return Ok(stream::once(async move { Ok(event) }).boxed());
406+
}
355407

356-
// Process through VAD
357-
let vad_chunk_start = Instant::now();
358-
let mut vad = state.vad.lock().await;
359-
if let Ok(speech_prob) = vad.process_chunk(chunk) {
360-
let is_speech = vad.is_speech(speech_prob);
408+
Ok(
409+
stream::unfold((state, whisper_processor, audio_data, 0, AudioBuffer::new(10000, 50, 300, sample_rate), processing_stats, stream_start), move |(state, whisper_processor, audio_data, mut processed, mut audio_buffer, mut stats, stream_start)| async move {
410+
if processed >= audio_data.len() {
411+
// Print final statistics for streaming
412+
stats.total_duration = stream_start.elapsed();
413+
stats.print_summary();
414+
return None;
415+
}
361416

362-
// Add to audio buffer and check if we have complete audio
363-
if let Some(complete_audio) = audio_buffer.add_chunk(chunk, is_speech) {
364-
// Release VAD lock before acquiring Whisper lock
365-
drop(vad);
366-
let vad_chunk_time = vad_chunk_start.elapsed();
367-
stats.vad_processing_duration += vad_chunk_time;
368-
369-
// Process complete audio through Whisper
370-
let whisper_chunk_start = Instant::now();
371-
let mut whisper = whisper_processor.lock().await;
372-
if let Ok(transcript) = whisper.transcribe(&complete_audio) {
373-
let whisper_chunk_time = whisper_chunk_start.elapsed();
374-
stats.whisper_transcription_duration += whisper_chunk_time;
375-
376-
if !transcript.trim().is_empty() && !transcript.contains("[BLANK_AUDIO]") {
377-
whisper_result = Some(transcript.trim().to_string());
378-
println!("🎯 Chunk transcribed in {:.2}ms: \"{}\"", whisper_chunk_time.as_secs_f64() * 1000.0, transcript.trim());
417+
// Process audio in chunks suitable for VAD (512 samples at a time)
418+
let chunk_size = 512.min(audio_data.len() - processed);
419+
let chunk = &audio_data[processed..processed + chunk_size];
420+
processed += chunk_size;
421+
422+
// Process through VAD and Whisper processors
423+
let mut whisper_result = None;
424+
425+
// Process through VAD
426+
let vad_chunk_start = Instant::now();
427+
let mut vad = state.vad.lock().await;
428+
if let Ok(speech_prob) = vad.process_chunk(chunk) {
429+
let is_speech = vad.is_speech(speech_prob);
430+
431+
// Add to audio buffer and check if we have complete audio
432+
if let Some(complete_audio) = audio_buffer.add_chunk(chunk, is_speech) {
433+
// Release VAD lock before acquiring Whisper lock
434+
drop(vad);
435+
let vad_chunk_time = vad_chunk_start.elapsed();
436+
stats.vad_processing_duration += vad_chunk_time;
437+
438+
// Process complete audio through Whisper
439+
let whisper_chunk_start = Instant::now();
440+
let mut whisper = whisper_processor.lock().await;
441+
if let Ok(transcript) = whisper.transcribe(&complete_audio) {
442+
let whisper_chunk_time = whisper_chunk_start.elapsed();
443+
stats.whisper_transcription_duration += whisper_chunk_time;
444+
445+
if !transcript.trim().is_empty() && !transcript.contains("[BLANK_AUDIO]") {
446+
whisper_result = Some(transcript.trim().to_string());
447+
println!("🎯 Chunk transcribed in {:.2}ms: \"{}\"", whisper_chunk_time.as_secs_f64() * 1000.0, transcript.trim());
448+
}
379449
}
380450
}
451+
} else {
452+
stats.vad_processing_duration += vad_chunk_start.elapsed();
381453
}
382-
} else {
383-
stats.vad_processing_duration += vad_chunk_start.elapsed();
384-
}
385454

386-
// Create event with actual transcription or progress update
387-
#[allow(clippy::option_if_let_else)]
388-
let event_data = if let Some(transcript) = whisper_result {
389-
#[allow(clippy::cast_precision_loss)]
390-
StreamChunk { text: transcript, timestamp: Some(processed as f64 / f64::from(sample_rate)) }
391-
} else {
392-
StreamChunk {
393-
#[allow(clippy::cast_precision_loss)]
394-
text: format!("Processing... ({:.1}%)", (processed as f64 / audio_data.len() as f64) * 100.0),
455+
// Create event with actual transcription or progress update
456+
#[allow(clippy::option_if_let_else)]
457+
let event_data = if let Some(transcript) = whisper_result {
395458
#[allow(clippy::cast_precision_loss)]
396-
timestamp: Some(processed as f64 / f64::from(sample_rate)),
397-
}
398-
};
459+
StreamChunk { text: transcript, timestamp: Some(processed as f64 / f64::from(sample_rate)) }
460+
} else {
461+
StreamChunk {
462+
#[allow(clippy::cast_precision_loss)]
463+
text: format!("Processing... ({:.1}%)", (processed as f64 / audio_data.len() as f64) * 100.0),
464+
#[allow(clippy::cast_precision_loss)]
465+
timestamp: Some(processed as f64 / f64::from(sample_rate)),
466+
}
467+
};
399468

400-
let event = Event::default().json_data(event_data).unwrap();
469+
let event = Event::default().json_data(event_data).unwrap();
401470

402-
Some((Ok(event), (state.clone(), whisper_processor.clone(), audio_data, processed, audio_buffer, stats, stream_start)))
403-
}))
471+
Some((Ok(event), (state.clone(), whisper_processor.clone(), audio_data, processed, audio_buffer, stats, stream_start)))
472+
})
473+
.boxed(),
474+
)
404475
}

0 commit comments

Comments
 (0)