1
1
use std:: {
2
2
collections:: HashMap ,
3
+ pin:: Pin ,
3
4
sync:: Arc ,
4
5
time:: { Duration , Instant } ,
5
6
} ;
@@ -15,7 +16,7 @@ use axum::{
15
16
sse:: { Event , KeepAlive , Sse } ,
16
17
} ,
17
18
} ;
18
- use futures:: stream:: { self , Stream } ;
19
+ use futures:: stream:: { self , Stream , StreamExt } ;
19
20
use symphonia:: {
20
21
core:: {
21
22
audio:: { AudioBufferRef , Signal } ,
@@ -266,16 +267,31 @@ async fn transcribe_audio_complete(
266
267
let whisper_processor = state. get_whisper_processor ( & model_name) . await ?;
267
268
processing_stats. model_loading_duration = model_loading_start. elapsed ( ) ;
268
269
270
+ // Check if VAD is enabled
271
+ if !state. vad_enabled {
272
+ println ! ( "🔇 VAD disabled, processing entire audio directly" ) ;
273
+ let whisper_start = Instant :: now ( ) ;
274
+ let mut whisper = whisper_processor. lock ( ) . await ;
275
+ let transcript = whisper. transcribe ( & audio_data) ?;
276
+ processing_stats. whisper_transcription_duration = whisper_start. elapsed ( ) ;
277
+ processing_stats. vad_processing_duration = Duration :: ZERO ;
278
+
279
+ return Ok ( transcript) ;
280
+ }
281
+
269
282
// Process audio through VAD and Whisper
270
283
let mut vad = state. vad . lock ( ) . await ;
271
284
let mut whisper = whisper_processor. lock ( ) . await ;
272
- let mut audio_buffer = AudioBuffer :: new ( 10000 , 100 , 500 , sample_rate) ;
285
+ // Use more lenient parameters: max_duration=10s, min_speech=50ms, min_silence=300ms
286
+ let mut audio_buffer = AudioBuffer :: new ( 10000 , 50 , 300 , sample_rate) ;
273
287
274
288
let mut transcripts = Vec :: new ( ) ;
275
289
let mut frame_buffer = Vec :: < f32 > :: new ( ) ;
276
290
277
291
let vad_start = Instant :: now ( ) ;
278
292
let mut whisper_total_time = Duration :: ZERO ;
293
+ let mut speech_frame_count = 0 ;
294
+ let mut total_frame_count = 0 ;
279
295
280
296
// Process in chunks
281
297
for chunk in audio_data. chunks ( 1024 ) {
@@ -287,6 +303,11 @@ async fn transcribe_audio_complete(
287
303
let speech_prob = vad. process_chunk ( & frame) ?;
288
304
let is_speech = vad. is_speech ( speech_prob) ;
289
305
306
+ total_frame_count += 1 ;
307
+ if is_speech {
308
+ speech_frame_count += 1 ;
309
+ }
310
+
290
311
if let Some ( complete_audio) = audio_buffer. add_chunk ( & frame, is_speech) {
291
312
// Measure Whisper transcription time
292
313
let whisper_start = Instant :: now ( ) ;
@@ -300,9 +321,24 @@ async fn transcribe_audio_complete(
300
321
}
301
322
}
302
323
324
+ println ! ( "🎯 VAD Stats: {speech_frame_count}/{total_frame_count} frames detected as speech ({:.1}%)" , speech_frame_count as f32 / total_frame_count as f32 * 100.0 ) ;
325
+
303
326
processing_stats. vad_processing_duration = vad_start. elapsed ( ) - whisper_total_time;
304
327
processing_stats. whisper_transcription_duration = whisper_total_time;
305
328
329
+ // Fallback: If no segments were detected by VAD, process the entire audio
330
+ if transcripts. is_empty ( ) && !audio_data. is_empty ( ) {
331
+ println ! ( "⚠️ No VAD segments detected, processing entire audio as fallback" ) ;
332
+ let whisper_start = Instant :: now ( ) ;
333
+ let transcript = whisper. transcribe ( & audio_data) ?;
334
+ let whisper_fallback_time = whisper_start. elapsed ( ) ;
335
+ processing_stats. whisper_transcription_duration += whisper_fallback_time;
336
+
337
+ if !transcript. trim ( ) . is_empty ( ) && !transcript. contains ( "[BLANK_AUDIO]" ) {
338
+ transcripts. push ( transcript. trim ( ) . to_string ( ) ) ;
339
+ }
340
+ }
341
+
306
342
Ok ( transcripts. join ( " " ) )
307
343
}
308
344
@@ -312,7 +348,7 @@ async fn create_transcription_stream(
312
348
model_name : String , // Change to owned String
313
349
audio_data : Vec < f32 > ,
314
350
mut processing_stats : ProcessingStats ,
315
- ) -> Result < impl Stream < Item = Result < Event , anyhow:: Error > > , ( StatusCode , Json < ErrorResponse > ) > {
351
+ ) -> Result < Pin < Box < dyn Stream < Item = Result < Event , anyhow:: Error > > + Send > > , ( StatusCode , Json < ErrorResponse > ) > {
316
352
let stream_start = Instant :: now ( ) ;
317
353
318
354
// Get the appropriate Whisper processor for this model with timing
@@ -337,68 +373,103 @@ async fn create_transcription_stream(
337
373
338
374
let sample_rate = 16000 ;
339
375
340
- Ok ( stream:: unfold ( ( state, whisper_processor, audio_data, 0 , AudioBuffer :: new ( 10000 , 100 , 500 , sample_rate) , processing_stats, stream_start) , move |( state, whisper_processor, audio_data, mut processed, mut audio_buffer, mut stats, stream_start) | async move {
341
- if processed >= audio_data. len ( ) {
342
- // Print final statistics for streaming
343
- stats. total_duration = stream_start. elapsed ( ) ;
344
- stats. print_summary ( ) ;
345
- return None ;
346
- }
376
+ // If VAD is disabled, process entire audio directly and return as single stream event
377
+ if !state. vad_enabled {
378
+ println ! ( "🔇 VAD disabled for streaming, processing entire audio directly" ) ;
379
+ let whisper_start = Instant :: now ( ) ;
380
+ let mut whisper = whisper_processor. lock ( ) . await ;
381
+ let transcript = match whisper. transcribe ( & audio_data) {
382
+ Ok ( text) => text,
383
+ Err ( e) => {
384
+ return Err ( (
385
+ StatusCode :: INTERNAL_SERVER_ERROR ,
386
+ Json ( ErrorResponse {
387
+ error : ErrorDetail {
388
+ message : format ! ( "Transcription failed: {e}" ) ,
389
+ error_type : "server_error" . to_string ( ) ,
390
+ param : None ,
391
+ code : None ,
392
+ } ,
393
+ } ) ,
394
+ ) ) ;
395
+ } ,
396
+ } ;
397
+ processing_stats. whisper_transcription_duration = whisper_start. elapsed ( ) ;
398
+ processing_stats. vad_processing_duration = Duration :: ZERO ;
399
+ processing_stats. total_duration = stream_start. elapsed ( ) ;
400
+ processing_stats. print_summary ( ) ;
347
401
348
- // Process audio in chunks suitable for VAD (512 samples at a time)
349
- let chunk_size = 512 . min ( audio_data. len ( ) - processed) ;
350
- let chunk = & audio_data[ processed..processed + chunk_size] ;
351
- processed += chunk_size;
402
+ let event_data = StreamChunk { text : transcript, timestamp : Some ( audio_data. len ( ) as f64 / f64:: from ( sample_rate) ) } ;
403
+ let event = Event :: default ( ) . json_data ( event_data) . unwrap ( ) ;
352
404
353
- // Process through VAD and Whisper processors
354
- let mut whisper_result = None ;
405
+ return Ok ( stream :: once ( async move { Ok ( event ) } ) . boxed ( ) ) ;
406
+ }
355
407
356
- // Process through VAD
357
- let vad_chunk_start = Instant :: now ( ) ;
358
- let mut vad = state. vad . lock ( ) . await ;
359
- if let Ok ( speech_prob) = vad. process_chunk ( chunk) {
360
- let is_speech = vad. is_speech ( speech_prob) ;
408
+ Ok (
409
+ stream:: unfold ( ( state, whisper_processor, audio_data, 0 , AudioBuffer :: new ( 10000 , 50 , 300 , sample_rate) , processing_stats, stream_start) , move |( state, whisper_processor, audio_data, mut processed, mut audio_buffer, mut stats, stream_start) | async move {
410
+ if processed >= audio_data. len ( ) {
411
+ // Print final statistics for streaming
412
+ stats. total_duration = stream_start. elapsed ( ) ;
413
+ stats. print_summary ( ) ;
414
+ return None ;
415
+ }
361
416
362
- // Add to audio buffer and check if we have complete audio
363
- if let Some ( complete_audio) = audio_buffer. add_chunk ( chunk, is_speech) {
364
- // Release VAD lock before acquiring Whisper lock
365
- drop ( vad) ;
366
- let vad_chunk_time = vad_chunk_start. elapsed ( ) ;
367
- stats. vad_processing_duration += vad_chunk_time;
368
-
369
- // Process complete audio through Whisper
370
- let whisper_chunk_start = Instant :: now ( ) ;
371
- let mut whisper = whisper_processor. lock ( ) . await ;
372
- if let Ok ( transcript) = whisper. transcribe ( & complete_audio) {
373
- let whisper_chunk_time = whisper_chunk_start. elapsed ( ) ;
374
- stats. whisper_transcription_duration += whisper_chunk_time;
375
-
376
- if !transcript. trim ( ) . is_empty ( ) && !transcript. contains ( "[BLANK_AUDIO]" ) {
377
- whisper_result = Some ( transcript. trim ( ) . to_string ( ) ) ;
378
- println ! ( "🎯 Chunk transcribed in {:.2}ms: \" {}\" " , whisper_chunk_time. as_secs_f64( ) * 1000.0 , transcript. trim( ) ) ;
417
+ // Process audio in chunks suitable for VAD (512 samples at a time)
418
+ let chunk_size = 512 . min ( audio_data. len ( ) - processed) ;
419
+ let chunk = & audio_data[ processed..processed + chunk_size] ;
420
+ processed += chunk_size;
421
+
422
+ // Process through VAD and Whisper processors
423
+ let mut whisper_result = None ;
424
+
425
+ // Process through VAD
426
+ let vad_chunk_start = Instant :: now ( ) ;
427
+ let mut vad = state. vad . lock ( ) . await ;
428
+ if let Ok ( speech_prob) = vad. process_chunk ( chunk) {
429
+ let is_speech = vad. is_speech ( speech_prob) ;
430
+
431
+ // Add to audio buffer and check if we have complete audio
432
+ if let Some ( complete_audio) = audio_buffer. add_chunk ( chunk, is_speech) {
433
+ // Release VAD lock before acquiring Whisper lock
434
+ drop ( vad) ;
435
+ let vad_chunk_time = vad_chunk_start. elapsed ( ) ;
436
+ stats. vad_processing_duration += vad_chunk_time;
437
+
438
+ // Process complete audio through Whisper
439
+ let whisper_chunk_start = Instant :: now ( ) ;
440
+ let mut whisper = whisper_processor. lock ( ) . await ;
441
+ if let Ok ( transcript) = whisper. transcribe ( & complete_audio) {
442
+ let whisper_chunk_time = whisper_chunk_start. elapsed ( ) ;
443
+ stats. whisper_transcription_duration += whisper_chunk_time;
444
+
445
+ if !transcript. trim ( ) . is_empty ( ) && !transcript. contains ( "[BLANK_AUDIO]" ) {
446
+ whisper_result = Some ( transcript. trim ( ) . to_string ( ) ) ;
447
+ println ! ( "🎯 Chunk transcribed in {:.2}ms: \" {}\" " , whisper_chunk_time. as_secs_f64( ) * 1000.0 , transcript. trim( ) ) ;
448
+ }
379
449
}
380
450
}
451
+ } else {
452
+ stats. vad_processing_duration += vad_chunk_start. elapsed ( ) ;
381
453
}
382
- } else {
383
- stats. vad_processing_duration += vad_chunk_start. elapsed ( ) ;
384
- }
385
454
386
- // Create event with actual transcription or progress update
387
- #[ allow( clippy:: option_if_let_else) ]
388
- let event_data = if let Some ( transcript) = whisper_result {
389
- #[ allow( clippy:: cast_precision_loss) ]
390
- StreamChunk { text : transcript, timestamp : Some ( processed as f64 / f64:: from ( sample_rate) ) }
391
- } else {
392
- StreamChunk {
393
- #[ allow( clippy:: cast_precision_loss) ]
394
- text : format ! ( "Processing... ({:.1}%)" , ( processed as f64 / audio_data. len( ) as f64 ) * 100.0 ) ,
455
+ // Create event with actual transcription or progress update
456
+ #[ allow( clippy:: option_if_let_else) ]
457
+ let event_data = if let Some ( transcript) = whisper_result {
395
458
#[ allow( clippy:: cast_precision_loss) ]
396
- timestamp : Some ( processed as f64 / f64:: from ( sample_rate) ) ,
397
- }
398
- } ;
459
+ StreamChunk { text : transcript, timestamp : Some ( processed as f64 / f64:: from ( sample_rate) ) }
460
+ } else {
461
+ StreamChunk {
462
+ #[ allow( clippy:: cast_precision_loss) ]
463
+ text : format ! ( "Processing... ({:.1}%)" , ( processed as f64 / audio_data. len( ) as f64 ) * 100.0 ) ,
464
+ #[ allow( clippy:: cast_precision_loss) ]
465
+ timestamp : Some ( processed as f64 / f64:: from ( sample_rate) ) ,
466
+ }
467
+ } ;
399
468
400
- let event = Event :: default ( ) . json_data ( event_data) . unwrap ( ) ;
469
+ let event = Event :: default ( ) . json_data ( event_data) . unwrap ( ) ;
401
470
402
- Some ( ( Ok ( event) , ( state. clone ( ) , whisper_processor. clone ( ) , audio_data, processed, audio_buffer, stats, stream_start) ) )
403
- } ) )
471
+ Some ( ( Ok ( event) , ( state. clone ( ) , whisper_processor. clone ( ) , audio_data, processed, audio_buffer, stats, stream_start) ) )
472
+ } )
473
+ . boxed ( ) ,
474
+ )
404
475
}
0 commit comments