@@ -21,7 +21,7 @@ use symphonia::{
21
21
audio:: { AudioBufferRef , Signal } ,
22
22
codecs:: DecoderOptions ,
23
23
formats:: FormatOptions ,
24
- io:: MediaSourceStream ,
24
+ io:: { MediaSourceStream , MediaSourceStreamOptions } ,
25
25
meta:: MetadataOptions ,
26
26
probe:: Hint ,
27
27
} ,
@@ -46,7 +46,7 @@ struct ProcessingStats {
46
46
}
47
47
48
48
impl ProcessingStats {
49
- fn new ( ) -> Self {
49
+ const fn new ( ) -> Self {
50
50
Self {
51
51
total_duration : Duration :: ZERO ,
52
52
audio_conversion_duration : Duration :: ZERO ,
@@ -66,7 +66,7 @@ impl ProcessingStats {
66
66
println ! ( " ⏱️ Total processing: {:.2}ms" , self . total_duration. as_secs_f64( ) * 1000.0 ) ;
67
67
println ! ( " 🎵 Audio length: {:.2}s" , self . audio_length_seconds) ;
68
68
if self . audio_length_seconds > 0.0 {
69
- let real_time_factor = self . total_duration . as_secs_f64 ( ) / self . audio_length_seconds as f64 ;
69
+ let real_time_factor = self . total_duration . as_secs_f64 ( ) / f64 :: from ( self . audio_length_seconds ) ;
70
70
println ! ( " ⚡ Real-time factor: {real_time_factor:.2}x" ) ;
71
71
}
72
72
}
@@ -77,7 +77,7 @@ pub async fn transcribe_audio(
77
77
mut multipart : Multipart ,
78
78
) -> Result < Response , ( StatusCode , Json < ErrorResponse > ) > {
79
79
let start_time = Instant :: now ( ) ;
80
- let mut stats = ProcessingStats :: new ( ) ;
80
+ let mut processing_stats = ProcessingStats :: new ( ) ;
81
81
82
82
// Extract both audio file and parameters from multipart form
83
83
let ( audio_data, params) = match extract_multipart_data ( & mut multipart) . await {
@@ -102,8 +102,7 @@ pub async fn transcribe_audio(
102
102
// Parse streaming parameter from form data
103
103
let stream_enabled = params
104
104
. get ( "stream" )
105
- . map ( |s| s. parse :: < bool > ( ) . unwrap_or ( false ) )
106
- . unwrap_or ( false ) ;
105
+ . is_some_and ( |s| s. parse :: < bool > ( ) . unwrap_or ( false ) ) ;
107
106
108
107
// Get model name from parameters and clone it to make it owned
109
108
let model_name = params
@@ -130,27 +129,28 @@ pub async fn transcribe_audio(
130
129
) ) ;
131
130
} ,
132
131
} ;
133
- stats. audio_conversion_duration = conversion_start. elapsed ( ) ;
134
- stats. audio_length_seconds = pcm_data. len ( ) as f32 / 16000.0 ; // Assuming 16kHz sample rate
135
132
136
- println ! ( "Audio data length: {} samples ({:.2}s)" , pcm_data. len( ) , stats. audio_length_seconds) ;
133
+ processing_stats. audio_conversion_duration = conversion_start. elapsed ( ) ;
134
+ processing_stats. audio_length_seconds = pcm_data. len ( ) as f32 / 16000.0 ; // Assuming 16kHz sample rate
135
+
136
+ println ! ( "Audio data length: {} samples ({:.2}s)" , pcm_data. len( ) , processing_stats. audio_length_seconds) ;
137
137
138
138
if stream_enabled {
139
139
// Return streaming response
140
- let stream = create_transcription_stream ( state, model_name, pcm_data, stats ) . await ?;
140
+ let stream = create_transcription_stream ( state, model_name, pcm_data, processing_stats ) . await ?;
141
141
let sse = Sse :: new ( stream) . keep_alive ( KeepAlive :: default ( ) ) ;
142
142
Ok ( sse. into_response ( ) )
143
143
} else {
144
144
// Return single response
145
- match transcribe_audio_complete ( state, model_name, pcm_data, & mut stats ) . await {
145
+ match transcribe_audio_complete ( state, model_name, pcm_data, & mut processing_stats ) . await {
146
146
Ok ( transcript) => {
147
- stats . total_duration = start_time. elapsed ( ) ;
148
- stats . print_summary ( ) ;
147
+ processing_stats . total_duration = start_time. elapsed ( ) ;
148
+ processing_stats . print_summary ( ) ;
149
149
Ok ( Json ( TranscriptionResponse { text : transcript } ) . into_response ( ) )
150
150
} ,
151
151
Err ( e) => {
152
- stats . total_duration = start_time. elapsed ( ) ;
153
- stats . print_summary ( ) ;
152
+ processing_stats . total_duration = start_time. elapsed ( ) ;
153
+ processing_stats . print_summary ( ) ;
154
154
Err ( (
155
155
StatusCode :: INTERNAL_SERVER_ERROR ,
156
156
Json ( ErrorResponse {
@@ -192,15 +192,16 @@ async fn extract_multipart_data(multipart: &mut Multipart) -> Result<(Vec<u8>, H
192
192
}
193
193
194
194
// Convert various audio formats to PCM
195
+ #[ allow( clippy:: unused_async) ]
195
196
async fn convert_audio_to_pcm ( audio_data : & [ u8 ] ) -> Result < Vec < f32 > > {
196
197
let cursor = std:: io:: Cursor :: new ( audio_data. to_vec ( ) ) ;
197
- let media_source = MediaSourceStream :: new ( Box :: new ( cursor) , Default :: default ( ) ) ;
198
+ let media_source = MediaSourceStream :: new ( Box :: new ( cursor) , MediaSourceStreamOptions :: default ( ) ) ;
198
199
199
200
let mut hint = Hint :: new ( ) ;
200
201
hint. mime_type ( "audio/wav" ) ; // You might want to detect this automatically
201
202
202
- let meta_opts: MetadataOptions = Default :: default ( ) ;
203
- let fmt_opts: FormatOptions = Default :: default ( ) ;
203
+ let meta_opts = MetadataOptions :: default ( ) ;
204
+ let fmt_opts = FormatOptions :: default ( ) ;
204
205
205
206
let probed = get_probe ( ) . format ( & hint, media_source, & fmt_opts, & meta_opts) ?;
206
207
@@ -211,7 +212,7 @@ async fn convert_audio_to_pcm(audio_data: &[u8]) -> Result<Vec<f32>> {
211
212
. find ( |t| t. codec_params . codec != symphonia:: core:: codecs:: CODEC_TYPE_NULL )
212
213
. ok_or_else ( || anyhow:: anyhow!( "No audio track found" ) ) ?;
213
214
214
- let dec_opts: DecoderOptions = Default :: default ( ) ;
215
+ let dec_opts = DecoderOptions :: default ( ) ;
215
216
let mut decoder = symphonia:: default:: get_codecs ( ) . make ( & track. codec_params , & dec_opts) ?;
216
217
217
218
let track_id = track. id ;
@@ -236,6 +237,7 @@ async fn convert_audio_to_pcm(audio_data: &[u8]) -> Result<Vec<f32>> {
236
237
} ,
237
238
AudioBufferRef :: S32 ( buf) => {
238
239
for & sample in buf. chan ( 0 ) {
240
+ #[ allow( clippy:: cast_precision_loss) ]
239
241
pcm_data. push ( sample as f32 / i32:: MAX as f32 ) ;
240
242
}
241
243
} ,
@@ -253,14 +255,14 @@ async fn transcribe_audio_complete(
253
255
state : Arc < AppState > ,
254
256
model_name : String , // Change to owned String
255
257
audio_data : Vec < f32 > ,
256
- stats : & mut ProcessingStats ,
258
+ processing_stats : & mut ProcessingStats ,
257
259
) -> Result < String > {
258
260
let sample_rate = 16000 ;
259
261
260
262
// Get the appropriate Whisper processor for this model with timing
261
263
let model_loading_start = Instant :: now ( ) ;
262
264
let whisper_processor = state. get_whisper_processor ( & model_name) . await ?;
263
- stats . model_loading_duration = model_loading_start. elapsed ( ) ;
265
+ processing_stats . model_loading_duration = model_loading_start. elapsed ( ) ;
264
266
265
267
// Process audio through VAD and Whisper
266
268
let mut vad = state. vad . lock ( ) . await ;
@@ -296,8 +298,8 @@ async fn transcribe_audio_complete(
296
298
}
297
299
}
298
300
299
- stats . vad_processing_duration = vad_start. elapsed ( ) - whisper_total_time;
300
- stats . whisper_transcription_duration = whisper_total_time;
301
+ processing_stats . vad_processing_duration = vad_start. elapsed ( ) - whisper_total_time;
302
+ processing_stats . whisper_transcription_duration = whisper_total_time;
301
303
302
304
Ok ( transcripts. join ( " " ) )
303
305
}
@@ -307,7 +309,7 @@ async fn create_transcription_stream(
307
309
state : Arc < AppState > ,
308
310
model_name : String , // Change to owned String
309
311
audio_data : Vec < f32 > ,
310
- mut stats : ProcessingStats ,
312
+ mut processing_stats : ProcessingStats ,
311
313
) -> Result < impl Stream < Item = Result < Event , anyhow:: Error > > , ( StatusCode , Json < ErrorResponse > ) > {
312
314
let stream_start = Instant :: now ( ) ;
313
315
@@ -329,11 +331,11 @@ async fn create_transcription_stream(
329
331
) ) ;
330
332
} ,
331
333
} ;
332
- stats . model_loading_duration = model_loading_start. elapsed ( ) ;
334
+ processing_stats . model_loading_duration = model_loading_start. elapsed ( ) ;
333
335
334
336
let sample_rate = 16000 ;
335
337
336
- Ok ( stream:: unfold ( ( state, whisper_processor, audio_data, 0 , AudioBuffer :: new ( 10000 , 100 , 500 , sample_rate) , stats , stream_start) , move |( state, whisper_processor, audio_data, mut processed, mut audio_buffer, mut stats, stream_start) | async move {
338
+ Ok ( stream:: unfold ( ( state, whisper_processor, audio_data, 0 , AudioBuffer :: new ( 10000 , 100 , 500 , sample_rate) , processing_stats , stream_start) , move |( state, whisper_processor, audio_data, mut processed, mut audio_buffer, mut stats, stream_start) | async move {
337
339
if processed >= audio_data. len ( ) {
338
340
// Print final statistics for streaming
339
341
stats. total_duration = stream_start. elapsed ( ) ;
@@ -381,11 +383,14 @@ async fn create_transcription_stream(
381
383
382
384
// Create event with actual transcription or progress update
383
385
let event_data = if let Some ( transcript) = whisper_result {
384
- StreamChunk { text : transcript, timestamp : Some ( processed as f64 / sample_rate as f64 ) }
386
+ #[ allow( clippy:: cast_precision_loss) ]
387
+ StreamChunk { text : transcript, timestamp : Some ( processed as f64 / f64:: from ( sample_rate) ) }
385
388
} else {
386
389
StreamChunk {
387
- text : format ! ( "Processing... ({:.1}%)" , ( processed as f64 / audio_data. len( ) as f64 ) * 100.0 ) ,
388
- timestamp : Some ( processed as f64 / sample_rate as f64 ) ,
390
+ #[ allow( clippy:: cast_precision_loss) ]
391
+ text : format ! ( "Processing... ({:.1}%)" , ( processed as f64 / audio_data. len( ) as f64 ) * 100.0 ) ,
392
+ #[ allow( clippy:: cast_precision_loss) ]
393
+ timestamp : Some ( processed as f64 / f64:: from ( sample_rate) ) ,
389
394
}
390
395
} ;
391
396
0 commit comments