@@ -16,22 +16,17 @@ use axum::{
16
16
} ,
17
17
} ;
18
18
use futures:: stream:: { self , Stream } ;
19
- use symphonia:: {
20
- core:: {
21
- audio:: { AudioBufferRef , Signal } ,
22
- codecs:: DecoderOptions ,
23
- formats:: FormatOptions ,
24
- io:: { MediaSourceStream , MediaSourceStreamOptions } ,
25
- meta:: MetadataOptions ,
26
- probe:: Hint ,
27
- } ,
28
- default:: get_probe,
19
+ use serde_json:: json;
20
+ use symphonia:: core:: {
21
+ audio:: { AudioBufferRef , Signal } ,
22
+ codecs:: DecoderOptions ,
29
23
} ;
30
24
31
25
use crate :: {
32
26
AppState ,
33
27
api:: { ErrorDetail , ErrorResponse , StreamChunk , TranscriptionResponse } ,
34
28
audio_manager:: AudioBuffer ,
29
+ whisper:: WhichWhisperModel ,
35
30
} ;
36
31
37
32
// Performance statistics struct
@@ -72,6 +67,46 @@ impl ProcessingStats {
72
67
}
73
68
}
74
69
70
+ pub async fn list_models ( ) -> Result < Response , ( StatusCode , Json < ErrorResponse > ) > {
71
+ // List available models (this is a placeholder, implement actual model listing logic)
72
+ let models = vec ! [
73
+ WhichWhisperModel :: Tiny ,
74
+ WhichWhisperModel :: TinyEn ,
75
+ WhichWhisperModel :: Base ,
76
+ WhichWhisperModel :: BaseEn ,
77
+ WhichWhisperModel :: Small ,
78
+ WhichWhisperModel :: SmallEn ,
79
+ WhichWhisperModel :: Medium ,
80
+ WhichWhisperModel :: MediumEn ,
81
+ WhichWhisperModel :: Large ,
82
+ WhichWhisperModel :: LargeV2 ,
83
+ WhichWhisperModel :: LargeV3 ,
84
+ WhichWhisperModel :: LargeV3Turbo ,
85
+ WhichWhisperModel :: DistilMediumEn ,
86
+ WhichWhisperModel :: DistilLargeV2 ,
87
+ WhichWhisperModel :: LiteWhisperLargeV3Turbo ,
88
+ WhichWhisperModel :: LiteWhisperLargeV3TurboAcc ,
89
+ WhichWhisperModel :: LiteWhisperLargeV3TurboFast ,
90
+ ]
91
+ . iter ( )
92
+ . map ( |model| {
93
+ json ! ( {
94
+ "id" : model. to_string( ) ,
95
+ "type" : "object" ,
96
+ "owned_by" : "candle-examples" ,
97
+ } )
98
+ } )
99
+ . collect :: < Vec < _ > > ( ) ;
100
+
101
+ Ok (
102
+ Json ( json ! ( {
103
+ "object" : "list" ,
104
+ "data" : models,
105
+ } ) )
106
+ . into_response ( ) ,
107
+ )
108
+ }
109
+
75
110
#[ allow( clippy:: cast_precision_loss) ]
76
111
pub async fn transcribe_audio (
77
112
State ( state) : State < Arc < AppState > > ,
@@ -192,59 +227,77 @@ async fn extract_multipart_data(multipart: &mut Multipart) -> Result<(Vec<u8>, H
192
227
Ok ( ( audio, params) )
193
228
}
194
229
195
- // Convert various audio formats to PCM
196
- #[ allow( clippy:: unused_async) ]
230
+ use symphonia:: core:: { codecs:: CODEC_TYPE_NULL , conv:: FromSample } ;
231
+
232
+ fn conv < T > (
233
+ samples : & mut Vec < f32 > ,
234
+ data : std:: borrow:: Cow < symphonia:: core:: audio:: AudioBuffer < T > > ,
235
+ ) where
236
+ T : symphonia:: core:: sample:: Sample ,
237
+ f32 : symphonia:: core:: conv:: FromSample < T > ,
238
+ {
239
+ samples. extend ( data. chan ( 0 ) . iter ( ) . map ( |v| f32:: from_sample ( * v) ) )
240
+ }
241
+
197
242
async fn convert_audio_to_pcm ( audio_data : & [ u8 ] ) -> Result < Vec < f32 > > {
198
243
let cursor = std:: io:: Cursor :: new ( audio_data. to_vec ( ) ) ;
199
- let media_source = MediaSourceStream :: new ( Box :: new ( cursor) , MediaSourceStreamOptions :: default ( ) ) ;
200
244
201
- let mut hint = Hint :: new ( ) ;
202
- hint . mime_type ( "audio/wav" ) ; // You might want to detect this automatically
245
+ // Create the media source stream.
246
+ let mss = symphonia :: core :: io :: MediaSourceStream :: new ( Box :: new ( cursor ) , Default :: default ( ) ) ;
203
247
204
- let meta_opts = MetadataOptions :: default ( ) ;
205
- let fmt_opts = FormatOptions :: default ( ) ;
248
+ // Create a probe hint using the file's extension. [Optional]
249
+ let hint = symphonia :: core :: probe :: Hint :: new ( ) ;
206
250
207
- let probed = get_probe ( ) . format ( & hint, media_source, & fmt_opts, & meta_opts) ?;
251
+ // Use the default options for metadata and format readers.
252
+ let meta_opts: symphonia:: core:: meta:: MetadataOptions = Default :: default ( ) ;
253
+ let fmt_opts: symphonia:: core:: formats:: FormatOptions = Default :: default ( ) ;
208
254
255
+ // Probe the media source.
256
+ let probed = symphonia:: default:: get_probe ( ) . format ( & hint, mss, & fmt_opts, & meta_opts) ?;
257
+ // Get the instantiated format reader.
209
258
let mut format = probed. format ;
259
+
260
+ // Find the first audio track with a known (decodeable) codec.
210
261
let track = format
211
262
. tracks ( )
212
263
. iter ( )
213
- . find ( |t| t. codec_params . codec != symphonia:: core:: codecs:: CODEC_TYPE_NULL )
214
- . ok_or_else ( || anyhow:: anyhow!( "No audio track found" ) ) ?;
264
+ . find ( |t| t. codec_params . codec != CODEC_TYPE_NULL )
265
+ . expect ( "no supported audio tracks" ) ;
266
+
267
+ // Use the default options for the decoder.
268
+ let dec_opts: DecoderOptions = Default :: default ( ) ;
269
+ let mut dec_params = track. codec_params . clone ( ) ;
270
+ dec_params. max_frames_per_packet = Some ( 1 ) ; // Decode one frame at a time
215
271
216
- let dec_opts = DecoderOptions :: default ( ) ;
217
- let mut decoder = symphonia:: default:: get_codecs ( ) . make ( & track. codec_params , & dec_opts) ?;
272
+ // Create a decoder for the track.
273
+ let mut decoder = symphonia:: default:: get_codecs ( )
274
+ . make ( & dec_params, & dec_opts)
275
+ . expect ( "unsupported codec" ) ;
218
276
219
277
let track_id = track. id ;
220
278
let mut pcm_data = Vec :: new ( ) ;
221
-
222
- // Decode the audio
279
+ // The decode loop.
223
280
while let Ok ( packet) = format. next_packet ( ) {
281
+ // Consume any new metadata that has been read since the last packet.
282
+ while !format. metadata ( ) . is_latest ( ) {
283
+ format. metadata ( ) . pop ( ) ;
284
+ }
285
+
286
+ // If the packet does not belong to the selected track, skip over it.
224
287
if packet. track_id ( ) != track_id {
225
288
continue ;
226
289
}
227
-
228
290
match decoder. decode ( & packet) ? {
229
- AudioBufferRef :: F32 ( buf) => {
230
- for & sample in buf. chan ( 0 ) {
231
- pcm_data. push ( sample) ;
232
- }
233
- } ,
234
- AudioBufferRef :: S16 ( buf) => {
235
- for & sample in buf. chan ( 0 ) {
236
- pcm_data. push ( f32:: from ( sample) / f32:: from ( i16:: MAX ) ) ;
237
- }
238
- } ,
239
- AudioBufferRef :: S32 ( buf) => {
240
- for & sample in buf. chan ( 0 ) {
241
- #[ allow( clippy:: cast_precision_loss) ]
242
- pcm_data. push ( sample as f32 / i32:: MAX as f32 ) ;
243
- }
244
- } ,
245
- _ => {
246
- anyhow:: bail!( "Unsupported audio format" ) ;
247
- } ,
291
+ AudioBufferRef :: F32 ( buf) => pcm_data. extend ( buf. chan ( 0 ) ) ,
292
+ AudioBufferRef :: U8 ( data) => conv ( & mut pcm_data, data) ,
293
+ AudioBufferRef :: U16 ( data) => conv ( & mut pcm_data, data) ,
294
+ AudioBufferRef :: U24 ( data) => conv ( & mut pcm_data, data) ,
295
+ AudioBufferRef :: U32 ( data) => conv ( & mut pcm_data, data) ,
296
+ AudioBufferRef :: S8 ( data) => conv ( & mut pcm_data, data) ,
297
+ AudioBufferRef :: S16 ( data) => conv ( & mut pcm_data, data) ,
298
+ AudioBufferRef :: S24 ( data) => conv ( & mut pcm_data, data) ,
299
+ AudioBufferRef :: S32 ( data) => conv ( & mut pcm_data, data) ,
300
+ AudioBufferRef :: F64 ( data) => conv ( & mut pcm_data, data) ,
248
301
}
249
302
}
250
303
0 commit comments