Skip to content

Commit f7a26b8

Browse files
committed
chore: update
1 parent ad0f511 commit f7a26b8

File tree

8 files changed

+165
-79
lines changed

8 files changed

+165
-79
lines changed

apps/silero-vad-whisper-realtime-ort/src/main.rs

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -131,5 +131,3 @@ async fn transcribe_audio(
131131
println!("📝 Transcript ({:.2}s): \"{}\"", duration.as_secs_f32(), transcript.trim());
132132
Ok(())
133133
}
134-
135-
// ... (Your save_audio_chunk_to_file and write_wav_file functions remain the same)

apps/silero-vad-whisper-realtime-ort/src/whisper_processor.rs

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
// src/whisper_processor.rs
2-
31
use anyhow::{Result, anyhow};
42
use byteorder::{ByteOrder, LittleEndian};
53
use ndarray::{Array1, Array2, s};
@@ -60,12 +58,9 @@ impl WhisperProcessor {
6058

6159
// 2. Compute the Short-Time Fourier Transform (STFT)
6260
let stft = self.stft(&pcm_data);
63-
6461
// 3. Apply the mel filter bank
6562
let mel_spectrogram = self.mel_filters.dot(&stft);
66-
6763
// 4. Apply logarithmic scaling
68-
6964
self.log_mel_spectrogram(&mel_spectrogram)
7065
}
7166

@@ -80,19 +75,16 @@ impl WhisperProcessor {
8075
// Pad the input data
8176
let mut padded_data = Array1::zeros(pcm_data.len() + N_FFT);
8277

83-
// --- FIX 1: Correct slicing for unsigned types ---
8478
let end = padded_data.len() - N_FFT / 2;
8579
padded_data
8680
.slice_mut(s![N_FFT / 2..end])
8781
.assign(pcm_data);
8882

89-
// --- FIX 2: Call .into_iter() before .step_by() ---
9083
let frames = padded_data
9184
.windows(N_FFT)
9285
.into_iter()
9386
.step_by(HOP_LENGTH);
9487

95-
// Initialize FFT planner
9688
let mut planner = FftPlanner::<f32>::new();
9789
let fft = planner.plan_fft_forward(N_FFT);
9890

apps/whisper-api/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ hf-hub = "0.4.2"
1616
rand = "0.9.1"
1717
rubato = "0.16.2"
1818
serde_json = "1.0.140"
19-
symphonia = "0.5.4"
19+
symphonia = { version = "0.5.4", features = ["mkv"] }
2020
tokenizers = "0.21.1"
2121
tracing-chrome = "0.7.2"
2222
tracing-subscriber = "0.3.19"

apps/whisper-api/src/main.rs

Lines changed: 4 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ use tower::ServiceBuilder;
1313
use tower_http::cors::CorsLayer;
1414

1515
use crate::{
16-
router::transcribe_audio,
16+
router::{list_models, transcribe_audio},
1717
vad::VADProcessor,
1818
whisper::{WhichWhisperModel, WhisperProcessor},
1919
};
@@ -102,24 +102,8 @@ impl AppState {
102102

103103
// Parse model name string to WhichWhisperModel enum
104104
fn parse_model_name(model_name: &str) -> Result<WhichWhisperModel> {
105-
match model_name.to_lowercase().as_str() {
106-
"tiny" => Ok(WhichWhisperModel::Tiny),
107-
"tiny.en" => Ok(WhichWhisperModel::TinyEn),
108-
"base" => Ok(WhichWhisperModel::Base),
109-
"base.en" => Ok(WhichWhisperModel::BaseEn),
110-
"small" => Ok(WhichWhisperModel::Small),
111-
"small.en" => Ok(WhichWhisperModel::SmallEn),
112-
"medium" => Ok(WhichWhisperModel::Medium),
113-
"medium.en" => Ok(WhichWhisperModel::MediumEn),
114-
"large" => Ok(WhichWhisperModel::Large),
115-
"large-v2" => Ok(WhichWhisperModel::LargeV2),
116-
"large-v3" => Ok(WhichWhisperModel::LargeV3),
117-
"large-v3-turbo" => Ok(WhichWhisperModel::LargeV3Turbo),
118-
"distil-medium.en" => Ok(WhichWhisperModel::DistilMediumEn),
119-
"distil-large-v2" => Ok(WhichWhisperModel::DistilLargeV2),
120-
"lite-whisper-large-v3-turbo" => Ok(WhichWhisperModel::LiteWhisperLargeV3Turbo),
121-
"lite-whisper-large-v3-turbo-acc" => Ok(WhichWhisperModel::LiteWhisperLargeV3TurboAcc),
122-
"lite-whisper-large-v3-turbo-fast" => Ok(WhichWhisperModel::LiteWhisperLargeV3TurboFast),
105+
match WhichWhisperModel::from_str(model_name) {
106+
Some(model) => Ok(model),
123107
_ => anyhow::bail!("Unsupported Whisper model: {}. Supported models: tiny, base, small, medium, large, large-v2, large-v3, large-v3-turbo, distill-medium.en, distil-large-v2, lite-whisper-large-v3-turbo, lite-whisper-large-v3-turbo-acc, lite-whisper-large-v3-turbo-fast", model_name),
124108
}
125109
}
@@ -136,6 +120,7 @@ async fn main() -> Result<()> {
136120
// Build application routes
137121
let app = Router::new()
138122
.route("/healthz", get(health_check))
123+
.route("/v1/models", get(list_models))
139124
.route("/v1/audio/transcriptions", post(transcribe_audio))
140125
.layer(
141126
ServiceBuilder::new()

apps/whisper-api/src/router.rs

Lines changed: 97 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -16,22 +16,17 @@ use axum::{
1616
},
1717
};
1818
use futures::stream::{self, Stream};
19-
use symphonia::{
20-
core::{
21-
audio::{AudioBufferRef, Signal},
22-
codecs::DecoderOptions,
23-
formats::FormatOptions,
24-
io::{MediaSourceStream, MediaSourceStreamOptions},
25-
meta::MetadataOptions,
26-
probe::Hint,
27-
},
28-
default::get_probe,
19+
use serde_json::json;
20+
use symphonia::core::{
21+
audio::{AudioBufferRef, Signal},
22+
codecs::DecoderOptions,
2923
};
3024

3125
use crate::{
3226
AppState,
3327
api::{ErrorDetail, ErrorResponse, StreamChunk, TranscriptionResponse},
3428
audio_manager::AudioBuffer,
29+
whisper::WhichWhisperModel,
3530
};
3631

3732
// Performance statistics struct
@@ -72,6 +67,46 @@ impl ProcessingStats {
7267
}
7368
}
7469

70+
pub async fn list_models() -> Result<Response, (StatusCode, Json<ErrorResponse>)> {
71+
// List available models (this is a placeholder, implement actual model listing logic)
72+
let models = vec![
73+
WhichWhisperModel::Tiny,
74+
WhichWhisperModel::TinyEn,
75+
WhichWhisperModel::Base,
76+
WhichWhisperModel::BaseEn,
77+
WhichWhisperModel::Small,
78+
WhichWhisperModel::SmallEn,
79+
WhichWhisperModel::Medium,
80+
WhichWhisperModel::MediumEn,
81+
WhichWhisperModel::Large,
82+
WhichWhisperModel::LargeV2,
83+
WhichWhisperModel::LargeV3,
84+
WhichWhisperModel::LargeV3Turbo,
85+
WhichWhisperModel::DistilMediumEn,
86+
WhichWhisperModel::DistilLargeV2,
87+
WhichWhisperModel::LiteWhisperLargeV3Turbo,
88+
WhichWhisperModel::LiteWhisperLargeV3TurboAcc,
89+
WhichWhisperModel::LiteWhisperLargeV3TurboFast,
90+
]
91+
.iter()
92+
.map(|model| {
93+
json!({
94+
"id": model.to_string(),
95+
"type": "object",
96+
"owned_by": "candle-examples",
97+
})
98+
})
99+
.collect::<Vec<_>>();
100+
101+
Ok(
102+
Json(json!({
103+
"object": "list",
104+
"data": models,
105+
}))
106+
.into_response(),
107+
)
108+
}
109+
75110
#[allow(clippy::cast_precision_loss)]
76111
pub async fn transcribe_audio(
77112
State(state): State<Arc<AppState>>,
@@ -192,59 +227,77 @@ async fn extract_multipart_data(multipart: &mut Multipart) -> Result<(Vec<u8>, H
192227
Ok((audio, params))
193228
}
194229

195-
// Convert various audio formats to PCM
196-
#[allow(clippy::unused_async)]
230+
use symphonia::core::{codecs::CODEC_TYPE_NULL, conv::FromSample};
231+
232+
fn conv<T>(
233+
samples: &mut Vec<f32>,
234+
data: std::borrow::Cow<symphonia::core::audio::AudioBuffer<T>>,
235+
) where
236+
T: symphonia::core::sample::Sample,
237+
f32: symphonia::core::conv::FromSample<T>,
238+
{
239+
samples.extend(data.chan(0).iter().map(|v| f32::from_sample(*v)))
240+
}
241+
197242
async fn convert_audio_to_pcm(audio_data: &[u8]) -> Result<Vec<f32>> {
198243
let cursor = std::io::Cursor::new(audio_data.to_vec());
199-
let media_source = MediaSourceStream::new(Box::new(cursor), MediaSourceStreamOptions::default());
200244

201-
let mut hint = Hint::new();
202-
hint.mime_type("audio/wav"); // You might want to detect this automatically
245+
// Create the media source stream.
246+
let mss = symphonia::core::io::MediaSourceStream::new(Box::new(cursor), Default::default());
203247

204-
let meta_opts = MetadataOptions::default();
205-
let fmt_opts = FormatOptions::default();
248+
// Create a probe hint using the file's extension. [Optional]
249+
let hint = symphonia::core::probe::Hint::new();
206250

207-
let probed = get_probe().format(&hint, media_source, &fmt_opts, &meta_opts)?;
251+
// Use the default options for metadata and format readers.
252+
let meta_opts: symphonia::core::meta::MetadataOptions = Default::default();
253+
let fmt_opts: symphonia::core::formats::FormatOptions = Default::default();
208254

255+
// Probe the media source.
256+
let probed = symphonia::default::get_probe().format(&hint, mss, &fmt_opts, &meta_opts)?;
257+
// Get the instantiated format reader.
209258
let mut format = probed.format;
259+
260+
// Find the first audio track with a known (decodeable) codec.
210261
let track = format
211262
.tracks()
212263
.iter()
213-
.find(|t| t.codec_params.codec != symphonia::core::codecs::CODEC_TYPE_NULL)
214-
.ok_or_else(|| anyhow::anyhow!("No audio track found"))?;
264+
.find(|t| t.codec_params.codec != CODEC_TYPE_NULL)
265+
.expect("no supported audio tracks");
266+
267+
// Use the default options for the decoder.
268+
let dec_opts: DecoderOptions = Default::default();
269+
let mut dec_params = track.codec_params.clone();
270+
dec_params.max_frames_per_packet = Some(1); // Decode one frame at a time
215271

216-
let dec_opts = DecoderOptions::default();
217-
let mut decoder = symphonia::default::get_codecs().make(&track.codec_params, &dec_opts)?;
272+
// Create a decoder for the track.
273+
let mut decoder = symphonia::default::get_codecs()
274+
.make(&dec_params, &dec_opts)
275+
.expect("unsupported codec");
218276

219277
let track_id = track.id;
220278
let mut pcm_data = Vec::new();
221-
222-
// Decode the audio
279+
// The decode loop.
223280
while let Ok(packet) = format.next_packet() {
281+
// Consume any new metadata that has been read since the last packet.
282+
while !format.metadata().is_latest() {
283+
format.metadata().pop();
284+
}
285+
286+
// If the packet does not belong to the selected track, skip over it.
224287
if packet.track_id() != track_id {
225288
continue;
226289
}
227-
228290
match decoder.decode(&packet)? {
229-
AudioBufferRef::F32(buf) => {
230-
for &sample in buf.chan(0) {
231-
pcm_data.push(sample);
232-
}
233-
},
234-
AudioBufferRef::S16(buf) => {
235-
for &sample in buf.chan(0) {
236-
pcm_data.push(f32::from(sample) / f32::from(i16::MAX));
237-
}
238-
},
239-
AudioBufferRef::S32(buf) => {
240-
for &sample in buf.chan(0) {
241-
#[allow(clippy::cast_precision_loss)]
242-
pcm_data.push(sample as f32 / i32::MAX as f32);
243-
}
244-
},
245-
_ => {
246-
anyhow::bail!("Unsupported audio format");
247-
},
291+
AudioBufferRef::F32(buf) => pcm_data.extend(buf.chan(0)),
292+
AudioBufferRef::U8(data) => conv(&mut pcm_data, data),
293+
AudioBufferRef::U16(data) => conv(&mut pcm_data, data),
294+
AudioBufferRef::U24(data) => conv(&mut pcm_data, data),
295+
AudioBufferRef::U32(data) => conv(&mut pcm_data, data),
296+
AudioBufferRef::S8(data) => conv(&mut pcm_data, data),
297+
AudioBufferRef::S16(data) => conv(&mut pcm_data, data),
298+
AudioBufferRef::S24(data) => conv(&mut pcm_data, data),
299+
AudioBufferRef::S32(data) => conv(&mut pcm_data, data),
300+
AudioBufferRef::F64(data) => conv(&mut pcm_data, data),
248301
}
249302
}
250303

apps/whisper-api/src/whisper.rs

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
use std::fmt::Display;
2+
13
use anyhow::Result;
24
use byteorder::{ByteOrder, LittleEndian};
35
use candle_core::{Device, IndexOp, Tensor};
@@ -75,7 +77,61 @@ pub enum WhichWhisperModel {
7577
LiteWhisperLargeV3TurboFast,
7678
}
7779

80+
impl Display for WhichWhisperModel {
81+
fn fmt(
82+
&self,
83+
f: &mut std::fmt::Formatter<'_>,
84+
) -> std::fmt::Result {
85+
write!(
86+
f,
87+
"{}",
88+
match self {
89+
Self::Tiny => "tiny",
90+
Self::TinyEn => "tiny.en",
91+
Self::Base => "base",
92+
Self::BaseEn => "base.en",
93+
Self::Small => "small",
94+
Self::SmallEn => "small.en",
95+
Self::Medium => "medium",
96+
Self::MediumEn => "medium.en",
97+
Self::Large => "large",
98+
Self::LargeV2 => "large-v2",
99+
Self::LargeV3 => "large-v3",
100+
Self::LargeV3Turbo => "large-v3-turbo",
101+
Self::DistilMediumEn => "distil-medium.en",
102+
Self::DistilLargeV2 => "distil-large-v2",
103+
Self::LiteWhisperLargeV3Turbo => "lite-whisper-large-v3-turbo",
104+
Self::LiteWhisperLargeV3TurboAcc => "lite-whisper-large-v3-turbo-acc",
105+
Self::LiteWhisperLargeV3TurboFast => "lite-whisper-large-v3-turbo-fast",
106+
}
107+
)
108+
}
109+
}
110+
78111
impl WhichWhisperModel {
112+
pub fn from_str(s: &str) -> Option<Self> {
113+
match s.to_lowercase().as_str() {
114+
"tiny" => Some(Self::Tiny),
115+
"tiny.en" => Some(Self::TinyEn),
116+
"base" => Some(Self::Base),
117+
"base.en" => Some(Self::BaseEn),
118+
"small" => Some(Self::Small),
119+
"small.en" => Some(Self::SmallEn),
120+
"medium" => Some(Self::Medium),
121+
"medium.en" => Some(Self::MediumEn),
122+
"large" => Some(Self::Large),
123+
"large-v2" => Some(Self::LargeV2),
124+
"large-v3" => Some(Self::LargeV3),
125+
"large-v3-turbo" => Some(Self::LargeV3Turbo),
126+
"distil-medium.en" => Some(Self::DistilMediumEn),
127+
"distil-large-v2" => Some(Self::DistilLargeV2),
128+
"lite-whisper-large-v3-turbo" => Some(Self::LiteWhisperLargeV3Turbo),
129+
"lite-whisper-large-v3-turbo-acc" => Some(Self::LiteWhisperLargeV3TurboAcc),
130+
"lite-whisper-large-v3-turbo-fast" => Some(Self::LiteWhisperLargeV3TurboFast),
131+
_ => None,
132+
}
133+
}
134+
79135
pub const fn model_and_revision(self) -> (&'static str, &'static str) {
80136
match self {
81137
Self::Tiny => ("openai/whisper-tiny", "main"),

cspell.config.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,12 +23,14 @@ words:
2323
- probs
2424
- Resampler
2525
- rngs
26+
- rustfft
2627
- safetensors
2728
- Seedable
2829
- serde
2930
- silero
3031
- snac
3132
- softmax
33+
- stft
3234
- unsqueeze
3335
ignoreWords: []
3436
import: []

0 commit comments

Comments
 (0)