chore: update

nekomeowww · nekomeowww · commit f7a26b829f32 · 2025-08-09T03:18:04.000+08:00
diff --git a/apps/silero-vad-whisper-realtime-ort/src/main.rs b/apps/silero-vad-whisper-realtime-ort/src/main.rs
@@ -131,5 +131,3 @@ async fn transcribe_audio(
   println!("📝 Transcript ({:.2}s): \"{}\"", duration.as_secs_f32(), transcript.trim());
   Ok(())
 }
-
-// ... (Your save_audio_chunk_to_file and write_wav_file functions remain the same)
diff --git a/apps/silero-vad-whisper-realtime-ort/src/whisper_processor.rs b/apps/silero-vad-whisper-realtime-ort/src/whisper_processor.rs
@@ -1,5 +1,3 @@
-// src/whisper_processor.rs
-
 use anyhow::{Result, anyhow};
 use byteorder::{ByteOrder, LittleEndian};
 use ndarray::{Array1, Array2, s};
@@ -60,12 +58,9 @@ impl WhisperProcessor {
 
     // 2. Compute the Short-Time Fourier Transform (STFT)
     let stft = self.stft(&pcm_data);
-
     // 3. Apply the mel filter bank
     let mel_spectrogram = self.mel_filters.dot(&stft);
-
     // 4. Apply logarithmic scaling
-
     self.log_mel_spectrogram(&mel_spectrogram)
   }
 
@@ -80,19 +75,16 @@ impl WhisperProcessor {
     // Pad the input data
     let mut padded_data = Array1::zeros(pcm_data.len() + N_FFT);
 
-    // --- FIX 1: Correct slicing for unsigned types ---
     let end = padded_data.len() - N_FFT / 2;
     padded_data
       .slice_mut(s![N_FFT / 2..end])
       .assign(pcm_data);
 
-    // --- FIX 2: Call .into_iter() before .step_by() ---
     let frames = padded_data
       .windows(N_FFT)
       .into_iter()
       .step_by(HOP_LENGTH);
 
-    // Initialize FFT planner
     let mut planner = FftPlanner::<f32>::new();
     let fft = planner.plan_fft_forward(N_FFT);
 
diff --git a/apps/whisper-api/Cargo.toml b/apps/whisper-api/Cargo.toml
@@ -16,7 +16,7 @@ hf-hub = "0.4.2"
 rand = "0.9.1"
 rubato = "0.16.2"
 serde_json = "1.0.140"
-symphonia = "0.5.4"
+symphonia = { version = "0.5.4", features = ["mkv"] }
 tokenizers = "0.21.1"
 tracing-chrome = "0.7.2"
 tracing-subscriber = "0.3.19"
diff --git a/apps/whisper-api/src/main.rs b/apps/whisper-api/src/main.rs
@@ -13,7 +13,7 @@ use tower::ServiceBuilder;
 use tower_http::cors::CorsLayer;
 
 use crate::{
-  router::transcribe_audio,
+  router::{list_models, transcribe_audio},
   vad::VADProcessor,
   whisper::{WhichWhisperModel, WhisperProcessor},
 };
@@ -102,24 +102,8 @@ impl AppState {
 
   // Parse model name string to WhichWhisperModel enum
   fn parse_model_name(model_name: &str) -> Result<WhichWhisperModel> {
-    match model_name.to_lowercase().as_str() {
-      "tiny" => Ok(WhichWhisperModel::Tiny),
-      "tiny.en" => Ok(WhichWhisperModel::TinyEn),
-      "base" => Ok(WhichWhisperModel::Base),
-      "base.en" => Ok(WhichWhisperModel::BaseEn),
-      "small" => Ok(WhichWhisperModel::Small),
-      "small.en" => Ok(WhichWhisperModel::SmallEn),
-      "medium" => Ok(WhichWhisperModel::Medium),
-      "medium.en" => Ok(WhichWhisperModel::MediumEn),
-      "large" => Ok(WhichWhisperModel::Large),
-      "large-v2" => Ok(WhichWhisperModel::LargeV2),
-      "large-v3" => Ok(WhichWhisperModel::LargeV3),
-      "large-v3-turbo" => Ok(WhichWhisperModel::LargeV3Turbo),
-      "distil-medium.en" => Ok(WhichWhisperModel::DistilMediumEn),
-      "distil-large-v2" => Ok(WhichWhisperModel::DistilLargeV2),
-      "lite-whisper-large-v3-turbo" => Ok(WhichWhisperModel::LiteWhisperLargeV3Turbo),
-      "lite-whisper-large-v3-turbo-acc" => Ok(WhichWhisperModel::LiteWhisperLargeV3TurboAcc),
-      "lite-whisper-large-v3-turbo-fast" => Ok(WhichWhisperModel::LiteWhisperLargeV3TurboFast),
+    match WhichWhisperModel::from_str(model_name) {
+      Some(model) => Ok(model),
       _ => anyhow::bail!("Unsupported Whisper model: {}. Supported models: tiny, base, small, medium, large, large-v2, large-v3, large-v3-turbo, distill-medium.en, distil-large-v2, lite-whisper-large-v3-turbo, lite-whisper-large-v3-turbo-acc, lite-whisper-large-v3-turbo-fast", model_name),
     }
   }
@@ -136,6 +120,7 @@ async fn main() -> Result<()> {
   // Build application routes
   let app = Router::new()
     .route("/healthz", get(health_check))
+    .route("/v1/models", get(list_models))
     .route("/v1/audio/transcriptions", post(transcribe_audio))
     .layer(
       ServiceBuilder::new()
diff --git a/apps/whisper-api/src/router.rs b/apps/whisper-api/src/router.rs
@@ -16,22 +16,17 @@ use axum::{
   },
 };
 use futures::stream::{self, Stream};
-use symphonia::{
-  core::{
-    audio::{AudioBufferRef, Signal},
-    codecs::DecoderOptions,
-    formats::FormatOptions,
-    io::{MediaSourceStream, MediaSourceStreamOptions},
-    meta::MetadataOptions,
-    probe::Hint,
-  },
-  default::get_probe,
+use serde_json::json;
+use symphonia::core::{
+  audio::{AudioBufferRef, Signal},
+  codecs::DecoderOptions,
 };
 
 use crate::{
   AppState,
   api::{ErrorDetail, ErrorResponse, StreamChunk, TranscriptionResponse},
   audio_manager::AudioBuffer,
+  whisper::WhichWhisperModel,
 };
 
 // Performance statistics struct
@@ -72,6 +67,46 @@ impl ProcessingStats {
   }
 }
 
+pub async fn list_models() -> Result<Response, (StatusCode, Json<ErrorResponse>)> {
+  // List available models (this is a placeholder, implement actual model listing logic)
+  let models = vec![
+    WhichWhisperModel::Tiny,
+    WhichWhisperModel::TinyEn,
+    WhichWhisperModel::Base,
+    WhichWhisperModel::BaseEn,
+    WhichWhisperModel::Small,
+    WhichWhisperModel::SmallEn,
+    WhichWhisperModel::Medium,
+    WhichWhisperModel::MediumEn,
+    WhichWhisperModel::Large,
+    WhichWhisperModel::LargeV2,
+    WhichWhisperModel::LargeV3,
+    WhichWhisperModel::LargeV3Turbo,
+    WhichWhisperModel::DistilMediumEn,
+    WhichWhisperModel::DistilLargeV2,
+    WhichWhisperModel::LiteWhisperLargeV3Turbo,
+    WhichWhisperModel::LiteWhisperLargeV3TurboAcc,
+    WhichWhisperModel::LiteWhisperLargeV3TurboFast,
+  ]
+  .iter()
+  .map(|model| {
+    json!({
+      "id": model.to_string(),
+      "type": "object",
+      "owned_by": "candle-examples",
+    })
+  })
+  .collect::<Vec<_>>();
+
+  Ok(
+    Json(json!({
+      "object": "list",
+      "data": models,
+    }))
+    .into_response(),
+  )
+}
+
 #[allow(clippy::cast_precision_loss)]
 pub async fn transcribe_audio(
   State(state): State<Arc<AppState>>,
@@ -192,59 +227,77 @@ async fn extract_multipart_data(multipart: &mut Multipart) -> Result<(Vec<u8>, H
   Ok((audio, params))
 }
 
-// Convert various audio formats to PCM
-#[allow(clippy::unused_async)]
+use symphonia::core::{codecs::CODEC_TYPE_NULL, conv::FromSample};
+
+fn conv<T>(
+  samples: &mut Vec<f32>,
+  data: std::borrow::Cow<symphonia::core::audio::AudioBuffer<T>>,
+) where
+  T: symphonia::core::sample::Sample,
+  f32: symphonia::core::conv::FromSample<T>,
+{
+  samples.extend(data.chan(0).iter().map(|v| f32::from_sample(*v)))
+}
+
 async fn convert_audio_to_pcm(audio_data: &[u8]) -> Result<Vec<f32>> {
   let cursor = std::io::Cursor::new(audio_data.to_vec());
-  let media_source = MediaSourceStream::new(Box::new(cursor), MediaSourceStreamOptions::default());
 
-  let mut hint = Hint::new();
-  hint.mime_type("audio/wav"); // You might want to detect this automatically
+  // Create the media source stream.
+  let mss = symphonia::core::io::MediaSourceStream::new(Box::new(cursor), Default::default());
 
-  let meta_opts = MetadataOptions::default();
-  let fmt_opts = FormatOptions::default();
+  // Create a probe hint using the file's extension. [Optional]
+  let hint = symphonia::core::probe::Hint::new();
 
-  let probed = get_probe().format(&hint, media_source, &fmt_opts, &meta_opts)?;
+  // Use the default options for metadata and format readers.
+  let meta_opts: symphonia::core::meta::MetadataOptions = Default::default();
+  let fmt_opts: symphonia::core::formats::FormatOptions = Default::default();
 
+  // Probe the media source.
+  let probed = symphonia::default::get_probe().format(&hint, mss, &fmt_opts, &meta_opts)?;
+  // Get the instantiated format reader.
   let mut format = probed.format;
+
+  // Find the first audio track with a known (decodeable) codec.
   let track = format
     .tracks()
     .iter()
-    .find(|t| t.codec_params.codec != symphonia::core::codecs::CODEC_TYPE_NULL)
-    .ok_or_else(|| anyhow::anyhow!("No audio track found"))?;
+    .find(|t| t.codec_params.codec != CODEC_TYPE_NULL)
+    .expect("no supported audio tracks");
+
+  // Use the default options for the decoder.
+  let dec_opts: DecoderOptions = Default::default();
+  let mut dec_params = track.codec_params.clone();
+  dec_params.max_frames_per_packet = Some(1); // Decode one frame at a time
 
-  let dec_opts = DecoderOptions::default();
-  let mut decoder = symphonia::default::get_codecs().make(&track.codec_params, &dec_opts)?;
+  // Create a decoder for the track.
+  let mut decoder = symphonia::default::get_codecs()
+    .make(&dec_params, &dec_opts)
+    .expect("unsupported codec");
 
   let track_id = track.id;
   let mut pcm_data = Vec::new();
-
-  // Decode the audio
+  // The decode loop.
   while let Ok(packet) = format.next_packet() {
+    // Consume any new metadata that has been read since the last packet.
+    while !format.metadata().is_latest() {
+      format.metadata().pop();
+    }
+
+    // If the packet does not belong to the selected track, skip over it.
     if packet.track_id() != track_id {
       continue;
     }
-
     match decoder.decode(&packet)? {
-      AudioBufferRef::F32(buf) => {
-        for &sample in buf.chan(0) {
-          pcm_data.push(sample);
-        }
-      },
-      AudioBufferRef::S16(buf) => {
-        for &sample in buf.chan(0) {
-          pcm_data.push(f32::from(sample) / f32::from(i16::MAX));
-        }
-      },
-      AudioBufferRef::S32(buf) => {
-        for &sample in buf.chan(0) {
-          #[allow(clippy::cast_precision_loss)]
-          pcm_data.push(sample as f32 / i32::MAX as f32);
-        }
-      },
-      _ => {
-        anyhow::bail!("Unsupported audio format");
-      },
+      AudioBufferRef::F32(buf) => pcm_data.extend(buf.chan(0)),
+      AudioBufferRef::U8(data) => conv(&mut pcm_data, data),
+      AudioBufferRef::U16(data) => conv(&mut pcm_data, data),
+      AudioBufferRef::U24(data) => conv(&mut pcm_data, data),
+      AudioBufferRef::U32(data) => conv(&mut pcm_data, data),
+      AudioBufferRef::S8(data) => conv(&mut pcm_data, data),
+      AudioBufferRef::S16(data) => conv(&mut pcm_data, data),
+      AudioBufferRef::S24(data) => conv(&mut pcm_data, data),
+      AudioBufferRef::S32(data) => conv(&mut pcm_data, data),
+      AudioBufferRef::F64(data) => conv(&mut pcm_data, data),
     }
   }
 
diff --git a/apps/whisper-api/src/whisper.rs b/apps/whisper-api/src/whisper.rs
@@ -1,3 +1,5 @@
+use std::fmt::Display;
+
 use anyhow::Result;
 use byteorder::{ByteOrder, LittleEndian};
 use candle_core::{Device, IndexOp, Tensor};
@@ -75,7 +77,61 @@ pub enum WhichWhisperModel {
   LiteWhisperLargeV3TurboFast,
 }
 
+impl Display for WhichWhisperModel {
+  fn fmt(
+    &self,
+    f: &mut std::fmt::Formatter<'_>,
+  ) -> std::fmt::Result {
+    write!(
+      f,
+      "{}",
+      match self {
+        Self::Tiny => "tiny",
+        Self::TinyEn => "tiny.en",
+        Self::Base => "base",
+        Self::BaseEn => "base.en",
+        Self::Small => "small",
+        Self::SmallEn => "small.en",
+        Self::Medium => "medium",
+        Self::MediumEn => "medium.en",
+        Self::Large => "large",
+        Self::LargeV2 => "large-v2",
+        Self::LargeV3 => "large-v3",
+        Self::LargeV3Turbo => "large-v3-turbo",
+        Self::DistilMediumEn => "distil-medium.en",
+        Self::DistilLargeV2 => "distil-large-v2",
+        Self::LiteWhisperLargeV3Turbo => "lite-whisper-large-v3-turbo",
+        Self::LiteWhisperLargeV3TurboAcc => "lite-whisper-large-v3-turbo-acc",
+        Self::LiteWhisperLargeV3TurboFast => "lite-whisper-large-v3-turbo-fast",
+      }
+    )
+  }
+}
+
 impl WhichWhisperModel {
+  pub fn from_str(s: &str) -> Option<Self> {
+    match s.to_lowercase().as_str() {
+      "tiny" => Some(Self::Tiny),
+      "tiny.en" => Some(Self::TinyEn),
+      "base" => Some(Self::Base),
+      "base.en" => Some(Self::BaseEn),
+      "small" => Some(Self::Small),
+      "small.en" => Some(Self::SmallEn),
+      "medium" => Some(Self::Medium),
+      "medium.en" => Some(Self::MediumEn),
+      "large" => Some(Self::Large),
+      "large-v2" => Some(Self::LargeV2),
+      "large-v3" => Some(Self::LargeV3),
+      "large-v3-turbo" => Some(Self::LargeV3Turbo),
+      "distil-medium.en" => Some(Self::DistilMediumEn),
+      "distil-large-v2" => Some(Self::DistilLargeV2),
+      "lite-whisper-large-v3-turbo" => Some(Self::LiteWhisperLargeV3Turbo),
+      "lite-whisper-large-v3-turbo-acc" => Some(Self::LiteWhisperLargeV3TurboAcc),
+      "lite-whisper-large-v3-turbo-fast" => Some(Self::LiteWhisperLargeV3TurboFast),
+      _ => None,
+    }
+  }
+
   pub const fn model_and_revision(self) -> (&'static str, &'static str) {
     match self {
       Self::Tiny => ("openai/whisper-tiny", "main"),
diff --git a/cspell.config.yaml b/cspell.config.yaml
@@ -23,12 +23,14 @@ words:
   - probs
   - Resampler
   - rngs
+  - rustfft
   - safetensors
   - Seedable
   - serde
   - silero
   - snac
   - softmax
+  - stft
   - unsqueeze
 ignoreWords: []
 import: []
diff --git a/python/test/whisper-test.py b/python/test/whisper-test.py

Original file line number	Diff line number	Diff line change
`@@ -131,5 +131,3 @@ async fn transcribe_audio(`
`131`	`131`	`println!("📝 Transcript ({:.2}s): \"{}\"", duration.as_secs_f32(), transcript.trim());`
`132`	`132`	`Ok(())`
`133`	`133`	`}`
`134`		`-`
`135`		`-// ... (Your save_audio_chunk_to_file and write_wav_file functions remain the same)`