proj-airi
diff --git a/‎Cargo.lock‎
Lines changed: 28 additions & 0 deletions b/‎Cargo.lock‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎Cargo.toml‎
Lines changed: 1 addition & 0 deletions b/‎Cargo.toml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎apps/silero-vad-lite-whisper-realtime-ort/src/whisper.rs‎
Lines changed: 7 additions & 2 deletions b/‎apps/silero-vad-lite-whisper-realtime-ort/src/whisper.rs‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎apps/silero-vad-whisper-realtime-ort/Cargo.toml‎
Lines changed: 36 additions & 0 deletions b/‎apps/silero-vad-whisper-realtime-ort/Cargo.toml‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎apps/silero-vad-whisper-realtime-ort/melfilters.bytes‎
62.8 KB b/‎apps/silero-vad-whisper-realtime-ort/melfilters.bytes‎
62.8 KB
diff --git a/‎apps/silero-vad-whisper-realtime-ort/melfilters128.bytes‎
101 KB b/‎apps/silero-vad-whisper-realtime-ort/melfilters128.bytes‎
101 KB
diff --git a/‎apps/silero-vad-whisper-realtime-ort/src/audio_manager.rs‎
Lines changed: 238 additions & 0 deletions b/‎apps/silero-vad-whisper-realtime-ort/src/audio_manager.rs‎
Lines changed: 238 additions & 0 deletions
@@ -18,6 +18,7 @@ members = [
     "apps/silero-vad-realtime-minimum",
     "apps/silero-vad-whisper-realtime",
     "apps/silero-vad-lite-whisper-realtime-ort",
+    "apps/silero-vad-whisper-realtime-ort",
     "apps/whisper-api",
     "apps/whisper-realtime",
 ]
 
@@ -6,7 +6,12 @@ use hf_hub::api::sync::Api;
 use lazy_static::lazy_static;
 use ndarray::{Array2, ArrayView3, Axis, s};
 use ort::{
-  execution_providers::{CPUExecutionProvider, CUDAExecutionProvider, CoreMLExecutionProvider, DirectMLExecutionProvider},
+  execution_providers::{
+    CPUExecutionProvider,
+    CUDAExecutionProvider,
+    // CoreMLExecutionProvider,
+    DirectMLExecutionProvider,
+  },
   session::{Session, SessionInputValue, builder::GraphOptimizationLevel},
   value::Value,
 };
@@ -270,7 +275,7 @@ impl LiteWhisper {
           .build(),
         // 2025-07-09 14:14:44.231707 [E:onnxruntime:, sequential_executor.cc:572 ExecuteKernel] Non-zero status code returned while running 3843266348432971732_CoreML_3843266348432971732_0 node. Name:'CoreMLExecutionProvider_3843266348432971732_CoreML_3843266348432971732_0_0' Status Message: Error executing model: Unable to compute the prediction using a neural network model. It can be an invalid input data or broken/unsupported model (error code: -1).
         // Error: Non-zero status code returned while running 3360655929800718712_CoreML_3360655929800718712_0 node. Name:'CoreMLExecutionProvider_3360655929800718712_CoreML_3360655929800718712_0_0' Status Message: Error executing model: Unable to compute the prediction using a neural network model. It can be an invalid input data or broken/unsupported model (error code: -1).
-        CoreMLExecutionProvider::default().build(),
+        // CoreMLExecutionProvider::default().build(),
         DirectMLExecutionProvider::default()
           .with_device_id(0)
           .build(),
 
@@ -0,0 +1,36 @@
+[package]
+name = "silero-vad-whisper-realtime-ort"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+anyhow = "1.0.98"
+byteorder = "1.5.0"
+clap = { version = "4.5.38", features = ["derive"] }
+cpal = "0.15.3"
+hf-hub = "0.4.2"
+rand = "0.9.1"
+rubato = "0.16.2"
+serde_json = "1.0.140"
+symphonia = "0.5.4"
+tracing-chrome = "0.7.2"
+tracing-subscriber = "0.3.19"
+tracing = "0.1.41"
+tokio = "1.45.1"
+crossbeam-channel = "0.5.15"
+tokenizers = "0.21.2"
+ndarray = "0.16.1"
+serde = { version = "1.0.219", features = ["derive"] }
+lazy_static = "1.5.0"
+rustfft = "6.4.0"
+ndarray-stats = "0.6.0"
+log = "0.4.27"
+
+[target.'cfg(target_os = "macos")'.dependencies]
+ort = { version = "2.0.0-rc.10", features = ["ndarray", "coreml"] }
+
+[target.'cfg(target_os = "windows")'.dependencies]
+ort = { version = "2.0.0-rc.10", features = ["ndarray", "directml", "cuda"] }
+
+[target.'cfg(target_os = "linux")'.dependencies]
+ort = { version = "2.0.0-rc.10", features = ["ndarray", "cuda"] }
@@ -0,0 +1,238 @@
+use std::time::Instant;
+
+use anyhow::Result;
+use cpal::{
+  InputCallbackInfo,
+  traits::{DeviceTrait, HostTrait, StreamTrait},
+};
+use rubato::{FastFixedIn, PolynomialDegree, Resampler};
+
+pub struct AudioManager {
+  _stream:      cpal::Stream,
+  audio_rx:     crossbeam_channel::Receiver<Vec<f32>>,
+  resampler:    Option<FastFixedIn<f32>>,
+  buffered_pcm: Vec<f32>,
+}
+
+impl AudioManager {
+  pub fn new(
+    device_name: Option<String>,
+    target_sample_rate: u32,
+  ) -> Result<Self> {
+    let host = cpal::default_host();
+    let device = match device_name {
+      None => host.default_input_device(),
+      Some(name) => host
+        .input_devices()?
+        .find(|d| d.name().map(|n| n == name).unwrap_or(false)),
+    }
+    .ok_or_else(|| {
+      anyhow::anyhow!(
+        "No input device found, current available devices: {:?}",
+        host
+          .input_devices()
+          .unwrap()
+          .map(|d| d
+            .name()
+            .unwrap_or_else(|_| "Unnamed Device".to_string()))
+          .collect::<Vec<String>>()
+          .join(", ")
+      )
+    })?;
+
+    println!("Using audio input device: {}", device.name()?);
+
+    let config = device.default_input_config()?;
+    let channel_count = config.channels() as usize;
+    let device_sample_rate = config.sample_rate().0;
+
+    println!("Device sample rate: {device_sample_rate}Hz, Target: {target_sample_rate}Hz");
+
+    let (tx, rx) = crossbeam_channel::unbounded();
+
+    let stream = device.build_input_stream(
+      &config.into(),
+      move |data: &[f32], _: &InputCallbackInfo| {
+        // Extract mono audio (first channel only)
+        let mono_data = data
+          .iter()
+          .step_by(channel_count)
+          .copied()
+          .collect::<Vec<f32>>();
+
+        if !mono_data.is_empty() {
+          let _ = tx.send(mono_data);
+        }
+      },
+      |err| eprintln!("Audio stream error: {err}"),
+      None,
+    )?;
+
+    stream.play()?;
+
+    let resampler = if device_sample_rate == target_sample_rate {
+      None
+    } else {
+      let resample_ratio = f64::from(target_sample_rate) / f64::from(device_sample_rate);
+
+      Some(FastFixedIn::new(
+        resample_ratio,
+        10.0, // max_resample_ratio_relative
+        PolynomialDegree::Septic,
+        1024, // chunk_size
+        1,    // channels
+      )?)
+    };
+
+    Ok(Self { _stream: stream, audio_rx: rx, resampler, buffered_pcm: Vec::new() })
+  }
+
+  #[allow(clippy::future_not_send, clippy::unused_async)]
+  pub async fn receive_audio(&mut self) -> Result<Vec<f32>> {
+    let chunk = self.audio_rx.recv()?;
+
+    if let Some(ref mut resampler) = self.resampler {
+      // Add the new raw audio to our internal buffer
+      self.buffered_pcm.extend_from_slice(&chunk);
+
+      let chunk_size = 1024; // Already specified
+      let mut resampled_audio = Vec::new();
+
+      // Process all full chunks that are available in the buffer
+      while self.buffered_pcm.len() >= chunk_size {
+        // Drain the chunk from the buffer, which removes it and returns it
+        let chunk_to_process = self
+          .buffered_pcm
+          .drain(..chunk_size)
+          .collect::<Vec<_>>();
+
+        // The resampler expects a slice of slices, e.g., &[&[f32]]
+        let resampled = resampler.process(&[&chunk_to_process], None)?;
+        resampled_audio.extend_from_slice(&resampled[0]);
+      }
+
+      // Any remaining samples in self.buffered_pcm will be carried over
+      // and processed with the next incoming audio chunk.
+
+      Ok(resampled_audio)
+    } else {
+      // No resampling needed, return the chunk directly
+      Ok(chunk)
+    }
+  }
+}
+
+pub struct AudioBuffer {
+  buffer:                       Vec<f32>,
+  max_duration_samples:         usize,
+  min_speech_duration_samples:  usize,
+  min_silence_duration_samples: usize,
+  is_recording:                 bool,
+  silence_start:                Option<Instant>,
+  speech_start:                 Option<Instant>,
+  samples_since_speech_start:   usize,
+  samples_since_silence_start:  usize,
+  sample_rate:                  usize,
+}
+
+impl AudioBuffer {
+  pub const fn new(
+    max_duration_ms: u64,
+    min_speech_duration_ms: u64,
+    min_silence_duration_ms: u64,
+    sample_rate: u32,
+  ) -> Self {
+    let sample_rate = sample_rate as usize;
+    Self {
+      buffer: Vec::new(),
+      max_duration_samples: (max_duration_ms * sample_rate as u64 / 1000) as usize,
+      min_speech_duration_samples: (min_speech_duration_ms * sample_rate as u64 / 1000) as usize,
+      min_silence_duration_samples: (min_silence_duration_ms * sample_rate as u64 / 1000) as usize,
+      is_recording: false,
+      silence_start: None,
+      speech_start: None,
+      samples_since_speech_start: 0,
+      samples_since_silence_start: 0,
+      sample_rate,
+    }
+  }
+
+  pub fn add_chunk(
+    &mut self,
+    chunk: &[f32],
+    is_speech: bool,
+  ) -> Option<Vec<f32>> {
+    if is_speech {
+      #[allow(clippy::if_not_else)]
+      if !self.is_recording {
+        if self.speech_start.is_none() {
+          self.speech_start = Some(Instant::now());
+          self.samples_since_speech_start = 0;
+        }
+
+        self.samples_since_speech_start += chunk.len();
+
+        if self.samples_since_speech_start >= self.min_speech_duration_samples {
+          self.is_recording = true;
+          self.silence_start = None;
+          self.samples_since_silence_start = 0;
+          println!("🚀 Started recording");
+        }
+      } else {
+        // Reset silence tracking
+        self.silence_start = None;
+        self.samples_since_silence_start = 0;
+      }
+    } else {
+      // Reset speech tracking
+      self.speech_start = None;
+      self.samples_since_speech_start = 0;
+
+      if self.is_recording {
+        if self.silence_start.is_none() {
+          self.silence_start = Some(Instant::now());
+          self.samples_since_silence_start = 0;
+        }
+
+        self.samples_since_silence_start += chunk.len();
+
+        if self.samples_since_silence_start >= self.min_silence_duration_samples {
+          // End of speech detected
+          if !self.buffer.is_empty() {
+            let result = self.buffer.clone();
+            self.reset();
+            #[allow(clippy::cast_precision_loss)]
+            let duration_secs = result.len() as f32 / self.sample_rate as f32;
+            println!("🔇 Stopped recording, {duration_secs:.2}s");
+            return Some(result);
+          }
+
+          self.reset();
+        }
+      }
+    }
+
+    if self.is_recording {
+      self.buffer.extend_from_slice(chunk);
+
+      // Check if buffer exceeds max duration
+      if self.buffer.len() >= self.max_duration_samples {
+        let result = self.buffer.clone();
+        self.reset();
+        println!("⏰ Max duration reached, {} samples", result.len());
+        return Some(result);
+      }
+    }
+
+    None
+  }
+
+  fn reset(&mut self) {
+    self.buffer.clear();
+    self.is_recording = false;
+    self.silence_start = None;
+    self.speech_start = None;
+    self.samples_since_speech_start = 0;
+    self.samples_since_silence_start = 0;
+  }
+}
Original file line number	Diff line number	Diff line change
`@@ -18,6 +18,7 @@ members = [`
`18`	`18`	`"apps/silero-vad-realtime-minimum",`
`19`	`19`	`"apps/silero-vad-whisper-realtime",`
`20`	`20`	`"apps/silero-vad-lite-whisper-realtime-ort",`
	`21`	`+ "apps/silero-vad-whisper-realtime-ort",`
`21`	`22`	`"apps/whisper-api",`
`22`	`23`	`"apps/whisper-realtime",`
`23`	`24`	`]`