Skip to content

Commit 2bd95be

Browse files
committed
feat(silero-vad-whisper-realtime-ort): new package
1 parent 6a45ff6 commit 2bd95be

File tree

11 files changed

+1201
-2
lines changed

11 files changed

+1201
-2
lines changed

Cargo.lock

Lines changed: 28 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ members = [
1818
"apps/silero-vad-realtime-minimum",
1919
"apps/silero-vad-whisper-realtime",
2020
"apps/silero-vad-lite-whisper-realtime-ort",
21+
"apps/silero-vad-whisper-realtime-ort",
2122
"apps/whisper-api",
2223
"apps/whisper-realtime",
2324
]

apps/silero-vad-lite-whisper-realtime-ort/src/whisper.rs

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,12 @@ use hf_hub::api::sync::Api;
66
use lazy_static::lazy_static;
77
use ndarray::{Array2, ArrayView3, Axis, s};
88
use ort::{
9-
execution_providers::{CPUExecutionProvider, CUDAExecutionProvider, CoreMLExecutionProvider, DirectMLExecutionProvider},
9+
execution_providers::{
10+
CPUExecutionProvider,
11+
CUDAExecutionProvider,
12+
// CoreMLExecutionProvider,
13+
DirectMLExecutionProvider,
14+
},
1015
session::{Session, SessionInputValue, builder::GraphOptimizationLevel},
1116
value::Value,
1217
};
@@ -270,7 +275,7 @@ impl LiteWhisper {
270275
.build(),
271276
// 2025-07-09 14:14:44.231707 [E:onnxruntime:, sequential_executor.cc:572 ExecuteKernel] Non-zero status code returned while running 3843266348432971732_CoreML_3843266348432971732_0 node. Name:'CoreMLExecutionProvider_3843266348432971732_CoreML_3843266348432971732_0_0' Status Message: Error executing model: Unable to compute the prediction using a neural network model. It can be an invalid input data or broken/unsupported model (error code: -1).
272277
// Error: Non-zero status code returned while running 3360655929800718712_CoreML_3360655929800718712_0 node. Name:'CoreMLExecutionProvider_3360655929800718712_CoreML_3360655929800718712_0_0' Status Message: Error executing model: Unable to compute the prediction using a neural network model. It can be an invalid input data or broken/unsupported model (error code: -1).
273-
CoreMLExecutionProvider::default().build(),
278+
// CoreMLExecutionProvider::default().build(),
274279
DirectMLExecutionProvider::default()
275280
.with_device_id(0)
276281
.build(),
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
[package]
2+
name = "silero-vad-whisper-realtime-ort"
3+
version = "0.1.0"
4+
edition = "2024"
5+
6+
[dependencies]
7+
anyhow = "1.0.98"
8+
byteorder = "1.5.0"
9+
clap = { version = "4.5.38", features = ["derive"] }
10+
cpal = "0.15.3"
11+
hf-hub = "0.4.2"
12+
rand = "0.9.1"
13+
rubato = "0.16.2"
14+
serde_json = "1.0.140"
15+
symphonia = "0.5.4"
16+
tracing-chrome = "0.7.2"
17+
tracing-subscriber = "0.3.19"
18+
tracing = "0.1.41"
19+
tokio = "1.45.1"
20+
crossbeam-channel = "0.5.15"
21+
tokenizers = "0.21.2"
22+
ndarray = "0.16.1"
23+
serde = { version = "1.0.219", features = ["derive"] }
24+
lazy_static = "1.5.0"
25+
rustfft = "6.4.0"
26+
ndarray-stats = "0.6.0"
27+
log = "0.4.27"
28+
29+
[target.'cfg(target_os = "macos")'.dependencies]
30+
ort = { version = "2.0.0-rc.10", features = ["ndarray", "coreml"] }
31+
32+
[target.'cfg(target_os = "windows")'.dependencies]
33+
ort = { version = "2.0.0-rc.10", features = ["ndarray", "directml", "cuda"] }
34+
35+
[target.'cfg(target_os = "linux")'.dependencies]
36+
ort = { version = "2.0.0-rc.10", features = ["ndarray", "cuda"] }
62.8 KB
Binary file not shown.
101 KB
Binary file not shown.
Lines changed: 238 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,238 @@
1+
use std::time::Instant;
2+
3+
use anyhow::Result;
4+
use cpal::{
5+
InputCallbackInfo,
6+
traits::{DeviceTrait, HostTrait, StreamTrait},
7+
};
8+
use rubato::{FastFixedIn, PolynomialDegree, Resampler};
9+
10+
pub struct AudioManager {
11+
_stream: cpal::Stream,
12+
audio_rx: crossbeam_channel::Receiver<Vec<f32>>,
13+
resampler: Option<FastFixedIn<f32>>,
14+
buffered_pcm: Vec<f32>,
15+
}
16+
17+
impl AudioManager {
18+
pub fn new(
19+
device_name: Option<String>,
20+
target_sample_rate: u32,
21+
) -> Result<Self> {
22+
let host = cpal::default_host();
23+
let device = match device_name {
24+
None => host.default_input_device(),
25+
Some(name) => host
26+
.input_devices()?
27+
.find(|d| d.name().map(|n| n == name).unwrap_or(false)),
28+
}
29+
.ok_or_else(|| {
30+
anyhow::anyhow!(
31+
"No input device found, current available devices: {:?}",
32+
host
33+
.input_devices()
34+
.unwrap()
35+
.map(|d| d
36+
.name()
37+
.unwrap_or_else(|_| "Unnamed Device".to_string()))
38+
.collect::<Vec<String>>()
39+
.join(", ")
40+
)
41+
})?;
42+
43+
println!("Using audio input device: {}", device.name()?);
44+
45+
let config = device.default_input_config()?;
46+
let channel_count = config.channels() as usize;
47+
let device_sample_rate = config.sample_rate().0;
48+
49+
println!("Device sample rate: {device_sample_rate}Hz, Target: {target_sample_rate}Hz");
50+
51+
let (tx, rx) = crossbeam_channel::unbounded();
52+
53+
let stream = device.build_input_stream(
54+
&config.into(),
55+
move |data: &[f32], _: &InputCallbackInfo| {
56+
// Extract mono audio (first channel only)
57+
let mono_data = data
58+
.iter()
59+
.step_by(channel_count)
60+
.copied()
61+
.collect::<Vec<f32>>();
62+
63+
if !mono_data.is_empty() {
64+
let _ = tx.send(mono_data);
65+
}
66+
},
67+
|err| eprintln!("Audio stream error: {err}"),
68+
None,
69+
)?;
70+
71+
stream.play()?;
72+
73+
let resampler = if device_sample_rate == target_sample_rate {
74+
None
75+
} else {
76+
let resample_ratio = f64::from(target_sample_rate) / f64::from(device_sample_rate);
77+
78+
Some(FastFixedIn::new(
79+
resample_ratio,
80+
10.0, // max_resample_ratio_relative
81+
PolynomialDegree::Septic,
82+
1024, // chunk_size
83+
1, // channels
84+
)?)
85+
};
86+
87+
Ok(Self { _stream: stream, audio_rx: rx, resampler, buffered_pcm: Vec::new() })
88+
}
89+
90+
#[allow(clippy::future_not_send, clippy::unused_async)]
91+
pub async fn receive_audio(&mut self) -> Result<Vec<f32>> {
92+
let chunk = self.audio_rx.recv()?;
93+
94+
if let Some(ref mut resampler) = self.resampler {
95+
// Add the new raw audio to our internal buffer
96+
self.buffered_pcm.extend_from_slice(&chunk);
97+
98+
let chunk_size = 1024; // Already specified
99+
let mut resampled_audio = Vec::new();
100+
101+
// Process all full chunks that are available in the buffer
102+
while self.buffered_pcm.len() >= chunk_size {
103+
// Drain the chunk from the buffer, which removes it and returns it
104+
let chunk_to_process = self
105+
.buffered_pcm
106+
.drain(..chunk_size)
107+
.collect::<Vec<_>>();
108+
109+
// The resampler expects a slice of slices, e.g., &[&[f32]]
110+
let resampled = resampler.process(&[&chunk_to_process], None)?;
111+
resampled_audio.extend_from_slice(&resampled[0]);
112+
}
113+
114+
// Any remaining samples in self.buffered_pcm will be carried over
115+
// and processed with the next incoming audio chunk.
116+
117+
Ok(resampled_audio)
118+
} else {
119+
// No resampling needed, return the chunk directly
120+
Ok(chunk)
121+
}
122+
}
123+
}
124+
125+
pub struct AudioBuffer {
126+
buffer: Vec<f32>,
127+
max_duration_samples: usize,
128+
min_speech_duration_samples: usize,
129+
min_silence_duration_samples: usize,
130+
is_recording: bool,
131+
silence_start: Option<Instant>,
132+
speech_start: Option<Instant>,
133+
samples_since_speech_start: usize,
134+
samples_since_silence_start: usize,
135+
sample_rate: usize,
136+
}
137+
138+
impl AudioBuffer {
139+
pub const fn new(
140+
max_duration_ms: u64,
141+
min_speech_duration_ms: u64,
142+
min_silence_duration_ms: u64,
143+
sample_rate: u32,
144+
) -> Self {
145+
let sample_rate = sample_rate as usize;
146+
Self {
147+
buffer: Vec::new(),
148+
max_duration_samples: (max_duration_ms * sample_rate as u64 / 1000) as usize,
149+
min_speech_duration_samples: (min_speech_duration_ms * sample_rate as u64 / 1000) as usize,
150+
min_silence_duration_samples: (min_silence_duration_ms * sample_rate as u64 / 1000) as usize,
151+
is_recording: false,
152+
silence_start: None,
153+
speech_start: None,
154+
samples_since_speech_start: 0,
155+
samples_since_silence_start: 0,
156+
sample_rate,
157+
}
158+
}
159+
160+
pub fn add_chunk(
161+
&mut self,
162+
chunk: &[f32],
163+
is_speech: bool,
164+
) -> Option<Vec<f32>> {
165+
if is_speech {
166+
#[allow(clippy::if_not_else)]
167+
if !self.is_recording {
168+
if self.speech_start.is_none() {
169+
self.speech_start = Some(Instant::now());
170+
self.samples_since_speech_start = 0;
171+
}
172+
173+
self.samples_since_speech_start += chunk.len();
174+
175+
if self.samples_since_speech_start >= self.min_speech_duration_samples {
176+
self.is_recording = true;
177+
self.silence_start = None;
178+
self.samples_since_silence_start = 0;
179+
println!("🚀 Started recording");
180+
}
181+
} else {
182+
// Reset silence tracking
183+
self.silence_start = None;
184+
self.samples_since_silence_start = 0;
185+
}
186+
} else {
187+
// Reset speech tracking
188+
self.speech_start = None;
189+
self.samples_since_speech_start = 0;
190+
191+
if self.is_recording {
192+
if self.silence_start.is_none() {
193+
self.silence_start = Some(Instant::now());
194+
self.samples_since_silence_start = 0;
195+
}
196+
197+
self.samples_since_silence_start += chunk.len();
198+
199+
if self.samples_since_silence_start >= self.min_silence_duration_samples {
200+
// End of speech detected
201+
if !self.buffer.is_empty() {
202+
let result = self.buffer.clone();
203+
self.reset();
204+
#[allow(clippy::cast_precision_loss)]
205+
let duration_secs = result.len() as f32 / self.sample_rate as f32;
206+
println!("🔇 Stopped recording, {duration_secs:.2}s");
207+
return Some(result);
208+
}
209+
210+
self.reset();
211+
}
212+
}
213+
}
214+
215+
if self.is_recording {
216+
self.buffer.extend_from_slice(chunk);
217+
218+
// Check if buffer exceeds max duration
219+
if self.buffer.len() >= self.max_duration_samples {
220+
let result = self.buffer.clone();
221+
self.reset();
222+
println!("⏰ Max duration reached, {} samples", result.len());
223+
return Some(result);
224+
}
225+
}
226+
227+
None
228+
}
229+
230+
fn reset(&mut self) {
231+
self.buffer.clear();
232+
self.is_recording = false;
233+
self.silence_start = None;
234+
self.speech_start = None;
235+
self.samples_since_speech_start = 0;
236+
self.samples_since_silence_start = 0;
237+
}
238+
}

0 commit comments

Comments
 (0)