peteonrails · peteonrails · Jan 27, 2026 · Jan 27, 2026
@@ -94,10 +94,11 @@ on_demand_loading = false
 # Enable on laptops with hybrid graphics to let dGPU sleep between transcriptions.
 # gpu_isolation = true
 
-# Context window optimization for short recordings (enabled by default)
-# Speeds up transcription for clips under 22.5 seconds. Disable only if you
-# experience accuracy issues with short recordings (rare).
-# context_window_optimization = false
+# Context window optimization for short recordings (disabled by default)
+# Speeds up transcription for clips under 22.5 seconds. Disabled by default
+# because some models (especially large-v3/turbo) may experience repetition
+# loops. Enable if you want faster transcription and don't experience issues.
+# context_window_optimization = true
 
 # --- Remote mode settings (used when mode = "remote") ---
 # remote_endpoint = "http://192.168.1.100:8080"  # Required

@@ -539,14 +539,14 @@ gpu_isolation = true  # Release GPU memory between transcriptions
 ### context_window_optimization
 
 **Type:** Boolean
-**Default:** `true`
+**Default:** `false`
 **Required:** No
 
-Optimizes Whisper's context window size for short recordings. When enabled, clips under 22.5 seconds use a smaller context window proportional to their length, significantly speeding up transcription.
+Optimizes Whisper's context window size for short recordings. When enabled, clips under 22.5 seconds use a smaller context window proportional to their length, speeding up transcription. Also sets `no_context=true` to prevent phrase repetition.
 
 **Values:**
-- `true` (default) - Use optimized context window for short clips. Faster transcription.
-- `false` - Always use Whisper's full 30-second context window (1500 tokens).
+- `false` (default) - Use Whisper's full 30-second context window (1500 tokens). Most compatible.
+- `true` - Use optimized context window for short clips. Faster but may cause issues with some models.
 
 **Performance impact:**
 
@@ -557,23 +557,26 @@ Optimizes Whisper's context window size for short recordings. When enabled, clip
 
 The optimization provides roughly 1.6-1.9x speedup for short recordings on both CPU and GPU.
 
-**When to disable (`false`):**
-- You experience transcription quality issues with short clips (rare)
-- Debugging transcription problems and want to rule out this optimization
-- Testing or benchmarking against default Whisper behavior
+**When to enable (`true`):**
+- You want faster transcription for short clips
+- Your model doesn't exhibit repetition issues (test before enabling)
+- You're using smaller models (tiny, base, small) which are more stable
 
-Most users should leave this enabled. The optimization has been tested extensively and should not affect transcription quality.
+**When to keep disabled (`false`):**
+- You use large-v3 or large-v3-turbo models (known repetition issues)
+- You experience phrase repetition like "word word word"
+- You want maximum compatibility across all models
 
 **Example:**
 ```toml
 [whisper]
-model = "large-v3-turbo"
-context_window_optimization = false  # Use full context window (not recommended)
+model = "base.en"
+context_window_optimization = true  # Enable for faster transcription
 ```
 
 **CLI override:**
 ```bash
-voxtype --no-whisper-context-optimization daemon
+voxtype --whisper-context-optimization daemon
 ```
 
 **Note:** This setting only applies when using the local whisper backend (`backend = "local"`). It has no effect with remote transcription.

@@ -251,21 +251,23 @@ model = "base.en"  # For English
 language = "en"
 ```
 
-#### 4. Context window optimization (rare)
+#### 4. Context window optimization
 
-If you experience accuracy issues specifically with short recordings, try disabling context window optimization:
+Context window optimization is disabled by default because it can cause phrase repetition with some models (especially large-v3 and large-v3-turbo).
+
+If you want faster transcription and your model works well with it, you can enable it:
 
 ```toml
 [whisper]
-context_window_optimization = false
+context_window_optimization = true
 ```
 
 Or via command line:
 ```bash
-voxtype --no-whisper-context-optimization daemon
+voxtype --whisper-context-optimization daemon
 ```
 
-This is rarely needed. The optimization speeds up transcription for short clips and should not affect quality. Only try this if other solutions don't help and the issue is specific to short recordings.
+If you experience phrase repetition (e.g., "word word word"), make sure this setting is disabled (the default).
 
 ### Transcription includes "[BLANK_AUDIO]" or similar
 
@@ -285,6 +287,21 @@ This is rarely needed. The optimization speeds up transcription for short clips
 2. Avoid recording ambient noise
 3. Keep recordings short and speech-focused
 
+### Phrase repetition (same words repeated multiple times)
+
+**Cause:** Known issue with Whisper large-v3 models, especially when context window optimization is enabled.
+
+**Example:** Saying "increase the limit" produces "increase the limit increase the limit increase the limit"
+
+**Solutions:**
+1. Ensure `context_window_optimization` is disabled (the default):
+   ```toml
+   [whisper]
+   context_window_optimization = false
+   ```
+2. Try a different model (large-v3-turbo and large-v3 are most affected)
+3. If using context optimization and experiencing issues, disable it
+
 ---
 
 ## Output Problems

@@ -378,7 +378,7 @@ fn default_on_demand_loading() -> bool {
 }
 
 fn default_context_window_optimization() -> bool {
-    true
+    false
 }
 
 fn default_max_loaded_models() -> usize {
@@ -1750,14 +1750,15 @@ mod tests {
     }
 
     #[test]
-    fn test_context_window_optimization_default_true() {
-        // Default config should have context_window_optimization enabled
+    fn test_context_window_optimization_default_false() {
+        // Default config should have context_window_optimization disabled
+        // (disabled by default due to repetition issues with some models)
         let config = Config::default();
-        assert!(config.whisper.context_window_optimization);
+        assert!(!config.whisper.context_window_optimization);
     }
 
     #[test]
-    fn test_context_window_optimization_can_be_disabled() {
+    fn test_context_window_optimization_can_be_enabled() {
         let toml_str = r#"
             [hotkey]
             key = "SCROLLLOCK"
@@ -1770,19 +1771,20 @@ mod tests {
             [whisper]
             model = "base.en"
             language = "en"
-            context_window_optimization = false
+            context_window_optimization = true
 
             [output]
             mode = "type"
         "#;
 
         let config: Config = toml::from_str(toml_str).unwrap();
-        assert!(!config.whisper.context_window_optimization);
+        assert!(config.whisper.context_window_optimization);
     }
 
     #[test]
     fn test_context_window_optimization_defaults_when_omitted() {
-        // When not specified in config, should default to true
+        // When not specified in config, should default to false
+        // (disabled by default due to repetition issues with some models)
         let toml_str = r#"
             [hotkey]
             key = "SCROLLLOCK"
@@ -1801,7 +1803,7 @@ mod tests {
         "#;
 
         let config: Config = toml::from_str(toml_str).unwrap();
-        assert!(config.whisper.context_window_optimization);
+        assert!(!config.whisper.context_window_optimization);
     }
 
     #[test]

@@ -195,12 +195,15 @@ impl Transcriber for WhisperTranscriber {
 
         // Optimize context window for short clips
         if self.context_window_optimization {
+            // Prevent hallucination/looping by not conditioning on previous text
+            // This is especially important for short clips where Whisper can repeat itself
+            params.set_no_context(true);
+
             if let Some(audio_ctx) = calculate_audio_ctx(duration_secs) {
                 params.set_audio_ctx(audio_ctx);
                 tracing::info!(
-                    "Audio context optimization: using audio_ctx={} for {:.2}s clip (formula: {:.2}s * 50 + 64)",
+                    "Audio context optimization: using audio_ctx={} for {:.2}s clip",
                     audio_ctx,
-                    duration_secs,
                     duration_secs
                 );
             }
@@ -300,14 +303,25 @@ fn resolve_model_path(model: &str) -> Result<PathBuf, TranscribeError> {
 }
 
 /// Calculate audio_ctx parameter for short clips (≤22.5s).
-/// Formula: duration_seconds * 50 + 64
+/// Formula: max(duration_seconds * 50 + 128, 384), rounded up to multiple of 8
 ///
 /// This optimization reduces transcription time for short recordings by
 /// telling Whisper to use a smaller context window proportional to the
 /// actual audio length, rather than the full 30-second batch window.
+///
+/// The conservative formula includes:
+/// - Increased padding (128 instead of 64) for stability
+/// - Minimum threshold of 384 (~7.7s context) to avoid instability with very short clips
+/// - Alignment to multiple of 8 for GPU backend compatibility (Metal, Vulkan)
 fn calculate_audio_ctx(duration_secs: f32) -> Option<i32> {
+    const MIN_AUDIO_CTX: i32 = 384; // ~7.7s minimum context
+
     if duration_secs <= 22.5 {
-        Some((duration_secs * 50.0) as i32 + 64)
+        let raw_ctx = (duration_secs * 50.0) as i32 + 128;
+        let bounded_ctx = raw_ctx.max(MIN_AUDIO_CTX);
+        // Round up to next multiple of 8 for GPU backend alignment
+        let aligned_ctx = (bounded_ctx + 7) / 8 * 8;
+        Some(aligned_ctx)
     } else {
         None
     }
@@ -354,17 +368,18 @@ mod tests {
 
     #[test]
     fn test_calculate_audio_ctx_short_clips() {
-        // Very short clip: 1s -> 1 * 50 + 64 = 114
-        assert_eq!(calculate_audio_ctx(1.0), Some(114));
+        // Very short clips use minimum threshold (384), aligned to 8
+        // 1s: max(50 + 128, 384) = 384, already aligned
+        assert_eq!(calculate_audio_ctx(1.0), Some(384));
 
-        // 5 second clip: 5 * 50 + 64 = 314
-        assert_eq!(calculate_audio_ctx(5.0), Some(314));
+        // 5s: max(250 + 128, 384) = 384, already aligned
+        assert_eq!(calculate_audio_ctx(5.0), Some(384));
 
-        // 10 second clip: 10 * 50 + 64 = 564
-        assert_eq!(calculate_audio_ctx(10.0), Some(564));
+        // 10s: max(500 + 128, 384) = 628, aligned to 632
+        assert_eq!(calculate_audio_ctx(10.0), Some(632));
 
-        // At threshold: 22.5 * 50 + 64 = 1189
-        assert_eq!(calculate_audio_ctx(22.5), Some(1189));
+        // At threshold: max(1125 + 128, 384) = 1253, aligned to 1256
+        assert_eq!(calculate_audio_ctx(22.5), Some(1256));
     }
 
     #[test]
@@ -386,25 +401,41 @@ mod tests {
         // (the full 30-second context window).
         //
         // This test verifies the optimization logic by demonstrating:
-        // 1. When enabled: short clips get optimized audio_ctx (e.g., 114 for 1s)
+        // 1. When enabled: short clips get optimized audio_ctx (e.g., 384 min for short clips)
         // 2. When disabled: Whisper's default 1500 is used (not set explicitly)
 
         const WHISPER_DEFAULT_AUDIO_CTX: i32 = 1500;
 
-        // With optimization enabled, 1s clip would use audio_ctx=114
+        // With optimization enabled, 1s clip uses minimum threshold (384)
         let optimized_ctx = calculate_audio_ctx(1.0);
-        assert_eq!(optimized_ctx, Some(114));
+        assert_eq!(optimized_ctx, Some(384));
         assert!(optimized_ctx.unwrap() < WHISPER_DEFAULT_AUDIO_CTX);
 
         // With optimization disabled, we don't call calculate_audio_ctx,
         // so Whisper uses its default of 1500. This is handled in transcribe()
         // by checking self.context_window_optimization before applying.
 
-        // Verify the optimization provides significant reduction
+        // Verify the optimization provides reduction (conservative formula still saves ~75%)
         let ratio = WHISPER_DEFAULT_AUDIO_CTX as f32 / optimized_ctx.unwrap() as f32;
         assert!(
-            ratio > 10.0,
-            "Optimization should reduce context by >10x for 1s clips"
+            ratio > 3.0,
+            "Optimization should reduce context by >3x for 1s clips"
         );
     }
+
+    #[test]
+    fn test_audio_ctx_alignment() {
+        // Verify all results are aligned to multiple of 8 for GPU compatibility
+        for duration in [1.0, 3.0, 5.0, 7.0, 10.0, 15.0, 20.0, 22.5] {
+            if let Some(ctx) = calculate_audio_ctx(duration) {
+                assert_eq!(
+                    ctx % 8,
+                    0,
+                    "audio_ctx {} for {}s should be aligned to 8",
+                    ctx,
+                    duration
+                );
+            }
+        }
+    }
 }