From 2aafd81de3a4101044fd95a5e9de9fd37b65988b Mon Sep 17 00:00:00 2001
From: Ruhollah Majdoddin <r.majdodin@gmail.com>
Date: Mon, 14 Oct 2024 19:35:29 +0200
Subject: [PATCH] repeats audio_features tensor, just like tokens tensor, by
 group size for beam search or best-of-n sampling, to avoid  size mismatch
 error in attention by batch inference

---
 whisper/decoding.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/whisper/decoding.py b/whisper/decoding.py
index 49485d009..c2d07fa75 100644
--- a/whisper/decoding.py
+++ b/whisper/decoding.py
@@ -732,6 +732,7 @@ def run(self, mel: Tensor) -> List[DecodingResult]:
 
         # repeat text tensors by the group size, for beam search or best-of-n sampling
         tokens = tokens.repeat_interleave(self.n_group, dim=0).to(audio_features.device)
+        audio_features = audio_features.repeat_interleave(self.n_group, dim=0).to(audio_features.device)
 
         # call the main sampling loop
         tokens, sum_logprobs, no_speech_probs = self._main_loop(audio_features, tokens)