From 21999e17025800053e3310ac9c1bb239d6c14f2b Mon Sep 17 00:00:00 2001 From: Ryan Heise Date: Sun, 31 Mar 2024 13:11:45 +1100 Subject: [PATCH] Omit space prefix in initial_prompt for spaceless languages. --- whisper/transcribe.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/whisper/transcribe.py b/whisper/transcribe.py index 1c075a201..70d70aeb5 100644 --- a/whisper/transcribe.py +++ b/whisper/transcribe.py @@ -228,7 +228,8 @@ def decode_with_fallback(segment: torch.Tensor) -> DecodingResult: prompt_reset_since = 0 if initial_prompt is not None: - initial_prompt_tokens = tokenizer.encode(" " + initial_prompt.strip()) + space = "" if language in {"zh", "ja", "th", "lo", "my", "yue"} else " " + initial_prompt_tokens = tokenizer.encode(space + initial_prompt.strip()) all_tokens.extend(initial_prompt_tokens) else: initial_prompt_tokens = []