From 13497f0dc3af7ef9c4bfc165663ecf6fac1be09f Mon Sep 17 00:00:00 2001 From: kadirnar Date: Fri, 3 May 2024 00:59:27 +0300 Subject: [PATCH] =?UTF-8?q?=F0=9F=8C=9E=20Add=20more=20parameter=20support?= =?UTF-8?q?=20for=20Whisper=20pipeline?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- requirements.txt | 2 ++ whisperplus/pipelines/whisper.py | 15 +++++++++++---- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/requirements.txt b/requirements.txt index fafb943..e4379f1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,3 +19,5 @@ accelerate pre-commit==3.4.0 autollm==0.1.9 speechbrain==0.5.16 +bitsandbytes +flash-attn --no-build-isolation diff --git a/whisperplus/pipelines/whisper.py b/whisperplus/pipelines/whisper.py index 6f5449b..c197704 100644 --- a/whisperplus/pipelines/whisper.py +++ b/whisperplus/pipelines/whisper.py @@ -48,7 +48,14 @@ def load_model(self, model_id: str = "openai/whisper-large-v3"): self.processor = processor self.model = model - def __call__(self, audio_path: str, language: str = "turkish"): + def __call__( + self, + chunk_length_s: int = 30, + stride_length_s: int = 5, + audio_path: str = "test.mp3", + max_new_tokens: int = 128, + batch_size: int = 100, + language: str = "turkish"): """ Converts audio to text using the pre-trained speech recognition model. @@ -61,9 +68,9 @@ def __call__(self, audio_path: str, language: str = "turkish"): pipe = pipeline( "automatic-speech-recognition", model=self.model, - chunk_length_s=30, - stride_length_s=5, - max_new_tokens=128, + chunk_length_s=chunk_length_s, + stride_length_s=stride_length_s, + max_new_tokens=max_new_tokens, batch_size=100, device_map="auto", return_timestamps=True,