Skip to content

Commit

Permalink
Merge pull request #111 from kadirnar/update-demo
Browse files Browse the repository at this point in the history
🦸 Update notebook and gradio code
  • Loading branch information
kadirnar authored Jun 8, 2024
2 parents ca9ff26 + 1bd9f43 commit 39eb886
Show file tree
Hide file tree
Showing 2 changed files with 308 additions and 182 deletions.
177 changes: 154 additions & 23 deletions notebook/demo.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,17 @@
"nest_asyncio.apply()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9b6cf901",
"metadata": {},
"outputs": [],
"source": [
"!pip install whisperplus git+https://github.com/huggingface/transformers\n",
"!pip install flash-attn --no-build-isolation"
]
},
{
"cell_type": "markdown",
"id": "1d0b2e40",
Expand All @@ -50,16 +61,100 @@
"metadata": {},
"outputs": [],
"source": [
"from whisperplus import SpeechToTextPipeline, download_and_convert_to_mp3\n",
"from whisperplus import SpeechToTextPipeline, download_youtube_to_mp3\n",
"from transformers import BitsAndBytesConfig, HqqConfig\n",
"import torch\n",
"\n",
"url = \"https://www.youtube.com/watch?v=di3rHkEZuUw\"\n",
"audio_path = download_and_convert_to_mp3(url)\n",
"pipeline = SpeechToTextPipeline(model_id=\"openai/whisper-large-v3\")\n",
"transcript = pipeline(audio_path, \"openai/whisper-large-v3\", \"english\")\n",
"audio_path = download_youtube_to_mp3(url, output_dir=\"downloads\", filename=\"test\")\n",
"\n",
"hqq_config = HqqConfig(\n",
" nbits=4,\n",
" group_size=64,\n",
" quant_zero=False,\n",
" quant_scale=False,\n",
" axis=0,\n",
" offload_meta=False,\n",
") # axis=0 is used by default\n",
"\n",
"bnb_config = BitsAndBytesConfig(\n",
" load_in_4bit=True,\n",
" bnb_4bit_quant_type=\"nf4\",\n",
" bnb_4bit_compute_dtype=torch.bfloat16,\n",
" bnb_4bit_use_double_quant=True,\n",
")\n",
"\n",
"pipeline = SpeechToTextPipeline(\n",
" model_id=\"distil-whisper/distil-large-v3\",\n",
" quant_config=hqq_config,\n",
" flash_attention_2=True,\n",
")\n",
"\n",
"transcript = pipeline(\n",
" audio_path=audio_path,\n",
" chunk_length_s=30,\n",
" stride_length_s=5,\n",
" max_new_tokens=128,\n",
" batch_size=100,\n",
" language=\"english\",\n",
" return_timestamps=False,\n",
")\n",
"\n",
"print(transcript)"
]
},
{
"cell_type": "markdown",
"id": "8d6282f7",
"metadata": {},
"source": [
"### 🍎 Apple MLX"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "97b42087",
"metadata": {},
"outputs": [],
"source": [
"from whisperplus.pipelines import mlx_whisper\n",
"from whisperplus import download_youtube_to_mp3\n",
"\n",
"url = \"https://www.youtube.com/watch?v=1__CAdTJ5JU\"\n",
"audio_path = download_youtube_to_mp3(url)\n",
"\n",
"text = mlx_whisper.transcribe(\n",
" audio_path, path_or_hf_repo=\"mlx-community/whisper-large-v3-mlx\"\n",
")[\"text\"]\n",
"print(text)"
]
},
{
"cell_type": "markdown",
"id": "ca528ba7",
"metadata": {},
"source": [
"### 🍏 Lightning Mlx Whisper"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0d99b8bb",
"metadata": {},
"outputs": [],
"source": [
"from whisperplus.pipelines.lightning_whisper_mlx import LightningWhisperMLX\n",
"from whisperplus import download_youtube_to_mp3\n",
"\n",
"url = \"https://www.youtube.com/watch?v=1__CAdTJ5JU\"\n",
"audio_path = download_youtube_to_mp3(url)\n",
"\n",
"whisper = LightningWhisperMLX(model=\"distil-large-v3\", batch_size=12, quant=None)\n",
"output = whisper.transcribe(audio_path=audio_path)[\"text\"]"
]
},
{
"cell_type": "markdown",
"id": "1a99131f",
Expand All @@ -77,7 +172,7 @@
"metadata": {},
"outputs": [],
"source": [
"from whisperplus import TextSummarizationPipeline\n",
"from whisperplus.pipelines.summarization import TextSummarizationPipeline\n",
"\n",
"summarizer = TextSummarizationPipeline(model_id=\"facebook/bart-large-cnn\")\n",
"summary = summarizer.summarize(transcript)\n",
Expand All @@ -101,7 +196,7 @@
"metadata": {},
"outputs": [],
"source": [
"from whisperplus import LongTextSummarizationPipeline\n",
"from whisperplus.pipelines.long_text_summarization import LongTextSummarizationPipeline\n",
"\n",
"summarizer = LongTextSummarizationPipeline(model_id=\"facebook/bart-large-cnn\")\n",
"summary_text = summarizer.summarize(transcript)\n",
Expand All @@ -115,7 +210,25 @@
"source": [
"### 💬 Speaker Diarization\n",
"\n",
"In this section, we demonstrate the use of Speaker Diarization. This feature helps in distinguishing between different speakers in an audio clip."
"You must confirm the licensing permissions of these two models.\n",
"\n",
"- https://huggingface.co/pyannote/speaker-diarization-3.1\n",
"- https://huggingface.co/pyannote/segmentation-3.0"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0066de25",
"metadata": {},
"outputs": [],
"source": [
"huggingface_hub import notebook_login\n",
"\n",
"!pip install -r speaker_diarization.txt\n",
"!pip install -U \"huggingface_hub[cli]\"\n",
"\n",
"notebook_login()"
]
},
{
Expand All @@ -125,18 +238,15 @@
"metadata": {},
"outputs": [],
"source": [
"from whisperplus import (\n",
" ASRDiarizationPipeline,\n",
" download_and_convert_to_mp3,\n",
" format_speech_to_dialogue,\n",
")\n",
"from whisperplus.pipelines.whisper_diarize import ASRDiarizationPipeline\n",
"from whisperplus import download_youtube_to_mp3, format_speech_to_dialogue\n",
"\n",
"audio_path = download_and_convert_to_mp3(\"https://www.youtube.com/watch?v=mRB14sFHw2E\")\n",
"audio_path = download_youtube_to_mp3(\"https://www.youtube.com/watch?v=mRB14sFHw2E\")\n",
"\n",
"device = \"cuda\" # cpu or mps\n",
"pipeline = ASRDiarizationPipeline.from_pretrained(\n",
" asr_model=\"openai/whisper-large-v3\",\n",
" diarizer_model=\"pyannote/speaker-diarization\",\n",
" diarizer_model=\"pyannote/speaker-diarization-3.1\",\n",
" use_auth_token=False,\n",
" chunk_length_s=30,\n",
" device=device,\n",
Expand All @@ -157,14 +267,24 @@
"This part covers the 'Chat with Video' feature using LanceDB. It demonstrates how to interact with a video transcript using a chat interface."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0714c378",
"metadata": {},
"outputs": [],
"source": [
"!pip install sentence-transformers ctransformers langchain"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7c108aee",
"metadata": {},
"outputs": [],
"source": [
"from whisperplus import ChatWithVideo\n",
"from whisperplus.pipelines.chatbot import ChatWithVideo\n",
"\n",
"chat = ChatWithVideo(\n",
" input_file=\"trascript.txt\",\n",
Expand All @@ -175,7 +295,8 @@
")\n",
"\n",
"query = \"what is this video about ?\"\n",
"response = chat.run_query(query)"
"response = chat.run_query(query)\n",
"print(response)"
]
},
{
Expand All @@ -188,14 +309,24 @@
"This section demonstrates the 'Chat with Video' feature using AutoLLM. It enables querying a video's content through a chat interface, utilizing advanced language models."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "38be611e",
"metadata": {},
"outputs": [],
"source": [
"!pip install autollm>=0.1.9"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ddfd23fd",
"metadata": {},
"outputs": [],
"source": [
"from whisperplus import AutoLLMChatWithVideo\n",
"from whisperplus.pipelines.autollm_chatbot import AutoLLMChatWithVideo\n",
"\n",
"# service_context_params\n",
"system_prompt = \"\"\"\n",
Expand All @@ -215,15 +346,15 @@
"\"\"\"\n",
"\n",
"chat = AutoLLMChatWithVideo(\n",
" input_file=\"audio.mp3\",\n",
" openai_key=\"YOUR_OPENAI_KEY\",\n",
" huggingface_key=\"YOUR_HUGGINGFACE_KEY\",\n",
" input_file=\"input_dir\", # path of mp3 file\n",
" openai_key=\"YOUR_OPENAI_KEY\", # optional\n",
" huggingface_key=\"YOUR_HUGGINGFACE_KEY\", # optional\n",
" llm_model=\"gpt-3.5-turbo\",\n",
" llm_max_tokens=\"256\",\n",
" llm_temperature=\"0.1\",\n",
" system_prompt=system_prompt,\n",
" query_wrapper_prompt=query_wrapper_prompt,\n",
" embed_model=\"huggingface/BAAI/bge-large-zh\",\n",
" embed_model=\"huggingface/BAAI/bge-large-zh\", # \"text-embedding-ada-002\"\n",
")\n",
"\n",
"query = \"what is this video about ?\"\n",
Expand All @@ -236,7 +367,7 @@
"id": "223ed48e",
"metadata": {},
"source": [
"### 🎙️ Speech to Text\n",
"### 🎙️ Text to Speech\n",
"\n",
"Finally, this section covers converting text to speech using WhisperPlus, demonstrating how to generate spoken audio from text."
]
Expand All @@ -248,7 +379,7 @@
"metadata": {},
"outputs": [],
"source": [
"from whisperplus import TextToSpeechPipeline\n",
"from whisperplus.pipelines.text2speech import TextToSpeechPipeline\n",
"\n",
"tts = TextToSpeechPipeline(model_id=\"suno/bark\")\n",
"audio = tts(text=\"Hello World\", voice_preset=\"v2/en_speaker_6\")"
Expand Down
Loading

0 comments on commit 39eb886

Please sign in to comment.