diff --git a/.gitignore b/.gitignore index ee9851f..8a00b44 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,7 @@ dist .DS_Store *.egg-info yt_whisper/__pycache__ +*.vtt +.venv +models +build \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..1d8e6ad --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,32 @@ +[build-system] +requires = ["setuptools", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "yt_whisper" +version = "1.0.0" +description = "Generate subtitles for YouTube videos using Whisper" +authors = [ + { name = "Miguel Piedrafita", email = "your.email@example.com" } +] +dependencies = [ + "yt-dlp", + "openai-whisper @ git+https://github.com/openai/whisper.git@main", + "openvino>=2024.1.0", + "nncf>=2.10.0", + "python-ffmpeg<=1.0.16", + "moviepy", + "transformers", + "onnx", + "optimum-intel @ git+https://github.com/huggingface/optimum-intel.git", + "peft==0.6.2", + "torch>=2.1,<2.4", + "torchvision<0.19.0", + "soundfile", + "librosa", + "jiwer", + "pytube @ git+https://github.com/garywu007/pytube.git" +] + +[project.scripts] +yt_whisper = "yt_whisper.cli:main" diff --git a/setup.py b/setup.py deleted file mode 100644 index be897ae..0000000 --- a/setup.py +++ /dev/null @@ -1,21 +0,0 @@ -import os - -import pkg_resources -from setuptools import setup, find_packages - -setup( - version="1.0", - name="yt_whisper", - packages=find_packages(), - py_modules=["yt_whisper"], - author="Miguel Piedrafita", - install_requires=[ - 'yt-dlp', - 'whisper @ git+https://github.com/openai/whisper.git@main#egg=whisper' - ], - description="Generate subtitles for YouTube videos using Whisper", - entry_points={ - 'console_scripts': ['yt_whisper=yt_whisper.cli:main'], - }, - include_package_data=True, -) diff --git a/yt_whisper/cli.py b/yt_whisper/cli.py index 77763e9..419322b 100644 --- a/yt_whisper/cli.py +++ b/yt_whisper/cli.py @@ -4,8 +4,11 @@ import argparse import warnings import yt_dlp -from .utils import slugify, str2bool, write_srt, write_vtt +from .utils import slugify, str2bool, write_srt, write_vtt, write_srt_openvino, write_vtt_openvino import tempfile +from optimum.intel.openvino import OVModelForSpeechSeq2Seq +from transformers import AutoProcessor, pipeline +import subprocess def main(): @@ -25,40 +28,73 @@ def main(): "transcribe", "translate"], help="whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')") parser.add_argument("--language", type=str, default=None, choices=sorted(LANGUAGES.keys()) + sorted([k.title() for k in TO_LANGUAGE_CODE.keys()]), help="language spoken in the audio, skip to perform language detection") - + parser.add_argument("--openvino", action='store_true', + help="Whether to use openvino pipeline for inferencing.") parser.add_argument("--break-lines", type=int, default=0, help="Whether to break lines into a bottom-heavy pyramid shape if line length exceeds N characters. 0 disables line breaking.") args = parser.parse_args().__dict__ + is_openvino = args.pop("openvino") model_name: str = args.pop("model") output_dir: str = args.pop("output_dir") subtitles_format: str = args.pop("format") os.makedirs(output_dir, exist_ok=True) + os.makedirs("models", exist_ok=True) + + model_dir = os.path.join("models", model_name) if model_name.endswith(".en"): warnings.warn( f"{model_name} is an English-only model, forcing English detection.") args["language"] = "en" - model = whisper.load_model(model_name) + if is_openvino: + if not os.path.exists(model_dir): + bash_command = ["optimum-cli", "export", "openvino", "-m", f"openai/whisper-{model_name}", model_dir, "--weight-format", "fp16"] + subprocess.run(bash_command, check=True) + print(f"Model downloaded and coverted to OpenVINO Intermediate Representation (IR) successfully.") + ov_model = OVModelForSpeechSeq2Seq.from_pretrained(model_dir, device="cpu") + processor = AutoProcessor.from_pretrained(model_dir) + pipe = pipeline( + "automatic-speech-recognition", + model=ov_model, + tokenizer=processor.tokenizer, + feature_extractor=processor.feature_extractor, + generate_kwargs={"task": "transcribe"}, + return_timestamps=True + ) + else: + model = whisper.load_model(model_name) + audios = get_audio(args.pop("video")) break_lines = args.pop("break_lines") for title, audio_path in audios.items(): warnings.filterwarnings("ignore") - result = model.transcribe(audio_path, **args) - warnings.filterwarnings("default") + if is_openvino: + result = pipe(audio_path) + transcript = result["chunks"] + else: + result = model.transcribe(audio_path, **args) + warnings.filterwarnings("default") + transcript = result["segments"] if (subtitles_format == 'vtt'): vtt_path = os.path.join(output_dir, f"{slugify(title)}.vtt") with open(vtt_path, 'w', encoding="utf-8") as vtt: - write_vtt(result["segments"], file=vtt, line_length=break_lines) + if is_openvino: + write_vtt_openvino(transcript, file=vtt, line_length=break_lines) + else: + write_vtt(transcript, file=vtt, line_length=break_lines) print("Saved VTT to", os.path.abspath(vtt_path)) else: srt_path = os.path.join(output_dir, f"{slugify(title)}.srt") with open(srt_path, 'w', encoding="utf-8") as srt: - write_srt(result["segments"], file=srt, line_length=break_lines) + if is_openvino: + write_srt_openvino(transcript, file=srt, line_length=break_lines) + else: + write_srt(transcript, file=srt, line_length=break_lines) print("Saved SRT to", os.path.abspath(srt_path)) diff --git a/yt_whisper/utils.py b/yt_whisper/utils.py index 77211d9..32c71c7 100644 --- a/yt_whisper/utils.py +++ b/yt_whisper/utils.py @@ -1,4 +1,5 @@ from typing import Iterator, TextIO +import math def str2bool(string): @@ -79,3 +80,27 @@ def write_srt(transcript: Iterator[dict], file: TextIO, line_length: int = 0): def slugify(title): return "".join(c if c.isalnum() else "_" for c in title).rstrip("_") +def write_vtt_openvino(transcript: Iterator[dict], file: TextIO, line_length: int = 0): + print("WEBVTT\n", file=file) + for segment in transcript: + segment = process_segment(segment, line_length=line_length) + + print( + f"{format_timestamp(segment['timestamp'][0])} --> {format_timestamp(segment['timestamp'][1])}\n" + f"{segment['text'].strip().replace('-->', '->')}\n", + file=file, + flush=True, + ) + +def write_srt_openvino(transcript: Iterator[dict], file: TextIO, line_length: int = 0): + for i, segment in enumerate(transcript, start=1): + segment = process_segment(segment, line_length=line_length) + + print( + f"{i}\n" + f"{format_timestamp(segment['timestamp'][0], always_include_hours=True, decimal_marker=',')} --> " + f"{format_timestamp(segment['timestamp'][1], always_include_hours=True, decimal_marker=',')}\n" + f"{segment['text'].strip().replace('-->', '->')}\n", + file=file, + flush=True, + )