diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 000000000..d625a97d1
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,57 @@
+FROM nvidia/cuda:11.6.2-cudnn8-devel-ubuntu20.04
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+# install python via pyenv
+RUN apt-get update && apt-get install -y --no-install-recommends \
+ make \
+ build-essential \
+ libssl-dev \
+ zlib1g-dev \
+ libbz2-dev \
+ libreadline-dev \
+ libsqlite3-dev \
+ wget \
+ curl \
+ llvm \
+ libncurses5-dev \
+ libncursesw5-dev \
+ xz-utils \
+ tk-dev \
+ libffi-dev \
+ liblzma-dev \
+ git \
+ ca-certificates \
+ libgl1 \
+ && rm -rf /var/lib/apt/lists/*
+ENV PATH="/root/.pyenv/shims:/root/.pyenv/bin:$PATH"
+ARG PYTHON_VERSION=3.8
+RUN curl -s -S -L https://raw.githubusercontent.com/pyenv/pyenv-installer/master/bin/pyenv-installer | bash && \
+ pyenv install $PYTHON_VERSION && \
+ pyenv global $PYTHON_VERSION
+
+# install cog
+RUN pip install cog
+
+# install deps
+RUN apt-get update && apt-get install -y --no-install-recommends \
+ ffmpeg libsndfile1 \
+ && rm -rf /var/lib/apt/lists/*
+
+# copy to /src
+ENV WORKDIR /src
+RUN mkdir -p $WORKDIR
+WORKDIR $WORKDIR
+
+# install requirements
+COPY requirements.txt .
+RUN pip install -r requirements.txt
+RUN pip install git+https://github.com/elliottzheng/batch-face.git@master
+
+# copy sources
+COPY . .
+
+ENV PYTHONUNBUFFERED=1
+
+# run cog
+CMD python3 -m cog.server.http
diff --git a/README.md b/README.md
index 76e66198e..f0ac474a0 100644
--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@ This code is part of the paper: _A Lip Sync Expert Is All You Need for Speech to
|📑 Original Paper|📰 Project Page|🌀 Demo|⚡ Live Testing|📔 Colab Notebook
|:-:|:-:|:-:|:-:|:-:|
-[Paper](http://arxiv.org/abs/2008.10010) | [Project Page](http://cvit.iiit.ac.in/research/projects/cvit-projects/a-lip-sync-expert-is-all-you-need-for-speech-to-lip-generation-in-the-wild/) | [Demo Video](https://youtu.be/0fXaDCZNOJc) | [Interactive Demo](https://bhaasha.iiit.ac.in/lipsync) | [Colab Notebook](https://colab.research.google.com/drive/1tZpDWXz49W6wDcTprANRGLo2D_EbD5J8?usp=sharing) /[Updated Collab Notebook](https://colab.research.google.com/drive/1IjFW1cLevs6Ouyu4Yht4mnR4yeuMqO7Y#scrollTo=MH1m608OymLH)
+[Paper](http://arxiv.org/abs/2008.10010) | [Project Page](http://cvit.iiit.ac.in/research/projects/cvit-projects/a-lip-sync-expert-is-all-you-need-for-speech-to-lip-generation-in-the-wild/) | [Demo Video](https://youtu.be/0fXaDCZNOJc) | [Interactive Demo](https://bhaasha.iiit.ac.in/lipsync) | [Colab Notebook](https://colab.research.google.com/drive/1tZpDWXz49W6wDcTprANRGLo2D_EbD5J8?usp=sharing) /[Updated Collab Notebook](https://colab.research.google.com/github/justinjohn0306/Wav2Lip/blob/master/Wav2Lip_simplified_v5.ipynb)
@@ -27,14 +27,15 @@ This code is part of the paper: _A Lip Sync Expert Is All You Need for Speech to
--------
**Disclaimer**
--------
-All results from this open-source code or our [demo website](https://bhaasha.iiit.ac.in/lipsync) should only be used for research/academic/personal purposes only. As the models are trained on the LRS2 dataset , any form of commercial use is strictly prohibited. For commercial requests please contact us directly!
+All results from this open-source code or our [demo website](https://bhaasha.iiit.ac.in/lipsync) should only be used for research/academic/personal purposes only. As the models are trained on the LRS2 dataset , any form of commercial use is strictly prohibhited. For commercial requests please contact us directly!
Prerequisites
-------------
-- `Python 3.6`
+- `Python 3.10.15`
- ffmpeg: `sudo apt-get install ffmpeg`
- Install necessary packages using `pip install -r requirements.txt`. Alternatively, instructions for using a docker image is provided [here](https://gist.github.com/xenogenesi/e62d3d13dadbc164124c830e9c453668). Have a look at [this comment](https://github.com/Rudrabha/Wav2Lip/issues/131#issuecomment-725478562) and comment on [the gist](https://gist.github.com/xenogenesi/e62d3d13dadbc164124c830e9c453668) if you encounter any issues.
- Face detection [pre-trained model](https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth) should be downloaded to `face_detection/detection/sfd/s3fd.pth`. Alternative [link](https://iiitaphyd-my.sharepoint.com/:u:/g/personal/prajwal_k_research_iiit_ac_in/EZsy6qWuivtDnANIG73iHjIBjMSoojcIV0NULXV-yiuiIg?e=qTasa8) if the above does not work.
+- Add [mobilenet.pth](https://github.com/justinjohn0306/Wav2Lip/releases/download/models/mobilenet.pth) to checkpoints folder once installed along with one of the weights files below.
Getting the weights
----------
@@ -55,8 +56,8 @@ The result is saved (by default) in `results/result_voice.mp4`. You can specify
##### Tips for better results:
- Experiment with the `--pads` argument to adjust the detected face bounding box. Often leads to improved results. You might need to increase the bottom padding to include the chin region. E.g. `--pads 0 20 0 0`.
-- If you see the mouth position dislocated or some weird artifacts such as two mouths, then it can be because of over-smoothing the face detections. Use the `--nosmooth` argument and give it another try.
-- Experiment with the `--resize_factor` argument, to get a lower-resolution video. Why? The models are trained on faces that were at a lower resolution. You might get better, visually pleasing results for 720p videos than for 1080p videos (in many cases, the latter works well too).
+- If you see the mouth position dislocated or some weird artifacts such as two mouths, then it can be because of over-smoothing the face detections. Use the `--nosmooth` argument and give another try.
+- Experiment with the `--resize_factor` argument, to get a lower resolution video. Why? The models are trained on faces which were at a lower resolution. You might get better, visually pleasing results for 720p videos than for 1080p videos (in many cases, the latter works well too).
- The Wav2Lip model without GAN usually needs more experimenting with the above two to get the most ideal results, and sometimes, can give you a better result as well.
Preparing LRS2 for training
@@ -78,7 +79,7 @@ Place the LRS2 filelists (train, val, test) `.txt` files in the `filelists/` fol
```bash
python preprocess.py --data_root data_root/main --preprocessed_root lrs2_preprocessed/
```
-Additional options like `batch_size` and the number of GPUs to use in parallel to use can also be set.
+Additional options like `batch_size` and number of GPUs to use in parallel to use can also be set.
##### Preprocessed LRS2 folder structure
```
@@ -99,12 +100,12 @@ You can download [the pre-trained weights](#getting-the-weights) if you want to
python color_syncnet_train.py --data_root lrs2_preprocessed/ --checkpoint_dir
```
##### Training the Wav2Lip models
-You can either train the model without the additional visual quality discriminator (< 1 day of training) or use the discriminator (~2 days). For the former, run:
+You can either train the model without the additional visual quality disriminator (< 1 day of training) or use the discriminator (~2 days). For the former, run:
```bash
python wav2lip_train.py --data_root lrs2_preprocessed/ --checkpoint_dir --syncnet_checkpoint_path
```
-To train with the visual quality discriminator, you should run `hq_wav2lip_train.py` instead. The arguments for both files are similar. In both cases, you can resume training as well. Look at `python wav2lip_train.py --help` for more details. You can also set additional less commonly-used hyper-parameters at the bottom of the `hparams.py` file.
+To train with the visual quality discriminator, you should run `hq_wav2lip_train.py` instead. The arguments for both the files are similar. In both the cases, you can resume training as well. Look at `python wav2lip_train.py --help` for more details. You can also set additional less commonly-used hyper-parameters at the bottom of the `hparams.py` file.
Training on datasets other than LRS2
------------------------------------
@@ -126,7 +127,7 @@ Please check the `evaluation/` folder for the instructions.
License and Citation
----------
-This repository can only be used for personal/research/non-commercial purposes. However, for commercial requests, please contact us directly at radrabha.m@research.iiit.ac.in or prajwal.k@research.iiit.ac.in. We have an HD model trained on a dataset allowing commercial usage. The size of the generated face will be 192 x 288 in our new model. Please cite the following paper if you use this repository:
+Theis repository can only be used for personal/research/non-commercial purposes. However, for commercial requests, please contact us directly at radrabha.m@research.iiit.ac.in or prajwal.k@research.iiit.ac.in. We have an HD model trained on a dataset allowing commercial usage. The size of the generated face will be 192 x 288 in our new model. Please cite the following paper if you use this repository:
```
@inproceedings{10.1145/3394171.3413532,
author = {Prajwal, K R and Mukhopadhyay, Rudrabha and Namboodiri, Vinay P. and Jawahar, C.V.},
@@ -147,6 +148,6 @@ series = {MM '20}
```
-Acknowledgments
+Acknowledgements
----------
-Parts of the code structure are inspired by this [TTS repository](https://github.com/r9y9/deepvoice3_pytorch). We thank the author for this wonderful code. The code for Face Detection has been taken from the [face_alignment](https://github.com/1adrianb/face-alignment) repository. We thank the authors for releasing their code and models. We thank [zabique](https://github.com/zabique) for the tutorial collab notebook.
+Parts of the code structure is inspired by this [TTS repository](https://github.com/r9y9/deepvoice3_pytorch). We thank the author for this wonderful code. The code for Face Detection has been taken from the [face_alignment](https://github.com/1adrianb/face-alignment) repository. We thank the authors for releasing their code and models. We thank [zabique](https://github.com/zabique) for the tutorial collab notebook.
diff --git a/Wav2Lip_simplified_V5(offline).ipynb b/Wav2Lip_simplified_V5(offline).ipynb
new file mode 100644
index 000000000..8eb6d2f2e
--- /dev/null
+++ b/Wav2Lip_simplified_V5(offline).ipynb
@@ -0,0 +1,225 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f1e90f25",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Step 1: Install dependency\n",
+ "!pip install ffmpeg-python\n",
+ "\n",
+ "# Step 2: Clone the Wav2Lip repository\n",
+ "!git clone https://github.com/justinjohn0306/Wav2Lip\n",
+ "\n",
+ "# Step 3: Download pretrained model\n",
+ "import requests\n",
+ "url = \"https://iiitaphyd-my.sharepoint.com/personal/radrabha_m_research_iiit_ac_in/_layouts/15/download.aspx?share=EdjI7bZlgApMqsVoEUUXpLsBxqXbn5z8VTmoxp55YNDcIA\"\n",
+ "response = requests.get(url)\n",
+ "\n",
+ "with open(\"Wav2Lip/checkpoints/wav2lip_gan.pth\", \"wb\") as f:\n",
+ " f.write(response.content)\n",
+ " \n",
+ "# Step 4: Install the required dependencies for Wav2Lip\n",
+ "!cd Wav2Lip && pip install -r requirements.txt\n",
+ "!pip install pyaudio\n",
+ "\n",
+ "\n",
+ "# Step 5: Download pretrained model for face detection\n",
+ "url = \"https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth\"\n",
+ "response = requests.get(url)\n",
+ "\n",
+ "with open(\"Wav2Lip/face_detection/detection/sfd/s3fd.pth\", \"wb\") as f:\n",
+ " f.write(response.content)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8e86c988",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "import subprocess\n",
+ "from urllib import parse as urlparse\n",
+ "\n",
+ "# Step 1: Install yt-dlp\n",
+ "subprocess.run(['pip', 'install', 'yt-dlp'])\n",
+ "\n",
+ "# Step 2: Define YouTube URL and Video ID\n",
+ "YOUTUBE_URL = 'https://www.youtube.com/watch?v=vAnWYLTdvfY'\n",
+ "url_data = urlparse.urlparse(YOUTUBE_URL)\n",
+ "query = urlparse.parse_qs(url_data.query)\n",
+ "YOUTUBE_ID = query[\"v\"][0]\n",
+ "\n",
+ "# Remove previous input video\n",
+ "if os.path.isfile('input_vid.mp4'):\n",
+ " os.remove('input_vid.mp4')\n",
+ "\n",
+ "# Trim video (start, end) seconds\n",
+ "start = 35\n",
+ "end = 62\n",
+ "interval = end - start\n",
+ "\n",
+ "# Step 3: Download and trim the YouTube video\n",
+ "subprocess.run(['yt-dlp', '-f', 'bestvideo[ext=mp4]', '--output', \"youtube.%(ext)s\", f'https://www.youtube.com/watch?v={YOUTUBE_ID}'])\n",
+ "\n",
+ "# Cut the video using FFmpeg\n",
+ "subprocess.run(['ffmpeg', '-y', '-i', 'youtube.mp4', '-ss', str(start), '-t', str(interval), '-async', '1', 'input_vid.mp4'])\n",
+ "\n",
+ "# Display video.\n",
+ "from IPython.display import HTML\n",
+ "from base64 import b64encode\n",
+ "\n",
+ "def show_video(path):\n",
+ " mp4 = open(path, 'rb').read()\n",
+ " data_url = \"data:video/mp4;base64,\" + b64encode(mp4).decode()\n",
+ " return HTML(f\"\"\" \"\"\")\n",
+ "\n",
+ "# Preview the trimmed video\n",
+ "show_video('input_vid.mp4')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7da8e818",
+ "metadata": {
+ "scrolled": false
+ },
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "from IPython.display import Audio\n",
+ "from IPython.core.display import display\n",
+ "\n",
+ "upload_method = 'Path' # Change this to 'Record' or 'Path'\n",
+ "\n",
+ "# Remove previous input audio\n",
+ "if os.path.isfile('input_audio.wav'):\n",
+ " os.remove('input_audio.wav')\n",
+ "\n",
+ "def display_audio():\n",
+ " display(Audio('input_audio.wav'))\n",
+ "\n",
+ "if upload_method == 'Record':\n",
+ " import pyaudio\n",
+ " import wave\n",
+ "\n",
+ " CHUNK = 1024\n",
+ " FORMAT = pyaudio.paInt16\n",
+ " CHANNELS = 1\n",
+ " RATE = 16000\n",
+ " RECORD_SECONDS = 5\n",
+ " WAVE_OUTPUT_FILENAME = \"input_audio.wav\"\n",
+ "\n",
+ " p = pyaudio.PyAudio()\n",
+ "\n",
+ " stream = p.open(format=FORMAT,\n",
+ " channels=CHANNELS,\n",
+ " rate=RATE,\n",
+ " input=True,\n",
+ " frames_per_buffer=CHUNK)\n",
+ "\n",
+ " print(\"Recording...\")\n",
+ "\n",
+ " frames = []\n",
+ "\n",
+ " for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):\n",
+ " data = stream.read(CHUNK)\n",
+ " frames.append(data)\n",
+ "\n",
+ " print(\"Finished recording.\")\n",
+ "\n",
+ " stream.stop_stream()\n",
+ " stream.close()\n",
+ " p.terminate()\n",
+ "\n",
+ " wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb')\n",
+ " wf.setnchannels(CHANNELS)\n",
+ " wf.setsampwidth(p.get_sample_size(FORMAT))\n",
+ " wf.setframerate(RATE)\n",
+ " wf.writeframes(b''.join(frames))\n",
+ " wf.close()\n",
+ "\n",
+ " display_audio()\n",
+ "\n",
+ "elif upload_method == 'Path':\n",
+ " # Add the full path to your audio\n",
+ " PATH_TO_YOUR_AUDIO = 'C:/Users/justi/OneDrive/Desktop/wav2lip/Wav2Lip/input_audio.wav'\n",
+ "\n",
+ " # Load audio with specified sampling rate\n",
+ " import librosa\n",
+ " audio, sr = librosa.load(PATH_TO_YOUR_AUDIO, sr=None)\n",
+ "\n",
+ " # Save audio with specified sampling rate\n",
+ " import soundfile as sf\n",
+ " sf.write('input_audio.wav', audio, sr, format='wav')\n",
+ "\n",
+ " display_audio()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "63289945",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "# Define the parameters for the Wav2Lip model\n",
+ "pad_top = 0\n",
+ "pad_bottom = 10\n",
+ "pad_left = 0\n",
+ "pad_right = 0\n",
+ "rescaleFactor = 1\n",
+ "nosmooth = False\n",
+ "\n",
+ "# Set the path to the Wav2Lip model and input files\n",
+ "checkpoint_path = \"checkpoints/wav2lip_gan.pth\"\n",
+ "input_face = \"input_vid.mp4\"\n",
+ "input_audio = \"input_audio.wav\"\n",
+ "\n",
+ "# Run the Wav2Lip model\n",
+ "!cd Wav2Lip && python inference.py --checkpoint_path {checkpoint_path} --face {input_face} --audio {input_audio} --pads {pad_top} {pad_bottom} {pad_left} {pad_right} --resize_factor {rescaleFactor} {\"--nosmooth\" if nosmooth else \"\"}\n",
+ "\n",
+ "# Preview the output video\n",
+ "print(\"Final Video Preview\")\n",
+ "print(\"Find the output video at\", 'Wav2Lip/results/result_voice.mp4')\n",
+ "show_video('Wav2Lip/results/result_voice.mp4')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "3fbafa56",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.11"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/Wav2Lip_simplified_v4.ipynb b/Wav2Lip_simplified_v4.ipynb
new file mode 100644
index 000000000..5cc33bf16
--- /dev/null
+++ b/Wav2Lip_simplified_v4.ipynb
@@ -0,0 +1,482 @@
+{
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
+ "accelerator": "GPU",
+ "colab": {
+ "provenance": []
+ },
+ "kernelspec": {
+ "display_name": "Python 3",
+ "name": "python3"
+ }
+ },
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "source": [
+ "**Fixes by: [justinjohn-03](https://github.com/justinjohn0306)**"
+ ],
+ "metadata": {
+ "id": "9Uyk6DCBGHuW"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "U1xFNFU58_2j"
+ },
+ "source": [
+ "## Goal: Make anyone speak anything (LipSync)\n",
+ "\n",
+ "* Github: https://github.com/Rudrabha/Wav2Lip\n",
+ "* Paper: https://arxiv.org/abs/2008.10010\n",
+ "*Original notebook: https://colab.research.google.com/drive/1tZpDWXz49W6wDcTprANRGLo2D_EbD5J8?usp=sharing\n",
+ "\n",
+ "\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "Qgo-oaI3JU2u",
+ "cellView": "form"
+ },
+ "source": [
+ "#@title Step1: Setup Wav2Lip \n",
+ "#@markdown * Install dependency\n",
+ "#@markdown * Download pretrained model\n",
+ "!rm -rf /content/sample_data\n",
+ "!mkdir /content/sample_data\n",
+ "\n",
+ "!git clone https://github.com/zabique/Wav2Lip\n",
+ "\n",
+ "#download the pretrained model\n",
+ "!wget 'https://iiitaphyd-my.sharepoint.com/personal/radrabha_m_research_iiit_ac_in/_layouts/15/download.aspx?share=EdjI7bZlgApMqsVoEUUXpLsBxqXbn5z8VTmoxp55YNDcIA' -O '/content/Wav2Lip/checkpoints/wav2lip_gan.pth'\n",
+ "a = !pip install https://raw.githubusercontent.com/AwaleSajil/ghc/master/ghc-1.0-py3-none-any.whl\n",
+ "\n",
+ "# !pip uninstall tensorflow tensorflow-gpu\n",
+ "!cd Wav2Lip && pip install -r requirements.txt\n",
+ "\n",
+ "#download pretrained model for face detection\n",
+ "!wget \"https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth\" -O \"/content/Wav2Lip/face_detection/detection/sfd/s3fd.pth\"\n",
+ "\n",
+ "!pip install -q youtube-dl\n",
+ "!pip install ffmpeg-python\n",
+ "!pip install librosa==0.9.1\n",
+ "\n",
+ "#this code for recording audio\n",
+ "\"\"\"\n",
+ "To write this piece of code I took inspiration/code from a lot of places.\n",
+ "It was late night, so I'm not sure how much I created or just copied o.O\n",
+ "Here are some of the possible references:\n",
+ "https://blog.addpipe.com/recording-audio-in-the-browser-using-pure-html5-and-minimal-javascript/\n",
+ "https://stackoverflow.com/a/18650249\n",
+ "https://hacks.mozilla.org/2014/06/easy-audio-capture-with-the-mediarecorder-api/\n",
+ "https://air.ghost.io/recording-to-an-audio-file-using-html5-and-js/\n",
+ "https://stackoverflow.com/a/49019356\n",
+ "\"\"\"\n",
+ "from IPython.display import HTML, Audio\n",
+ "from google.colab.output import eval_js\n",
+ "from base64 import b64decode\n",
+ "import numpy as np\n",
+ "from scipy.io.wavfile import read as wav_read\n",
+ "import io\n",
+ "import ffmpeg\n",
+ "\n",
+ "AUDIO_HTML = \"\"\"\n",
+ "\n",
+ "\"\"\"\n",
+ "\n",
+ "%cd /\n",
+ "from ghc.l_ghc_cf import l_ghc_cf\n",
+ "%cd content\n",
+ "\n",
+ "def get_audio():\n",
+ " display(HTML(AUDIO_HTML))\n",
+ " data = eval_js(\"data\")\n",
+ " binary = b64decode(data.split(',')[1])\n",
+ " \n",
+ " process = (ffmpeg\n",
+ " .input('pipe:0')\n",
+ " .output('pipe:1', format='wav')\n",
+ " .run_async(pipe_stdin=True, pipe_stdout=True, pipe_stderr=True, quiet=True, overwrite_output=True)\n",
+ " )\n",
+ " output, err = process.communicate(input=binary)\n",
+ " \n",
+ " riff_chunk_size = len(output) - 8\n",
+ " # Break up the chunk size into four bytes, held in b.\n",
+ " q = riff_chunk_size\n",
+ " b = []\n",
+ " for i in range(4):\n",
+ " q, r = divmod(q, 256)\n",
+ " b.append(r)\n",
+ "\n",
+ " # Replace bytes 4:8 in proc.stdout with the actual size of the RIFF chunk.\n",
+ " riff = output[:4] + bytes(b) + output[8:]\n",
+ "\n",
+ " sr, audio = wav_read(io.BytesIO(riff))\n",
+ "\n",
+ " return audio, sr\n",
+ "\n",
+ "\n",
+ "from IPython.display import HTML\n",
+ "from base64 import b64encode\n",
+ "def showVideo(path):\n",
+ " mp4 = open(str(path),'rb').read()\n",
+ " data_url = \"data:video/mp4;base64,\" + b64encode(mp4).decode()\n",
+ " return HTML(\"\"\"\n",
+ " \n",
+ " \n",
+ " \n",
+ " \"\"\" % data_url)\n",
+ "\n",
+ "from IPython.display import clear_output"
+ ],
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "SEdy6PWDXMRL"
+ },
+ "source": [
+ "# LipSync Youtube Video"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "QI4kcm8QEeGZ",
+ "cellView": "form"
+ },
+ "source": [
+ "#@title STEP2: Select a Youtube Video\n",
+ "# Install yt-dlp\n",
+ "!pip install yt-dlp\n",
+ "\n",
+ "#@markdown ### Find YouTube video ID from URL\n",
+ "from urllib import parse as urlparse\n",
+ "YOUTUBE_URL = 'https://www.youtube.com/watch?v=vAnWYLTdvfY' #@param {type:\"string\"}\n",
+ "url_data = urlparse.urlparse(YOUTUBE_URL)\n",
+ "query = urlparse.parse_qs(url_data.query)\n",
+ "YOUTUBE_ID = query[\"v\"][0]\n",
+ "\n",
+ "#@markdown ### Trim the video (start, end) seconds\n",
+ "start = 35 #@param {type:\"integer\"}\n",
+ "end = 62 #@param {type:\"integer\"}\n",
+ "interval = end - start\n",
+ "\n",
+ "# Download the YouTube video using yt-dlp\n",
+ "!yt-dlp -f 'bestvideo[ext=mp4]' --output \"youtube.%(ext)s\" https://www.youtube.com/watch?v=$YOUTUBE_ID\n",
+ "\n",
+ "# Cut the video using FFmpeg\n",
+ "!ffmpeg -y -i youtube.mp4 -ss {start} -t {interval} -async 1 /content/sample_data/input_vid.mp4\n",
+ "\n",
+ "# Preview the trimmed video\n",
+ "from IPython.display import HTML\n",
+ "from base64 import b64encode\n",
+ "mp4 = open('/content/sample_data/input_vid.mp4','rb').read()\n",
+ "data_url = \"data:video/mp4;base64,\" + b64encode(mp4).decode()\n",
+ "HTML(f\"\"\" \"\"\")\n",
+ "\n"
+ ],
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "zS_RAeh-IfZy",
+ "cellView": "form"
+ },
+ "source": [
+ "#@title STEP3: Select Audio (Record or Upload)\n",
+ "from IPython.display import Audio \n",
+ "from IPython.core.display import display\n",
+ "\n",
+ "record_or_upload = 'Upload' #@param ['Record', 'Upload']\n",
+ "\n",
+ "def displayAudio():\n",
+ " display(Audio('/content/sample_data/input_audio.wav'))\n",
+ "if record_or_upload == 'Record':\n",
+ " audio, sr = get_audio()\n",
+ " import scipy\n",
+ " scipy.io.wavfile.write('/content/sample_data/input_audio.wav', sr, audio)\n",
+ "elif record_or_upload == 'Upload':\n",
+ " from google.colab import files\n",
+ " uploaded = files.upload()\n",
+ " for fn in uploaded.keys():\n",
+ " print('User uploaded file \"{name}\" with length {length} bytes'.format(\n",
+ " name=fn, length=len(uploaded[fn])))\n",
+ " \n",
+ " #concider only the first file\n",
+ " audio_file = str(list(uploaded.keys())[0])\n",
+ " \n",
+ " # Load audio with specified sampling rate\n",
+ " import librosa\n",
+ " audio, sr = librosa.load(audio_file, sr=None)\n",
+ " \n",
+ " # Save audio with specified sampling rate\n",
+ " import soundfile as sf\n",
+ " sf.write('/content/sample_data/input_audio.wav', audio, sr, format='wav')\n",
+ " \n",
+ " clear_output()\n",
+ " displayAudio()"
+ ],
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "BQPLXJ8L0gms",
+ "cellView": "form"
+ },
+ "source": [
+ "#@title STEP4: Start Crunching and Preview Output\n",
+ "#@markdown Note: Only change these, if you have to \n",
+ "pad_top = 0#@param {type:\"integer\"}\n",
+ "pad_bottom = 10#@param {type:\"integer\"}\n",
+ "pad_left = 0#@param {type:\"integer\"}\n",
+ "pad_right = 0#@param {type:\"integer\"}\n",
+ "rescaleFactor = 1#@param {type:\"integer\"}\n",
+ "nosmooth = False #@param {type:\"boolean\"}\n",
+ "\n",
+ "\n",
+ "if nosmooth == False:\n",
+ " !cd Wav2Lip && python inference.py --checkpoint_path checkpoints/wav2lip_gan.pth --face \"../sample_data/input_vid.mp4\" --audio \"../sample_data/input_audio.wav\" --pads $pad_top $pad_bottom $pad_left $pad_right --resize_factor $rescaleFactor\n",
+ "else:\n",
+ " !cd Wav2Lip && python inference.py --checkpoint_path checkpoints/wav2lip_gan.pth --face \"../sample_data/input_vid.mp4\" --audio \"../sample_data/input_audio.wav\" --pads $pad_top $pad_bottom $pad_left $pad_right --resize_factor $rescaleFactor --nosmooth\n",
+ "#Preview output video\n",
+ "clear_output()\n",
+ "print(\"Final Video Preview\")\n",
+ "print(\"Download this video from\", '/content/Wav2Lip/results/result_voice.mp4')\n",
+ "showVideo('/content/Wav2Lip/results/result_voice.mp4')\n"
+ ],
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "vYxpPeie1CYL"
+ },
+ "source": [
+ "# LipSync on Your Video File"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "nDuM7tfZ1F0t",
+ "cellView": "form"
+ },
+ "source": [
+ "import os\n",
+ "from google.colab import files\n",
+ "from IPython.display import HTML\n",
+ "\n",
+ "def showVideo(file_path):\n",
+ " \"\"\"Function to display video in Colab\"\"\"\n",
+ " mp4 = open(file_path,'rb').read()\n",
+ " data_url = \"data:video/mp4;base64,\" + b64encode(mp4).decode()\n",
+ " display(HTML(\"\"\"\n",
+ " \n",
+ " \n",
+ " \n",
+ " \"\"\" % data_url))\n",
+ "\n",
+ "#@markdown ### Select an uploading method\n",
+ "upload_or_path = \"Upload\" #@param [\"Upload\", \"Custom Path\"]\n",
+ "\n",
+ "if upload_or_path == \"Upload\":\n",
+ " uploaded = files.upload()\n",
+ " for filename in uploaded.keys():\n",
+ " os.rename(filename, '/content/sample_data/input_vid.mp4')\n",
+ " PATH_TO_YOUR_VIDEO = '/content/sample_data/input_vid.mp4'\n",
+ "else:\n",
+ " PATH_TO_YOUR_VIDEO = '/content/test.mp4' #@param {type:\"string\"}\n",
+ " if not os.path.isfile(PATH_TO_YOUR_VIDEO):\n",
+ " print(\"ERROR: File not found!\")\n",
+ " raise SystemExit(0)\n",
+ "\n",
+ "#@markdown ### Trim the video (start, end) seconds\n",
+ "start_time = 0 #@param {type:\"integer\"}\n",
+ "end_time = 0 #@param {type:\"integer\"}\n",
+ "\n",
+ "if start_time == 0 and end_time == 0:\n",
+ " print(\"No trimming applied\")\n",
+ "else:\n",
+ " duration = end_time - start_time\n",
+ " os.system(f\"ffmpeg -i {PATH_TO_YOUR_VIDEO} -ss {start_time} -t {duration} -async 1 /content/sample_data/trimmed_vid.mp4\")\n",
+ " PATH_TO_YOUR_VIDEO = \"/content/sample_data/input_vid.mp4\"\n",
+ " print(f\"Video trimmed from {start_time} to {end_time} seconds\")\n",
+ "\n",
+ "print(f\"PATH_TO_YOUR_VIDEO: {PATH_TO_YOUR_VIDEO}\")\n",
+ "\n",
+ "if upload_or_path == \"Upload\":\n",
+ " clear_output()\n",
+ " print(\"Input Video\")\n",
+ " showVideo(PATH_TO_YOUR_VIDEO)\n",
+ "else:\n",
+ " if os.path.isfile(PATH_TO_YOUR_VIDEO):\n",
+ " print(\"Input Video\")\n",
+ " showVideo(PATH_TO_YOUR_VIDEO)\n"
+ ],
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "XgF4794r7sWK",
+ "cellView": "form"
+ },
+ "source": [
+ "#@title STEP3: Select Audio (Record or Upload)\n",
+ "from IPython.display import Audio \n",
+ "from IPython.core.display import display\n",
+ "\n",
+ "record_or_upload = 'Upload' #@param ['Record', 'Upload']\n",
+ "\n",
+ "def displayAudio():\n",
+ " display(Audio('/content/sample_data/input_audio.wav'))\n",
+ "if record_or_upload == 'Record':\n",
+ " audio, sr = get_audio()\n",
+ " import scipy\n",
+ " scipy.io.wavfile.write('/content/sample_data/input_audio.wav', sr, audio)\n",
+ "elif record_or_upload == 'Upload':\n",
+ " from google.colab import files\n",
+ " uploaded = files.upload()\n",
+ " for fn in uploaded.keys():\n",
+ " print('User uploaded file \"{name}\" with length {length} bytes'.format(\n",
+ " name=fn, length=len(uploaded[fn])))\n",
+ " \n",
+ " #concider only the first file\n",
+ " audio_file = str(list(uploaded.keys())[0])\n",
+ " \n",
+ " # Load audio with specified sampling rate\n",
+ " import librosa\n",
+ " audio, sr = librosa.load(audio_file, sr=None)\n",
+ " \n",
+ " # Save audio with specified sampling rate\n",
+ " import soundfile as sf\n",
+ " sf.write('/content/sample_data/input_audio.wav', audio, sr, format='wav')\n",
+ " \n",
+ " clear_output()\n",
+ " displayAudio()"
+ ],
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "metadata": {
+ "id": "ZgtO08V28ANf",
+ "cellView": "form"
+ },
+ "source": [
+ "#@title STEP4: Start Crunching and Preview Output\n",
+ "#@markdown Note: Only change these, if you have to \n",
+ "pad_top = 0#@param {type:\"integer\"}\n",
+ "pad_bottom = 10#@param {type:\"integer\"}\n",
+ "pad_left = 0#@param {type:\"integer\"}\n",
+ "pad_right = 0#@param {type:\"integer\"}\n",
+ "rescaleFactor = 1#@param {type:\"integer\"}\n",
+ "nosmooth = False #@param {type:\"boolean\"}\n",
+ "\n",
+ "if nosmooth == False:\n",
+ " !cd Wav2Lip && python inference.py --checkpoint_path checkpoints/wav2lip_gan.pth --face \"../sample_data/input_vid.mp4\" --audio \"../sample_data/input_audio.wav\" --pads $pad_top $pad_bottom $pad_left $pad_right --resize_factor $rescaleFactor\n",
+ "else:\n",
+ " !cd Wav2Lip && python inference.py --checkpoint_path checkpoints/wav2lip_gan.pth --face \"../sample_data/input_vid.mp4\" --audio \"../sample_data/input_audio.wav\" --pads $pad_top $pad_bottom $pad_left $pad_right --resize_factor $rescaleFactor --nosmooth\n",
+ "\n",
+ "#Preview output video\n",
+ "clear_output()\n",
+ "print(\"Final Video Preview\")\n",
+ "print(\"Dowload this video from\", '/content/Wav2Lip/results/result_voice.mp4')\n",
+ "showVideo('/content/Wav2Lip/results/result_voice.mp4')\n"
+ ],
+ "execution_count": null,
+ "outputs": []
+ }
+ ]
+}
\ No newline at end of file
diff --git a/Wav2Lip_simplified_v5.ipynb b/Wav2Lip_simplified_v5.ipynb
new file mode 100644
index 000000000..308f3bd76
--- /dev/null
+++ b/Wav2Lip_simplified_v5.ipynb
@@ -0,0 +1,645 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "U1xFNFU58_2j"
+ },
+ "source": [
+ "## Goal: Make anyone speak anything (LipSync)\n",
+ "\n",
+ "* Github: https://github.com/Rudrabha/Wav2Lip\n",
+ "* Paper: https://arxiv.org/abs/2008.10010\n",
+ "*Original notebook: https://colab.research.google.com/drive/1tZpDWXz49W6wDcTprANRGLo2D_EbD5J8?usp=sharing\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "**Modded by: [justinjohn-03](https://github.com/justinjohn0306)**\n",
+ "\n",
+ "\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "cellView": "form",
+ "id": "Qgo-oaI3JU2u"
+ },
+ "outputs": [],
+ "source": [
+ "#@title Step1: Setup Wav2Lip \n",
+ "#@markdown * Install dependency\n",
+ "#@markdown * Download pretrained model\n",
+ "from IPython.display import HTML, clear_output\n",
+ "!rm -rf /content/sample_data\n",
+ "!mkdir /content/sample_data\n",
+ "\n",
+ "!git clone https://github.com/justinjohn0306/Wav2Lip\n",
+ "\n",
+ "%cd /content/Wav2Lip\n",
+ "\n",
+ "#download the pretrained model\n",
+ "!wget 'https://github.com/justinjohn0306/Wav2Lip/releases/download/models/wav2lip.pth' -O 'checkpoints/wav2lip.pth'\n",
+ "!wget 'https://github.com/justinjohn0306/Wav2Lip/releases/download/models/wav2lip_gan.pth' -O 'checkpoints/wav2lip_gan.pth'\n",
+ "!wget 'https://github.com/justinjohn0306/Wav2Lip/releases/download/models/resnet50.pth' -O 'checkpoints/resnet50.pth'\n",
+ "!wget 'https://github.com/justinjohn0306/Wav2Lip/releases/download/models/mobilenet.pth' -O 'checkpoints/mobilenet.pth'\n",
+ "a = !pip install https://raw.githubusercontent.com/AwaleSajil/ghc/master/ghc-1.0-py3-none-any.whl\n",
+ "!pip install git+https://github.com/elliottzheng/batch-face.git@master\n",
+ "\n",
+ "!pip install ffmpeg-python mediapipe==0.10.18\n",
+ "\n",
+ "#this code for recording audio\n",
+ "\"\"\"\n",
+ "To write this piece of code I took inspiration/code from a lot of places.\n",
+ "It was late night, so I'm not sure how much I created or just copied o.O\n",
+ "Here are some of the possible references:\n",
+ "https://blog.addpipe.com/recording-audio-in-the-browser-using-pure-html5-and-minimal-javascript/\n",
+ "https://stackoverflow.com/a/18650249\n",
+ "https://hacks.mozilla.org/2014/06/easy-audio-capture-with-the-mediarecorder-api/\n",
+ "https://air.ghost.io/recording-to-an-audio-file-using-html5-and-js/\n",
+ "https://stackoverflow.com/a/49019356\n",
+ "\"\"\"\n",
+ "from IPython.display import HTML, Audio\n",
+ "from google.colab.output import eval_js\n",
+ "from base64 import b64decode\n",
+ "import numpy as np\n",
+ "from scipy.io.wavfile import read as wav_read\n",
+ "import io\n",
+ "import ffmpeg\n",
+ "\n",
+ "AUDIO_HTML = \"\"\"\n",
+ "\n",
+ "\"\"\"\n",
+ "\n",
+ "%cd /\n",
+ "from ghc.l_ghc_cf import l_ghc_cf\n",
+ "%cd content\n",
+ "\n",
+ "def get_audio():\n",
+ " display(HTML(AUDIO_HTML))\n",
+ " data = eval_js(\"data\")\n",
+ " binary = b64decode(data.split(',')[1])\n",
+ "\n",
+ " process = (ffmpeg\n",
+ " .input('pipe:0')\n",
+ " .output('pipe:1', format='wav')\n",
+ " .run_async(pipe_stdin=True, pipe_stdout=True, pipe_stderr=True, quiet=True, overwrite_output=True)\n",
+ " )\n",
+ " output, err = process.communicate(input=binary)\n",
+ "\n",
+ " riff_chunk_size = len(output) - 8\n",
+ " # Break up the chunk size into four bytes, held in b.\n",
+ " q = riff_chunk_size\n",
+ " b = []\n",
+ " for i in range(4):\n",
+ " q, r = divmod(q, 256)\n",
+ " b.append(r)\n",
+ "\n",
+ " # Replace bytes 4:8 in proc.stdout with the actual size of the RIFF chunk.\n",
+ " riff = output[:4] + bytes(b) + output[8:]\n",
+ "\n",
+ " sr, audio = wav_read(io.BytesIO(riff))\n",
+ "\n",
+ " return audio, sr\n",
+ "\n",
+ "\n",
+ "from IPython.display import HTML\n",
+ "from base64 import b64encode\n",
+ "def showVideo(path):\n",
+ " mp4 = open(str(path),'rb').read()\n",
+ " data_url = \"data:video/mp4;base64,\" + b64encode(mp4).decode()\n",
+ " return HTML(\"\"\"\n",
+ " \n",
+ " \n",
+ " \n",
+ " \"\"\" % data_url)\n",
+ "\n",
+ "from IPython.display import clear_output\n",
+ "\n",
+ "clear_output()\n",
+ "print(\"All set and ready!\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "SEdy6PWDXMRL"
+ },
+ "source": [
+ "# LipSync Youtube Video"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "cellView": "form",
+ "id": "QI4kcm8QEeGZ"
+ },
+ "outputs": [],
+ "source": [
+ "#@title STEP2: Select a Youtube Video\n",
+ "# Install yt-dlp\n",
+ "\n",
+ "import os\n",
+ "!pip install yt-dlp\n",
+ "\n",
+ "#@markdown ## Find YouTube video ID from URL\n",
+ "\n",
+ "#@markdown ___\n",
+ "\n",
+ "#@markdown Link format:\n",
+ "\n",
+ "#@markdown ``https://youtu.be/vAnWYLTdvfY`` ❌\n",
+ "\n",
+ "#@markdown ``https://www.youtube.com/watch?v=vAnWYLTdvfY`` ✔️\n",
+ "\n",
+ "!rm -df youtube.mp4\n",
+ "\n",
+ "#@markdown ___\n",
+ "from urllib import parse as urlparse\n",
+ "YOUTUBE_URL = 'https://www.youtube.com/watch?v=vAnWYLTdvfY' #@param {type:\"string\"}\n",
+ "url_data = urlparse.urlparse(YOUTUBE_URL)\n",
+ "query = urlparse.parse_qs(url_data.query)\n",
+ "YOUTUBE_ID = query[\"v\"][0]\n",
+ "\n",
+ "\n",
+ "# remove previous input video\n",
+ "!rm -f /content/sample_data/input_vid.mp4\n",
+ "\n",
+ "\n",
+ "#@markdown ___\n",
+ "\n",
+ "#@markdown ### Trim the video (start, end) seconds\n",
+ "start = 35 #@param {type:\"integer\"}\n",
+ "end = 62 #@param {type:\"integer\"}\n",
+ "interval = end - start\n",
+ "\n",
+ "#@markdown Note: ``the trimmed video must have face on all frames``\n",
+ "\n",
+ "# Download the YouTube video using yt-dlp\n",
+ "!yt-dlp -f 'bestvideo[ext=mp4]' --output \"youtube.%(ext)s\" https://www.youtube.com/watch?v=$YOUTUBE_ID\n",
+ "\n",
+ "# Cut the video using FFmpeg\n",
+ "!ffmpeg -y -i youtube.mp4 -ss {start} -t {interval} -async 1 /content/sample_data/input_vid.mp4\n",
+ "\n",
+ "# Preview the trimmed video\n",
+ "from IPython.display import HTML\n",
+ "from base64 import b64encode\n",
+ "mp4 = open('/content/sample_data/input_vid.mp4','rb').read()\n",
+ "data_url = \"data:video/mp4;base64,\" + b64encode(mp4).decode()\n",
+ "HTML(f\"\"\" \"\"\")\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "cellView": "form",
+ "id": "zS_RAeh-IfZy"
+ },
+ "outputs": [],
+ "source": [
+ "#@title STEP3: Select Audio (Record, Upload from local drive or Gdrive)\n",
+ "import os\n",
+ "from IPython.display import Audio\n",
+ "from IPython.core.display import display\n",
+ "\n",
+ "upload_method = 'Upload' #@param ['Record', 'Upload', 'Custom Path']\n",
+ "\n",
+ "#remove previous input audio\n",
+ "if os.path.isfile('/content/sample_data/input_audio.wav'):\n",
+ " os.remove('/content/sample_data/input_audio.wav')\n",
+ "\n",
+ "def displayAudio():\n",
+ " display(Audio('/content/sample_data/input_audio.wav'))\n",
+ "\n",
+ "if upload_method == 'Record':\n",
+ " audio, sr = get_audio()\n",
+ " import scipy\n",
+ " scipy.io.wavfile.write('/content/sample_data/input_audio.wav', sr, audio)\n",
+ "\n",
+ "elif upload_method == 'Upload':\n",
+ " from google.colab import files\n",
+ " uploaded = files.upload()\n",
+ " for fn in uploaded.keys():\n",
+ " print('User uploaded file \"{name}\" with length {length} bytes'.format(\n",
+ " name=fn, length=len(uploaded[fn])))\n",
+ "\n",
+ " # Consider only the first file\n",
+ " PATH_TO_YOUR_AUDIO = str(list(uploaded.keys())[0])\n",
+ "\n",
+ " # Load audio with specified sampling rate\n",
+ " import librosa\n",
+ " audio, sr = librosa.load(PATH_TO_YOUR_AUDIO, sr=None)\n",
+ "\n",
+ " # Save audio with specified sampling rate\n",
+ " import soundfile as sf\n",
+ " sf.write('/content/sample_data/input_audio.wav', audio, sr, format='wav')\n",
+ "\n",
+ " clear_output()\n",
+ " displayAudio()\n",
+ "\n",
+ "elif upload_method == 'Custom Path':\n",
+ " from google.colab import drive\n",
+ " drive.mount('/content/drive')\n",
+ " #@markdown ``Add the full path to your audio on your Gdrive`` 👇\n",
+ " PATH_TO_YOUR_AUDIO = '/content/drive/MyDrive/test.wav' #@param {type:\"string\"}\n",
+ "\n",
+ " # Load audio with specified sampling rate\n",
+ " import librosa\n",
+ " audio, sr = librosa.load(PATH_TO_YOUR_AUDIO, sr=None)\n",
+ "\n",
+ " # Save audio with specified sampling rate\n",
+ " import soundfile as sf\n",
+ " sf.write('/content/sample_data/input_audio.wav', audio, sr, format='wav')\n",
+ "\n",
+ " clear_output()\n",
+ " displayAudio()\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "cellView": "form",
+ "id": "BQPLXJ8L0gms"
+ },
+ "outputs": [],
+ "source": [
+ "#@title STEP4: Start Crunching and Preview Output\n",
+ "#@markdown Note: Only change these, if you have to \n",
+ "\n",
+ "%cd /content/Wav2Lip\n",
+ "\n",
+ "# Set up paths and variables for the output file\n",
+ "output_file_path = '/content/Wav2Lip/results/result_voice.mp4'\n",
+ "\n",
+ "# Delete existing output file before processing, if any\n",
+ "if os.path.exists(output_file_path):\n",
+ " os.remove(output_file_path)\n",
+ "\n",
+ "pad_top = 0#@param {type:\"integer\"}\n",
+ "pad_bottom = 10#@param {type:\"integer\"}\n",
+ "pad_left = 0#@param {type:\"integer\"}\n",
+ "pad_right = 0#@param {type:\"integer\"}\n",
+ "rescaleFactor = 1#@param {type:\"integer\"}\n",
+ "nosmooth = True #@param {type:\"boolean\"}\n",
+ "#@markdown ___\n",
+ "#@markdown Model selection:\n",
+ "use_hd_model = False #@param {type:\"boolean\"}\n",
+ "checkpoint_path = 'checkpoints/wav2lip.pth' if not use_hd_model else 'checkpoints/wav2lip_gan.pth'\n",
+ "\n",
+ "\n",
+ "if nosmooth == False:\n",
+ " !python inference.py --checkpoint_path $checkpoint_path --face \"../sample_data/input_vid.mp4\" --audio \"../sample_data/input_audio.wav\" --pads $pad_top $pad_bottom $pad_left $pad_right --resize_factor $rescaleFactor\n",
+ "else:\n",
+ " !python inference.py --checkpoint_path $checkpoint_path --face \"../sample_data/input_vid.mp4\" --audio \"../sample_data/input_audio.wav\" --pads $pad_top $pad_bottom $pad_left $pad_right --resize_factor $rescaleFactor --nosmooth\n",
+ "\n",
+ "#Preview output video\n",
+ "if os.path.exists(output_file_path):\n",
+ " clear_output()\n",
+ " print(\"Final Video Preview\")\n",
+ " print(\"Download this video from\", output_file_path)\n",
+ " showVideo(output_file_path)\n",
+ "else:\n",
+ " print(\"Processing failed. Output video not found.\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "vYxpPeie1CYL"
+ },
+ "source": [
+ "# LipSync on Your Video File"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "cellView": "form",
+ "id": "nDuM7tfZ1F0t"
+ },
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "import shutil\n",
+ "from google.colab import drive\n",
+ "from google.colab import files\n",
+ "from IPython.display import HTML, clear_output\n",
+ "from base64 import b64encode\n",
+ "import moviepy.editor as mp\n",
+ "\n",
+ "\n",
+ "def showVideo(file_path):\n",
+ " \"\"\"Function to display video in Colab\"\"\"\n",
+ " mp4 = open(file_path,'rb').read()\n",
+ " data_url = \"data:video/mp4;base64,\" + b64encode(mp4).decode()\n",
+ " display(HTML(\"\"\"\n",
+ " \n",
+ " \n",
+ " \n",
+ " \"\"\" % data_url))\n",
+ "\n",
+ "def get_video_resolution(video_path):\n",
+ " \"\"\"Function to get the resolution of a video\"\"\"\n",
+ " import cv2\n",
+ " video = cv2.VideoCapture(video_path)\n",
+ " width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))\n",
+ " height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))\n",
+ " return (width, height)\n",
+ "\n",
+ "def resize_video(video_path, new_resolution):\n",
+ " \"\"\"Function to resize a video\"\"\"\n",
+ " import cv2\n",
+ " video = cv2.VideoCapture(video_path)\n",
+ " fourcc = int(video.get(cv2.CAP_PROP_FOURCC))\n",
+ " fps = video.get(cv2.CAP_PROP_FPS)\n",
+ " width, height = new_resolution\n",
+ " output_path = os.path.splitext(video_path)[0] + '_720p.mp4'\n",
+ " writer = cv2.VideoWriter(output_path, fourcc, fps, (width, height))\n",
+ " while True:\n",
+ " success, frame = video.read()\n",
+ " if not success:\n",
+ " break\n",
+ " resized_frame = cv2.resize(frame, new_resolution)\n",
+ " writer.write(resized_frame)\n",
+ " video.release()\n",
+ " writer.release()\n",
+ "\n",
+ "# Mount Google Drive if it's not already mounted\n",
+ "if not os.path.isdir(\"/content/drive/MyDrive\"):\n",
+ " drive.mount('/content/drive', force_remount=True)\n",
+ "\n",
+ "#@markdown ### Select an uploading method\n",
+ "upload_method = \"Upload\" #@param [\"Upload\", \"Custom Path\"]\n",
+ "\n",
+ "\n",
+ "# remove previous input video\n",
+ "if os.path.isfile('/content/sample_data/input_vid.mp4'):\n",
+ " os.remove('/content/sample_data/input_vid.mp4')\n",
+ "\n",
+ "if upload_method == \"Upload\":\n",
+ " uploaded = files.upload()\n",
+ " for filename in uploaded.keys():\n",
+ " os.rename(filename, '/content/sample_data/input_vid.mp4')\n",
+ " PATH_TO_YOUR_VIDEO = '/content/sample_data/input_vid.mp4'\n",
+ "\n",
+ "elif upload_method == 'Custom Path':\n",
+ " #@markdown ``Add the full path to your video on your Gdrive `` 👇\n",
+ " PATH_TO_YOUR_VIDEO = '/content/drive/MyDrive/test.mp4' #@param {type:\"string\"}\n",
+ " if not os.path.isfile(PATH_TO_YOUR_VIDEO):\n",
+ " print(\"ERROR: File not found!\")\n",
+ " raise SystemExit(0)\n",
+ "\n",
+ "#@markdown Notes:\n",
+ "\n",
+ "#@markdown . ``If your uploaded video is 1080p or higher resolution, this cell will resize it to 720p.``\n",
+ "\n",
+ "#@markdown . ``Do not upload videos longer than 60 seconds.``\n",
+ "\n",
+ "#@markdown ___\n",
+ "\n",
+ "video_duration = mp.VideoFileClip(PATH_TO_YOUR_VIDEO).duration\n",
+ "if video_duration > 60:\n",
+ " print(\"WARNING: Video duration exceeds 60 seconds. Please upload a shorter video.\")\n",
+ " raise SystemExit(0)\n",
+ "\n",
+ "video_resolution = get_video_resolution(PATH_TO_YOUR_VIDEO)\n",
+ "print(f\"Video resolution: {video_resolution}\")\n",
+ "if video_resolution[0] >= 1920 or video_resolution[1] >= 1080:\n",
+ " print(\"Resizing video to 720p...\")\n",
+ " os.system(f\"ffmpeg -i {PATH_TO_YOUR_VIDEO} -vf scale=1280:720 /content/sample_data/input_vid.mp4\")\n",
+ " PATH_TO_YOUR_VIDEO = \"/content/sample_data/input_vid.mp4\"\n",
+ " print(\"Video resized to 720p\")\n",
+ "else:\n",
+ " print(\"No resizing needed\")\n",
+ "\n",
+ "if upload_method == \"Upload\":\n",
+ " clear_output()\n",
+ " print(\"Input Video\")\n",
+ " showVideo(PATH_TO_YOUR_VIDEO)\n",
+ "else:\n",
+ " if os.path.isfile(PATH_TO_YOUR_VIDEO):\n",
+ " # Check if the source and destination files are the same\n",
+ " if PATH_TO_YOUR_VIDEO != \"/content/sample_data/input_vid.mp4\":\n",
+ " shutil.copyfile(PATH_TO_YOUR_VIDEO, \"/content/sample_data/input_vid.mp4\")\n",
+ " print(\"Video copied to destination.\")\n",
+ "\n",
+ " print(\"Input Video\")\n",
+ " # Display the video from the destination path\n",
+ " showVideo(\"/content/sample_data/input_vid.mp4\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "cellView": "form",
+ "id": "XgF4794r7sWK"
+ },
+ "outputs": [],
+ "source": [
+ "#@title STEP3: Select Audio (Record, Upload from local drive or Gdrive)\n",
+ "import os\n",
+ "from IPython.display import Audio\n",
+ "from IPython.core.display import display\n",
+ "\n",
+ "upload_method = 'Upload' #@param ['Record', 'Upload', 'Custom Path']\n",
+ "\n",
+ "#remove previous input audio\n",
+ "if os.path.isfile('/content/sample_data/input_audio.wav'):\n",
+ " os.remove('/content/sample_data/input_audio.wav')\n",
+ "\n",
+ "def displayAudio():\n",
+ " display(Audio('/content/sample_data/input_audio.wav'))\n",
+ "\n",
+ "if upload_method == 'Record':\n",
+ " audio, sr = get_audio()\n",
+ " import scipy\n",
+ " scipy.io.wavfile.write('/content/sample_data/input_audio.wav', sr, audio)\n",
+ "\n",
+ "elif upload_method == 'Upload':\n",
+ " from google.colab import files\n",
+ " uploaded = files.upload()\n",
+ " for fn in uploaded.keys():\n",
+ " print('User uploaded file \"{name}\" with length {length} bytes.'.format(\n",
+ " name=fn, length=len(uploaded[fn])))\n",
+ "\n",
+ " # Consider only the first file\n",
+ " PATH_TO_YOUR_AUDIO = str(list(uploaded.keys())[0])\n",
+ "\n",
+ " # Load audio with specified sampling rate\n",
+ " import librosa\n",
+ " audio, sr = librosa.load(PATH_TO_YOUR_AUDIO, sr=None)\n",
+ "\n",
+ " # Save audio with specified sampling rate\n",
+ " import soundfile as sf\n",
+ " sf.write('/content/sample_data/input_audio.wav', audio, sr, format='wav')\n",
+ "\n",
+ " clear_output()\n",
+ " displayAudio()\n",
+ "\n",
+ "else: # Custom Path\n",
+ " from google.colab import drive\n",
+ " drive.mount('/content/drive')\n",
+ " #@markdown ``Add the full path to your audio on your Gdrive`` 👇\n",
+ " PATH_TO_YOUR_AUDIO = '/content/drive/MyDrive/test.wav' #@param {type:\"string\"}\n",
+ "\n",
+ " # Load audio with specified sampling rate\n",
+ " import librosa\n",
+ " audio, sr = librosa.load(PATH_TO_YOUR_AUDIO, sr=None)\n",
+ "\n",
+ " # Save audio with specified sampling rate\n",
+ " import soundfile as sf\n",
+ " sf.write('/content/sample_data/input_audio.wav', audio, sr, format='wav')\n",
+ "\n",
+ " clear_output()\n",
+ " displayAudio()\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "cellView": "form",
+ "id": "ZgtO08V28ANf"
+ },
+ "outputs": [],
+ "source": [
+ "#@title STEP4: Start Crunching and Preview Output\n",
+ "#@markdown Note: Only change these, if you have to \n",
+ "\n",
+ "%cd /content/Wav2Lip\n",
+ "\n",
+ "# Set up paths and variables for the output file\n",
+ "output_file_path = '/content/Wav2Lip/results/result_voice.mp4'\n",
+ "\n",
+ "# Delete existing output file before processing, if any\n",
+ "if os.path.exists(output_file_path):\n",
+ " os.remove(output_file_path)\n",
+ "\n",
+ "pad_top = 0#@param {type:\"integer\"}\n",
+ "pad_bottom = 10#@param {type:\"integer\"}\n",
+ "pad_left = 0#@param {type:\"integer\"}\n",
+ "pad_right = 0#@param {type:\"integer\"}\n",
+ "rescaleFactor = 1#@param {type:\"integer\"}\n",
+ "nosmooth = True #@param {type:\"boolean\"}\n",
+ "#@markdown ___\n",
+ "#@markdown Model selection:\n",
+ "use_hd_model = False #@param {type:\"boolean\"}\n",
+ "checkpoint_path = 'checkpoints/wav2lip.pth' if not use_hd_model else 'checkpoints/wav2lip_gan.pth'\n",
+ "\n",
+ "\n",
+ "if nosmooth == False:\n",
+ " !python inference.py --checkpoint_path $checkpoint_path --face \"../sample_data/input_vid.mp4\" --audio \"../sample_data/input_audio.wav\" --pads $pad_top $pad_bottom $pad_left $pad_right --resize_factor $rescaleFactor\n",
+ "else:\n",
+ " !python inference.py --checkpoint_path $checkpoint_path --face \"../sample_data/input_vid.mp4\" --audio \"../sample_data/input_audio.wav\" --pads $pad_top $pad_bottom $pad_left $pad_right --resize_factor $rescaleFactor --nosmooth\n",
+ "\n",
+ "#Preview output video\n",
+ "if os.path.exists(output_file_path):\n",
+ " clear_output()\n",
+ " print(\"Final Video Preview\")\n",
+ " print(\"Download this video from\", output_file_path)\n",
+ " showVideo(output_file_path)\n",
+ "else:\n",
+ " print(\"Processing failed. Output video not found.\")"
+ ]
+ }
+ ],
+ "metadata": {
+ "accelerator": "GPU",
+ "colab": {
+ "private_outputs": true,
+ "provenance": []
+ },
+ "kernelspec": {
+ "display_name": "Python 3",
+ "name": "python3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/audio.py b/audio.py
index 32b20c449..32ab5fabe 100644
--- a/audio.py
+++ b/audio.py
@@ -97,7 +97,7 @@ def _linear_to_mel(spectogram):
def _build_mel_basis():
assert hp.fmax <= hp.sample_rate // 2
- return librosa.filters.mel(hp.sample_rate, hp.n_fft, n_mels=hp.num_mels,
+ return librosa.filters.mel(sr=hp.sample_rate, n_fft=hp.n_fft, n_mels=hp.num_mels,
fmin=hp.fmin, fmax=hp.fmax)
def _amp_to_db(x):
diff --git a/cog.yaml b/cog.yaml
new file mode 100644
index 000000000..f188727d7
--- /dev/null
+++ b/cog.yaml
@@ -0,0 +1,35 @@
+# Configuration for Cog ⚙️
+# Reference: https://github.com/replicate/cog/blob/main/docs/yaml.md
+
+image: r8.im/devxpy/cog-wav2lip
+
+build:
+ # set to true if your model requires a GPU
+ gpu: true
+ cuda: "11.6.2"
+
+ # a list of ubuntu apt packages to install
+ system_packages:
+ - ffmpeg
+ - cmake
+
+ # python version in the form '3.8' or '3.8.12'
+ python_version: "3.8"
+
+ # a list of packages in the format ==
+ python_packages:
+ - numpy==1.23.4
+ - librosa==0.7.0
+ - opencv-python==4.6.0.66
+ - torch==1.12.1+cu116 --extra-index-url=https://download.pytorch.org/whl/cu116
+ - torchvision==0.13.1+cu116 --extra-index-url=https://download.pytorch.org/whl/cu116
+ - tqdm==4.45.0
+ - numba==0.48
+ - mediapipe==0.8.11
+
+ # commands run after the environment is setup
+ run:
+ - pip install git+https://github.com/elliottzheng/batch-face.git@master
+
+# predict.py defines how predictions are run on your model
+predict: "predict.py:Predictor"
diff --git a/face_detect.py b/face_detect.py
new file mode 100644
index 000000000..fd35da2a1
--- /dev/null
+++ b/face_detect.py
@@ -0,0 +1,55 @@
+import cv2
+import mediapipe as mp
+
+mp_face_mesh = mp.solutions.face_mesh
+mp_drawing = mp.solutions.drawing_utils
+mp_drawing_styles = mp.solutions.drawing_styles
+mp_face_detection = mp.solutions.face_detection
+
+
+def face_rect(images):
+ with mp_face_detection.FaceDetection(
+ model_selection=1, min_detection_confidence=0.5
+ ) as face_detection:
+ for image_cv2 in images:
+ # Convert the BGR image to RGB and process it with MediaPipe Face Detection.
+ results = face_detection.process(cv2.cvtColor(image_cv2, cv2.COLOR_BGR2RGB))
+
+ # Draw face detections of each face.
+ if not results.detections:
+ yield None
+ for detection in results.detections:
+ yield _get_bounding_rect(image_cv2, detection)
+
+
+def _get_bounding_rect(
+ image: mp_drawing.np.ndarray,
+ detection: mp_drawing.detection_pb2.Detection,
+):
+ """
+ Stolen from mediapipe.solutions.drawing_utils.draw_detection()
+ """
+ if not detection.location_data:
+ return
+ if image.shape[2] != mp_drawing._BGR_CHANNELS:
+ raise ValueError("Input image must contain three channel bgr data.")
+ image_rows, image_cols, _ = image.shape
+
+ location = detection.location_data
+
+ # get bounding box if exists.
+ if not location.HasField("relative_bounding_box"):
+ return
+ relative_bounding_box = location.relative_bounding_box
+ rect_start_point = mp_drawing._normalized_to_pixel_coordinates(
+ relative_bounding_box.xmin, relative_bounding_box.ymin, image_cols, image_rows
+ )
+ rect_end_point = mp_drawing._normalized_to_pixel_coordinates(
+ relative_bounding_box.xmin + relative_bounding_box.width,
+ relative_bounding_box.ymin + relative_bounding_box.height,
+ image_cols,
+ image_rows,
+ )
+
+ return *rect_start_point, *rect_end_point
+
diff --git a/face_detection/detection/sfd/sfd_detector.py b/face_detection/detection/sfd/sfd_detector.py
index 8fbce1525..d1776e4bf 100644
--- a/face_detection/detection/sfd/sfd_detector.py
+++ b/face_detection/detection/sfd/sfd_detector.py
@@ -14,8 +14,9 @@
class SFDDetector(FaceDetector):
- def __init__(self, device, path_to_detector=os.path.join(os.path.dirname(os.path.abspath(__file__)), 's3fd.pth'), verbose=False):
- super(SFDDetector, self).__init__(device, verbose)
+ @classmethod
+ def load_model(cls, device):
+ path_to_detector = os.path.join(os.path.dirname(os.path.abspath(__file__)), 's3fd.pth')
# Initialise the face detector
if not os.path.isfile(path_to_detector):
@@ -23,10 +24,10 @@ def __init__(self, device, path_to_detector=os.path.join(os.path.dirname(os.path
else:
model_weights = torch.load(path_to_detector)
- self.face_detector = s3fd()
- self.face_detector.load_state_dict(model_weights)
- self.face_detector.to(device)
- self.face_detector.eval()
+ cls.face_detector = s3fd()
+ cls.face_detector.load_state_dict(model_weights)
+ cls.face_detector.to(device)
+ cls.face_detector.eval()
def detect_from_image(self, tensor_or_path):
image = self.tensor_or_path_to_ndarray(tensor_or_path)
diff --git a/inference.py b/inference.py
index 90692521e..5e1522d25 100644
--- a/inference.py
+++ b/inference.py
@@ -1,280 +1,327 @@
-from os import listdir, path
+import argparse
+import math
+import os
+import platform
+import subprocess
+
+import cv2
import numpy as np
-import scipy, cv2, os, sys, argparse, audio
-import json, subprocess, random, string
+import torch
from tqdm import tqdm
-from glob import glob
-import torch, face_detection
+
+import audio
+# from face_detect import face_rect
from models import Wav2Lip
-import platform
+
+from batch_face import RetinaFace
+from time import time
parser = argparse.ArgumentParser(description='Inference code to lip-sync videos in the wild using Wav2Lip models')
parser.add_argument('--checkpoint_path', type=str,
- help='Name of saved checkpoint to load weights from', required=True)
+ help='Name of saved checkpoint to load weights from', required=True)
parser.add_argument('--face', type=str,
- help='Filepath of video/image that contains faces to use', required=True)
+ help='Filepath of video/image that contains faces to use', required=True)
parser.add_argument('--audio', type=str,
- help='Filepath of video/audio file to use as raw audio source', required=True)
+ help='Filepath of video/audio file to use as raw audio source', required=True)
parser.add_argument('--outfile', type=str, help='Video path to save result. See default for an e.g.',
- default='results/result_voice.mp4')
+ default='results/result_voice.mp4')
parser.add_argument('--static', type=bool,
- help='If True, then use only first video frame for inference', default=False)
+ help='If True, then use only first video frame for inference', default=False)
parser.add_argument('--fps', type=float, help='Can be specified only if input is a static image (default: 25)',
- default=25., required=False)
+ default=25., required=False)
parser.add_argument('--pads', nargs='+', type=int, default=[0, 10, 0, 0],
- help='Padding (top, bottom, left, right). Please adjust to include chin at least')
+ help='Padding (top, bottom, left, right). Please adjust to include chin at least')
-parser.add_argument('--face_det_batch_size', type=int,
- help='Batch size for face detection', default=16)
parser.add_argument('--wav2lip_batch_size', type=int, help='Batch size for Wav2Lip model(s)', default=128)
-parser.add_argument('--resize_factor', default=1, type=int,
- help='Reduce the resolution by this factor. Sometimes, best results are obtained at 480p or 720p')
+parser.add_argument('--resize_factor', default=1, type=int,
+ help='Reduce the resolution by this factor. Sometimes, best results are obtained at 480p or 720p')
+
+parser.add_argument('--out_height', default=480, type=int,
+ help='Output video height. Best results are obtained at 480 or 720')
-parser.add_argument('--crop', nargs='+', type=int, default=[0, -1, 0, -1],
- help='Crop video to a smaller region (top, bottom, left, right). Applied after resize_factor and rotate arg. '
- 'Useful if multiple face present. -1 implies the value will be auto-inferred based on height, width')
+parser.add_argument('--crop', nargs='+', type=int, default=[0, -1, 0, -1],
+ help='Crop video to a smaller region (top, bottom, left, right). Applied after resize_factor and rotate arg. '
+ 'Useful if multiple face present. -1 implies the value will be auto-inferred based on height, width')
parser.add_argument('--box', nargs='+', type=int, default=[-1, -1, -1, -1],
- help='Specify a constant bounding box for the face. Use only as a last resort if the face is not detected.'
- 'Also, might work only if the face is not moving around much. Syntax: (top, bottom, left, right).')
+ help='Specify a constant bounding box for the face. Use only as a last resort if the face is not detected.'
+ 'Also, might work only if the face is not moving around much. Syntax: (top, bottom, left, right).')
parser.add_argument('--rotate', default=False, action='store_true',
- help='Sometimes videos taken from a phone can be flipped 90deg. If true, will flip video right by 90deg.'
- 'Use if you get a flipped result, despite feeding a normal looking video')
+ help='Sometimes videos taken from a phone can be flipped 90deg. If true, will flip video right by 90deg.'
+ 'Use if you get a flipped result, despite feeding a normal looking video')
parser.add_argument('--nosmooth', default=False, action='store_true',
- help='Prevent smoothing face detections over a short temporal window')
+ help='Prevent smoothing face detections over a short temporal window')
-args = parser.parse_args()
-args.img_size = 96
-
-if os.path.isfile(args.face) and args.face.split('.')[1] in ['jpg', 'png', 'jpeg']:
- args.static = True
def get_smoothened_boxes(boxes, T):
- for i in range(len(boxes)):
- if i + T > len(boxes):
- window = boxes[len(boxes) - T:]
- else:
- window = boxes[i : i + T]
- boxes[i] = np.mean(window, axis=0)
- return boxes
+ for i in range(len(boxes)):
+ if i + T > len(boxes):
+ window = boxes[len(boxes) - T:]
+ else:
+ window = boxes[i : i + T]
+ boxes[i] = np.mean(window, axis=0)
+ return boxes
def face_detect(images):
- detector = face_detection.FaceAlignment(face_detection.LandmarksType._2D,
- flip_input=False, device=device)
-
- batch_size = args.face_det_batch_size
-
- while 1:
- predictions = []
- try:
- for i in tqdm(range(0, len(images), batch_size)):
- predictions.extend(detector.get_detections_for_batch(np.array(images[i:i + batch_size])))
- except RuntimeError:
- if batch_size == 1:
- raise RuntimeError('Image too big to run face detection on GPU. Please use the --resize_factor argument')
- batch_size //= 2
- print('Recovering from OOM error; New batch size: {}'.format(batch_size))
- continue
- break
-
- results = []
- pady1, pady2, padx1, padx2 = args.pads
- for rect, image in zip(predictions, images):
- if rect is None:
- cv2.imwrite('temp/faulty_frame.jpg', image) # check this frame where the face was not detected.
- raise ValueError('Face not detected! Ensure the video contains a face in all the frames.')
-
- y1 = max(0, rect[1] - pady1)
- y2 = min(image.shape[0], rect[3] + pady2)
- x1 = max(0, rect[0] - padx1)
- x2 = min(image.shape[1], rect[2] + padx2)
-
- results.append([x1, y1, x2, y2])
-
- boxes = np.array(results)
- if not args.nosmooth: boxes = get_smoothened_boxes(boxes, T=5)
- results = [[image[y1: y2, x1:x2], (y1, y2, x1, x2)] for image, (x1, y1, x2, y2) in zip(images, boxes)]
-
- del detector
- return results
+ results = []
+ pady1, pady2, padx1, padx2 = args.pads
+
+ s = time()
+
+ for image, rect in zip(images, face_rect(images)):
+ if rect is None:
+ cv2.imwrite('temp/faulty_frame.jpg', image) # check this frame where the face was not detected.
+ raise ValueError('Face not detected! Ensure the video contains a face in all the frames.')
+
+ y1 = max(0, rect[1] - pady1)
+ y2 = min(image.shape[0], rect[3] + pady2)
+ x1 = max(0, rect[0] - padx1)
+ x2 = min(image.shape[1], rect[2] + padx2)
+
+ results.append([x1, y1, x2, y2])
+
+ print('face detect time:', time() - s)
+
+ boxes = np.array(results)
+ if not args.nosmooth: boxes = get_smoothened_boxes(boxes, T=5)
+ results = [[image[y1: y2, x1:x2], (y1, y2, x1, x2)] for image, (x1, y1, x2, y2) in zip(images, boxes)]
+
+ return results
+
def datagen(frames, mels):
- img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []
+ img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []
- if args.box[0] == -1:
- if not args.static:
- face_det_results = face_detect(frames) # BGR2RGB for CNN face detection
- else:
- face_det_results = face_detect([frames[0]])
- else:
- print('Using the specified bounding box instead of face detection...')
- y1, y2, x1, x2 = args.box
- face_det_results = [[f[y1: y2, x1:x2], (y1, y2, x1, x2)] for f in frames]
+ if args.box[0] == -1:
+ if not args.static:
+ face_det_results = face_detect(frames) # BGR2RGB for CNN face detection
+ else:
+ face_det_results = face_detect([frames[0]])
+ else:
+ print('Using the specified bounding box instead of face detection...')
+ y1, y2, x1, x2 = args.box
+ face_det_results = [[f[y1: y2, x1:x2], (y1, y2, x1, x2)] for f in frames]
- for i, m in enumerate(mels):
- idx = 0 if args.static else i%len(frames)
- frame_to_save = frames[idx].copy()
- face, coords = face_det_results[idx].copy()
+ for i, m in enumerate(mels):
+ idx = 0 if args.static else i%len(frames)
+ frame_to_save = frames[idx].copy()
+ face, coords = face_det_results[idx].copy()
- face = cv2.resize(face, (args.img_size, args.img_size))
-
- img_batch.append(face)
- mel_batch.append(m)
- frame_batch.append(frame_to_save)
- coords_batch.append(coords)
+ face = cv2.resize(face, (args.img_size, args.img_size))
- if len(img_batch) >= args.wav2lip_batch_size:
- img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)
+ img_batch.append(face)
+ mel_batch.append(m)
+ frame_batch.append(frame_to_save)
+ coords_batch.append(coords)
- img_masked = img_batch.copy()
- img_masked[:, args.img_size//2:] = 0
+ if len(img_batch) >= args.wav2lip_batch_size:
+ img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)
- img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255.
- mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1])
+ img_masked = img_batch.copy()
+ img_masked[:, args.img_size//2:] = 0
- yield img_batch, mel_batch, frame_batch, coords_batch
- img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []
+ img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255.
+ mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1])
- if len(img_batch) > 0:
- img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)
+ yield img_batch, mel_batch, frame_batch, coords_batch
+ img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []
- img_masked = img_batch.copy()
- img_masked[:, args.img_size//2:] = 0
+ if len(img_batch) > 0:
+ img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)
- img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255.
- mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1])
+ img_masked = img_batch.copy()
+ img_masked[:, args.img_size//2:] = 0
- yield img_batch, mel_batch, frame_batch, coords_batch
+ img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255.
+ mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1])
+
+ yield img_batch, mel_batch, frame_batch, coords_batch
mel_step_size = 16
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Using {} for inference.'.format(device))
def _load(checkpoint_path):
- if device == 'cuda':
- checkpoint = torch.load(checkpoint_path)
- else:
- checkpoint = torch.load(checkpoint_path,
- map_location=lambda storage, loc: storage)
- return checkpoint
+ if device == 'cuda':
+ checkpoint = torch.load(checkpoint_path)
+ else:
+ checkpoint = torch.load(checkpoint_path,
+ map_location=lambda storage, loc: storage)
+ return checkpoint
def load_model(path):
- model = Wav2Lip()
- print("Load checkpoint from: {}".format(path))
- checkpoint = _load(path)
- s = checkpoint["state_dict"]
- new_s = {}
- for k, v in s.items():
- new_s[k.replace('module.', '')] = v
- model.load_state_dict(new_s)
-
- model = model.to(device)
- return model.eval()
+ model = Wav2Lip()
+ print("Load checkpoint from: {}".format(path))
+ checkpoint = _load(path)
+ s = checkpoint["state_dict"]
+ new_s = {}
+ for k, v in s.items():
+ new_s[k.replace('module.', '')] = v
+ model.load_state_dict(new_s)
+
+ model = model.to(device)
+ return model.eval()
def main():
- if not os.path.isfile(args.face):
- raise ValueError('--face argument must be a valid path to video/image file')
+ args.img_size = 96
+
+ if os.path.isfile(args.face) and args.face.split('.')[1] in ['jpg', 'png', 'jpeg']:
+ args.static = True
+
+ if not os.path.isfile(args.face):
+ raise ValueError('--face argument must be a valid path to video/image file')
+
+ elif args.face.split('.')[1] in ['jpg', 'png', 'jpeg']:
+ full_frames = [cv2.imread(args.face)]
+ fps = args.fps
+
+ else:
+ video_stream = cv2.VideoCapture(args.face)
+ fps = video_stream.get(cv2.CAP_PROP_FPS)
+
+ print('Reading video frames...')
+
+ full_frames = []
+ while 1:
+ still_reading, frame = video_stream.read()
+ if not still_reading:
+ video_stream.release()
+ break
+
+ aspect_ratio = frame.shape[1] / frame.shape[0]
+ frame = cv2.resize(frame, (int(args.out_height * aspect_ratio), args.out_height))
+ # if args.resize_factor > 1:
+ # frame = cv2.resize(frame, (frame.shape[1]//args.resize_factor, frame.shape[0]//args.resize_factor))
+
+ if args.rotate:
+ frame = cv2.rotate(frame, cv2.cv2.ROTATE_90_CLOCKWISE)
+
+ y1, y2, x1, x2 = args.crop
+ if x2 == -1: x2 = frame.shape[1]
+ if y2 == -1: y2 = frame.shape[0]
+
+ frame = frame[y1:y2, x1:x2]
+
+ full_frames.append(frame)
+
+ print ("Number of frames available for inference: "+str(len(full_frames)))
+
+ if not args.audio.endswith('.wav'):
+ print('Extracting raw audio...')
+ # command = 'ffmpeg -y -i {} -strict -2 {}'.format(args.audio, 'temp/temp.wav')
+ # subprocess.call(command, shell=True)
+ subprocess.check_call([
+ "ffmpeg", "-y",
+ "-i", args.audio,
+ "temp/temp.wav",
+ ])
+ args.audio = 'temp/temp.wav'
- elif args.face.split('.')[1] in ['jpg', 'png', 'jpeg']:
- full_frames = [cv2.imread(args.face)]
- fps = args.fps
+ wav = audio.load_wav(args.audio, 16000)
+ mel = audio.melspectrogram(wav)
+ print(mel.shape)
- else:
- video_stream = cv2.VideoCapture(args.face)
- fps = video_stream.get(cv2.CAP_PROP_FPS)
+ if np.isnan(mel.reshape(-1)).sum() > 0:
+ raise ValueError('Mel contains nan! Using a TTS voice? Add a small epsilon noise to the wav file and try again')
- print('Reading video frames...')
+ mel_chunks = []
+ mel_idx_multiplier = 80./fps
+ i = 0
+ while 1:
+ start_idx = int(i * mel_idx_multiplier)
+ if start_idx + mel_step_size > len(mel[0]):
+ mel_chunks.append(mel[:, len(mel[0]) - mel_step_size:])
+ break
+ mel_chunks.append(mel[:, start_idx : start_idx + mel_step_size])
+ i += 1
- full_frames = []
- while 1:
- still_reading, frame = video_stream.read()
- if not still_reading:
- video_stream.release()
- break
- if args.resize_factor > 1:
- frame = cv2.resize(frame, (frame.shape[1]//args.resize_factor, frame.shape[0]//args.resize_factor))
+ print("Length of mel chunks: {}".format(len(mel_chunks)))
- if args.rotate:
- frame = cv2.rotate(frame, cv2.cv2.ROTATE_90_CLOCKWISE)
+ full_frames = full_frames[:len(mel_chunks)]
- y1, y2, x1, x2 = args.crop
- if x2 == -1: x2 = frame.shape[1]
- if y2 == -1: y2 = frame.shape[0]
+ batch_size = args.wav2lip_batch_size
+ gen = datagen(full_frames.copy(), mel_chunks)
- frame = frame[y1:y2, x1:x2]
+ s = time()
- full_frames.append(frame)
+ for i, (img_batch, mel_batch, frames, coords) in enumerate(tqdm(gen,
+ total=int(np.ceil(float(len(mel_chunks))/batch_size)))):
+ if i == 0:
+ frame_h, frame_w = full_frames[0].shape[:-1]
+ out = cv2.VideoWriter('temp/result.avi',
+ cv2.VideoWriter_fourcc(*'DIVX'), fps, (frame_w, frame_h))
- print ("Number of frames available for inference: "+str(len(full_frames)))
+ img_batch = torch.FloatTensor(np.transpose(img_batch, (0, 3, 1, 2))).to(device)
+ mel_batch = torch.FloatTensor(np.transpose(mel_batch, (0, 3, 1, 2))).to(device)
- if not args.audio.endswith('.wav'):
- print('Extracting raw audio...')
- command = 'ffmpeg -y -i {} -strict -2 {}'.format(args.audio, 'temp/temp.wav')
+ with torch.no_grad():
+ pred = model(mel_batch, img_batch)
- subprocess.call(command, shell=True)
- args.audio = 'temp/temp.wav'
+ pred = pred.cpu().numpy().transpose(0, 2, 3, 1) * 255.
- wav = audio.load_wav(args.audio, 16000)
- mel = audio.melspectrogram(wav)
- print(mel.shape)
+ for p, f, c in zip(pred, frames, coords):
+ y1, y2, x1, x2 = c
+ p = cv2.resize(p.astype(np.uint8), (x2 - x1, y2 - y1))
- if np.isnan(mel.reshape(-1)).sum() > 0:
- raise ValueError('Mel contains nan! Using a TTS voice? Add a small epsilon noise to the wav file and try again')
+ f[y1:y2, x1:x2] = p
+ out.write(f)
- mel_chunks = []
- mel_idx_multiplier = 80./fps
- i = 0
- while 1:
- start_idx = int(i * mel_idx_multiplier)
- if start_idx + mel_step_size > len(mel[0]):
- mel_chunks.append(mel[:, len(mel[0]) - mel_step_size:])
- break
- mel_chunks.append(mel[:, start_idx : start_idx + mel_step_size])
- i += 1
+ out.release()
- print("Length of mel chunks: {}".format(len(mel_chunks)))
+ print("wav2lip prediction time:", time() - s)
- full_frames = full_frames[:len(mel_chunks)]
+ subprocess.check_call([
+ "ffmpeg", "-y",
+ # "-vsync", "0", "-hwaccel", "cuda", "-hwaccel_output_format", "cuda",
+ "-i", "temp/result.avi",
+ "-i", args.audio,
+ # "-c:v", "h264_nvenc",
+ args.outfile,
+ ])
- batch_size = args.wav2lip_batch_size
- gen = datagen(full_frames.copy(), mel_chunks)
+model = detector = detector_model = None
- for i, (img_batch, mel_batch, frames, coords) in enumerate(tqdm(gen,
- total=int(np.ceil(float(len(mel_chunks))/batch_size)))):
- if i == 0:
- model = load_model(args.checkpoint_path)
- print ("Model loaded")
+def do_load(checkpoint_path):
+ global model, detector, detector_model
- frame_h, frame_w = full_frames[0].shape[:-1]
- out = cv2.VideoWriter('temp/result.avi',
- cv2.VideoWriter_fourcc(*'DIVX'), fps, (frame_w, frame_h))
+ model = load_model(checkpoint_path)
- img_batch = torch.FloatTensor(np.transpose(img_batch, (0, 3, 1, 2))).to(device)
- mel_batch = torch.FloatTensor(np.transpose(mel_batch, (0, 3, 1, 2))).to(device)
+ # SFDDetector.load_model(device)
+ # detector = RetinaFace(gpu_id=0, model_path="checkpoints/mobilenet.pth", network="mobilenet")
+ # detector = RetinaFace(gpu_id=0, model_path="checkpoints/resnet50.pth", network="resnet50")
+ if torch.cuda.is_available():
+ detector = RetinaFace(gpu_id=0, model_path="checkpoints/mobilenet.pth", network="mobilenet")
+ else:
+ detector = RetinaFace( model_path="checkpoints/mobilenet.pth", network="mobilenet")
+
+ detector_model = detector.model
- with torch.no_grad():
- pred = model(mel_batch, img_batch)
+ print("Models loaded")
- pred = pred.cpu().numpy().transpose(0, 2, 3, 1) * 255.
-
- for p, f, c in zip(pred, frames, coords):
- y1, y2, x1, x2 = c
- p = cv2.resize(p.astype(np.uint8), (x2 - x1, y2 - y1))
- f[y1:y2, x1:x2] = p
- out.write(f)
+face_batch_size = 64 * 8
- out.release()
+def face_rect(images):
+ num_batches = math.ceil(len(images) / face_batch_size)
+ prev_ret = None
+ for i in range(num_batches):
+ batch = images[i * face_batch_size: (i + 1) * face_batch_size]
+ all_faces = detector(batch) # return faces list of all images
+ for faces in all_faces:
+ if faces:
+ box, landmarks, score = faces[0]
+ prev_ret = tuple(map(int, box))
+ yield prev_ret
- command = 'ffmpeg -y -i {} -i {} -strict -2 -q:v 1 {}'.format(args.audio, 'temp/result.avi', args.outfile)
- subprocess.call(command, shell=platform.system() != 'Windows')
if __name__ == '__main__':
- main()
+ args = parser.parse_args()
+ do_load(args.checkpoint_path)
+ main()
diff --git a/predict.py b/predict.py
new file mode 100644
index 000000000..7fbc7eba6
--- /dev/null
+++ b/predict.py
@@ -0,0 +1,144 @@
+# Prediction interface for Cog ⚙️
+# https://github.com/replicate/cog/blob/main/docs/python.md
+import os
+import subprocess
+
+from cog import BasePredictor, Input, Path
+
+import inference
+
+from time import time
+
+from functools import wraps
+import torch
+
+
+def make_mem_efficient(cls: BasePredictor):
+ if not torch.cuda.is_available():
+ return cls
+
+ old_setup = cls.setup
+ old_predict = cls.predict
+
+ @wraps(old_setup)
+ def new_setup(self, *args, **kwargs):
+ ret = old_setup(self, *args, **kwargs)
+ _move_to(self, "cpu")
+ return ret
+
+ @wraps(old_predict)
+ def new_predict(self, *args, **kwargs):
+ _move_to(self, "cuda")
+ try:
+ ret = old_predict(self, *args, **kwargs)
+ finally:
+ _move_to(self, "cpu")
+ return ret
+
+ cls.setup = new_setup
+ cls.predict = new_predict
+
+ return cls
+
+
+def _move_to(self, device):
+ try:
+ self = self.cached_models
+ except AttributeError:
+ pass
+ for attr, value in vars(self).items():
+ try:
+ value = value.to(device)
+ except AttributeError:
+ pass
+ else:
+ print(f"Moving {self.__name__}.{attr} to {device}")
+ setattr(self, attr, value)
+ torch.cuda.empty_cache()
+
+
+@make_mem_efficient
+class Predictor(BasePredictor):
+ cached_models = inference
+
+ def setup(self):
+ inference.do_load("checkpoints/wav2lip_gan.pth")
+
+ def predict(
+ self,
+ face: Path = Input(description="video/image that contains faces to use"),
+ audio: Path = Input(description="video/audio file to use as raw audio source"),
+ pads: str = Input(
+ description="Padding for the detected face bounding box.\n"
+ "Please adjust to include chin at least\n"
+ 'Format: "top bottom left right"',
+ default="0 10 0 0",
+ ),
+ smooth: bool = Input(
+ description="Smooth face detections over a short temporal window",
+ default=True,
+ ),
+ fps: float = Input(
+ description="Can be specified only if input is a static image",
+ default=25.0,
+ ),
+ out_height: int = Input(
+ description="Output video height. Best results are obtained at 480 or 720",
+ default=480,
+ ),
+ ) -> Path:
+ try:
+ os.remove("results/result_voice.mp4")
+ except FileNotFoundError:
+ pass
+
+ face_ext = os.path.splitext(face)[-1]
+ if face_ext not in [".mp4", ".mov", ".png" , ".jpg" , ".jpeg" , ".gif", ".mkv", ".webp"]:
+ raise ValueError(f'Unsupported face format {face_ext!r}')
+
+ audio_ext = os.path.splitext(audio)[-1]
+ if audio_ext not in [".wav", ".mp3"]:
+ raise ValueError(f'Unsupported audio format {audio_ext!r}')
+
+ args = [
+ "--checkpoint_path", "checkpoints/wav2lip_gan.pth",
+ "--face", str(face),
+ "--audio", str(audio),
+ "--pads", *pads.split(" "),
+ "--fps", str(fps),
+ "--out_height", str(out_height),
+ ]
+ if not smooth:
+ args += ["--nosmooth"]
+
+ print("-> run:", " ".join(args))
+ inference.args = inference.parser.parse_args(args)
+
+ s = time()
+
+ try:
+ inference.main()
+ except ValueError as e:
+ print('-> Encountered error, skipping lipsync:', e)
+
+ args = [
+ "ffmpeg", "-y",
+ # "-vsync", "0", "-hwaccel", "cuda", "-hwaccel_output_format", "cuda",
+ "-stream_loop", "-1",
+ "-i", str(face),
+ "-i", str(audio),
+ "-shortest",
+ "-fflags", "+shortest",
+ "-max_interleave_delta", "100M",
+ "-map", "0:v:0",
+ "-map", "1:a:0",
+ # "-c", "copy",
+ # "-c:v", "h264_nvenc",
+ "results/result_voice.mp4",
+ ]
+ print("-> run:", " ".join(args))
+ print(subprocess.check_output(args, encoding="utf-8"))
+
+ print(time() - s)
+
+ return Path("results/result_voice.mp4")
diff --git a/requirements.txt b/requirements.txt
index bfd428ab9..b16f1dabf 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,8 +1,73 @@
-librosa==0.7.0
-numpy==1.17.1
-opencv-contrib-python>=4.2.0.34
-opencv-python==4.1.0.25
-torch==1.1.0
-torchvision==0.3.0
-tqdm==4.45.0
-numba==0.48
+absl-py==2.1.0
+attrs==24.2.0
+audioread==3.0.1
+batch-face==1.5.0.dev0
+certifi==2024.8.30
+cffi==1.17.1
+charset-normalizer==3.4.0
+colorama==0.4.6
+contourpy==1.3.1
+cycler==0.12.1
+decorator==5.1.1
+ffmpeg-python==0.2.0
+filelock==3.16.1
+flatbuffers==24.3.25
+fonttools==4.55.2
+fsspec==2024.10.0
+future==1.0.0
+ghc==1.0
+idna==3.10
+intel-openmp==2021.4.0
+jax==0.4.36
+jaxlib==0.4.36
+Jinja2==3.1.4
+joblib==1.4.2
+kiwisolver==1.4.7
+lazy_loader==0.4
+librosa==0.10.2.post1
+llvmlite==0.43.0
+MarkupSafe==3.0.2
+matplotlib==3.9.3
+mediapipe==0.10.18
+mkl==2021.4.0
+ml_dtypes==0.5.0
+mpmath==1.3.0
+msgpack==1.1.0
+networkx==3.4.2
+numba==0.60.0
+numpy==1.26.4
+opencv-contrib-python==4.10.0.84
+opencv-python==4.10.0.84
+opencv-transforms==0.0.6
+opt_einsum==3.4.0
+packaging==24.2
+pandas==2.2.3
+pillow==11.0.0
+pip==24.2
+platformdirs==4.3.6
+pooch==1.8.2
+protobuf==4.25.5
+pycparser==2.22
+pyparsing==3.2.0
+python-dateutil==2.9.0.post0
+pytz==2024.2
+requests==2.32.3
+scikit-learn==1.5.2
+scipy==1.14.1
+sentencepiece==0.2.0
+setuptools==75.1.0
+six==1.17.0
+sixdrepnet==0.1.6
+sounddevice==0.5.1
+soundfile==0.12.1
+soxr==0.5.0.post1
+sympy==1.13.3
+tbb==2021.13.1
+threadpoolctl==3.5.0
+torch==2.3.0+cu118
+torchvision==0.18.0+cu118
+tqdm==4.67.1
+typing_extensions==4.12.2
+tzdata==2024.2
+urllib3==2.2.3
+wheel==0.44.0
diff --git a/requirementsCPU.txt b/requirementsCPU.txt
new file mode 100644
index 000000000..ac7cef623
--- /dev/null
+++ b/requirementsCPU.txt
@@ -0,0 +1,13 @@
+librosa
+numpy
+opencv-contrib-python
+opencv-python
+-f https://download.pytorch.org/whl/torch_stable.html
+torch
+torchvision
+tqdm
+numba
+mediapipe
+https://raw.githubusercontent.com/AwaleSajil/ghc/master/ghc-1.0-py3-none-any.whl
+git+https://github.com/elliottzheng/batch-face.git@master
+ffmpeg-python
diff --git a/requirements_colab.txt b/requirements_colab.txt
new file mode 100644
index 000000000..c5f75e7b4
--- /dev/null
+++ b/requirements_colab.txt
@@ -0,0 +1,7 @@
+numpy==1.23.4
+librosa
+opencv-python
+torch
+torchvision
+tqdm
+numba
diff --git a/scripts/download_models.sh b/scripts/download_models.sh
new file mode 100644
index 000000000..93049e873
--- /dev/null
+++ b/scripts/download_models.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+
+set -ex
+
+wget -c -O checkpoints/wav2lip_gan.pth 'https://iiitaphyd-my.sharepoint.com/personal/radrabha_m_research_iiit_ac_in/_layouts/15/download.aspx?share=EdjI7bZlgApMqsVoEUUXpLsBxqXbn5z8VTmoxp55YNDcIA'
+wget -c -O checkpoints/mobilenet.pth 'https://github.com/elliottzheng/face-detection/releases/download/0.0.1/mobilenet0.25_Final.pth'
+wget -c -O checkpoints/resnet50.pth 'https://github.com/elliottzheng/face-detection/releases/download/0.0.1/Resnet50_Final.pth'
diff --git a/scripts/run-dev.sh b/scripts/run-dev.sh
new file mode 100644
index 000000000..becde83e1
--- /dev/null
+++ b/scripts/run-dev.sh
@@ -0,0 +1,13 @@
+#!/usr/bin/env bash
+
+NAME=wav2lip-dev
+
+set -ex
+
+docker build . -t $NAME
+docker run -it --rm \
+ --name $NAME \
+ -v $PWD/checkpoints:/src/checkpoints \
+ -p 6001:5000 \
+ --gpus all \
+ $NAME
diff --git a/scripts/run-prod.sh b/scripts/run-prod.sh
new file mode 100644
index 000000000..08f378a48
--- /dev/null
+++ b/scripts/run-prod.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+
+NAME=wav2lip
+
+set -x
+
+docker rm -f $NAME
+
+docker build . -t $NAME
+docker run -d --restart always \
+ --name $NAME \
+ -v $PWD/checkpoints:/src/checkpoints \
+ -p 5001:5000 \
+ --gpus all \
+ $NAME
+
+docker logs -f $NAME