diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 000000000..d625a97d1 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,57 @@ +FROM nvidia/cuda:11.6.2-cudnn8-devel-ubuntu20.04 + +ARG DEBIAN_FRONTEND=noninteractive + +# install python via pyenv +RUN apt-get update && apt-get install -y --no-install-recommends \ + make \ + build-essential \ + libssl-dev \ + zlib1g-dev \ + libbz2-dev \ + libreadline-dev \ + libsqlite3-dev \ + wget \ + curl \ + llvm \ + libncurses5-dev \ + libncursesw5-dev \ + xz-utils \ + tk-dev \ + libffi-dev \ + liblzma-dev \ + git \ + ca-certificates \ + libgl1 \ + && rm -rf /var/lib/apt/lists/* +ENV PATH="/root/.pyenv/shims:/root/.pyenv/bin:$PATH" +ARG PYTHON_VERSION=3.8 +RUN curl -s -S -L https://raw.githubusercontent.com/pyenv/pyenv-installer/master/bin/pyenv-installer | bash && \ + pyenv install $PYTHON_VERSION && \ + pyenv global $PYTHON_VERSION + +# install cog +RUN pip install cog + +# install deps +RUN apt-get update && apt-get install -y --no-install-recommends \ + ffmpeg libsndfile1 \ + && rm -rf /var/lib/apt/lists/* + +# copy to /src +ENV WORKDIR /src +RUN mkdir -p $WORKDIR +WORKDIR $WORKDIR + +# install requirements +COPY requirements.txt . +RUN pip install -r requirements.txt +RUN pip install git+https://github.com/elliottzheng/batch-face.git@master + +# copy sources +COPY . . + +ENV PYTHONUNBUFFERED=1 + +# run cog +CMD python3 -m cog.server.http diff --git a/README.md b/README.md index 76e66198e..f0ac474a0 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ This code is part of the paper: _A Lip Sync Expert Is All You Need for Speech to |📑 Original Paper|📰 Project Page|🌀 Demo|⚡ Live Testing|📔 Colab Notebook |:-:|:-:|:-:|:-:|:-:| -[Paper](http://arxiv.org/abs/2008.10010) | [Project Page](http://cvit.iiit.ac.in/research/projects/cvit-projects/a-lip-sync-expert-is-all-you-need-for-speech-to-lip-generation-in-the-wild/) | [Demo Video](https://youtu.be/0fXaDCZNOJc) | [Interactive Demo](https://bhaasha.iiit.ac.in/lipsync) | [Colab Notebook](https://colab.research.google.com/drive/1tZpDWXz49W6wDcTprANRGLo2D_EbD5J8?usp=sharing) /[Updated Collab Notebook](https://colab.research.google.com/drive/1IjFW1cLevs6Ouyu4Yht4mnR4yeuMqO7Y#scrollTo=MH1m608OymLH) +[Paper](http://arxiv.org/abs/2008.10010) | [Project Page](http://cvit.iiit.ac.in/research/projects/cvit-projects/a-lip-sync-expert-is-all-you-need-for-speech-to-lip-generation-in-the-wild/) | [Demo Video](https://youtu.be/0fXaDCZNOJc) | [Interactive Demo](https://bhaasha.iiit.ac.in/lipsync) | [Colab Notebook](https://colab.research.google.com/drive/1tZpDWXz49W6wDcTprANRGLo2D_EbD5J8?usp=sharing) /[Updated Collab Notebook](https://colab.research.google.com/github/justinjohn0306/Wav2Lip/blob/master/Wav2Lip_simplified_v5.ipynb) @@ -27,14 +27,15 @@ This code is part of the paper: _A Lip Sync Expert Is All You Need for Speech to -------- **Disclaimer** -------- -All results from this open-source code or our [demo website](https://bhaasha.iiit.ac.in/lipsync) should only be used for research/academic/personal purposes only. As the models are trained on the LRS2 dataset, any form of commercial use is strictly prohibited. For commercial requests please contact us directly! +All results from this open-source code or our [demo website](https://bhaasha.iiit.ac.in/lipsync) should only be used for research/academic/personal purposes only. As the models are trained on the LRS2 dataset, any form of commercial use is strictly prohibhited. For commercial requests please contact us directly! Prerequisites ------------- -- `Python 3.6` +- `Python 3.10.15` - ffmpeg: `sudo apt-get install ffmpeg` - Install necessary packages using `pip install -r requirements.txt`. Alternatively, instructions for using a docker image is provided [here](https://gist.github.com/xenogenesi/e62d3d13dadbc164124c830e9c453668). Have a look at [this comment](https://github.com/Rudrabha/Wav2Lip/issues/131#issuecomment-725478562) and comment on [the gist](https://gist.github.com/xenogenesi/e62d3d13dadbc164124c830e9c453668) if you encounter any issues. - Face detection [pre-trained model](https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth) should be downloaded to `face_detection/detection/sfd/s3fd.pth`. Alternative [link](https://iiitaphyd-my.sharepoint.com/:u:/g/personal/prajwal_k_research_iiit_ac_in/EZsy6qWuivtDnANIG73iHjIBjMSoojcIV0NULXV-yiuiIg?e=qTasa8) if the above does not work. +- Add [mobilenet.pth](https://github.com/justinjohn0306/Wav2Lip/releases/download/models/mobilenet.pth) to checkpoints folder once installed along with one of the weights files below. Getting the weights ---------- @@ -55,8 +56,8 @@ The result is saved (by default) in `results/result_voice.mp4`. You can specify ##### Tips for better results: - Experiment with the `--pads` argument to adjust the detected face bounding box. Often leads to improved results. You might need to increase the bottom padding to include the chin region. E.g. `--pads 0 20 0 0`. -- If you see the mouth position dislocated or some weird artifacts such as two mouths, then it can be because of over-smoothing the face detections. Use the `--nosmooth` argument and give it another try. -- Experiment with the `--resize_factor` argument, to get a lower-resolution video. Why? The models are trained on faces that were at a lower resolution. You might get better, visually pleasing results for 720p videos than for 1080p videos (in many cases, the latter works well too). +- If you see the mouth position dislocated or some weird artifacts such as two mouths, then it can be because of over-smoothing the face detections. Use the `--nosmooth` argument and give another try. +- Experiment with the `--resize_factor` argument, to get a lower resolution video. Why? The models are trained on faces which were at a lower resolution. You might get better, visually pleasing results for 720p videos than for 1080p videos (in many cases, the latter works well too). - The Wav2Lip model without GAN usually needs more experimenting with the above two to get the most ideal results, and sometimes, can give you a better result as well. Preparing LRS2 for training @@ -78,7 +79,7 @@ Place the LRS2 filelists (train, val, test) `.txt` files in the `filelists/` fol ```bash python preprocess.py --data_root data_root/main --preprocessed_root lrs2_preprocessed/ ``` -Additional options like `batch_size` and the number of GPUs to use in parallel to use can also be set. +Additional options like `batch_size` and number of GPUs to use in parallel to use can also be set. ##### Preprocessed LRS2 folder structure ``` @@ -99,12 +100,12 @@ You can download [the pre-trained weights](#getting-the-weights) if you want to python color_syncnet_train.py --data_root lrs2_preprocessed/ --checkpoint_dir ``` ##### Training the Wav2Lip models -You can either train the model without the additional visual quality discriminator (< 1 day of training) or use the discriminator (~2 days). For the former, run: +You can either train the model without the additional visual quality disriminator (< 1 day of training) or use the discriminator (~2 days). For the former, run: ```bash python wav2lip_train.py --data_root lrs2_preprocessed/ --checkpoint_dir --syncnet_checkpoint_path ``` -To train with the visual quality discriminator, you should run `hq_wav2lip_train.py` instead. The arguments for both files are similar. In both cases, you can resume training as well. Look at `python wav2lip_train.py --help` for more details. You can also set additional less commonly-used hyper-parameters at the bottom of the `hparams.py` file. +To train with the visual quality discriminator, you should run `hq_wav2lip_train.py` instead. The arguments for both the files are similar. In both the cases, you can resume training as well. Look at `python wav2lip_train.py --help` for more details. You can also set additional less commonly-used hyper-parameters at the bottom of the `hparams.py` file. Training on datasets other than LRS2 ------------------------------------ @@ -126,7 +127,7 @@ Please check the `evaluation/` folder for the instructions. License and Citation ---------- -This repository can only be used for personal/research/non-commercial purposes. However, for commercial requests, please contact us directly at radrabha.m@research.iiit.ac.in or prajwal.k@research.iiit.ac.in. We have an HD model trained on a dataset allowing commercial usage. The size of the generated face will be 192 x 288 in our new model. Please cite the following paper if you use this repository: +Theis repository can only be used for personal/research/non-commercial purposes. However, for commercial requests, please contact us directly at radrabha.m@research.iiit.ac.in or prajwal.k@research.iiit.ac.in. We have an HD model trained on a dataset allowing commercial usage. The size of the generated face will be 192 x 288 in our new model. Please cite the following paper if you use this repository: ``` @inproceedings{10.1145/3394171.3413532, author = {Prajwal, K R and Mukhopadhyay, Rudrabha and Namboodiri, Vinay P. and Jawahar, C.V.}, @@ -147,6 +148,6 @@ series = {MM '20} ``` -Acknowledgments +Acknowledgements ---------- -Parts of the code structure are inspired by this [TTS repository](https://github.com/r9y9/deepvoice3_pytorch). We thank the author for this wonderful code. The code for Face Detection has been taken from the [face_alignment](https://github.com/1adrianb/face-alignment) repository. We thank the authors for releasing their code and models. We thank [zabique](https://github.com/zabique) for the tutorial collab notebook. +Parts of the code structure is inspired by this [TTS repository](https://github.com/r9y9/deepvoice3_pytorch). We thank the author for this wonderful code. The code for Face Detection has been taken from the [face_alignment](https://github.com/1adrianb/face-alignment) repository. We thank the authors for releasing their code and models. We thank [zabique](https://github.com/zabique) for the tutorial collab notebook. diff --git a/Wav2Lip_simplified_V5(offline).ipynb b/Wav2Lip_simplified_V5(offline).ipynb new file mode 100644 index 000000000..8eb6d2f2e --- /dev/null +++ b/Wav2Lip_simplified_V5(offline).ipynb @@ -0,0 +1,225 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "f1e90f25", + "metadata": {}, + "outputs": [], + "source": [ + "# Step 1: Install dependency\n", + "!pip install ffmpeg-python\n", + "\n", + "# Step 2: Clone the Wav2Lip repository\n", + "!git clone https://github.com/justinjohn0306/Wav2Lip\n", + "\n", + "# Step 3: Download pretrained model\n", + "import requests\n", + "url = \"https://iiitaphyd-my.sharepoint.com/personal/radrabha_m_research_iiit_ac_in/_layouts/15/download.aspx?share=EdjI7bZlgApMqsVoEUUXpLsBxqXbn5z8VTmoxp55YNDcIA\"\n", + "response = requests.get(url)\n", + "\n", + "with open(\"Wav2Lip/checkpoints/wav2lip_gan.pth\", \"wb\") as f:\n", + " f.write(response.content)\n", + " \n", + "# Step 4: Install the required dependencies for Wav2Lip\n", + "!cd Wav2Lip && pip install -r requirements.txt\n", + "!pip install pyaudio\n", + "\n", + "\n", + "# Step 5: Download pretrained model for face detection\n", + "url = \"https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth\"\n", + "response = requests.get(url)\n", + "\n", + "with open(\"Wav2Lip/face_detection/detection/sfd/s3fd.pth\", \"wb\") as f:\n", + " f.write(response.content)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8e86c988", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import subprocess\n", + "from urllib import parse as urlparse\n", + "\n", + "# Step 1: Install yt-dlp\n", + "subprocess.run(['pip', 'install', 'yt-dlp'])\n", + "\n", + "# Step 2: Define YouTube URL and Video ID\n", + "YOUTUBE_URL = 'https://www.youtube.com/watch?v=vAnWYLTdvfY'\n", + "url_data = urlparse.urlparse(YOUTUBE_URL)\n", + "query = urlparse.parse_qs(url_data.query)\n", + "YOUTUBE_ID = query[\"v\"][0]\n", + "\n", + "# Remove previous input video\n", + "if os.path.isfile('input_vid.mp4'):\n", + " os.remove('input_vid.mp4')\n", + "\n", + "# Trim video (start, end) seconds\n", + "start = 35\n", + "end = 62\n", + "interval = end - start\n", + "\n", + "# Step 3: Download and trim the YouTube video\n", + "subprocess.run(['yt-dlp', '-f', 'bestvideo[ext=mp4]', '--output', \"youtube.%(ext)s\", f'https://www.youtube.com/watch?v={YOUTUBE_ID}'])\n", + "\n", + "# Cut the video using FFmpeg\n", + "subprocess.run(['ffmpeg', '-y', '-i', 'youtube.mp4', '-ss', str(start), '-t', str(interval), '-async', '1', 'input_vid.mp4'])\n", + "\n", + "# Display video.\n", + "from IPython.display import HTML\n", + "from base64 import b64encode\n", + "\n", + "def show_video(path):\n", + " mp4 = open(path, 'rb').read()\n", + " data_url = \"data:video/mp4;base64,\" + b64encode(mp4).decode()\n", + " return HTML(f\"\"\"\"\"\")\n", + "\n", + "# Preview the trimmed video\n", + "show_video('input_vid.mp4')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7da8e818", + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "import os\n", + "from IPython.display import Audio\n", + "from IPython.core.display import display\n", + "\n", + "upload_method = 'Path' # Change this to 'Record' or 'Path'\n", + "\n", + "# Remove previous input audio\n", + "if os.path.isfile('input_audio.wav'):\n", + " os.remove('input_audio.wav')\n", + "\n", + "def display_audio():\n", + " display(Audio('input_audio.wav'))\n", + "\n", + "if upload_method == 'Record':\n", + " import pyaudio\n", + " import wave\n", + "\n", + " CHUNK = 1024\n", + " FORMAT = pyaudio.paInt16\n", + " CHANNELS = 1\n", + " RATE = 16000\n", + " RECORD_SECONDS = 5\n", + " WAVE_OUTPUT_FILENAME = \"input_audio.wav\"\n", + "\n", + " p = pyaudio.PyAudio()\n", + "\n", + " stream = p.open(format=FORMAT,\n", + " channels=CHANNELS,\n", + " rate=RATE,\n", + " input=True,\n", + " frames_per_buffer=CHUNK)\n", + "\n", + " print(\"Recording...\")\n", + "\n", + " frames = []\n", + "\n", + " for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):\n", + " data = stream.read(CHUNK)\n", + " frames.append(data)\n", + "\n", + " print(\"Finished recording.\")\n", + "\n", + " stream.stop_stream()\n", + " stream.close()\n", + " p.terminate()\n", + "\n", + " wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb')\n", + " wf.setnchannels(CHANNELS)\n", + " wf.setsampwidth(p.get_sample_size(FORMAT))\n", + " wf.setframerate(RATE)\n", + " wf.writeframes(b''.join(frames))\n", + " wf.close()\n", + "\n", + " display_audio()\n", + "\n", + "elif upload_method == 'Path':\n", + " # Add the full path to your audio\n", + " PATH_TO_YOUR_AUDIO = 'C:/Users/justi/OneDrive/Desktop/wav2lip/Wav2Lip/input_audio.wav'\n", + "\n", + " # Load audio with specified sampling rate\n", + " import librosa\n", + " audio, sr = librosa.load(PATH_TO_YOUR_AUDIO, sr=None)\n", + "\n", + " # Save audio with specified sampling rate\n", + " import soundfile as sf\n", + " sf.write('input_audio.wav', audio, sr, format='wav')\n", + "\n", + " display_audio()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "63289945", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# Define the parameters for the Wav2Lip model\n", + "pad_top = 0\n", + "pad_bottom = 10\n", + "pad_left = 0\n", + "pad_right = 0\n", + "rescaleFactor = 1\n", + "nosmooth = False\n", + "\n", + "# Set the path to the Wav2Lip model and input files\n", + "checkpoint_path = \"checkpoints/wav2lip_gan.pth\"\n", + "input_face = \"input_vid.mp4\"\n", + "input_audio = \"input_audio.wav\"\n", + "\n", + "# Run the Wav2Lip model\n", + "!cd Wav2Lip && python inference.py --checkpoint_path {checkpoint_path} --face {input_face} --audio {input_audio} --pads {pad_top} {pad_bottom} {pad_left} {pad_right} --resize_factor {rescaleFactor} {\"--nosmooth\" if nosmooth else \"\"}\n", + "\n", + "# Preview the output video\n", + "print(\"Final Video Preview\")\n", + "print(\"Find the output video at\", 'Wav2Lip/results/result_voice.mp4')\n", + "show_video('Wav2Lip/results/result_voice.mp4')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3fbafa56", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/Wav2Lip_simplified_v4.ipynb b/Wav2Lip_simplified_v4.ipynb new file mode 100644 index 000000000..5cc33bf16 --- /dev/null +++ b/Wav2Lip_simplified_v4.ipynb @@ -0,0 +1,482 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "accelerator": "GPU", + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "**Fixes by: [justinjohn-03](https://github.com/justinjohn0306)**" + ], + "metadata": { + "id": "9Uyk6DCBGHuW" + } + }, + { + "cell_type": "markdown", + "metadata": { + "id": "U1xFNFU58_2j" + }, + "source": [ + "## Goal: Make anyone speak anything (LipSync)\n", + "\n", + "* Github: https://github.com/Rudrabha/Wav2Lip\n", + "* Paper: https://arxiv.org/abs/2008.10010\n", + "*Original notebook: https://colab.research.google.com/drive/1tZpDWXz49W6wDcTprANRGLo2D_EbD5J8?usp=sharing\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "Qgo-oaI3JU2u", + "cellView": "form" + }, + "source": [ + "#@title

Step1: Setup Wav2Lip

\n", + "#@markdown * Install dependency\n", + "#@markdown * Download pretrained model\n", + "!rm -rf /content/sample_data\n", + "!mkdir /content/sample_data\n", + "\n", + "!git clone https://github.com/zabique/Wav2Lip\n", + "\n", + "#download the pretrained model\n", + "!wget 'https://iiitaphyd-my.sharepoint.com/personal/radrabha_m_research_iiit_ac_in/_layouts/15/download.aspx?share=EdjI7bZlgApMqsVoEUUXpLsBxqXbn5z8VTmoxp55YNDcIA' -O '/content/Wav2Lip/checkpoints/wav2lip_gan.pth'\n", + "a = !pip install https://raw.githubusercontent.com/AwaleSajil/ghc/master/ghc-1.0-py3-none-any.whl\n", + "\n", + "# !pip uninstall tensorflow tensorflow-gpu\n", + "!cd Wav2Lip && pip install -r requirements.txt\n", + "\n", + "#download pretrained model for face detection\n", + "!wget \"https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth\" -O \"/content/Wav2Lip/face_detection/detection/sfd/s3fd.pth\"\n", + "\n", + "!pip install -q youtube-dl\n", + "!pip install ffmpeg-python\n", + "!pip install librosa==0.9.1\n", + "\n", + "#this code for recording audio\n", + "\"\"\"\n", + "To write this piece of code I took inspiration/code from a lot of places.\n", + "It was late night, so I'm not sure how much I created or just copied o.O\n", + "Here are some of the possible references:\n", + "https://blog.addpipe.com/recording-audio-in-the-browser-using-pure-html5-and-minimal-javascript/\n", + "https://stackoverflow.com/a/18650249\n", + "https://hacks.mozilla.org/2014/06/easy-audio-capture-with-the-mediarecorder-api/\n", + "https://air.ghost.io/recording-to-an-audio-file-using-html5-and-js/\n", + "https://stackoverflow.com/a/49019356\n", + "\"\"\"\n", + "from IPython.display import HTML, Audio\n", + "from google.colab.output import eval_js\n", + "from base64 import b64decode\n", + "import numpy as np\n", + "from scipy.io.wavfile import read as wav_read\n", + "import io\n", + "import ffmpeg\n", + "\n", + "AUDIO_HTML = \"\"\"\n", + "\n", + "\"\"\"\n", + "\n", + "%cd /\n", + "from ghc.l_ghc_cf import l_ghc_cf\n", + "%cd content\n", + "\n", + "def get_audio():\n", + " display(HTML(AUDIO_HTML))\n", + " data = eval_js(\"data\")\n", + " binary = b64decode(data.split(',')[1])\n", + " \n", + " process = (ffmpeg\n", + " .input('pipe:0')\n", + " .output('pipe:1', format='wav')\n", + " .run_async(pipe_stdin=True, pipe_stdout=True, pipe_stderr=True, quiet=True, overwrite_output=True)\n", + " )\n", + " output, err = process.communicate(input=binary)\n", + " \n", + " riff_chunk_size = len(output) - 8\n", + " # Break up the chunk size into four bytes, held in b.\n", + " q = riff_chunk_size\n", + " b = []\n", + " for i in range(4):\n", + " q, r = divmod(q, 256)\n", + " b.append(r)\n", + "\n", + " # Replace bytes 4:8 in proc.stdout with the actual size of the RIFF chunk.\n", + " riff = output[:4] + bytes(b) + output[8:]\n", + "\n", + " sr, audio = wav_read(io.BytesIO(riff))\n", + "\n", + " return audio, sr\n", + "\n", + "\n", + "from IPython.display import HTML\n", + "from base64 import b64encode\n", + "def showVideo(path):\n", + " mp4 = open(str(path),'rb').read()\n", + " data_url = \"data:video/mp4;base64,\" + b64encode(mp4).decode()\n", + " return HTML(\"\"\"\n", + " \n", + " \"\"\" % data_url)\n", + "\n", + "from IPython.display import clear_output" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SEdy6PWDXMRL" + }, + "source": [ + "# LipSync Youtube Video" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "QI4kcm8QEeGZ", + "cellView": "form" + }, + "source": [ + "#@title STEP2: Select a Youtube Video\n", + "# Install yt-dlp\n", + "!pip install yt-dlp\n", + "\n", + "#@markdown ### Find YouTube video ID from URL\n", + "from urllib import parse as urlparse\n", + "YOUTUBE_URL = 'https://www.youtube.com/watch?v=vAnWYLTdvfY' #@param {type:\"string\"}\n", + "url_data = urlparse.urlparse(YOUTUBE_URL)\n", + "query = urlparse.parse_qs(url_data.query)\n", + "YOUTUBE_ID = query[\"v\"][0]\n", + "\n", + "#@markdown ### Trim the video (start, end) seconds\n", + "start = 35 #@param {type:\"integer\"}\n", + "end = 62 #@param {type:\"integer\"}\n", + "interval = end - start\n", + "\n", + "# Download the YouTube video using yt-dlp\n", + "!yt-dlp -f 'bestvideo[ext=mp4]' --output \"youtube.%(ext)s\" https://www.youtube.com/watch?v=$YOUTUBE_ID\n", + "\n", + "# Cut the video using FFmpeg\n", + "!ffmpeg -y -i youtube.mp4 -ss {start} -t {interval} -async 1 /content/sample_data/input_vid.mp4\n", + "\n", + "# Preview the trimmed video\n", + "from IPython.display import HTML\n", + "from base64 import b64encode\n", + "mp4 = open('/content/sample_data/input_vid.mp4','rb').read()\n", + "data_url = \"data:video/mp4;base64,\" + b64encode(mp4).decode()\n", + "HTML(f\"\"\"\"\"\")\n", + "\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "zS_RAeh-IfZy", + "cellView": "form" + }, + "source": [ + "#@title STEP3: Select Audio (Record or Upload)\n", + "from IPython.display import Audio \n", + "from IPython.core.display import display\n", + "\n", + "record_or_upload = 'Upload' #@param ['Record', 'Upload']\n", + "\n", + "def displayAudio():\n", + " display(Audio('/content/sample_data/input_audio.wav'))\n", + "if record_or_upload == 'Record':\n", + " audio, sr = get_audio()\n", + " import scipy\n", + " scipy.io.wavfile.write('/content/sample_data/input_audio.wav', sr, audio)\n", + "elif record_or_upload == 'Upload':\n", + " from google.colab import files\n", + " uploaded = files.upload()\n", + " for fn in uploaded.keys():\n", + " print('User uploaded file \"{name}\" with length {length} bytes'.format(\n", + " name=fn, length=len(uploaded[fn])))\n", + " \n", + " #concider only the first file\n", + " audio_file = str(list(uploaded.keys())[0])\n", + " \n", + " # Load audio with specified sampling rate\n", + " import librosa\n", + " audio, sr = librosa.load(audio_file, sr=None)\n", + " \n", + " # Save audio with specified sampling rate\n", + " import soundfile as sf\n", + " sf.write('/content/sample_data/input_audio.wav', audio, sr, format='wav')\n", + " \n", + " clear_output()\n", + " displayAudio()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "BQPLXJ8L0gms", + "cellView": "form" + }, + "source": [ + "#@title STEP4: Start Crunching and Preview Output\n", + "#@markdown Note: Only change these, if you have to\n", + "pad_top = 0#@param {type:\"integer\"}\n", + "pad_bottom = 10#@param {type:\"integer\"}\n", + "pad_left = 0#@param {type:\"integer\"}\n", + "pad_right = 0#@param {type:\"integer\"}\n", + "rescaleFactor = 1#@param {type:\"integer\"}\n", + "nosmooth = False #@param {type:\"boolean\"}\n", + "\n", + "\n", + "if nosmooth == False:\n", + " !cd Wav2Lip && python inference.py --checkpoint_path checkpoints/wav2lip_gan.pth --face \"../sample_data/input_vid.mp4\" --audio \"../sample_data/input_audio.wav\" --pads $pad_top $pad_bottom $pad_left $pad_right --resize_factor $rescaleFactor\n", + "else:\n", + " !cd Wav2Lip && python inference.py --checkpoint_path checkpoints/wav2lip_gan.pth --face \"../sample_data/input_vid.mp4\" --audio \"../sample_data/input_audio.wav\" --pads $pad_top $pad_bottom $pad_left $pad_right --resize_factor $rescaleFactor --nosmooth\n", + "#Preview output video\n", + "clear_output()\n", + "print(\"Final Video Preview\")\n", + "print(\"Download this video from\", '/content/Wav2Lip/results/result_voice.mp4')\n", + "showVideo('/content/Wav2Lip/results/result_voice.mp4')\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vYxpPeie1CYL" + }, + "source": [ + "# LipSync on Your Video File" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "nDuM7tfZ1F0t", + "cellView": "form" + }, + "source": [ + "import os\n", + "from google.colab import files\n", + "from IPython.display import HTML\n", + "\n", + "def showVideo(file_path):\n", + " \"\"\"Function to display video in Colab\"\"\"\n", + " mp4 = open(file_path,'rb').read()\n", + " data_url = \"data:video/mp4;base64,\" + b64encode(mp4).decode()\n", + " display(HTML(\"\"\"\n", + " \n", + " \"\"\" % data_url))\n", + "\n", + "#@markdown ### Select an uploading method\n", + "upload_or_path = \"Upload\" #@param [\"Upload\", \"Custom Path\"]\n", + "\n", + "if upload_or_path == \"Upload\":\n", + " uploaded = files.upload()\n", + " for filename in uploaded.keys():\n", + " os.rename(filename, '/content/sample_data/input_vid.mp4')\n", + " PATH_TO_YOUR_VIDEO = '/content/sample_data/input_vid.mp4'\n", + "else:\n", + " PATH_TO_YOUR_VIDEO = '/content/test.mp4' #@param {type:\"string\"}\n", + " if not os.path.isfile(PATH_TO_YOUR_VIDEO):\n", + " print(\"ERROR: File not found!\")\n", + " raise SystemExit(0)\n", + "\n", + "#@markdown ### Trim the video (start, end) seconds\n", + "start_time = 0 #@param {type:\"integer\"}\n", + "end_time = 0 #@param {type:\"integer\"}\n", + "\n", + "if start_time == 0 and end_time == 0:\n", + " print(\"No trimming applied\")\n", + "else:\n", + " duration = end_time - start_time\n", + " os.system(f\"ffmpeg -i {PATH_TO_YOUR_VIDEO} -ss {start_time} -t {duration} -async 1 /content/sample_data/trimmed_vid.mp4\")\n", + " PATH_TO_YOUR_VIDEO = \"/content/sample_data/input_vid.mp4\"\n", + " print(f\"Video trimmed from {start_time} to {end_time} seconds\")\n", + "\n", + "print(f\"PATH_TO_YOUR_VIDEO: {PATH_TO_YOUR_VIDEO}\")\n", + "\n", + "if upload_or_path == \"Upload\":\n", + " clear_output()\n", + " print(\"Input Video\")\n", + " showVideo(PATH_TO_YOUR_VIDEO)\n", + "else:\n", + " if os.path.isfile(PATH_TO_YOUR_VIDEO):\n", + " print(\"Input Video\")\n", + " showVideo(PATH_TO_YOUR_VIDEO)\n" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "XgF4794r7sWK", + "cellView": "form" + }, + "source": [ + "#@title STEP3: Select Audio (Record or Upload)\n", + "from IPython.display import Audio \n", + "from IPython.core.display import display\n", + "\n", + "record_or_upload = 'Upload' #@param ['Record', 'Upload']\n", + "\n", + "def displayAudio():\n", + " display(Audio('/content/sample_data/input_audio.wav'))\n", + "if record_or_upload == 'Record':\n", + " audio, sr = get_audio()\n", + " import scipy\n", + " scipy.io.wavfile.write('/content/sample_data/input_audio.wav', sr, audio)\n", + "elif record_or_upload == 'Upload':\n", + " from google.colab import files\n", + " uploaded = files.upload()\n", + " for fn in uploaded.keys():\n", + " print('User uploaded file \"{name}\" with length {length} bytes'.format(\n", + " name=fn, length=len(uploaded[fn])))\n", + " \n", + " #concider only the first file\n", + " audio_file = str(list(uploaded.keys())[0])\n", + " \n", + " # Load audio with specified sampling rate\n", + " import librosa\n", + " audio, sr = librosa.load(audio_file, sr=None)\n", + " \n", + " # Save audio with specified sampling rate\n", + " import soundfile as sf\n", + " sf.write('/content/sample_data/input_audio.wav', audio, sr, format='wav')\n", + " \n", + " clear_output()\n", + " displayAudio()" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "ZgtO08V28ANf", + "cellView": "form" + }, + "source": [ + "#@title STEP4: Start Crunching and Preview Output\n", + "#@markdown Note: Only change these, if you have to\n", + "pad_top = 0#@param {type:\"integer\"}\n", + "pad_bottom = 10#@param {type:\"integer\"}\n", + "pad_left = 0#@param {type:\"integer\"}\n", + "pad_right = 0#@param {type:\"integer\"}\n", + "rescaleFactor = 1#@param {type:\"integer\"}\n", + "nosmooth = False #@param {type:\"boolean\"}\n", + "\n", + "if nosmooth == False:\n", + " !cd Wav2Lip && python inference.py --checkpoint_path checkpoints/wav2lip_gan.pth --face \"../sample_data/input_vid.mp4\" --audio \"../sample_data/input_audio.wav\" --pads $pad_top $pad_bottom $pad_left $pad_right --resize_factor $rescaleFactor\n", + "else:\n", + " !cd Wav2Lip && python inference.py --checkpoint_path checkpoints/wav2lip_gan.pth --face \"../sample_data/input_vid.mp4\" --audio \"../sample_data/input_audio.wav\" --pads $pad_top $pad_bottom $pad_left $pad_right --resize_factor $rescaleFactor --nosmooth\n", + "\n", + "#Preview output video\n", + "clear_output()\n", + "print(\"Final Video Preview\")\n", + "print(\"Dowload this video from\", '/content/Wav2Lip/results/result_voice.mp4')\n", + "showVideo('/content/Wav2Lip/results/result_voice.mp4')\n" + ], + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file diff --git a/Wav2Lip_simplified_v5.ipynb b/Wav2Lip_simplified_v5.ipynb new file mode 100644 index 000000000..308f3bd76 --- /dev/null +++ b/Wav2Lip_simplified_v5.ipynb @@ -0,0 +1,645 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "U1xFNFU58_2j" + }, + "source": [ + "## Goal: Make anyone speak anything (LipSync)\n", + "\n", + "* Github: https://github.com/Rudrabha/Wav2Lip\n", + "* Paper: https://arxiv.org/abs/2008.10010\n", + "*Original notebook: https://colab.research.google.com/drive/1tZpDWXz49W6wDcTprANRGLo2D_EbD5J8?usp=sharing\n", + "\n", + "\n", + "\n", + "\n", + "**Modded by: [justinjohn-03](https://github.com/justinjohn0306)**\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "Qgo-oaI3JU2u" + }, + "outputs": [], + "source": [ + "#@title

Step1: Setup Wav2Lip

\n", + "#@markdown * Install dependency\n", + "#@markdown * Download pretrained model\n", + "from IPython.display import HTML, clear_output\n", + "!rm -rf /content/sample_data\n", + "!mkdir /content/sample_data\n", + "\n", + "!git clone https://github.com/justinjohn0306/Wav2Lip\n", + "\n", + "%cd /content/Wav2Lip\n", + "\n", + "#download the pretrained model\n", + "!wget 'https://github.com/justinjohn0306/Wav2Lip/releases/download/models/wav2lip.pth' -O 'checkpoints/wav2lip.pth'\n", + "!wget 'https://github.com/justinjohn0306/Wav2Lip/releases/download/models/wav2lip_gan.pth' -O 'checkpoints/wav2lip_gan.pth'\n", + "!wget 'https://github.com/justinjohn0306/Wav2Lip/releases/download/models/resnet50.pth' -O 'checkpoints/resnet50.pth'\n", + "!wget 'https://github.com/justinjohn0306/Wav2Lip/releases/download/models/mobilenet.pth' -O 'checkpoints/mobilenet.pth'\n", + "a = !pip install https://raw.githubusercontent.com/AwaleSajil/ghc/master/ghc-1.0-py3-none-any.whl\n", + "!pip install git+https://github.com/elliottzheng/batch-face.git@master\n", + "\n", + "!pip install ffmpeg-python mediapipe==0.10.18\n", + "\n", + "#this code for recording audio\n", + "\"\"\"\n", + "To write this piece of code I took inspiration/code from a lot of places.\n", + "It was late night, so I'm not sure how much I created or just copied o.O\n", + "Here are some of the possible references:\n", + "https://blog.addpipe.com/recording-audio-in-the-browser-using-pure-html5-and-minimal-javascript/\n", + "https://stackoverflow.com/a/18650249\n", + "https://hacks.mozilla.org/2014/06/easy-audio-capture-with-the-mediarecorder-api/\n", + "https://air.ghost.io/recording-to-an-audio-file-using-html5-and-js/\n", + "https://stackoverflow.com/a/49019356\n", + "\"\"\"\n", + "from IPython.display import HTML, Audio\n", + "from google.colab.output import eval_js\n", + "from base64 import b64decode\n", + "import numpy as np\n", + "from scipy.io.wavfile import read as wav_read\n", + "import io\n", + "import ffmpeg\n", + "\n", + "AUDIO_HTML = \"\"\"\n", + "\n", + "\"\"\"\n", + "\n", + "%cd /\n", + "from ghc.l_ghc_cf import l_ghc_cf\n", + "%cd content\n", + "\n", + "def get_audio():\n", + " display(HTML(AUDIO_HTML))\n", + " data = eval_js(\"data\")\n", + " binary = b64decode(data.split(',')[1])\n", + "\n", + " process = (ffmpeg\n", + " .input('pipe:0')\n", + " .output('pipe:1', format='wav')\n", + " .run_async(pipe_stdin=True, pipe_stdout=True, pipe_stderr=True, quiet=True, overwrite_output=True)\n", + " )\n", + " output, err = process.communicate(input=binary)\n", + "\n", + " riff_chunk_size = len(output) - 8\n", + " # Break up the chunk size into four bytes, held in b.\n", + " q = riff_chunk_size\n", + " b = []\n", + " for i in range(4):\n", + " q, r = divmod(q, 256)\n", + " b.append(r)\n", + "\n", + " # Replace bytes 4:8 in proc.stdout with the actual size of the RIFF chunk.\n", + " riff = output[:4] + bytes(b) + output[8:]\n", + "\n", + " sr, audio = wav_read(io.BytesIO(riff))\n", + "\n", + " return audio, sr\n", + "\n", + "\n", + "from IPython.display import HTML\n", + "from base64 import b64encode\n", + "def showVideo(path):\n", + " mp4 = open(str(path),'rb').read()\n", + " data_url = \"data:video/mp4;base64,\" + b64encode(mp4).decode()\n", + " return HTML(\"\"\"\n", + " \n", + " \"\"\" % data_url)\n", + "\n", + "from IPython.display import clear_output\n", + "\n", + "clear_output()\n", + "print(\"All set and ready!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SEdy6PWDXMRL" + }, + "source": [ + "# LipSync Youtube Video" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "QI4kcm8QEeGZ" + }, + "outputs": [], + "source": [ + "#@title STEP2: Select a Youtube Video\n", + "# Install yt-dlp\n", + "\n", + "import os\n", + "!pip install yt-dlp\n", + "\n", + "#@markdown ## Find YouTube video ID from URL\n", + "\n", + "#@markdown ___\n", + "\n", + "#@markdown Link format:\n", + "\n", + "#@markdown ``https://youtu.be/vAnWYLTdvfY`` ❌\n", + "\n", + "#@markdown ``https://www.youtube.com/watch?v=vAnWYLTdvfY`` ✔️\n", + "\n", + "!rm -df youtube.mp4\n", + "\n", + "#@markdown ___\n", + "from urllib import parse as urlparse\n", + "YOUTUBE_URL = 'https://www.youtube.com/watch?v=vAnWYLTdvfY' #@param {type:\"string\"}\n", + "url_data = urlparse.urlparse(YOUTUBE_URL)\n", + "query = urlparse.parse_qs(url_data.query)\n", + "YOUTUBE_ID = query[\"v\"][0]\n", + "\n", + "\n", + "# remove previous input video\n", + "!rm -f /content/sample_data/input_vid.mp4\n", + "\n", + "\n", + "#@markdown ___\n", + "\n", + "#@markdown ### Trim the video (start, end) seconds\n", + "start = 35 #@param {type:\"integer\"}\n", + "end = 62 #@param {type:\"integer\"}\n", + "interval = end - start\n", + "\n", + "#@markdown Note: ``the trimmed video must have face on all frames``\n", + "\n", + "# Download the YouTube video using yt-dlp\n", + "!yt-dlp -f 'bestvideo[ext=mp4]' --output \"youtube.%(ext)s\" https://www.youtube.com/watch?v=$YOUTUBE_ID\n", + "\n", + "# Cut the video using FFmpeg\n", + "!ffmpeg -y -i youtube.mp4 -ss {start} -t {interval} -async 1 /content/sample_data/input_vid.mp4\n", + "\n", + "# Preview the trimmed video\n", + "from IPython.display import HTML\n", + "from base64 import b64encode\n", + "mp4 = open('/content/sample_data/input_vid.mp4','rb').read()\n", + "data_url = \"data:video/mp4;base64,\" + b64encode(mp4).decode()\n", + "HTML(f\"\"\"\"\"\")\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "zS_RAeh-IfZy" + }, + "outputs": [], + "source": [ + "#@title STEP3: Select Audio (Record, Upload from local drive or Gdrive)\n", + "import os\n", + "from IPython.display import Audio\n", + "from IPython.core.display import display\n", + "\n", + "upload_method = 'Upload' #@param ['Record', 'Upload', 'Custom Path']\n", + "\n", + "#remove previous input audio\n", + "if os.path.isfile('/content/sample_data/input_audio.wav'):\n", + " os.remove('/content/sample_data/input_audio.wav')\n", + "\n", + "def displayAudio():\n", + " display(Audio('/content/sample_data/input_audio.wav'))\n", + "\n", + "if upload_method == 'Record':\n", + " audio, sr = get_audio()\n", + " import scipy\n", + " scipy.io.wavfile.write('/content/sample_data/input_audio.wav', sr, audio)\n", + "\n", + "elif upload_method == 'Upload':\n", + " from google.colab import files\n", + " uploaded = files.upload()\n", + " for fn in uploaded.keys():\n", + " print('User uploaded file \"{name}\" with length {length} bytes'.format(\n", + " name=fn, length=len(uploaded[fn])))\n", + "\n", + " # Consider only the first file\n", + " PATH_TO_YOUR_AUDIO = str(list(uploaded.keys())[0])\n", + "\n", + " # Load audio with specified sampling rate\n", + " import librosa\n", + " audio, sr = librosa.load(PATH_TO_YOUR_AUDIO, sr=None)\n", + "\n", + " # Save audio with specified sampling rate\n", + " import soundfile as sf\n", + " sf.write('/content/sample_data/input_audio.wav', audio, sr, format='wav')\n", + "\n", + " clear_output()\n", + " displayAudio()\n", + "\n", + "elif upload_method == 'Custom Path':\n", + " from google.colab import drive\n", + " drive.mount('/content/drive')\n", + " #@markdown ``Add the full path to your audio on your Gdrive`` 👇\n", + " PATH_TO_YOUR_AUDIO = '/content/drive/MyDrive/test.wav' #@param {type:\"string\"}\n", + "\n", + " # Load audio with specified sampling rate\n", + " import librosa\n", + " audio, sr = librosa.load(PATH_TO_YOUR_AUDIO, sr=None)\n", + "\n", + " # Save audio with specified sampling rate\n", + " import soundfile as sf\n", + " sf.write('/content/sample_data/input_audio.wav', audio, sr, format='wav')\n", + "\n", + " clear_output()\n", + " displayAudio()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "BQPLXJ8L0gms" + }, + "outputs": [], + "source": [ + "#@title STEP4: Start Crunching and Preview Output\n", + "#@markdown Note: Only change these, if you have to\n", + "\n", + "%cd /content/Wav2Lip\n", + "\n", + "# Set up paths and variables for the output file\n", + "output_file_path = '/content/Wav2Lip/results/result_voice.mp4'\n", + "\n", + "# Delete existing output file before processing, if any\n", + "if os.path.exists(output_file_path):\n", + " os.remove(output_file_path)\n", + "\n", + "pad_top = 0#@param {type:\"integer\"}\n", + "pad_bottom = 10#@param {type:\"integer\"}\n", + "pad_left = 0#@param {type:\"integer\"}\n", + "pad_right = 0#@param {type:\"integer\"}\n", + "rescaleFactor = 1#@param {type:\"integer\"}\n", + "nosmooth = True #@param {type:\"boolean\"}\n", + "#@markdown ___\n", + "#@markdown Model selection:\n", + "use_hd_model = False #@param {type:\"boolean\"}\n", + "checkpoint_path = 'checkpoints/wav2lip.pth' if not use_hd_model else 'checkpoints/wav2lip_gan.pth'\n", + "\n", + "\n", + "if nosmooth == False:\n", + " !python inference.py --checkpoint_path $checkpoint_path --face \"../sample_data/input_vid.mp4\" --audio \"../sample_data/input_audio.wav\" --pads $pad_top $pad_bottom $pad_left $pad_right --resize_factor $rescaleFactor\n", + "else:\n", + " !python inference.py --checkpoint_path $checkpoint_path --face \"../sample_data/input_vid.mp4\" --audio \"../sample_data/input_audio.wav\" --pads $pad_top $pad_bottom $pad_left $pad_right --resize_factor $rescaleFactor --nosmooth\n", + "\n", + "#Preview output video\n", + "if os.path.exists(output_file_path):\n", + " clear_output()\n", + " print(\"Final Video Preview\")\n", + " print(\"Download this video from\", output_file_path)\n", + " showVideo(output_file_path)\n", + "else:\n", + " print(\"Processing failed. Output video not found.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vYxpPeie1CYL" + }, + "source": [ + "# LipSync on Your Video File" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "nDuM7tfZ1F0t" + }, + "outputs": [], + "source": [ + "import os\n", + "import shutil\n", + "from google.colab import drive\n", + "from google.colab import files\n", + "from IPython.display import HTML, clear_output\n", + "from base64 import b64encode\n", + "import moviepy.editor as mp\n", + "\n", + "\n", + "def showVideo(file_path):\n", + " \"\"\"Function to display video in Colab\"\"\"\n", + " mp4 = open(file_path,'rb').read()\n", + " data_url = \"data:video/mp4;base64,\" + b64encode(mp4).decode()\n", + " display(HTML(\"\"\"\n", + " \n", + " \"\"\" % data_url))\n", + "\n", + "def get_video_resolution(video_path):\n", + " \"\"\"Function to get the resolution of a video\"\"\"\n", + " import cv2\n", + " video = cv2.VideoCapture(video_path)\n", + " width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))\n", + " height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))\n", + " return (width, height)\n", + "\n", + "def resize_video(video_path, new_resolution):\n", + " \"\"\"Function to resize a video\"\"\"\n", + " import cv2\n", + " video = cv2.VideoCapture(video_path)\n", + " fourcc = int(video.get(cv2.CAP_PROP_FOURCC))\n", + " fps = video.get(cv2.CAP_PROP_FPS)\n", + " width, height = new_resolution\n", + " output_path = os.path.splitext(video_path)[0] + '_720p.mp4'\n", + " writer = cv2.VideoWriter(output_path, fourcc, fps, (width, height))\n", + " while True:\n", + " success, frame = video.read()\n", + " if not success:\n", + " break\n", + " resized_frame = cv2.resize(frame, new_resolution)\n", + " writer.write(resized_frame)\n", + " video.release()\n", + " writer.release()\n", + "\n", + "# Mount Google Drive if it's not already mounted\n", + "if not os.path.isdir(\"/content/drive/MyDrive\"):\n", + " drive.mount('/content/drive', force_remount=True)\n", + "\n", + "#@markdown ### Select an uploading method\n", + "upload_method = \"Upload\" #@param [\"Upload\", \"Custom Path\"]\n", + "\n", + "\n", + "# remove previous input video\n", + "if os.path.isfile('/content/sample_data/input_vid.mp4'):\n", + " os.remove('/content/sample_data/input_vid.mp4')\n", + "\n", + "if upload_method == \"Upload\":\n", + " uploaded = files.upload()\n", + " for filename in uploaded.keys():\n", + " os.rename(filename, '/content/sample_data/input_vid.mp4')\n", + " PATH_TO_YOUR_VIDEO = '/content/sample_data/input_vid.mp4'\n", + "\n", + "elif upload_method == 'Custom Path':\n", + " #@markdown ``Add the full path to your video on your Gdrive `` 👇\n", + " PATH_TO_YOUR_VIDEO = '/content/drive/MyDrive/test.mp4' #@param {type:\"string\"}\n", + " if not os.path.isfile(PATH_TO_YOUR_VIDEO):\n", + " print(\"ERROR: File not found!\")\n", + " raise SystemExit(0)\n", + "\n", + "#@markdown Notes:\n", + "\n", + "#@markdown . ``If your uploaded video is 1080p or higher resolution, this cell will resize it to 720p.``\n", + "\n", + "#@markdown . ``Do not upload videos longer than 60 seconds.``\n", + "\n", + "#@markdown ___\n", + "\n", + "video_duration = mp.VideoFileClip(PATH_TO_YOUR_VIDEO).duration\n", + "if video_duration > 60:\n", + " print(\"WARNING: Video duration exceeds 60 seconds. Please upload a shorter video.\")\n", + " raise SystemExit(0)\n", + "\n", + "video_resolution = get_video_resolution(PATH_TO_YOUR_VIDEO)\n", + "print(f\"Video resolution: {video_resolution}\")\n", + "if video_resolution[0] >= 1920 or video_resolution[1] >= 1080:\n", + " print(\"Resizing video to 720p...\")\n", + " os.system(f\"ffmpeg -i {PATH_TO_YOUR_VIDEO} -vf scale=1280:720 /content/sample_data/input_vid.mp4\")\n", + " PATH_TO_YOUR_VIDEO = \"/content/sample_data/input_vid.mp4\"\n", + " print(\"Video resized to 720p\")\n", + "else:\n", + " print(\"No resizing needed\")\n", + "\n", + "if upload_method == \"Upload\":\n", + " clear_output()\n", + " print(\"Input Video\")\n", + " showVideo(PATH_TO_YOUR_VIDEO)\n", + "else:\n", + " if os.path.isfile(PATH_TO_YOUR_VIDEO):\n", + " # Check if the source and destination files are the same\n", + " if PATH_TO_YOUR_VIDEO != \"/content/sample_data/input_vid.mp4\":\n", + " shutil.copyfile(PATH_TO_YOUR_VIDEO, \"/content/sample_data/input_vid.mp4\")\n", + " print(\"Video copied to destination.\")\n", + "\n", + " print(\"Input Video\")\n", + " # Display the video from the destination path\n", + " showVideo(\"/content/sample_data/input_vid.mp4\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "XgF4794r7sWK" + }, + "outputs": [], + "source": [ + "#@title STEP3: Select Audio (Record, Upload from local drive or Gdrive)\n", + "import os\n", + "from IPython.display import Audio\n", + "from IPython.core.display import display\n", + "\n", + "upload_method = 'Upload' #@param ['Record', 'Upload', 'Custom Path']\n", + "\n", + "#remove previous input audio\n", + "if os.path.isfile('/content/sample_data/input_audio.wav'):\n", + " os.remove('/content/sample_data/input_audio.wav')\n", + "\n", + "def displayAudio():\n", + " display(Audio('/content/sample_data/input_audio.wav'))\n", + "\n", + "if upload_method == 'Record':\n", + " audio, sr = get_audio()\n", + " import scipy\n", + " scipy.io.wavfile.write('/content/sample_data/input_audio.wav', sr, audio)\n", + "\n", + "elif upload_method == 'Upload':\n", + " from google.colab import files\n", + " uploaded = files.upload()\n", + " for fn in uploaded.keys():\n", + " print('User uploaded file \"{name}\" with length {length} bytes.'.format(\n", + " name=fn, length=len(uploaded[fn])))\n", + "\n", + " # Consider only the first file\n", + " PATH_TO_YOUR_AUDIO = str(list(uploaded.keys())[0])\n", + "\n", + " # Load audio with specified sampling rate\n", + " import librosa\n", + " audio, sr = librosa.load(PATH_TO_YOUR_AUDIO, sr=None)\n", + "\n", + " # Save audio with specified sampling rate\n", + " import soundfile as sf\n", + " sf.write('/content/sample_data/input_audio.wav', audio, sr, format='wav')\n", + "\n", + " clear_output()\n", + " displayAudio()\n", + "\n", + "else: # Custom Path\n", + " from google.colab import drive\n", + " drive.mount('/content/drive')\n", + " #@markdown ``Add the full path to your audio on your Gdrive`` 👇\n", + " PATH_TO_YOUR_AUDIO = '/content/drive/MyDrive/test.wav' #@param {type:\"string\"}\n", + "\n", + " # Load audio with specified sampling rate\n", + " import librosa\n", + " audio, sr = librosa.load(PATH_TO_YOUR_AUDIO, sr=None)\n", + "\n", + " # Save audio with specified sampling rate\n", + " import soundfile as sf\n", + " sf.write('/content/sample_data/input_audio.wav', audio, sr, format='wav')\n", + "\n", + " clear_output()\n", + " displayAudio()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "ZgtO08V28ANf" + }, + "outputs": [], + "source": [ + "#@title STEP4: Start Crunching and Preview Output\n", + "#@markdown Note: Only change these, if you have to\n", + "\n", + "%cd /content/Wav2Lip\n", + "\n", + "# Set up paths and variables for the output file\n", + "output_file_path = '/content/Wav2Lip/results/result_voice.mp4'\n", + "\n", + "# Delete existing output file before processing, if any\n", + "if os.path.exists(output_file_path):\n", + " os.remove(output_file_path)\n", + "\n", + "pad_top = 0#@param {type:\"integer\"}\n", + "pad_bottom = 10#@param {type:\"integer\"}\n", + "pad_left = 0#@param {type:\"integer\"}\n", + "pad_right = 0#@param {type:\"integer\"}\n", + "rescaleFactor = 1#@param {type:\"integer\"}\n", + "nosmooth = True #@param {type:\"boolean\"}\n", + "#@markdown ___\n", + "#@markdown Model selection:\n", + "use_hd_model = False #@param {type:\"boolean\"}\n", + "checkpoint_path = 'checkpoints/wav2lip.pth' if not use_hd_model else 'checkpoints/wav2lip_gan.pth'\n", + "\n", + "\n", + "if nosmooth == False:\n", + " !python inference.py --checkpoint_path $checkpoint_path --face \"../sample_data/input_vid.mp4\" --audio \"../sample_data/input_audio.wav\" --pads $pad_top $pad_bottom $pad_left $pad_right --resize_factor $rescaleFactor\n", + "else:\n", + " !python inference.py --checkpoint_path $checkpoint_path --face \"../sample_data/input_vid.mp4\" --audio \"../sample_data/input_audio.wav\" --pads $pad_top $pad_bottom $pad_left $pad_right --resize_factor $rescaleFactor --nosmooth\n", + "\n", + "#Preview output video\n", + "if os.path.exists(output_file_path):\n", + " clear_output()\n", + " print(\"Final Video Preview\")\n", + " print(\"Download this video from\", output_file_path)\n", + " showVideo(output_file_path)\n", + "else:\n", + " print(\"Processing failed. Output video not found.\")" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "private_outputs": true, + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/audio.py b/audio.py index 32b20c449..32ab5fabe 100644 --- a/audio.py +++ b/audio.py @@ -97,7 +97,7 @@ def _linear_to_mel(spectogram): def _build_mel_basis(): assert hp.fmax <= hp.sample_rate // 2 - return librosa.filters.mel(hp.sample_rate, hp.n_fft, n_mels=hp.num_mels, + return librosa.filters.mel(sr=hp.sample_rate, n_fft=hp.n_fft, n_mels=hp.num_mels, fmin=hp.fmin, fmax=hp.fmax) def _amp_to_db(x): diff --git a/cog.yaml b/cog.yaml new file mode 100644 index 000000000..f188727d7 --- /dev/null +++ b/cog.yaml @@ -0,0 +1,35 @@ +# Configuration for Cog ⚙️ +# Reference: https://github.com/replicate/cog/blob/main/docs/yaml.md + +image: r8.im/devxpy/cog-wav2lip + +build: + # set to true if your model requires a GPU + gpu: true + cuda: "11.6.2" + + # a list of ubuntu apt packages to install + system_packages: + - ffmpeg + - cmake + + # python version in the form '3.8' or '3.8.12' + python_version: "3.8" + + # a list of packages in the format == + python_packages: + - numpy==1.23.4 + - librosa==0.7.0 + - opencv-python==4.6.0.66 + - torch==1.12.1+cu116 --extra-index-url=https://download.pytorch.org/whl/cu116 + - torchvision==0.13.1+cu116 --extra-index-url=https://download.pytorch.org/whl/cu116 + - tqdm==4.45.0 + - numba==0.48 + - mediapipe==0.8.11 + + # commands run after the environment is setup + run: + - pip install git+https://github.com/elliottzheng/batch-face.git@master + +# predict.py defines how predictions are run on your model +predict: "predict.py:Predictor" diff --git a/face_detect.py b/face_detect.py new file mode 100644 index 000000000..fd35da2a1 --- /dev/null +++ b/face_detect.py @@ -0,0 +1,55 @@ +import cv2 +import mediapipe as mp + +mp_face_mesh = mp.solutions.face_mesh +mp_drawing = mp.solutions.drawing_utils +mp_drawing_styles = mp.solutions.drawing_styles +mp_face_detection = mp.solutions.face_detection + + +def face_rect(images): + with mp_face_detection.FaceDetection( + model_selection=1, min_detection_confidence=0.5 + ) as face_detection: + for image_cv2 in images: + # Convert the BGR image to RGB and process it with MediaPipe Face Detection. + results = face_detection.process(cv2.cvtColor(image_cv2, cv2.COLOR_BGR2RGB)) + + # Draw face detections of each face. + if not results.detections: + yield None + for detection in results.detections: + yield _get_bounding_rect(image_cv2, detection) + + +def _get_bounding_rect( + image: mp_drawing.np.ndarray, + detection: mp_drawing.detection_pb2.Detection, +): + """ + Stolen from mediapipe.solutions.drawing_utils.draw_detection() + """ + if not detection.location_data: + return + if image.shape[2] != mp_drawing._BGR_CHANNELS: + raise ValueError("Input image must contain three channel bgr data.") + image_rows, image_cols, _ = image.shape + + location = detection.location_data + + # get bounding box if exists. + if not location.HasField("relative_bounding_box"): + return + relative_bounding_box = location.relative_bounding_box + rect_start_point = mp_drawing._normalized_to_pixel_coordinates( + relative_bounding_box.xmin, relative_bounding_box.ymin, image_cols, image_rows + ) + rect_end_point = mp_drawing._normalized_to_pixel_coordinates( + relative_bounding_box.xmin + relative_bounding_box.width, + relative_bounding_box.ymin + relative_bounding_box.height, + image_cols, + image_rows, + ) + + return *rect_start_point, *rect_end_point + diff --git a/face_detection/detection/sfd/sfd_detector.py b/face_detection/detection/sfd/sfd_detector.py index 8fbce1525..d1776e4bf 100644 --- a/face_detection/detection/sfd/sfd_detector.py +++ b/face_detection/detection/sfd/sfd_detector.py @@ -14,8 +14,9 @@ class SFDDetector(FaceDetector): - def __init__(self, device, path_to_detector=os.path.join(os.path.dirname(os.path.abspath(__file__)), 's3fd.pth'), verbose=False): - super(SFDDetector, self).__init__(device, verbose) + @classmethod + def load_model(cls, device): + path_to_detector = os.path.join(os.path.dirname(os.path.abspath(__file__)), 's3fd.pth') # Initialise the face detector if not os.path.isfile(path_to_detector): @@ -23,10 +24,10 @@ def __init__(self, device, path_to_detector=os.path.join(os.path.dirname(os.path else: model_weights = torch.load(path_to_detector) - self.face_detector = s3fd() - self.face_detector.load_state_dict(model_weights) - self.face_detector.to(device) - self.face_detector.eval() + cls.face_detector = s3fd() + cls.face_detector.load_state_dict(model_weights) + cls.face_detector.to(device) + cls.face_detector.eval() def detect_from_image(self, tensor_or_path): image = self.tensor_or_path_to_ndarray(tensor_or_path) diff --git a/inference.py b/inference.py index 90692521e..5e1522d25 100644 --- a/inference.py +++ b/inference.py @@ -1,280 +1,327 @@ -from os import listdir, path +import argparse +import math +import os +import platform +import subprocess + +import cv2 import numpy as np -import scipy, cv2, os, sys, argparse, audio -import json, subprocess, random, string +import torch from tqdm import tqdm -from glob import glob -import torch, face_detection + +import audio +# from face_detect import face_rect from models import Wav2Lip -import platform + +from batch_face import RetinaFace +from time import time parser = argparse.ArgumentParser(description='Inference code to lip-sync videos in the wild using Wav2Lip models') parser.add_argument('--checkpoint_path', type=str, - help='Name of saved checkpoint to load weights from', required=True) + help='Name of saved checkpoint to load weights from', required=True) parser.add_argument('--face', type=str, - help='Filepath of video/image that contains faces to use', required=True) + help='Filepath of video/image that contains faces to use', required=True) parser.add_argument('--audio', type=str, - help='Filepath of video/audio file to use as raw audio source', required=True) + help='Filepath of video/audio file to use as raw audio source', required=True) parser.add_argument('--outfile', type=str, help='Video path to save result. See default for an e.g.', - default='results/result_voice.mp4') + default='results/result_voice.mp4') parser.add_argument('--static', type=bool, - help='If True, then use only first video frame for inference', default=False) + help='If True, then use only first video frame for inference', default=False) parser.add_argument('--fps', type=float, help='Can be specified only if input is a static image (default: 25)', - default=25., required=False) + default=25., required=False) parser.add_argument('--pads', nargs='+', type=int, default=[0, 10, 0, 0], - help='Padding (top, bottom, left, right). Please adjust to include chin at least') + help='Padding (top, bottom, left, right). Please adjust to include chin at least') -parser.add_argument('--face_det_batch_size', type=int, - help='Batch size for face detection', default=16) parser.add_argument('--wav2lip_batch_size', type=int, help='Batch size for Wav2Lip model(s)', default=128) -parser.add_argument('--resize_factor', default=1, type=int, - help='Reduce the resolution by this factor. Sometimes, best results are obtained at 480p or 720p') +parser.add_argument('--resize_factor', default=1, type=int, + help='Reduce the resolution by this factor. Sometimes, best results are obtained at 480p or 720p') + +parser.add_argument('--out_height', default=480, type=int, + help='Output video height. Best results are obtained at 480 or 720') -parser.add_argument('--crop', nargs='+', type=int, default=[0, -1, 0, -1], - help='Crop video to a smaller region (top, bottom, left, right). Applied after resize_factor and rotate arg. ' - 'Useful if multiple face present. -1 implies the value will be auto-inferred based on height, width') +parser.add_argument('--crop', nargs='+', type=int, default=[0, -1, 0, -1], + help='Crop video to a smaller region (top, bottom, left, right). Applied after resize_factor and rotate arg. ' + 'Useful if multiple face present. -1 implies the value will be auto-inferred based on height, width') parser.add_argument('--box', nargs='+', type=int, default=[-1, -1, -1, -1], - help='Specify a constant bounding box for the face. Use only as a last resort if the face is not detected.' - 'Also, might work only if the face is not moving around much. Syntax: (top, bottom, left, right).') + help='Specify a constant bounding box for the face. Use only as a last resort if the face is not detected.' + 'Also, might work only if the face is not moving around much. Syntax: (top, bottom, left, right).') parser.add_argument('--rotate', default=False, action='store_true', - help='Sometimes videos taken from a phone can be flipped 90deg. If true, will flip video right by 90deg.' - 'Use if you get a flipped result, despite feeding a normal looking video') + help='Sometimes videos taken from a phone can be flipped 90deg. If true, will flip video right by 90deg.' + 'Use if you get a flipped result, despite feeding a normal looking video') parser.add_argument('--nosmooth', default=False, action='store_true', - help='Prevent smoothing face detections over a short temporal window') + help='Prevent smoothing face detections over a short temporal window') -args = parser.parse_args() -args.img_size = 96 - -if os.path.isfile(args.face) and args.face.split('.')[1] in ['jpg', 'png', 'jpeg']: - args.static = True def get_smoothened_boxes(boxes, T): - for i in range(len(boxes)): - if i + T > len(boxes): - window = boxes[len(boxes) - T:] - else: - window = boxes[i : i + T] - boxes[i] = np.mean(window, axis=0) - return boxes + for i in range(len(boxes)): + if i + T > len(boxes): + window = boxes[len(boxes) - T:] + else: + window = boxes[i : i + T] + boxes[i] = np.mean(window, axis=0) + return boxes def face_detect(images): - detector = face_detection.FaceAlignment(face_detection.LandmarksType._2D, - flip_input=False, device=device) - - batch_size = args.face_det_batch_size - - while 1: - predictions = [] - try: - for i in tqdm(range(0, len(images), batch_size)): - predictions.extend(detector.get_detections_for_batch(np.array(images[i:i + batch_size]))) - except RuntimeError: - if batch_size == 1: - raise RuntimeError('Image too big to run face detection on GPU. Please use the --resize_factor argument') - batch_size //= 2 - print('Recovering from OOM error; New batch size: {}'.format(batch_size)) - continue - break - - results = [] - pady1, pady2, padx1, padx2 = args.pads - for rect, image in zip(predictions, images): - if rect is None: - cv2.imwrite('temp/faulty_frame.jpg', image) # check this frame where the face was not detected. - raise ValueError('Face not detected! Ensure the video contains a face in all the frames.') - - y1 = max(0, rect[1] - pady1) - y2 = min(image.shape[0], rect[3] + pady2) - x1 = max(0, rect[0] - padx1) - x2 = min(image.shape[1], rect[2] + padx2) - - results.append([x1, y1, x2, y2]) - - boxes = np.array(results) - if not args.nosmooth: boxes = get_smoothened_boxes(boxes, T=5) - results = [[image[y1: y2, x1:x2], (y1, y2, x1, x2)] for image, (x1, y1, x2, y2) in zip(images, boxes)] - - del detector - return results + results = [] + pady1, pady2, padx1, padx2 = args.pads + + s = time() + + for image, rect in zip(images, face_rect(images)): + if rect is None: + cv2.imwrite('temp/faulty_frame.jpg', image) # check this frame where the face was not detected. + raise ValueError('Face not detected! Ensure the video contains a face in all the frames.') + + y1 = max(0, rect[1] - pady1) + y2 = min(image.shape[0], rect[3] + pady2) + x1 = max(0, rect[0] - padx1) + x2 = min(image.shape[1], rect[2] + padx2) + + results.append([x1, y1, x2, y2]) + + print('face detect time:', time() - s) + + boxes = np.array(results) + if not args.nosmooth: boxes = get_smoothened_boxes(boxes, T=5) + results = [[image[y1: y2, x1:x2], (y1, y2, x1, x2)] for image, (x1, y1, x2, y2) in zip(images, boxes)] + + return results + def datagen(frames, mels): - img_batch, mel_batch, frame_batch, coords_batch = [], [], [], [] + img_batch, mel_batch, frame_batch, coords_batch = [], [], [], [] - if args.box[0] == -1: - if not args.static: - face_det_results = face_detect(frames) # BGR2RGB for CNN face detection - else: - face_det_results = face_detect([frames[0]]) - else: - print('Using the specified bounding box instead of face detection...') - y1, y2, x1, x2 = args.box - face_det_results = [[f[y1: y2, x1:x2], (y1, y2, x1, x2)] for f in frames] + if args.box[0] == -1: + if not args.static: + face_det_results = face_detect(frames) # BGR2RGB for CNN face detection + else: + face_det_results = face_detect([frames[0]]) + else: + print('Using the specified bounding box instead of face detection...') + y1, y2, x1, x2 = args.box + face_det_results = [[f[y1: y2, x1:x2], (y1, y2, x1, x2)] for f in frames] - for i, m in enumerate(mels): - idx = 0 if args.static else i%len(frames) - frame_to_save = frames[idx].copy() - face, coords = face_det_results[idx].copy() + for i, m in enumerate(mels): + idx = 0 if args.static else i%len(frames) + frame_to_save = frames[idx].copy() + face, coords = face_det_results[idx].copy() - face = cv2.resize(face, (args.img_size, args.img_size)) - - img_batch.append(face) - mel_batch.append(m) - frame_batch.append(frame_to_save) - coords_batch.append(coords) + face = cv2.resize(face, (args.img_size, args.img_size)) - if len(img_batch) >= args.wav2lip_batch_size: - img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch) + img_batch.append(face) + mel_batch.append(m) + frame_batch.append(frame_to_save) + coords_batch.append(coords) - img_masked = img_batch.copy() - img_masked[:, args.img_size//2:] = 0 + if len(img_batch) >= args.wav2lip_batch_size: + img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch) - img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255. - mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1]) + img_masked = img_batch.copy() + img_masked[:, args.img_size//2:] = 0 - yield img_batch, mel_batch, frame_batch, coords_batch - img_batch, mel_batch, frame_batch, coords_batch = [], [], [], [] + img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255. + mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1]) - if len(img_batch) > 0: - img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch) + yield img_batch, mel_batch, frame_batch, coords_batch + img_batch, mel_batch, frame_batch, coords_batch = [], [], [], [] - img_masked = img_batch.copy() - img_masked[:, args.img_size//2:] = 0 + if len(img_batch) > 0: + img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch) - img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255. - mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1]) + img_masked = img_batch.copy() + img_masked[:, args.img_size//2:] = 0 - yield img_batch, mel_batch, frame_batch, coords_batch + img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255. + mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1]) + + yield img_batch, mel_batch, frame_batch, coords_batch mel_step_size = 16 device = 'cuda' if torch.cuda.is_available() else 'cpu' print('Using {} for inference.'.format(device)) def _load(checkpoint_path): - if device == 'cuda': - checkpoint = torch.load(checkpoint_path) - else: - checkpoint = torch.load(checkpoint_path, - map_location=lambda storage, loc: storage) - return checkpoint + if device == 'cuda': + checkpoint = torch.load(checkpoint_path) + else: + checkpoint = torch.load(checkpoint_path, + map_location=lambda storage, loc: storage) + return checkpoint def load_model(path): - model = Wav2Lip() - print("Load checkpoint from: {}".format(path)) - checkpoint = _load(path) - s = checkpoint["state_dict"] - new_s = {} - for k, v in s.items(): - new_s[k.replace('module.', '')] = v - model.load_state_dict(new_s) - - model = model.to(device) - return model.eval() + model = Wav2Lip() + print("Load checkpoint from: {}".format(path)) + checkpoint = _load(path) + s = checkpoint["state_dict"] + new_s = {} + for k, v in s.items(): + new_s[k.replace('module.', '')] = v + model.load_state_dict(new_s) + + model = model.to(device) + return model.eval() def main(): - if not os.path.isfile(args.face): - raise ValueError('--face argument must be a valid path to video/image file') + args.img_size = 96 + + if os.path.isfile(args.face) and args.face.split('.')[1] in ['jpg', 'png', 'jpeg']: + args.static = True + + if not os.path.isfile(args.face): + raise ValueError('--face argument must be a valid path to video/image file') + + elif args.face.split('.')[1] in ['jpg', 'png', 'jpeg']: + full_frames = [cv2.imread(args.face)] + fps = args.fps + + else: + video_stream = cv2.VideoCapture(args.face) + fps = video_stream.get(cv2.CAP_PROP_FPS) + + print('Reading video frames...') + + full_frames = [] + while 1: + still_reading, frame = video_stream.read() + if not still_reading: + video_stream.release() + break + + aspect_ratio = frame.shape[1] / frame.shape[0] + frame = cv2.resize(frame, (int(args.out_height * aspect_ratio), args.out_height)) + # if args.resize_factor > 1: + # frame = cv2.resize(frame, (frame.shape[1]//args.resize_factor, frame.shape[0]//args.resize_factor)) + + if args.rotate: + frame = cv2.rotate(frame, cv2.cv2.ROTATE_90_CLOCKWISE) + + y1, y2, x1, x2 = args.crop + if x2 == -1: x2 = frame.shape[1] + if y2 == -1: y2 = frame.shape[0] + + frame = frame[y1:y2, x1:x2] + + full_frames.append(frame) + + print ("Number of frames available for inference: "+str(len(full_frames))) + + if not args.audio.endswith('.wav'): + print('Extracting raw audio...') + # command = 'ffmpeg -y -i {} -strict -2 {}'.format(args.audio, 'temp/temp.wav') + # subprocess.call(command, shell=True) + subprocess.check_call([ + "ffmpeg", "-y", + "-i", args.audio, + "temp/temp.wav", + ]) + args.audio = 'temp/temp.wav' - elif args.face.split('.')[1] in ['jpg', 'png', 'jpeg']: - full_frames = [cv2.imread(args.face)] - fps = args.fps + wav = audio.load_wav(args.audio, 16000) + mel = audio.melspectrogram(wav) + print(mel.shape) - else: - video_stream = cv2.VideoCapture(args.face) - fps = video_stream.get(cv2.CAP_PROP_FPS) + if np.isnan(mel.reshape(-1)).sum() > 0: + raise ValueError('Mel contains nan! Using a TTS voice? Add a small epsilon noise to the wav file and try again') - print('Reading video frames...') + mel_chunks = [] + mel_idx_multiplier = 80./fps + i = 0 + while 1: + start_idx = int(i * mel_idx_multiplier) + if start_idx + mel_step_size > len(mel[0]): + mel_chunks.append(mel[:, len(mel[0]) - mel_step_size:]) + break + mel_chunks.append(mel[:, start_idx : start_idx + mel_step_size]) + i += 1 - full_frames = [] - while 1: - still_reading, frame = video_stream.read() - if not still_reading: - video_stream.release() - break - if args.resize_factor > 1: - frame = cv2.resize(frame, (frame.shape[1]//args.resize_factor, frame.shape[0]//args.resize_factor)) + print("Length of mel chunks: {}".format(len(mel_chunks))) - if args.rotate: - frame = cv2.rotate(frame, cv2.cv2.ROTATE_90_CLOCKWISE) + full_frames = full_frames[:len(mel_chunks)] - y1, y2, x1, x2 = args.crop - if x2 == -1: x2 = frame.shape[1] - if y2 == -1: y2 = frame.shape[0] + batch_size = args.wav2lip_batch_size + gen = datagen(full_frames.copy(), mel_chunks) - frame = frame[y1:y2, x1:x2] + s = time() - full_frames.append(frame) + for i, (img_batch, mel_batch, frames, coords) in enumerate(tqdm(gen, + total=int(np.ceil(float(len(mel_chunks))/batch_size)))): + if i == 0: + frame_h, frame_w = full_frames[0].shape[:-1] + out = cv2.VideoWriter('temp/result.avi', + cv2.VideoWriter_fourcc(*'DIVX'), fps, (frame_w, frame_h)) - print ("Number of frames available for inference: "+str(len(full_frames))) + img_batch = torch.FloatTensor(np.transpose(img_batch, (0, 3, 1, 2))).to(device) + mel_batch = torch.FloatTensor(np.transpose(mel_batch, (0, 3, 1, 2))).to(device) - if not args.audio.endswith('.wav'): - print('Extracting raw audio...') - command = 'ffmpeg -y -i {} -strict -2 {}'.format(args.audio, 'temp/temp.wav') + with torch.no_grad(): + pred = model(mel_batch, img_batch) - subprocess.call(command, shell=True) - args.audio = 'temp/temp.wav' + pred = pred.cpu().numpy().transpose(0, 2, 3, 1) * 255. - wav = audio.load_wav(args.audio, 16000) - mel = audio.melspectrogram(wav) - print(mel.shape) + for p, f, c in zip(pred, frames, coords): + y1, y2, x1, x2 = c + p = cv2.resize(p.astype(np.uint8), (x2 - x1, y2 - y1)) - if np.isnan(mel.reshape(-1)).sum() > 0: - raise ValueError('Mel contains nan! Using a TTS voice? Add a small epsilon noise to the wav file and try again') + f[y1:y2, x1:x2] = p + out.write(f) - mel_chunks = [] - mel_idx_multiplier = 80./fps - i = 0 - while 1: - start_idx = int(i * mel_idx_multiplier) - if start_idx + mel_step_size > len(mel[0]): - mel_chunks.append(mel[:, len(mel[0]) - mel_step_size:]) - break - mel_chunks.append(mel[:, start_idx : start_idx + mel_step_size]) - i += 1 + out.release() - print("Length of mel chunks: {}".format(len(mel_chunks))) + print("wav2lip prediction time:", time() - s) - full_frames = full_frames[:len(mel_chunks)] + subprocess.check_call([ + "ffmpeg", "-y", + # "-vsync", "0", "-hwaccel", "cuda", "-hwaccel_output_format", "cuda", + "-i", "temp/result.avi", + "-i", args.audio, + # "-c:v", "h264_nvenc", + args.outfile, + ]) - batch_size = args.wav2lip_batch_size - gen = datagen(full_frames.copy(), mel_chunks) +model = detector = detector_model = None - for i, (img_batch, mel_batch, frames, coords) in enumerate(tqdm(gen, - total=int(np.ceil(float(len(mel_chunks))/batch_size)))): - if i == 0: - model = load_model(args.checkpoint_path) - print ("Model loaded") +def do_load(checkpoint_path): + global model, detector, detector_model - frame_h, frame_w = full_frames[0].shape[:-1] - out = cv2.VideoWriter('temp/result.avi', - cv2.VideoWriter_fourcc(*'DIVX'), fps, (frame_w, frame_h)) + model = load_model(checkpoint_path) - img_batch = torch.FloatTensor(np.transpose(img_batch, (0, 3, 1, 2))).to(device) - mel_batch = torch.FloatTensor(np.transpose(mel_batch, (0, 3, 1, 2))).to(device) + # SFDDetector.load_model(device) + # detector = RetinaFace(gpu_id=0, model_path="checkpoints/mobilenet.pth", network="mobilenet") + # detector = RetinaFace(gpu_id=0, model_path="checkpoints/resnet50.pth", network="resnet50") + if torch.cuda.is_available(): + detector = RetinaFace(gpu_id=0, model_path="checkpoints/mobilenet.pth", network="mobilenet") + else: + detector = RetinaFace( model_path="checkpoints/mobilenet.pth", network="mobilenet") + + detector_model = detector.model - with torch.no_grad(): - pred = model(mel_batch, img_batch) + print("Models loaded") - pred = pred.cpu().numpy().transpose(0, 2, 3, 1) * 255. - - for p, f, c in zip(pred, frames, coords): - y1, y2, x1, x2 = c - p = cv2.resize(p.astype(np.uint8), (x2 - x1, y2 - y1)) - f[y1:y2, x1:x2] = p - out.write(f) +face_batch_size = 64 * 8 - out.release() +def face_rect(images): + num_batches = math.ceil(len(images) / face_batch_size) + prev_ret = None + for i in range(num_batches): + batch = images[i * face_batch_size: (i + 1) * face_batch_size] + all_faces = detector(batch) # return faces list of all images + for faces in all_faces: + if faces: + box, landmarks, score = faces[0] + prev_ret = tuple(map(int, box)) + yield prev_ret - command = 'ffmpeg -y -i {} -i {} -strict -2 -q:v 1 {}'.format(args.audio, 'temp/result.avi', args.outfile) - subprocess.call(command, shell=platform.system() != 'Windows') if __name__ == '__main__': - main() + args = parser.parse_args() + do_load(args.checkpoint_path) + main() diff --git a/predict.py b/predict.py new file mode 100644 index 000000000..7fbc7eba6 --- /dev/null +++ b/predict.py @@ -0,0 +1,144 @@ +# Prediction interface for Cog ⚙️ +# https://github.com/replicate/cog/blob/main/docs/python.md +import os +import subprocess + +from cog import BasePredictor, Input, Path + +import inference + +from time import time + +from functools import wraps +import torch + + +def make_mem_efficient(cls: BasePredictor): + if not torch.cuda.is_available(): + return cls + + old_setup = cls.setup + old_predict = cls.predict + + @wraps(old_setup) + def new_setup(self, *args, **kwargs): + ret = old_setup(self, *args, **kwargs) + _move_to(self, "cpu") + return ret + + @wraps(old_predict) + def new_predict(self, *args, **kwargs): + _move_to(self, "cuda") + try: + ret = old_predict(self, *args, **kwargs) + finally: + _move_to(self, "cpu") + return ret + + cls.setup = new_setup + cls.predict = new_predict + + return cls + + +def _move_to(self, device): + try: + self = self.cached_models + except AttributeError: + pass + for attr, value in vars(self).items(): + try: + value = value.to(device) + except AttributeError: + pass + else: + print(f"Moving {self.__name__}.{attr} to {device}") + setattr(self, attr, value) + torch.cuda.empty_cache() + + +@make_mem_efficient +class Predictor(BasePredictor): + cached_models = inference + + def setup(self): + inference.do_load("checkpoints/wav2lip_gan.pth") + + def predict( + self, + face: Path = Input(description="video/image that contains faces to use"), + audio: Path = Input(description="video/audio file to use as raw audio source"), + pads: str = Input( + description="Padding for the detected face bounding box.\n" + "Please adjust to include chin at least\n" + 'Format: "top bottom left right"', + default="0 10 0 0", + ), + smooth: bool = Input( + description="Smooth face detections over a short temporal window", + default=True, + ), + fps: float = Input( + description="Can be specified only if input is a static image", + default=25.0, + ), + out_height: int = Input( + description="Output video height. Best results are obtained at 480 or 720", + default=480, + ), + ) -> Path: + try: + os.remove("results/result_voice.mp4") + except FileNotFoundError: + pass + + face_ext = os.path.splitext(face)[-1] + if face_ext not in [".mp4", ".mov", ".png" , ".jpg" , ".jpeg" , ".gif", ".mkv", ".webp"]: + raise ValueError(f'Unsupported face format {face_ext!r}') + + audio_ext = os.path.splitext(audio)[-1] + if audio_ext not in [".wav", ".mp3"]: + raise ValueError(f'Unsupported audio format {audio_ext!r}') + + args = [ + "--checkpoint_path", "checkpoints/wav2lip_gan.pth", + "--face", str(face), + "--audio", str(audio), + "--pads", *pads.split(" "), + "--fps", str(fps), + "--out_height", str(out_height), + ] + if not smooth: + args += ["--nosmooth"] + + print("-> run:", " ".join(args)) + inference.args = inference.parser.parse_args(args) + + s = time() + + try: + inference.main() + except ValueError as e: + print('-> Encountered error, skipping lipsync:', e) + + args = [ + "ffmpeg", "-y", + # "-vsync", "0", "-hwaccel", "cuda", "-hwaccel_output_format", "cuda", + "-stream_loop", "-1", + "-i", str(face), + "-i", str(audio), + "-shortest", + "-fflags", "+shortest", + "-max_interleave_delta", "100M", + "-map", "0:v:0", + "-map", "1:a:0", + # "-c", "copy", + # "-c:v", "h264_nvenc", + "results/result_voice.mp4", + ] + print("-> run:", " ".join(args)) + print(subprocess.check_output(args, encoding="utf-8")) + + print(time() - s) + + return Path("results/result_voice.mp4") diff --git a/requirements.txt b/requirements.txt index bfd428ab9..b16f1dabf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,73 @@ -librosa==0.7.0 -numpy==1.17.1 -opencv-contrib-python>=4.2.0.34 -opencv-python==4.1.0.25 -torch==1.1.0 -torchvision==0.3.0 -tqdm==4.45.0 -numba==0.48 +absl-py==2.1.0 +attrs==24.2.0 +audioread==3.0.1 +batch-face==1.5.0.dev0 +certifi==2024.8.30 +cffi==1.17.1 +charset-normalizer==3.4.0 +colorama==0.4.6 +contourpy==1.3.1 +cycler==0.12.1 +decorator==5.1.1 +ffmpeg-python==0.2.0 +filelock==3.16.1 +flatbuffers==24.3.25 +fonttools==4.55.2 +fsspec==2024.10.0 +future==1.0.0 +ghc==1.0 +idna==3.10 +intel-openmp==2021.4.0 +jax==0.4.36 +jaxlib==0.4.36 +Jinja2==3.1.4 +joblib==1.4.2 +kiwisolver==1.4.7 +lazy_loader==0.4 +librosa==0.10.2.post1 +llvmlite==0.43.0 +MarkupSafe==3.0.2 +matplotlib==3.9.3 +mediapipe==0.10.18 +mkl==2021.4.0 +ml_dtypes==0.5.0 +mpmath==1.3.0 +msgpack==1.1.0 +networkx==3.4.2 +numba==0.60.0 +numpy==1.26.4 +opencv-contrib-python==4.10.0.84 +opencv-python==4.10.0.84 +opencv-transforms==0.0.6 +opt_einsum==3.4.0 +packaging==24.2 +pandas==2.2.3 +pillow==11.0.0 +pip==24.2 +platformdirs==4.3.6 +pooch==1.8.2 +protobuf==4.25.5 +pycparser==2.22 +pyparsing==3.2.0 +python-dateutil==2.9.0.post0 +pytz==2024.2 +requests==2.32.3 +scikit-learn==1.5.2 +scipy==1.14.1 +sentencepiece==0.2.0 +setuptools==75.1.0 +six==1.17.0 +sixdrepnet==0.1.6 +sounddevice==0.5.1 +soundfile==0.12.1 +soxr==0.5.0.post1 +sympy==1.13.3 +tbb==2021.13.1 +threadpoolctl==3.5.0 +torch==2.3.0+cu118 +torchvision==0.18.0+cu118 +tqdm==4.67.1 +typing_extensions==4.12.2 +tzdata==2024.2 +urllib3==2.2.3 +wheel==0.44.0 diff --git a/requirementsCPU.txt b/requirementsCPU.txt new file mode 100644 index 000000000..ac7cef623 --- /dev/null +++ b/requirementsCPU.txt @@ -0,0 +1,13 @@ +librosa +numpy +opencv-contrib-python +opencv-python +-f https://download.pytorch.org/whl/torch_stable.html +torch +torchvision +tqdm +numba +mediapipe +https://raw.githubusercontent.com/AwaleSajil/ghc/master/ghc-1.0-py3-none-any.whl +git+https://github.com/elliottzheng/batch-face.git@master +ffmpeg-python diff --git a/requirements_colab.txt b/requirements_colab.txt new file mode 100644 index 000000000..c5f75e7b4 --- /dev/null +++ b/requirements_colab.txt @@ -0,0 +1,7 @@ +numpy==1.23.4 +librosa +opencv-python +torch +torchvision +tqdm +numba diff --git a/scripts/download_models.sh b/scripts/download_models.sh new file mode 100644 index 000000000..93049e873 --- /dev/null +++ b/scripts/download_models.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash + +set -ex + +wget -c -O checkpoints/wav2lip_gan.pth 'https://iiitaphyd-my.sharepoint.com/personal/radrabha_m_research_iiit_ac_in/_layouts/15/download.aspx?share=EdjI7bZlgApMqsVoEUUXpLsBxqXbn5z8VTmoxp55YNDcIA' +wget -c -O checkpoints/mobilenet.pth 'https://github.com/elliottzheng/face-detection/releases/download/0.0.1/mobilenet0.25_Final.pth' +wget -c -O checkpoints/resnet50.pth 'https://github.com/elliottzheng/face-detection/releases/download/0.0.1/Resnet50_Final.pth' diff --git a/scripts/run-dev.sh b/scripts/run-dev.sh new file mode 100644 index 000000000..becde83e1 --- /dev/null +++ b/scripts/run-dev.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash + +NAME=wav2lip-dev + +set -ex + +docker build . -t $NAME +docker run -it --rm \ + --name $NAME \ + -v $PWD/checkpoints:/src/checkpoints \ + -p 6001:5000 \ + --gpus all \ + $NAME diff --git a/scripts/run-prod.sh b/scripts/run-prod.sh new file mode 100644 index 000000000..08f378a48 --- /dev/null +++ b/scripts/run-prod.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +NAME=wav2lip + +set -x + +docker rm -f $NAME + +docker build . -t $NAME +docker run -d --restart always \ + --name $NAME \ + -v $PWD/checkpoints:/src/checkpoints \ + -p 5001:5000 \ + --gpus all \ + $NAME + +docker logs -f $NAME