Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
74d2e32
Add files via upload
justinjohn0306 Mar 18, 2023
10ad535
Update README.md
justinjohn0306 Mar 18, 2023
3bd070a
Update README.md
justinjohn0306 Mar 18, 2023
6a0a0ce
Delete Wav2Lip_simplified_v3.ipynb
justinjohn0306 Mar 18, 2023
a36ba32
Add files via upload
justinjohn0306 Mar 18, 2023
4701565
Update README.md
justinjohn0306 Mar 18, 2023
74cde9b
Add files via upload
justinjohn0306 Mar 26, 2023
36a496b
Update README.md
justinjohn0306 Mar 26, 2023
1f9824d
Update audio.py
justinjohn0306 Mar 26, 2023
b6b4895
Update requirements.txt
justinjohn0306 Mar 26, 2023
57b68ba
Add files via upload
justinjohn0306 Mar 26, 2023
6da1256
cleanup
justinjohn0306 Mar 26, 2023
c4f4a25
Delete Wav2Lip_simplified_v5.ipynb
justinjohn0306 Mar 27, 2023
fc6453a
Add files via upload
justinjohn0306 Mar 27, 2023
5b53c76
Update Wav2Lip_simplified_v5.ipynb
justinjohn0306 Mar 27, 2023
cd92e36
Delete Wav2Lip_simplified_v5.ipynb
justinjohn0306 Mar 28, 2023
764da9b
Add files via upload
justinjohn0306 Mar 28, 2023
7b5c4f8
Delete Wav2Lip_simplified_v5.ipynb
justinjohn0306 Mar 28, 2023
d447984
Add files via upload
justinjohn0306 Mar 28, 2023
85a0e8d
Update Wav2Lip_simplified_v5.ipynb
justinjohn0306 Mar 28, 2023
75de7f4
Add files via upload
justinjohn0306 May 13, 2023
d175fa1
Add files via upload
justinjohn0306 Sep 7, 2023
949ea47
Add files via upload
justinjohn0306 Sep 7, 2023
3985d70
Update Wav2Lip_simplified_v5.ipynb
justinjohn0306 Sep 7, 2023
bc6b705
Add files via upload
justinjohn0306 Sep 7, 2023
367e809
Delete batch_face directory
justinjohn0306 Sep 7, 2023
ebac721
Merge branch 'Rudrabha:master' into master
justinjohn0306 Sep 7, 2023
9ebe926
Add files via upload
justinjohn0306 Sep 7, 2023
ba27242
Update requirements_colab.txt
justinjohn0306 Sep 7, 2023
ef93024
Update audio.py
justinjohn0306 Sep 7, 2023
1bf7260
Add files via upload
justinjohn0306 Sep 7, 2023
5a175e1
Add files via upload
justinjohn0306 Sep 7, 2023
eb93fb0
Update inference.py
justinjohn0306 Sep 7, 2023
461b1d9
Add files via upload
justinjohn0306 Sep 7, 2023
11f3460
Add files via upload
justinjohn0306 Sep 8, 2023
44a3d41
Add files via upload
justinjohn0306 Sep 8, 2023
164479a
Add files via upload
justinjohn0306 Sep 8, 2023
280bf21
Fix SameFileError
mrwadams Jan 21, 2024
89d929f
fix-requirements
justinjohn0306 May 23, 2024
b71179c
fix-requirements
justinjohn0306 May 23, 2024
b725d4f
Merge pull request #1 from mrwadams/master
justinjohn0306 May 23, 2024
d9c9717
Update inference.py
magicse Sep 11, 2024
3547bfc
Update requirements.txt
justinjohn0306 Dec 7, 2024
e60195d
Update README.md
justinjohn0306 Dec 7, 2024
5fe77e1
Merge pull request #2 from magicse/patch-1
justinjohn0306 Dec 7, 2024
ebe4687
Update Wav2Lip_simplified_v5.ipynb
justinjohn0306 Dec 8, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 57 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
FROM nvidia/cuda:11.6.2-cudnn8-devel-ubuntu20.04

ARG DEBIAN_FRONTEND=noninteractive

# install python via pyenv
RUN apt-get update && apt-get install -y --no-install-recommends \
make \
build-essential \
libssl-dev \
zlib1g-dev \
libbz2-dev \
libreadline-dev \
libsqlite3-dev \
wget \
curl \
llvm \
libncurses5-dev \
libncursesw5-dev \
xz-utils \
tk-dev \
libffi-dev \
liblzma-dev \
git \
ca-certificates \
libgl1 \
&& rm -rf /var/lib/apt/lists/*
ENV PATH="/root/.pyenv/shims:/root/.pyenv/bin:$PATH"
ARG PYTHON_VERSION=3.8
RUN curl -s -S -L https://raw.githubusercontent.com/pyenv/pyenv-installer/master/bin/pyenv-installer | bash && \
pyenv install $PYTHON_VERSION && \
pyenv global $PYTHON_VERSION

# install cog
RUN pip install cog

# install deps
RUN apt-get update && apt-get install -y --no-install-recommends \
ffmpeg libsndfile1 \
&& rm -rf /var/lib/apt/lists/*

# copy to /src
ENV WORKDIR /src
RUN mkdir -p $WORKDIR
WORKDIR $WORKDIR

# install requirements
COPY requirements.txt .
RUN pip install -r requirements.txt
RUN pip install git+https://github.com/elliottzheng/batch-face.git@master

# copy sources
COPY . .

ENV PYTHONUNBUFFERED=1

# run cog
CMD python3 -m cog.server.http
23 changes: 12 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ This code is part of the paper: _A Lip Sync Expert Is All You Need for Speech to

|📑 Original Paper|📰 Project Page|🌀 Demo|⚡ Live Testing|📔 Colab Notebook
|:-:|:-:|:-:|:-:|:-:|
[Paper](http://arxiv.org/abs/2008.10010) | [Project Page](http://cvit.iiit.ac.in/research/projects/cvit-projects/a-lip-sync-expert-is-all-you-need-for-speech-to-lip-generation-in-the-wild/) | [Demo Video](https://youtu.be/0fXaDCZNOJc) | [Interactive Demo](https://bhaasha.iiit.ac.in/lipsync) | [Colab Notebook](https://colab.research.google.com/drive/1tZpDWXz49W6wDcTprANRGLo2D_EbD5J8?usp=sharing) /[Updated Collab Notebook](https://colab.research.google.com/drive/1IjFW1cLevs6Ouyu4Yht4mnR4yeuMqO7Y#scrollTo=MH1m608OymLH)
[Paper](http://arxiv.org/abs/2008.10010) | [Project Page](http://cvit.iiit.ac.in/research/projects/cvit-projects/a-lip-sync-expert-is-all-you-need-for-speech-to-lip-generation-in-the-wild/) | [Demo Video](https://youtu.be/0fXaDCZNOJc) | [Interactive Demo](https://bhaasha.iiit.ac.in/lipsync) | [Colab Notebook](https://colab.research.google.com/drive/1tZpDWXz49W6wDcTprANRGLo2D_EbD5J8?usp=sharing) /[Updated Collab Notebook](https://colab.research.google.com/github/justinjohn0306/Wav2Lip/blob/master/Wav2Lip_simplified_v5.ipynb)

<img src="https://drive.google.com/uc?export=view&id=1Wn0hPmpo4GRbCIJR8Tf20Akzdi1qjjG9"/>

Expand All @@ -27,14 +27,15 @@ This code is part of the paper: _A Lip Sync Expert Is All You Need for Speech to
--------
**Disclaimer**
--------
All results from this open-source code or our [demo website](https://bhaasha.iiit.ac.in/lipsync) should only be used for research/academic/personal purposes only. As the models are trained on the <a href="http://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs2.html">LRS2 dataset</a>, any form of commercial use is strictly prohibited. For commercial requests please contact us directly!
All results from this open-source code or our [demo website](https://bhaasha.iiit.ac.in/lipsync) should only be used for research/academic/personal purposes only. As the models are trained on the <a href="http://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs2.html">LRS2 dataset</a>, any form of commercial use is strictly prohibhited. For commercial requests please contact us directly!

Prerequisites
-------------
- `Python 3.6`
- `Python 3.10.15`
- ffmpeg: `sudo apt-get install ffmpeg`
- Install necessary packages using `pip install -r requirements.txt`. Alternatively, instructions for using a docker image is provided [here](https://gist.github.com/xenogenesi/e62d3d13dadbc164124c830e9c453668). Have a look at [this comment](https://github.com/Rudrabha/Wav2Lip/issues/131#issuecomment-725478562) and comment on [the gist](https://gist.github.com/xenogenesi/e62d3d13dadbc164124c830e9c453668) if you encounter any issues.
- Face detection [pre-trained model](https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth) should be downloaded to `face_detection/detection/sfd/s3fd.pth`. Alternative [link](https://iiitaphyd-my.sharepoint.com/:u:/g/personal/prajwal_k_research_iiit_ac_in/EZsy6qWuivtDnANIG73iHjIBjMSoojcIV0NULXV-yiuiIg?e=qTasa8) if the above does not work.
- Add [mobilenet.pth](https://github.com/justinjohn0306/Wav2Lip/releases/download/models/mobilenet.pth) to checkpoints folder once installed along with one of the weights files below.

Getting the weights
----------
Expand All @@ -55,8 +56,8 @@ The result is saved (by default) in `results/result_voice.mp4`. You can specify

##### Tips for better results:
- Experiment with the `--pads` argument to adjust the detected face bounding box. Often leads to improved results. You might need to increase the bottom padding to include the chin region. E.g. `--pads 0 20 0 0`.
- If you see the mouth position dislocated or some weird artifacts such as two mouths, then it can be because of over-smoothing the face detections. Use the `--nosmooth` argument and give it another try.
- Experiment with the `--resize_factor` argument, to get a lower-resolution video. Why? The models are trained on faces that were at a lower resolution. You might get better, visually pleasing results for 720p videos than for 1080p videos (in many cases, the latter works well too).
- If you see the mouth position dislocated or some weird artifacts such as two mouths, then it can be because of over-smoothing the face detections. Use the `--nosmooth` argument and give another try.
- Experiment with the `--resize_factor` argument, to get a lower resolution video. Why? The models are trained on faces which were at a lower resolution. You might get better, visually pleasing results for 720p videos than for 1080p videos (in many cases, the latter works well too).
- The Wav2Lip model without GAN usually needs more experimenting with the above two to get the most ideal results, and sometimes, can give you a better result as well.

Preparing LRS2 for training
Expand All @@ -78,7 +79,7 @@ Place the LRS2 filelists (train, val, test) `.txt` files in the `filelists/` fol
```bash
python preprocess.py --data_root data_root/main --preprocessed_root lrs2_preprocessed/
```
Additional options like `batch_size` and the number of GPUs to use in parallel to use can also be set.
Additional options like `batch_size` and number of GPUs to use in parallel to use can also be set.

##### Preprocessed LRS2 folder structure
```
Expand All @@ -99,12 +100,12 @@ You can download [the pre-trained weights](#getting-the-weights) if you want to
python color_syncnet_train.py --data_root lrs2_preprocessed/ --checkpoint_dir <folder_to_save_checkpoints>
```
##### Training the Wav2Lip models
You can either train the model without the additional visual quality discriminator (< 1 day of training) or use the discriminator (~2 days). For the former, run:
You can either train the model without the additional visual quality disriminator (< 1 day of training) or use the discriminator (~2 days). For the former, run:
```bash
python wav2lip_train.py --data_root lrs2_preprocessed/ --checkpoint_dir <folder_to_save_checkpoints> --syncnet_checkpoint_path <path_to_expert_disc_checkpoint>
```

To train with the visual quality discriminator, you should run `hq_wav2lip_train.py` instead. The arguments for both files are similar. In both cases, you can resume training as well. Look at `python wav2lip_train.py --help` for more details. You can also set additional less commonly-used hyper-parameters at the bottom of the `hparams.py` file.
To train with the visual quality discriminator, you should run `hq_wav2lip_train.py` instead. The arguments for both the files are similar. In both the cases, you can resume training as well. Look at `python wav2lip_train.py --help` for more details. You can also set additional less commonly-used hyper-parameters at the bottom of the `hparams.py` file.

Training on datasets other than LRS2
------------------------------------
Expand All @@ -126,7 +127,7 @@ Please check the `evaluation/` folder for the instructions.

License and Citation
----------
This repository can only be used for personal/research/non-commercial purposes. However, for commercial requests, please contact us directly at [email protected] or [email protected]. We have an HD model trained on a dataset allowing commercial usage. The size of the generated face will be 192 x 288 in our new model. Please cite the following paper if you use this repository:
Theis repository can only be used for personal/research/non-commercial purposes. However, for commercial requests, please contact us directly at [email protected] or [email protected]. We have an HD model trained on a dataset allowing commercial usage. The size of the generated face will be 192 x 288 in our new model. Please cite the following paper if you use this repository:
```
@inproceedings{10.1145/3394171.3413532,
author = {Prajwal, K R and Mukhopadhyay, Rudrabha and Namboodiri, Vinay P. and Jawahar, C.V.},
Expand All @@ -147,6 +148,6 @@ series = {MM '20}
```


Acknowledgments
Acknowledgements
----------
Parts of the code structure are inspired by this [TTS repository](https://github.com/r9y9/deepvoice3_pytorch). We thank the author for this wonderful code. The code for Face Detection has been taken from the [face_alignment](https://github.com/1adrianb/face-alignment) repository. We thank the authors for releasing their code and models. We thank [zabique](https://github.com/zabique) for the tutorial collab notebook.
Parts of the code structure is inspired by this [TTS repository](https://github.com/r9y9/deepvoice3_pytorch). We thank the author for this wonderful code. The code for Face Detection has been taken from the [face_alignment](https://github.com/1adrianb/face-alignment) repository. We thank the authors for releasing their code and models. We thank [zabique](https://github.com/zabique) for the tutorial collab notebook.
225 changes: 225 additions & 0 deletions Wav2Lip_simplified_V5(offline).ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,225 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "f1e90f25",
"metadata": {},
"outputs": [],
"source": [
"# Step 1: Install dependency\n",
"!pip install ffmpeg-python\n",
"\n",
"# Step 2: Clone the Wav2Lip repository\n",
"!git clone https://github.com/justinjohn0306/Wav2Lip\n",
"\n",
"# Step 3: Download pretrained model\n",
"import requests\n",
"url = \"https://iiitaphyd-my.sharepoint.com/personal/radrabha_m_research_iiit_ac_in/_layouts/15/download.aspx?share=EdjI7bZlgApMqsVoEUUXpLsBxqXbn5z8VTmoxp55YNDcIA\"\n",
"response = requests.get(url)\n",
"\n",
"with open(\"Wav2Lip/checkpoints/wav2lip_gan.pth\", \"wb\") as f:\n",
" f.write(response.content)\n",
" \n",
"# Step 4: Install the required dependencies for Wav2Lip\n",
"!cd Wav2Lip && pip install -r requirements.txt\n",
"!pip install pyaudio\n",
"\n",
"\n",
"# Step 5: Download pretrained model for face detection\n",
"url = \"https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth\"\n",
"response = requests.get(url)\n",
"\n",
"with open(\"Wav2Lip/face_detection/detection/sfd/s3fd.pth\", \"wb\") as f:\n",
" f.write(response.content)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8e86c988",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import subprocess\n",
"from urllib import parse as urlparse\n",
"\n",
"# Step 1: Install yt-dlp\n",
"subprocess.run(['pip', 'install', 'yt-dlp'])\n",
"\n",
"# Step 2: Define YouTube URL and Video ID\n",
"YOUTUBE_URL = 'https://www.youtube.com/watch?v=vAnWYLTdvfY'\n",
"url_data = urlparse.urlparse(YOUTUBE_URL)\n",
"query = urlparse.parse_qs(url_data.query)\n",
"YOUTUBE_ID = query[\"v\"][0]\n",
"\n",
"# Remove previous input video\n",
"if os.path.isfile('input_vid.mp4'):\n",
" os.remove('input_vid.mp4')\n",
"\n",
"# Trim video (start, end) seconds\n",
"start = 35\n",
"end = 62\n",
"interval = end - start\n",
"\n",
"# Step 3: Download and trim the YouTube video\n",
"subprocess.run(['yt-dlp', '-f', 'bestvideo[ext=mp4]', '--output', \"youtube.%(ext)s\", f'https://www.youtube.com/watch?v={YOUTUBE_ID}'])\n",
"\n",
"# Cut the video using FFmpeg\n",
"subprocess.run(['ffmpeg', '-y', '-i', 'youtube.mp4', '-ss', str(start), '-t', str(interval), '-async', '1', 'input_vid.mp4'])\n",
"\n",
"# Display video.\n",
"from IPython.display import HTML\n",
"from base64 import b64encode\n",
"\n",
"def show_video(path):\n",
" mp4 = open(path, 'rb').read()\n",
" data_url = \"data:video/mp4;base64,\" + b64encode(mp4).decode()\n",
" return HTML(f\"\"\"<video width=600 controls><source src=\"{data_url}\"></video>\"\"\")\n",
"\n",
"# Preview the trimmed video\n",
"show_video('input_vid.mp4')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7da8e818",
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"import os\n",
"from IPython.display import Audio\n",
"from IPython.core.display import display\n",
"\n",
"upload_method = 'Path' # Change this to 'Record' or 'Path'\n",
"\n",
"# Remove previous input audio\n",
"if os.path.isfile('input_audio.wav'):\n",
" os.remove('input_audio.wav')\n",
"\n",
"def display_audio():\n",
" display(Audio('input_audio.wav'))\n",
"\n",
"if upload_method == 'Record':\n",
" import pyaudio\n",
" import wave\n",
"\n",
" CHUNK = 1024\n",
" FORMAT = pyaudio.paInt16\n",
" CHANNELS = 1\n",
" RATE = 16000\n",
" RECORD_SECONDS = 5\n",
" WAVE_OUTPUT_FILENAME = \"input_audio.wav\"\n",
"\n",
" p = pyaudio.PyAudio()\n",
"\n",
" stream = p.open(format=FORMAT,\n",
" channels=CHANNELS,\n",
" rate=RATE,\n",
" input=True,\n",
" frames_per_buffer=CHUNK)\n",
"\n",
" print(\"Recording...\")\n",
"\n",
" frames = []\n",
"\n",
" for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):\n",
" data = stream.read(CHUNK)\n",
" frames.append(data)\n",
"\n",
" print(\"Finished recording.\")\n",
"\n",
" stream.stop_stream()\n",
" stream.close()\n",
" p.terminate()\n",
"\n",
" wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb')\n",
" wf.setnchannels(CHANNELS)\n",
" wf.setsampwidth(p.get_sample_size(FORMAT))\n",
" wf.setframerate(RATE)\n",
" wf.writeframes(b''.join(frames))\n",
" wf.close()\n",
"\n",
" display_audio()\n",
"\n",
"elif upload_method == 'Path':\n",
" # Add the full path to your audio\n",
" PATH_TO_YOUR_AUDIO = 'C:/Users/justi/OneDrive/Desktop/wav2lip/Wav2Lip/input_audio.wav'\n",
"\n",
" # Load audio with specified sampling rate\n",
" import librosa\n",
" audio, sr = librosa.load(PATH_TO_YOUR_AUDIO, sr=None)\n",
"\n",
" # Save audio with specified sampling rate\n",
" import soundfile as sf\n",
" sf.write('input_audio.wav', audio, sr, format='wav')\n",
"\n",
" display_audio()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "63289945",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# Define the parameters for the Wav2Lip model\n",
"pad_top = 0\n",
"pad_bottom = 10\n",
"pad_left = 0\n",
"pad_right = 0\n",
"rescaleFactor = 1\n",
"nosmooth = False\n",
"\n",
"# Set the path to the Wav2Lip model and input files\n",
"checkpoint_path = \"checkpoints/wav2lip_gan.pth\"\n",
"input_face = \"input_vid.mp4\"\n",
"input_audio = \"input_audio.wav\"\n",
"\n",
"# Run the Wav2Lip model\n",
"!cd Wav2Lip && python inference.py --checkpoint_path {checkpoint_path} --face {input_face} --audio {input_audio} --pads {pad_top} {pad_bottom} {pad_left} {pad_right} --resize_factor {rescaleFactor} {\"--nosmooth\" if nosmooth else \"\"}\n",
"\n",
"# Preview the output video\n",
"print(\"Final Video Preview\")\n",
"print(\"Find the output video at\", 'Wav2Lip/results/result_voice.mp4')\n",
"show_video('Wav2Lip/results/result_voice.mp4')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3fbafa56",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Loading