Rudrabha · nfxffydp · Mar 18, 2023 · Mar 18, 2023 · Mar 18, 2023 · Mar 18, 2023
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,57 @@
+FROM nvidia/cuda:11.6.2-cudnn8-devel-ubuntu20.04
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+# install python via pyenv
+RUN apt-get update && apt-get install -y --no-install-recommends \
+	make \
+	build-essential \
+	libssl-dev \
+	zlib1g-dev \
+	libbz2-dev \
+	libreadline-dev \
+	libsqlite3-dev \
+	wget \
+	curl \
+	llvm \
+	libncurses5-dev \
+	libncursesw5-dev \
+	xz-utils \
+	tk-dev \
+	libffi-dev \
+	liblzma-dev \
+	git \
+	ca-certificates \
+    libgl1 \
+	&& rm -rf /var/lib/apt/lists/*
+ENV PATH="/root/.pyenv/shims:/root/.pyenv/bin:$PATH"
+ARG PYTHON_VERSION=3.8
+RUN curl -s -S -L https://raw.githubusercontent.com/pyenv/pyenv-installer/master/bin/pyenv-installer | bash && \
+	pyenv install $PYTHON_VERSION && \
+	pyenv global $PYTHON_VERSION
+
+# install cog
+RUN pip install cog
+
+# install deps
+RUN apt-get update && apt-get install -y --no-install-recommends \
+	ffmpeg libsndfile1 \
+	&& rm -rf /var/lib/apt/lists/*
+
+# copy to /src
+ENV WORKDIR /src
+RUN mkdir -p $WORKDIR
+WORKDIR $WORKDIR
+
+# install requirements
+COPY requirements.txt .
+RUN pip install -r requirements.txt
+RUN pip install git+https://github.com/elliottzheng/batch-face.git@master
+
+# copy sources
+COPY . .
+
+ENV PYTHONUNBUFFERED=1
+
+# run cog
+CMD python3 -m cog.server.http
diff --git a/README.md b/README.md
@@ -10,7 +10,7 @@ This code is part of the paper: _A Lip Sync Expert Is All You Need for Speech to
 
 |📑 Original Paper|📰 Project Page|🌀 Demo|⚡ Live Testing|📔 Colab Notebook
 |:-:|:-:|:-:|:-:|:-:|
-[Paper](http://arxiv.org/abs/2008.10010) | [Project Page](http://cvit.iiit.ac.in/research/projects/cvit-projects/a-lip-sync-expert-is-all-you-need-for-speech-to-lip-generation-in-the-wild/) | [Demo Video](https://youtu.be/0fXaDCZNOJc) | [Interactive Demo](https://bhaasha.iiit.ac.in/lipsync) | [Colab Notebook](https://colab.research.google.com/drive/1tZpDWXz49W6wDcTprANRGLo2D_EbD5J8?usp=sharing) /[Updated Collab Notebook](https://colab.research.google.com/drive/1IjFW1cLevs6Ouyu4Yht4mnR4yeuMqO7Y#scrollTo=MH1m608OymLH)
+[Paper](http://arxiv.org/abs/2008.10010) | [Project Page](http://cvit.iiit.ac.in/research/projects/cvit-projects/a-lip-sync-expert-is-all-you-need-for-speech-to-lip-generation-in-the-wild/) | [Demo Video](https://youtu.be/0fXaDCZNOJc) | [Interactive Demo](https://bhaasha.iiit.ac.in/lipsync) | [Colab Notebook](https://colab.research.google.com/drive/1tZpDWXz49W6wDcTprANRGLo2D_EbD5J8?usp=sharing) /[Updated Collab Notebook](https://colab.research.google.com/github/justinjohn0306/Wav2Lip/blob/master/Wav2Lip_simplified_v5.ipynb)
 
  <img src="https://drive.google.com/uc?export=view&id=1Wn0hPmpo4GRbCIJR8Tf20Akzdi1qjjG9"/>
 
@@ -27,14 +27,15 @@ This code is part of the paper: _A Lip Sync Expert Is All You Need for Speech to
 --------
 **Disclaimer**
 --------
-All results from this open-source code or our [demo website](https://bhaasha.iiit.ac.in/lipsync) should only be used for research/academic/personal purposes only. As the models are trained on the <a href="http://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs2.html">LRS2 dataset</a>, any form of commercial use is strictly prohibited. For commercial requests please contact us directly!
+All results from this open-source code or our [demo website](https://bhaasha.iiit.ac.in/lipsync) should only be used for research/academic/personal purposes only. As the models are trained on the <a href="http://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs2.html">LRS2 dataset</a>, any form of commercial use is strictly prohibhited. For commercial requests please contact us directly!
 
 Prerequisites
 -------------
-- `Python 3.6` 
+- `Python 3.10.15` 
 - ffmpeg: `sudo apt-get install ffmpeg`
 - Install necessary packages using `pip install -r requirements.txt`. Alternatively, instructions for using a docker image is provided [here](https://gist.github.com/xenogenesi/e62d3d13dadbc164124c830e9c453668). Have a look at [this comment](https://github.com/Rudrabha/Wav2Lip/issues/131#issuecomment-725478562) and comment on [the gist](https://gist.github.com/xenogenesi/e62d3d13dadbc164124c830e9c453668) if you encounter any issues. 
 - Face detection [pre-trained model](https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth) should be downloaded to `face_detection/detection/sfd/s3fd.pth`. Alternative [link](https://iiitaphyd-my.sharepoint.com/:u:/g/personal/prajwal_k_research_iiit_ac_in/EZsy6qWuivtDnANIG73iHjIBjMSoojcIV0NULXV-yiuiIg?e=qTasa8) if the above does not work.
+- Add [mobilenet.pth](https://github.com/justinjohn0306/Wav2Lip/releases/download/models/mobilenet.pth) to checkpoints folder once installed  along with one of the weights files below.
 
 Getting the weights
 ----------
@@ -55,8 +56,8 @@ The result is saved (by default) in `results/result_voice.mp4`. You can specify
 
 ##### Tips for better results:
 - Experiment with the `--pads` argument to adjust the detected face bounding box. Often leads to improved results. You might need to increase the bottom padding to include the chin region. E.g. `--pads 0 20 0 0`.
-- If you see the mouth position dislocated or some weird artifacts such as two mouths, then it can be because of over-smoothing the face detections. Use the `--nosmooth` argument and give it another try. 
-- Experiment with the `--resize_factor` argument, to get a lower-resolution video. Why? The models are trained on faces that were at a lower resolution. You might get better, visually pleasing results for 720p videos than for 1080p videos (in many cases, the latter works well too). 
+- If you see the mouth position dislocated or some weird artifacts such as two mouths, then it can be because of over-smoothing the face detections. Use the `--nosmooth` argument and give another try. 
+- Experiment with the `--resize_factor` argument, to get a lower resolution video. Why? The models are trained on faces which were at a lower resolution. You might get better, visually pleasing results for 720p videos than for 1080p videos (in many cases, the latter works well too). 
 - The Wav2Lip model without GAN usually needs more experimenting with the above two to get the most ideal results, and sometimes, can give you a better result as well.
 
 Preparing LRS2 for training
@@ -78,7 +79,7 @@ Place the LRS2 filelists (train, val, test) `.txt` files in the `filelists/` fol
 ```bash
 python preprocess.py --data_root data_root/main --preprocessed_root lrs2_preprocessed/
 ```
-Additional options like `batch_size` and the number of GPUs to use in parallel to use can also be set.
+Additional options like `batch_size` and number of GPUs to use in parallel to use can also be set.
 
 ##### Preprocessed LRS2 folder structure
 ```
@@ -99,12 +100,12 @@ You can download [the pre-trained weights](#getting-the-weights) if you want to
 python color_syncnet_train.py --data_root lrs2_preprocessed/ --checkpoint_dir <folder_to_save_checkpoints>
 ```
 ##### Training the Wav2Lip models
-You can either train the model without the additional visual quality discriminator (< 1 day of training) or use the discriminator (~2 days). For the former, run: 
+You can either train the model without the additional visual quality disriminator (< 1 day of training) or use the discriminator (~2 days). For the former, run: 
 ```bash
 python wav2lip_train.py --data_root lrs2_preprocessed/ --checkpoint_dir <folder_to_save_checkpoints> --syncnet_checkpoint_path <path_to_expert_disc_checkpoint>
 ```
 
-To train with the visual quality discriminator, you should run `hq_wav2lip_train.py` instead. The arguments for both files are similar. In both cases, you can resume training as well. Look at `python wav2lip_train.py --help` for more details. You can also set additional less commonly-used hyper-parameters at the bottom of the `hparams.py` file.
+To train with the visual quality discriminator, you should run `hq_wav2lip_train.py` instead. The arguments for both the files are similar. In both the cases, you can resume training as well. Look at `python wav2lip_train.py --help` for more details. You can also set additional less commonly-used hyper-parameters at the bottom of the `hparams.py` file.
 
 Training on datasets other than LRS2
 ------------------------------------
@@ -126,7 +127,7 @@ Please check the `evaluation/` folder for the instructions.
 
 License and Citation
 ----------
-This repository can only be used for personal/research/non-commercial purposes. However, for commercial requests, please contact us directly at [email protected] or [email protected]. We have an HD model trained on a dataset allowing commercial usage. The size of the generated face will be 192 x 288 in our new model. Please cite the following paper if you use this repository:
+Theis repository can only be used for personal/research/non-commercial purposes. However, for commercial requests, please contact us directly at [email protected] or [email protected]. We have an HD model trained on a dataset allowing commercial usage. The size of the generated face will be 192 x 288 in our new model. Please cite the following paper if you use this repository:
 ```
 @inproceedings{10.1145/3394171.3413532,
 author = {Prajwal, K R and Mukhopadhyay, Rudrabha and Namboodiri, Vinay P. and Jawahar, C.V.},
@@ -147,6 +148,6 @@ series = {MM '20}
 ```
 
 
-Acknowledgments
+Acknowledgements
 ----------
-Parts of the code structure are inspired by this [TTS repository](https://github.com/r9y9/deepvoice3_pytorch). We thank the author for this wonderful code. The code for Face Detection has been taken from the [face_alignment](https://github.com/1adrianb/face-alignment) repository. We thank the authors for releasing their code and models. We thank [zabique](https://github.com/zabique) for the tutorial collab notebook.
+Parts of the code structure is inspired by this [TTS repository](https://github.com/r9y9/deepvoice3_pytorch). We thank the author for this wonderful code. The code for Face Detection has been taken from the [face_alignment](https://github.com/1adrianb/face-alignment) repository. We thank the authors for releasing their code and models. We thank [zabique](https://github.com/zabique) for the tutorial collab notebook.
diff --git a/Wav2Lip_simplified_V5(offline).ipynb b/Wav2Lip_simplified_V5(offline).ipynb
@@ -0,0 +1,225 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f1e90f25",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Step 1: Install dependency\n",
+    "!pip install ffmpeg-python\n",
+    "\n",
+    "# Step 2: Clone the Wav2Lip repository\n",
+    "!git clone https://github.com/justinjohn0306/Wav2Lip\n",
+    "\n",
+    "# Step 3: Download pretrained model\n",
+    "import requests\n",
+    "url = \"https://iiitaphyd-my.sharepoint.com/personal/radrabha_m_research_iiit_ac_in/_layouts/15/download.aspx?share=EdjI7bZlgApMqsVoEUUXpLsBxqXbn5z8VTmoxp55YNDcIA\"\n",
+    "response = requests.get(url)\n",
+    "\n",
+    "with open(\"Wav2Lip/checkpoints/wav2lip_gan.pth\", \"wb\") as f:\n",
+    "    f.write(response.content)\n",
+    "    \n",
+    "# Step 4: Install the required dependencies for Wav2Lip\n",
+    "!cd Wav2Lip && pip install -r requirements.txt\n",
+    "!pip install pyaudio\n",
+    "\n",
+    "\n",
+    "# Step 5: Download pretrained model for face detection\n",
+    "url = \"https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth\"\n",
+    "response = requests.get(url)\n",
+    "\n",
+    "with open(\"Wav2Lip/face_detection/detection/sfd/s3fd.pth\", \"wb\") as f:\n",
+    "    f.write(response.content)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8e86c988",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import subprocess\n",
+    "from urllib import parse as urlparse\n",
+    "\n",
+    "# Step 1: Install yt-dlp\n",
+    "subprocess.run(['pip', 'install', 'yt-dlp'])\n",
+    "\n",
+    "# Step 2: Define YouTube URL and Video ID\n",
+    "YOUTUBE_URL = 'https://www.youtube.com/watch?v=vAnWYLTdvfY'\n",
+    "url_data = urlparse.urlparse(YOUTUBE_URL)\n",
+    "query = urlparse.parse_qs(url_data.query)\n",
+    "YOUTUBE_ID = query[\"v\"][0]\n",
+    "\n",
+    "# Remove previous input video\n",
+    "if os.path.isfile('input_vid.mp4'):\n",
+    "    os.remove('input_vid.mp4')\n",
+    "\n",
+    "# Trim video (start, end) seconds\n",
+    "start = 35\n",
+    "end = 62\n",
+    "interval = end - start\n",
+    "\n",
+    "# Step 3: Download and trim the YouTube video\n",
+    "subprocess.run(['yt-dlp', '-f', 'bestvideo[ext=mp4]', '--output', \"youtube.%(ext)s\", f'https://www.youtube.com/watch?v={YOUTUBE_ID}'])\n",
+    "\n",
+    "# Cut the video using FFmpeg\n",
+    "subprocess.run(['ffmpeg', '-y', '-i', 'youtube.mp4', '-ss', str(start), '-t', str(interval), '-async', '1', 'input_vid.mp4'])\n",
+    "\n",
+    "# Display video.\n",
+    "from IPython.display import HTML\n",
+    "from base64 import b64encode\n",
+    "\n",
+    "def show_video(path):\n",
+    "    mp4 = open(path, 'rb').read()\n",
+    "    data_url = \"data:video/mp4;base64,\" + b64encode(mp4).decode()\n",
+    "    return HTML(f\"\"\"<video width=600 controls><source src=\"{data_url}\"></video>\"\"\")\n",
+    "\n",
+    "# Preview the trimmed video\n",
+    "show_video('input_vid.mp4')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7da8e818",
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "from IPython.display import Audio\n",
+    "from IPython.core.display import display\n",
+    "\n",
+    "upload_method = 'Path'  # Change this to 'Record' or 'Path'\n",
+    "\n",
+    "# Remove previous input audio\n",
+    "if os.path.isfile('input_audio.wav'):\n",
+    "    os.remove('input_audio.wav')\n",
+    "\n",
+    "def display_audio():\n",
+    "    display(Audio('input_audio.wav'))\n",
+    "\n",
+    "if upload_method == 'Record':\n",
+    "    import pyaudio\n",
+    "    import wave\n",
+    "\n",
+    "    CHUNK = 1024\n",
+    "    FORMAT = pyaudio.paInt16\n",
+    "    CHANNELS = 1\n",
+    "    RATE = 16000\n",
+    "    RECORD_SECONDS = 5\n",
+    "    WAVE_OUTPUT_FILENAME = \"input_audio.wav\"\n",
+    "\n",
+    "    p = pyaudio.PyAudio()\n",
+    "\n",
+    "    stream = p.open(format=FORMAT,\n",
+    "                    channels=CHANNELS,\n",
+    "                    rate=RATE,\n",
+    "                    input=True,\n",
+    "                    frames_per_buffer=CHUNK)\n",
+    "\n",
+    "    print(\"Recording...\")\n",
+    "\n",
+    "    frames = []\n",
+    "\n",
+    "    for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):\n",
+    "        data = stream.read(CHUNK)\n",
+    "        frames.append(data)\n",
+    "\n",
+    "    print(\"Finished recording.\")\n",
+    "\n",
+    "    stream.stop_stream()\n",
+    "    stream.close()\n",
+    "    p.terminate()\n",
+    "\n",
+    "    wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb')\n",
+    "    wf.setnchannels(CHANNELS)\n",
+    "    wf.setsampwidth(p.get_sample_size(FORMAT))\n",
+    "    wf.setframerate(RATE)\n",
+    "    wf.writeframes(b''.join(frames))\n",
+    "    wf.close()\n",
+    "\n",
+    "    display_audio()\n",
+    "\n",
+    "elif upload_method == 'Path':\n",
+    "    # Add the full path to your audio\n",
+    "    PATH_TO_YOUR_AUDIO = 'C:/Users/justi/OneDrive/Desktop/wav2lip/Wav2Lip/input_audio.wav'\n",
+    "\n",
+    "    # Load audio with specified sampling rate\n",
+    "    import librosa\n",
+    "    audio, sr = librosa.load(PATH_TO_YOUR_AUDIO, sr=None)\n",
+    "\n",
+    "    # Save audio with specified sampling rate\n",
+    "    import soundfile as sf\n",
+    "    sf.write('input_audio.wav', audio, sr, format='wav')\n",
+    "\n",
+    "    display_audio()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "63289945",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# Define the parameters for the Wav2Lip model\n",
+    "pad_top = 0\n",
+    "pad_bottom = 10\n",
+    "pad_left = 0\n",
+    "pad_right = 0\n",
+    "rescaleFactor = 1\n",
+    "nosmooth = False\n",
+    "\n",
+    "# Set the path to the Wav2Lip model and input files\n",
+    "checkpoint_path = \"checkpoints/wav2lip_gan.pth\"\n",
+    "input_face = \"input_vid.mp4\"\n",
+    "input_audio = \"input_audio.wav\"\n",
+    "\n",
+    "# Run the Wav2Lip model\n",
+    "!cd Wav2Lip && python inference.py --checkpoint_path {checkpoint_path} --face {input_face} --audio {input_audio} --pads {pad_top} {pad_bottom} {pad_left} {pad_right} --resize_factor {rescaleFactor} {\"--nosmooth\" if nosmooth else \"\"}\n",
+    "\n",
+    "# Preview the output video\n",
+    "print(\"Final Video Preview\")\n",
+    "print(\"Find the output video at\", 'Wav2Lip/results/result_voice.mp4')\n",
+    "show_video('Wav2Lip/results/result_voice.mp4')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3fbafa56",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}