diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 000000000..d625a97d1
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,57 @@
+FROM nvidia/cuda:11.6.2-cudnn8-devel-ubuntu20.04
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+# install python via pyenv
+RUN apt-get update && apt-get install -y --no-install-recommends \
+	make \
+	build-essential \
+	libssl-dev \
+	zlib1g-dev \
+	libbz2-dev \
+	libreadline-dev \
+	libsqlite3-dev \
+	wget \
+	curl \
+	llvm \
+	libncurses5-dev \
+	libncursesw5-dev \
+	xz-utils \
+	tk-dev \
+	libffi-dev \
+	liblzma-dev \
+	git \
+	ca-certificates \
+    libgl1 \
+	&& rm -rf /var/lib/apt/lists/*
+ENV PATH="/root/.pyenv/shims:/root/.pyenv/bin:$PATH"
+ARG PYTHON_VERSION=3.8
+RUN curl -s -S -L https://raw.githubusercontent.com/pyenv/pyenv-installer/master/bin/pyenv-installer | bash && \
+	pyenv install $PYTHON_VERSION && \
+	pyenv global $PYTHON_VERSION
+
+# install cog
+RUN pip install cog
+
+# install deps
+RUN apt-get update && apt-get install -y --no-install-recommends \
+	ffmpeg libsndfile1 \
+	&& rm -rf /var/lib/apt/lists/*
+
+# copy to /src
+ENV WORKDIR /src
+RUN mkdir -p $WORKDIR
+WORKDIR $WORKDIR
+
+# install requirements
+COPY requirements.txt .
+RUN pip install -r requirements.txt
+RUN pip install git+https://github.com/elliottzheng/batch-face.git@master
+
+# copy sources
+COPY . .
+
+ENV PYTHONUNBUFFERED=1
+
+# run cog
+CMD python3 -m cog.server.http
diff --git a/README.md b/README.md
index 76e66198e..f0ac474a0 100644
--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@ This code is part of the paper: _A Lip Sync Expert Is All You Need for Speech to
 
 |📑 Original Paper|📰 Project Page|🌀 Demo|⚡ Live Testing|📔 Colab Notebook
 |:-:|:-:|:-:|:-:|:-:|
-[Paper](http://arxiv.org/abs/2008.10010) | [Project Page](http://cvit.iiit.ac.in/research/projects/cvit-projects/a-lip-sync-expert-is-all-you-need-for-speech-to-lip-generation-in-the-wild/) | [Demo Video](https://youtu.be/0fXaDCZNOJc) | [Interactive Demo](https://bhaasha.iiit.ac.in/lipsync) | [Colab Notebook](https://colab.research.google.com/drive/1tZpDWXz49W6wDcTprANRGLo2D_EbD5J8?usp=sharing) /[Updated Collab Notebook](https://colab.research.google.com/drive/1IjFW1cLevs6Ouyu4Yht4mnR4yeuMqO7Y#scrollTo=MH1m608OymLH)
+[Paper](http://arxiv.org/abs/2008.10010) | [Project Page](http://cvit.iiit.ac.in/research/projects/cvit-projects/a-lip-sync-expert-is-all-you-need-for-speech-to-lip-generation-in-the-wild/) | [Demo Video](https://youtu.be/0fXaDCZNOJc) | [Interactive Demo](https://bhaasha.iiit.ac.in/lipsync) | [Colab Notebook](https://colab.research.google.com/drive/1tZpDWXz49W6wDcTprANRGLo2D_EbD5J8?usp=sharing) /[Updated Collab Notebook](https://colab.research.google.com/github/justinjohn0306/Wav2Lip/blob/master/Wav2Lip_simplified_v5.ipynb)
 
  <img src="https://drive.google.com/uc?export=view&id=1Wn0hPmpo4GRbCIJR8Tf20Akzdi1qjjG9"/>
 
@@ -27,14 +27,15 @@ This code is part of the paper: _A Lip Sync Expert Is All You Need for Speech to
 --------
 **Disclaimer**
 --------
-All results from this open-source code or our [demo website](https://bhaasha.iiit.ac.in/lipsync) should only be used for research/academic/personal purposes only. As the models are trained on the <a href="http://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs2.html">LRS2 dataset</a>, any form of commercial use is strictly prohibited. For commercial requests please contact us directly!
+All results from this open-source code or our [demo website](https://bhaasha.iiit.ac.in/lipsync) should only be used for research/academic/personal purposes only. As the models are trained on the <a href="http://www.robots.ox.ac.uk/~vgg/data/lip_reading/lrs2.html">LRS2 dataset</a>, any form of commercial use is strictly prohibhited. For commercial requests please contact us directly!
 
 Prerequisites
 -------------
-- `Python 3.6` 
+- `Python 3.10.15` 
 - ffmpeg: `sudo apt-get install ffmpeg`
 - Install necessary packages using `pip install -r requirements.txt`. Alternatively, instructions for using a docker image is provided [here](https://gist.github.com/xenogenesi/e62d3d13dadbc164124c830e9c453668). Have a look at [this comment](https://github.com/Rudrabha/Wav2Lip/issues/131#issuecomment-725478562) and comment on [the gist](https://gist.github.com/xenogenesi/e62d3d13dadbc164124c830e9c453668) if you encounter any issues. 
 - Face detection [pre-trained model](https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth) should be downloaded to `face_detection/detection/sfd/s3fd.pth`. Alternative [link](https://iiitaphyd-my.sharepoint.com/:u:/g/personal/prajwal_k_research_iiit_ac_in/EZsy6qWuivtDnANIG73iHjIBjMSoojcIV0NULXV-yiuiIg?e=qTasa8) if the above does not work.
+- Add [mobilenet.pth](https://github.com/justinjohn0306/Wav2Lip/releases/download/models/mobilenet.pth) to checkpoints folder once installed  along with one of the weights files below.
 
 Getting the weights
 ----------
@@ -55,8 +56,8 @@ The result is saved (by default) in `results/result_voice.mp4`. You can specify
 
 ##### Tips for better results:
 - Experiment with the `--pads` argument to adjust the detected face bounding box. Often leads to improved results. You might need to increase the bottom padding to include the chin region. E.g. `--pads 0 20 0 0`.
-- If you see the mouth position dislocated or some weird artifacts such as two mouths, then it can be because of over-smoothing the face detections. Use the `--nosmooth` argument and give it another try. 
-- Experiment with the `--resize_factor` argument, to get a lower-resolution video. Why? The models are trained on faces that were at a lower resolution. You might get better, visually pleasing results for 720p videos than for 1080p videos (in many cases, the latter works well too). 
+- If you see the mouth position dislocated or some weird artifacts such as two mouths, then it can be because of over-smoothing the face detections. Use the `--nosmooth` argument and give another try. 
+- Experiment with the `--resize_factor` argument, to get a lower resolution video. Why? The models are trained on faces which were at a lower resolution. You might get better, visually pleasing results for 720p videos than for 1080p videos (in many cases, the latter works well too). 
 - The Wav2Lip model without GAN usually needs more experimenting with the above two to get the most ideal results, and sometimes, can give you a better result as well.
 
 Preparing LRS2 for training
@@ -78,7 +79,7 @@ Place the LRS2 filelists (train, val, test) `.txt` files in the `filelists/` fol
 ```bash
 python preprocess.py --data_root data_root/main --preprocessed_root lrs2_preprocessed/
 ```
-Additional options like `batch_size` and the number of GPUs to use in parallel to use can also be set.
+Additional options like `batch_size` and number of GPUs to use in parallel to use can also be set.
 
 ##### Preprocessed LRS2 folder structure
 ```
@@ -99,12 +100,12 @@ You can download [the pre-trained weights](#getting-the-weights) if you want to
 python color_syncnet_train.py --data_root lrs2_preprocessed/ --checkpoint_dir <folder_to_save_checkpoints>
 ```
 ##### Training the Wav2Lip models
-You can either train the model without the additional visual quality discriminator (< 1 day of training) or use the discriminator (~2 days). For the former, run: 
+You can either train the model without the additional visual quality disriminator (< 1 day of training) or use the discriminator (~2 days). For the former, run: 
 ```bash
 python wav2lip_train.py --data_root lrs2_preprocessed/ --checkpoint_dir <folder_to_save_checkpoints> --syncnet_checkpoint_path <path_to_expert_disc_checkpoint>
 ```
 
-To train with the visual quality discriminator, you should run `hq_wav2lip_train.py` instead. The arguments for both files are similar. In both cases, you can resume training as well. Look at `python wav2lip_train.py --help` for more details. You can also set additional less commonly-used hyper-parameters at the bottom of the `hparams.py` file.
+To train with the visual quality discriminator, you should run `hq_wav2lip_train.py` instead. The arguments for both the files are similar. In both the cases, you can resume training as well. Look at `python wav2lip_train.py --help` for more details. You can also set additional less commonly-used hyper-parameters at the bottom of the `hparams.py` file.
 
 Training on datasets other than LRS2
 ------------------------------------
@@ -126,7 +127,7 @@ Please check the `evaluation/` folder for the instructions.
 
 License and Citation
 ----------
-This repository can only be used for personal/research/non-commercial purposes. However, for commercial requests, please contact us directly at radrabha.m@research.iiit.ac.in or prajwal.k@research.iiit.ac.in. We have an HD model trained on a dataset allowing commercial usage. The size of the generated face will be 192 x 288 in our new model. Please cite the following paper if you use this repository:
+Theis repository can only be used for personal/research/non-commercial purposes. However, for commercial requests, please contact us directly at radrabha.m@research.iiit.ac.in or prajwal.k@research.iiit.ac.in. We have an HD model trained on a dataset allowing commercial usage. The size of the generated face will be 192 x 288 in our new model. Please cite the following paper if you use this repository:
 ```
 @inproceedings{10.1145/3394171.3413532,
 author = {Prajwal, K R and Mukhopadhyay, Rudrabha and Namboodiri, Vinay P. and Jawahar, C.V.},
@@ -147,6 +148,6 @@ series = {MM '20}
 ```
 
 
-Acknowledgments
+Acknowledgements
 ----------
-Parts of the code structure are inspired by this [TTS repository](https://github.com/r9y9/deepvoice3_pytorch). We thank the author for this wonderful code. The code for Face Detection has been taken from the [face_alignment](https://github.com/1adrianb/face-alignment) repository. We thank the authors for releasing their code and models. We thank [zabique](https://github.com/zabique) for the tutorial collab notebook.
+Parts of the code structure is inspired by this [TTS repository](https://github.com/r9y9/deepvoice3_pytorch). We thank the author for this wonderful code. The code for Face Detection has been taken from the [face_alignment](https://github.com/1adrianb/face-alignment) repository. We thank the authors for releasing their code and models. We thank [zabique](https://github.com/zabique) for the tutorial collab notebook.
diff --git a/Wav2Lip_simplified_V5(offline).ipynb b/Wav2Lip_simplified_V5(offline).ipynb
new file mode 100644
index 000000000..8eb6d2f2e
--- /dev/null
+++ b/Wav2Lip_simplified_V5(offline).ipynb
@@ -0,0 +1,225 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f1e90f25",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Step 1: Install dependency\n",
+    "!pip install ffmpeg-python\n",
+    "\n",
+    "# Step 2: Clone the Wav2Lip repository\n",
+    "!git clone https://github.com/justinjohn0306/Wav2Lip\n",
+    "\n",
+    "# Step 3: Download pretrained model\n",
+    "import requests\n",
+    "url = \"https://iiitaphyd-my.sharepoint.com/personal/radrabha_m_research_iiit_ac_in/_layouts/15/download.aspx?share=EdjI7bZlgApMqsVoEUUXpLsBxqXbn5z8VTmoxp55YNDcIA\"\n",
+    "response = requests.get(url)\n",
+    "\n",
+    "with open(\"Wav2Lip/checkpoints/wav2lip_gan.pth\", \"wb\") as f:\n",
+    "    f.write(response.content)\n",
+    "    \n",
+    "# Step 4: Install the required dependencies for Wav2Lip\n",
+    "!cd Wav2Lip && pip install -r requirements.txt\n",
+    "!pip install pyaudio\n",
+    "\n",
+    "\n",
+    "# Step 5: Download pretrained model for face detection\n",
+    "url = \"https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth\"\n",
+    "response = requests.get(url)\n",
+    "\n",
+    "with open(\"Wav2Lip/face_detection/detection/sfd/s3fd.pth\", \"wb\") as f:\n",
+    "    f.write(response.content)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8e86c988",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import subprocess\n",
+    "from urllib import parse as urlparse\n",
+    "\n",
+    "# Step 1: Install yt-dlp\n",
+    "subprocess.run(['pip', 'install', 'yt-dlp'])\n",
+    "\n",
+    "# Step 2: Define YouTube URL and Video ID\n",
+    "YOUTUBE_URL = 'https://www.youtube.com/watch?v=vAnWYLTdvfY'\n",
+    "url_data = urlparse.urlparse(YOUTUBE_URL)\n",
+    "query = urlparse.parse_qs(url_data.query)\n",
+    "YOUTUBE_ID = query[\"v\"][0]\n",
+    "\n",
+    "# Remove previous input video\n",
+    "if os.path.isfile('input_vid.mp4'):\n",
+    "    os.remove('input_vid.mp4')\n",
+    "\n",
+    "# Trim video (start, end) seconds\n",
+    "start = 35\n",
+    "end = 62\n",
+    "interval = end - start\n",
+    "\n",
+    "# Step 3: Download and trim the YouTube video\n",
+    "subprocess.run(['yt-dlp', '-f', 'bestvideo[ext=mp4]', '--output', \"youtube.%(ext)s\", f'https://www.youtube.com/watch?v={YOUTUBE_ID}'])\n",
+    "\n",
+    "# Cut the video using FFmpeg\n",
+    "subprocess.run(['ffmpeg', '-y', '-i', 'youtube.mp4', '-ss', str(start), '-t', str(interval), '-async', '1', 'input_vid.mp4'])\n",
+    "\n",
+    "# Display video.\n",
+    "from IPython.display import HTML\n",
+    "from base64 import b64encode\n",
+    "\n",
+    "def show_video(path):\n",
+    "    mp4 = open(path, 'rb').read()\n",
+    "    data_url = \"data:video/mp4;base64,\" + b64encode(mp4).decode()\n",
+    "    return HTML(f\"\"\"<video width=600 controls><source src=\"{data_url}\"></video>\"\"\")\n",
+    "\n",
+    "# Preview the trimmed video\n",
+    "show_video('input_vid.mp4')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7da8e818",
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "from IPython.display import Audio\n",
+    "from IPython.core.display import display\n",
+    "\n",
+    "upload_method = 'Path'  # Change this to 'Record' or 'Path'\n",
+    "\n",
+    "# Remove previous input audio\n",
+    "if os.path.isfile('input_audio.wav'):\n",
+    "    os.remove('input_audio.wav')\n",
+    "\n",
+    "def display_audio():\n",
+    "    display(Audio('input_audio.wav'))\n",
+    "\n",
+    "if upload_method == 'Record':\n",
+    "    import pyaudio\n",
+    "    import wave\n",
+    "\n",
+    "    CHUNK = 1024\n",
+    "    FORMAT = pyaudio.paInt16\n",
+    "    CHANNELS = 1\n",
+    "    RATE = 16000\n",
+    "    RECORD_SECONDS = 5\n",
+    "    WAVE_OUTPUT_FILENAME = \"input_audio.wav\"\n",
+    "\n",
+    "    p = pyaudio.PyAudio()\n",
+    "\n",
+    "    stream = p.open(format=FORMAT,\n",
+    "                    channels=CHANNELS,\n",
+    "                    rate=RATE,\n",
+    "                    input=True,\n",
+    "                    frames_per_buffer=CHUNK)\n",
+    "\n",
+    "    print(\"Recording...\")\n",
+    "\n",
+    "    frames = []\n",
+    "\n",
+    "    for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):\n",
+    "        data = stream.read(CHUNK)\n",
+    "        frames.append(data)\n",
+    "\n",
+    "    print(\"Finished recording.\")\n",
+    "\n",
+    "    stream.stop_stream()\n",
+    "    stream.close()\n",
+    "    p.terminate()\n",
+    "\n",
+    "    wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb')\n",
+    "    wf.setnchannels(CHANNELS)\n",
+    "    wf.setsampwidth(p.get_sample_size(FORMAT))\n",
+    "    wf.setframerate(RATE)\n",
+    "    wf.writeframes(b''.join(frames))\n",
+    "    wf.close()\n",
+    "\n",
+    "    display_audio()\n",
+    "\n",
+    "elif upload_method == 'Path':\n",
+    "    # Add the full path to your audio\n",
+    "    PATH_TO_YOUR_AUDIO = 'C:/Users/justi/OneDrive/Desktop/wav2lip/Wav2Lip/input_audio.wav'\n",
+    "\n",
+    "    # Load audio with specified sampling rate\n",
+    "    import librosa\n",
+    "    audio, sr = librosa.load(PATH_TO_YOUR_AUDIO, sr=None)\n",
+    "\n",
+    "    # Save audio with specified sampling rate\n",
+    "    import soundfile as sf\n",
+    "    sf.write('input_audio.wav', audio, sr, format='wav')\n",
+    "\n",
+    "    display_audio()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "63289945",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# Define the parameters for the Wav2Lip model\n",
+    "pad_top = 0\n",
+    "pad_bottom = 10\n",
+    "pad_left = 0\n",
+    "pad_right = 0\n",
+    "rescaleFactor = 1\n",
+    "nosmooth = False\n",
+    "\n",
+    "# Set the path to the Wav2Lip model and input files\n",
+    "checkpoint_path = \"checkpoints/wav2lip_gan.pth\"\n",
+    "input_face = \"input_vid.mp4\"\n",
+    "input_audio = \"input_audio.wav\"\n",
+    "\n",
+    "# Run the Wav2Lip model\n",
+    "!cd Wav2Lip && python inference.py --checkpoint_path {checkpoint_path} --face {input_face} --audio {input_audio} --pads {pad_top} {pad_bottom} {pad_left} {pad_right} --resize_factor {rescaleFactor} {\"--nosmooth\" if nosmooth else \"\"}\n",
+    "\n",
+    "# Preview the output video\n",
+    "print(\"Final Video Preview\")\n",
+    "print(\"Find the output video at\", 'Wav2Lip/results/result_voice.mp4')\n",
+    "show_video('Wav2Lip/results/result_voice.mp4')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3fbafa56",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/Wav2Lip_simplified_v4.ipynb b/Wav2Lip_simplified_v4.ipynb
new file mode 100644
index 000000000..5cc33bf16
--- /dev/null
+++ b/Wav2Lip_simplified_v4.ipynb
@@ -0,0 +1,482 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "source": [
+        "**Fixes by: [justinjohn-03](https://github.com/justinjohn0306)**"
+      ],
+      "metadata": {
+        "id": "9Uyk6DCBGHuW"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "U1xFNFU58_2j"
+      },
+      "source": [
+        "## Goal: Make anyone speak anything (LipSync)\n",
+        "\n",
+        "* Github: https://github.com/Rudrabha/Wav2Lip\n",
+        "* Paper: https://arxiv.org/abs/2008.10010\n",
+        "*Original notebook: https://colab.research.google.com/drive/1tZpDWXz49W6wDcTprANRGLo2D_EbD5J8?usp=sharing\n",
+        "\n",
+        "\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "Qgo-oaI3JU2u",
+        "cellView": "form"
+      },
+      "source": [
+        "#@title <h1>Step1: Setup Wav2Lip</h1>\n",
+        "#@markdown * Install dependency\n",
+        "#@markdown * Download pretrained model\n",
+        "!rm -rf /content/sample_data\n",
+        "!mkdir /content/sample_data\n",
+        "\n",
+        "!git clone https://github.com/zabique/Wav2Lip\n",
+        "\n",
+        "#download the pretrained model\n",
+        "!wget 'https://iiitaphyd-my.sharepoint.com/personal/radrabha_m_research_iiit_ac_in/_layouts/15/download.aspx?share=EdjI7bZlgApMqsVoEUUXpLsBxqXbn5z8VTmoxp55YNDcIA' -O '/content/Wav2Lip/checkpoints/wav2lip_gan.pth'\n",
+        "a = !pip install https://raw.githubusercontent.com/AwaleSajil/ghc/master/ghc-1.0-py3-none-any.whl\n",
+        "\n",
+        "# !pip uninstall tensorflow tensorflow-gpu\n",
+        "!cd Wav2Lip && pip install -r requirements.txt\n",
+        "\n",
+        "#download pretrained model for face detection\n",
+        "!wget \"https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth\" -O \"/content/Wav2Lip/face_detection/detection/sfd/s3fd.pth\"\n",
+        "\n",
+        "!pip install -q youtube-dl\n",
+        "!pip install ffmpeg-python\n",
+        "!pip install librosa==0.9.1\n",
+        "\n",
+        "#this code for recording audio\n",
+        "\"\"\"\n",
+        "To write this piece of code I took inspiration/code from a lot of places.\n",
+        "It was late night, so I'm not sure how much I created or just copied o.O\n",
+        "Here are some of the possible references:\n",
+        "https://blog.addpipe.com/recording-audio-in-the-browser-using-pure-html5-and-minimal-javascript/\n",
+        "https://stackoverflow.com/a/18650249\n",
+        "https://hacks.mozilla.org/2014/06/easy-audio-capture-with-the-mediarecorder-api/\n",
+        "https://air.ghost.io/recording-to-an-audio-file-using-html5-and-js/\n",
+        "https://stackoverflow.com/a/49019356\n",
+        "\"\"\"\n",
+        "from IPython.display import HTML, Audio\n",
+        "from google.colab.output import eval_js\n",
+        "from base64 import b64decode\n",
+        "import numpy as np\n",
+        "from scipy.io.wavfile import read as wav_read\n",
+        "import io\n",
+        "import ffmpeg\n",
+        "\n",
+        "AUDIO_HTML = \"\"\"\n",
+        "<script>\n",
+        "var my_div = document.createElement(\"DIV\");\n",
+        "var my_p = document.createElement(\"P\");\n",
+        "var my_btn = document.createElement(\"BUTTON\");\n",
+        "var t = document.createTextNode(\"Press to start recording\");\n",
+        "\n",
+        "my_btn.appendChild(t);\n",
+        "//my_p.appendChild(my_btn);\n",
+        "my_div.appendChild(my_btn);\n",
+        "document.body.appendChild(my_div);\n",
+        "\n",
+        "var base64data = 0;\n",
+        "var reader;\n",
+        "var recorder, gumStream;\n",
+        "var recordButton = my_btn;\n",
+        "\n",
+        "var handleSuccess = function(stream) {\n",
+        "  gumStream = stream;\n",
+        "  var options = {\n",
+        "    //bitsPerSecond: 8000, //chrome seems to ignore, always 48k\n",
+        "    mimeType : 'audio/webm;codecs=opus'\n",
+        "    //mimeType : 'audio/webm;codecs=pcm'\n",
+        "  };            \n",
+        "  //recorder = new MediaRecorder(stream, options);\n",
+        "  recorder = new MediaRecorder(stream);\n",
+        "  recorder.ondataavailable = function(e) {            \n",
+        "    var url = URL.createObjectURL(e.data);\n",
+        "    var preview = document.createElement('audio');\n",
+        "    preview.controls = true;\n",
+        "    preview.src = url;\n",
+        "    document.body.appendChild(preview);\n",
+        "\n",
+        "    reader = new FileReader();\n",
+        "    reader.readAsDataURL(e.data); \n",
+        "    reader.onloadend = function() {\n",
+        "      base64data = reader.result;\n",
+        "      //console.log(\"Inside FileReader:\" + base64data);\n",
+        "    }\n",
+        "  };\n",
+        "  recorder.start();\n",
+        "  };\n",
+        "\n",
+        "recordButton.innerText = \"Recording... press to stop\";\n",
+        "\n",
+        "navigator.mediaDevices.getUserMedia({audio: true}).then(handleSuccess);\n",
+        "\n",
+        "\n",
+        "function toggleRecording() {\n",
+        "  if (recorder && recorder.state == \"recording\") {\n",
+        "      recorder.stop();\n",
+        "      gumStream.getAudioTracks()[0].stop();\n",
+        "      recordButton.innerText = \"Saving the recording... pls wait!\"\n",
+        "  }\n",
+        "}\n",
+        "\n",
+        "// https://stackoverflow.com/a/951057\n",
+        "function sleep(ms) {\n",
+        "  return new Promise(resolve => setTimeout(resolve, ms));\n",
+        "}\n",
+        "\n",
+        "var data = new Promise(resolve=>{\n",
+        "//recordButton.addEventListener(\"click\", toggleRecording);\n",
+        "recordButton.onclick = ()=>{\n",
+        "toggleRecording()\n",
+        "\n",
+        "sleep(2000).then(() => {\n",
+        "  // wait 2000ms for the data to be available...\n",
+        "  // ideally this should use something like await...\n",
+        "  //console.log(\"Inside data:\" + base64data)\n",
+        "  resolve(base64data.toString())\n",
+        "\n",
+        "});\n",
+        "\n",
+        "}\n",
+        "});\n",
+        "      \n",
+        "</script>\n",
+        "\"\"\"\n",
+        "\n",
+        "%cd /\n",
+        "from ghc.l_ghc_cf import l_ghc_cf\n",
+        "%cd content\n",
+        "\n",
+        "def get_audio():\n",
+        "  display(HTML(AUDIO_HTML))\n",
+        "  data = eval_js(\"data\")\n",
+        "  binary = b64decode(data.split(',')[1])\n",
+        "  \n",
+        "  process = (ffmpeg\n",
+        "    .input('pipe:0')\n",
+        "    .output('pipe:1', format='wav')\n",
+        "    .run_async(pipe_stdin=True, pipe_stdout=True, pipe_stderr=True, quiet=True, overwrite_output=True)\n",
+        "  )\n",
+        "  output, err = process.communicate(input=binary)\n",
+        "  \n",
+        "  riff_chunk_size = len(output) - 8\n",
+        "  # Break up the chunk size into four bytes, held in b.\n",
+        "  q = riff_chunk_size\n",
+        "  b = []\n",
+        "  for i in range(4):\n",
+        "      q, r = divmod(q, 256)\n",
+        "      b.append(r)\n",
+        "\n",
+        "  # Replace bytes 4:8 in proc.stdout with the actual size of the RIFF chunk.\n",
+        "  riff = output[:4] + bytes(b) + output[8:]\n",
+        "\n",
+        "  sr, audio = wav_read(io.BytesIO(riff))\n",
+        "\n",
+        "  return audio, sr\n",
+        "\n",
+        "\n",
+        "from IPython.display import HTML\n",
+        "from base64 import b64encode\n",
+        "def showVideo(path):\n",
+        "  mp4 = open(str(path),'rb').read()\n",
+        "  data_url = \"data:video/mp4;base64,\" + b64encode(mp4).decode()\n",
+        "  return HTML(\"\"\"\n",
+        "  <video width=700 controls>\n",
+        "        <source src=\"%s\" type=\"video/mp4\">\n",
+        "  </video>\n",
+        "  \"\"\" % data_url)\n",
+        "\n",
+        "from IPython.display import clear_output"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "SEdy6PWDXMRL"
+      },
+      "source": [
+        "# LipSync Youtube Video"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "QI4kcm8QEeGZ",
+        "cellView": "form"
+      },
+      "source": [
+        "#@title STEP2: Select a Youtube Video\n",
+        "# Install yt-dlp\n",
+        "!pip install yt-dlp\n",
+        "\n",
+        "#@markdown ### Find YouTube video ID from URL\n",
+        "from urllib import parse as urlparse\n",
+        "YOUTUBE_URL = 'https://www.youtube.com/watch?v=vAnWYLTdvfY' #@param {type:\"string\"}\n",
+        "url_data = urlparse.urlparse(YOUTUBE_URL)\n",
+        "query = urlparse.parse_qs(url_data.query)\n",
+        "YOUTUBE_ID = query[\"v\"][0]\n",
+        "\n",
+        "#@markdown ### Trim the video (start, end) seconds\n",
+        "start = 35 #@param {type:\"integer\"}\n",
+        "end = 62 #@param {type:\"integer\"}\n",
+        "interval = end - start\n",
+        "\n",
+        "# Download the YouTube video using yt-dlp\n",
+        "!yt-dlp -f 'bestvideo[ext=mp4]' --output \"youtube.%(ext)s\" https://www.youtube.com/watch?v=$YOUTUBE_ID\n",
+        "\n",
+        "# Cut the video using FFmpeg\n",
+        "!ffmpeg -y -i youtube.mp4 -ss {start} -t {interval} -async 1 /content/sample_data/input_vid.mp4\n",
+        "\n",
+        "# Preview the trimmed video\n",
+        "from IPython.display import HTML\n",
+        "from base64 import b64encode\n",
+        "mp4 = open('/content/sample_data/input_vid.mp4','rb').read()\n",
+        "data_url = \"data:video/mp4;base64,\" + b64encode(mp4).decode()\n",
+        "HTML(f\"\"\"<video width=600 controls><source src=\"{data_url}\"></video>\"\"\")\n",
+        "\n"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "zS_RAeh-IfZy",
+        "cellView": "form"
+      },
+      "source": [
+        "#@title STEP3: Select Audio (Record or Upload)\n",
+        "from IPython.display import Audio \n",
+        "from IPython.core.display import display\n",
+        "\n",
+        "record_or_upload = 'Upload' #@param ['Record', 'Upload']\n",
+        "\n",
+        "def displayAudio():\n",
+        "  display(Audio('/content/sample_data/input_audio.wav'))\n",
+        "if record_or_upload == 'Record':\n",
+        "  audio, sr = get_audio()\n",
+        "  import scipy\n",
+        "  scipy.io.wavfile.write('/content/sample_data/input_audio.wav', sr, audio)\n",
+        "elif record_or_upload == 'Upload':\n",
+        "  from google.colab import files\n",
+        "  uploaded = files.upload()\n",
+        "  for fn in uploaded.keys():\n",
+        "    print('User uploaded file \"{name}\" with length {length} bytes'.format(\n",
+        "        name=fn, length=len(uploaded[fn])))\n",
+        "  \n",
+        "  #concider only the first file\n",
+        "  audio_file = str(list(uploaded.keys())[0])\n",
+        "  \n",
+        "  # Load audio with specified sampling rate\n",
+        "  import librosa\n",
+        "  audio, sr = librosa.load(audio_file, sr=None)\n",
+        "  \n",
+        "  # Save audio with specified sampling rate\n",
+        "  import soundfile as sf\n",
+        "  sf.write('/content/sample_data/input_audio.wav', audio, sr, format='wav')\n",
+        "  \n",
+        "  clear_output()\n",
+        "  displayAudio()"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "BQPLXJ8L0gms",
+        "cellView": "form"
+      },
+      "source": [
+        "#@title STEP4: Start Crunching and Preview Output\n",
+        "#@markdown <b>Note: Only change these, if you have to</b>\n",
+        "pad_top =  0#@param {type:\"integer\"}\n",
+        "pad_bottom =  10#@param {type:\"integer\"}\n",
+        "pad_left =  0#@param {type:\"integer\"}\n",
+        "pad_right =  0#@param {type:\"integer\"}\n",
+        "rescaleFactor =  1#@param {type:\"integer\"}\n",
+        "nosmooth = False #@param {type:\"boolean\"}\n",
+        "\n",
+        "\n",
+        "if nosmooth == False:\n",
+        "  !cd Wav2Lip && python inference.py --checkpoint_path checkpoints/wav2lip_gan.pth --face \"../sample_data/input_vid.mp4\" --audio \"../sample_data/input_audio.wav\" --pads $pad_top $pad_bottom $pad_left $pad_right --resize_factor $rescaleFactor\n",
+        "else:\n",
+        "  !cd Wav2Lip && python inference.py --checkpoint_path checkpoints/wav2lip_gan.pth --face \"../sample_data/input_vid.mp4\" --audio \"../sample_data/input_audio.wav\" --pads $pad_top $pad_bottom $pad_left $pad_right --resize_factor $rescaleFactor --nosmooth\n",
+        "#Preview output video\n",
+        "clear_output()\n",
+        "print(\"Final Video Preview\")\n",
+        "print(\"Download this video from\", '/content/Wav2Lip/results/result_voice.mp4')\n",
+        "showVideo('/content/Wav2Lip/results/result_voice.mp4')\n"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "vYxpPeie1CYL"
+      },
+      "source": [
+        "# LipSync on Your Video File"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "nDuM7tfZ1F0t",
+        "cellView": "form"
+      },
+      "source": [
+        "import os\n",
+        "from google.colab import files\n",
+        "from IPython.display import HTML\n",
+        "\n",
+        "def showVideo(file_path):\n",
+        "    \"\"\"Function to display video in Colab\"\"\"\n",
+        "    mp4 = open(file_path,'rb').read()\n",
+        "    data_url = \"data:video/mp4;base64,\" + b64encode(mp4).decode()\n",
+        "    display(HTML(\"\"\"\n",
+        "    <video controls width=600>\n",
+        "        <source src=\"%s\" type=\"video/mp4\">\n",
+        "    </video>\n",
+        "    \"\"\" % data_url))\n",
+        "\n",
+        "#@markdown ### Select an uploading method\n",
+        "upload_or_path = \"Upload\" #@param [\"Upload\", \"Custom Path\"]\n",
+        "\n",
+        "if upload_or_path == \"Upload\":\n",
+        "    uploaded = files.upload()\n",
+        "    for filename in uploaded.keys():\n",
+        "        os.rename(filename, '/content/sample_data/input_vid.mp4')\n",
+        "    PATH_TO_YOUR_VIDEO = '/content/sample_data/input_vid.mp4'\n",
+        "else:\n",
+        "    PATH_TO_YOUR_VIDEO = '/content/test.mp4' #@param {type:\"string\"}\n",
+        "    if not os.path.isfile(PATH_TO_YOUR_VIDEO):\n",
+        "        print(\"ERROR: File not found!\")\n",
+        "        raise SystemExit(0)\n",
+        "\n",
+        "#@markdown ### Trim the video (start, end) seconds\n",
+        "start_time = 0 #@param {type:\"integer\"}\n",
+        "end_time = 0 #@param {type:\"integer\"}\n",
+        "\n",
+        "if start_time == 0 and end_time == 0:\n",
+        "    print(\"No trimming applied\")\n",
+        "else:\n",
+        "    duration = end_time - start_time\n",
+        "    os.system(f\"ffmpeg -i {PATH_TO_YOUR_VIDEO} -ss {start_time} -t {duration} -async 1 /content/sample_data/trimmed_vid.mp4\")\n",
+        "    PATH_TO_YOUR_VIDEO = \"/content/sample_data/input_vid.mp4\"\n",
+        "    print(f\"Video trimmed from {start_time} to {end_time} seconds\")\n",
+        "\n",
+        "print(f\"PATH_TO_YOUR_VIDEO: {PATH_TO_YOUR_VIDEO}\")\n",
+        "\n",
+        "if upload_or_path == \"Upload\":\n",
+        "    clear_output()\n",
+        "    print(\"Input Video\")\n",
+        "    showVideo(PATH_TO_YOUR_VIDEO)\n",
+        "else:\n",
+        "    if os.path.isfile(PATH_TO_YOUR_VIDEO):\n",
+        "        print(\"Input Video\")\n",
+        "        showVideo(PATH_TO_YOUR_VIDEO)\n"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "XgF4794r7sWK",
+        "cellView": "form"
+      },
+      "source": [
+        "#@title STEP3: Select Audio (Record or Upload)\n",
+        "from IPython.display import Audio \n",
+        "from IPython.core.display import display\n",
+        "\n",
+        "record_or_upload = 'Upload' #@param ['Record', 'Upload']\n",
+        "\n",
+        "def displayAudio():\n",
+        "  display(Audio('/content/sample_data/input_audio.wav'))\n",
+        "if record_or_upload == 'Record':\n",
+        "  audio, sr = get_audio()\n",
+        "  import scipy\n",
+        "  scipy.io.wavfile.write('/content/sample_data/input_audio.wav', sr, audio)\n",
+        "elif record_or_upload == 'Upload':\n",
+        "  from google.colab import files\n",
+        "  uploaded = files.upload()\n",
+        "  for fn in uploaded.keys():\n",
+        "    print('User uploaded file \"{name}\" with length {length} bytes'.format(\n",
+        "        name=fn, length=len(uploaded[fn])))\n",
+        "  \n",
+        "  #concider only the first file\n",
+        "  audio_file = str(list(uploaded.keys())[0])\n",
+        "  \n",
+        "  # Load audio with specified sampling rate\n",
+        "  import librosa\n",
+        "  audio, sr = librosa.load(audio_file, sr=None)\n",
+        "  \n",
+        "  # Save audio with specified sampling rate\n",
+        "  import soundfile as sf\n",
+        "  sf.write('/content/sample_data/input_audio.wav', audio, sr, format='wav')\n",
+        "  \n",
+        "  clear_output()\n",
+        "  displayAudio()"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "ZgtO08V28ANf",
+        "cellView": "form"
+      },
+      "source": [
+        "#@title STEP4: Start Crunching and Preview Output\n",
+        "#@markdown <b>Note: Only change these, if you have to</b>\n",
+        "pad_top =  0#@param {type:\"integer\"}\n",
+        "pad_bottom =  10#@param {type:\"integer\"}\n",
+        "pad_left =  0#@param {type:\"integer\"}\n",
+        "pad_right =  0#@param {type:\"integer\"}\n",
+        "rescaleFactor =  1#@param {type:\"integer\"}\n",
+        "nosmooth = False #@param {type:\"boolean\"}\n",
+        "\n",
+        "if nosmooth == False:\n",
+        "  !cd Wav2Lip && python inference.py --checkpoint_path checkpoints/wav2lip_gan.pth --face \"../sample_data/input_vid.mp4\" --audio \"../sample_data/input_audio.wav\" --pads $pad_top $pad_bottom $pad_left $pad_right --resize_factor $rescaleFactor\n",
+        "else:\n",
+        "  !cd Wav2Lip && python inference.py --checkpoint_path checkpoints/wav2lip_gan.pth --face \"../sample_data/input_vid.mp4\" --audio \"../sample_data/input_audio.wav\" --pads $pad_top $pad_bottom $pad_left $pad_right --resize_factor $rescaleFactor --nosmooth\n",
+        "\n",
+        "#Preview output video\n",
+        "clear_output()\n",
+        "print(\"Final Video Preview\")\n",
+        "print(\"Dowload this video from\", '/content/Wav2Lip/results/result_voice.mp4')\n",
+        "showVideo('/content/Wav2Lip/results/result_voice.mp4')\n"
+      ],
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}
\ No newline at end of file
diff --git a/Wav2Lip_simplified_v5.ipynb b/Wav2Lip_simplified_v5.ipynb
new file mode 100644
index 000000000..308f3bd76
--- /dev/null
+++ b/Wav2Lip_simplified_v5.ipynb
@@ -0,0 +1,645 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "U1xFNFU58_2j"
+      },
+      "source": [
+        "## Goal: Make anyone speak anything (LipSync)\n",
+        "\n",
+        "* Github: https://github.com/Rudrabha/Wav2Lip\n",
+        "* Paper: https://arxiv.org/abs/2008.10010\n",
+        "*Original notebook: https://colab.research.google.com/drive/1tZpDWXz49W6wDcTprANRGLo2D_EbD5J8?usp=sharing\n",
+        "\n",
+        "\n",
+        "\n",
+        "\n",
+        "**Modded by: [justinjohn-03](https://github.com/justinjohn0306)**\n",
+        "\n",
+        "\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "id": "Qgo-oaI3JU2u"
+      },
+      "outputs": [],
+      "source": [
+        "#@title <h1>Step1: Setup Wav2Lip</h1>\n",
+        "#@markdown * Install dependency\n",
+        "#@markdown * Download pretrained model\n",
+        "from IPython.display import HTML, clear_output\n",
+        "!rm -rf /content/sample_data\n",
+        "!mkdir /content/sample_data\n",
+        "\n",
+        "!git clone https://github.com/justinjohn0306/Wav2Lip\n",
+        "\n",
+        "%cd /content/Wav2Lip\n",
+        "\n",
+        "#download the pretrained model\n",
+        "!wget 'https://github.com/justinjohn0306/Wav2Lip/releases/download/models/wav2lip.pth' -O 'checkpoints/wav2lip.pth'\n",
+        "!wget 'https://github.com/justinjohn0306/Wav2Lip/releases/download/models/wav2lip_gan.pth' -O 'checkpoints/wav2lip_gan.pth'\n",
+        "!wget 'https://github.com/justinjohn0306/Wav2Lip/releases/download/models/resnet50.pth' -O 'checkpoints/resnet50.pth'\n",
+        "!wget 'https://github.com/justinjohn0306/Wav2Lip/releases/download/models/mobilenet.pth' -O 'checkpoints/mobilenet.pth'\n",
+        "a = !pip install https://raw.githubusercontent.com/AwaleSajil/ghc/master/ghc-1.0-py3-none-any.whl\n",
+        "!pip install git+https://github.com/elliottzheng/batch-face.git@master\n",
+        "\n",
+        "!pip install ffmpeg-python mediapipe==0.10.18\n",
+        "\n",
+        "#this code for recording audio\n",
+        "\"\"\"\n",
+        "To write this piece of code I took inspiration/code from a lot of places.\n",
+        "It was late night, so I'm not sure how much I created or just copied o.O\n",
+        "Here are some of the possible references:\n",
+        "https://blog.addpipe.com/recording-audio-in-the-browser-using-pure-html5-and-minimal-javascript/\n",
+        "https://stackoverflow.com/a/18650249\n",
+        "https://hacks.mozilla.org/2014/06/easy-audio-capture-with-the-mediarecorder-api/\n",
+        "https://air.ghost.io/recording-to-an-audio-file-using-html5-and-js/\n",
+        "https://stackoverflow.com/a/49019356\n",
+        "\"\"\"\n",
+        "from IPython.display import HTML, Audio\n",
+        "from google.colab.output import eval_js\n",
+        "from base64 import b64decode\n",
+        "import numpy as np\n",
+        "from scipy.io.wavfile import read as wav_read\n",
+        "import io\n",
+        "import ffmpeg\n",
+        "\n",
+        "AUDIO_HTML = \"\"\"\n",
+        "<script>\n",
+        "var my_div = document.createElement(\"DIV\");\n",
+        "var my_p = document.createElement(\"P\");\n",
+        "var my_btn = document.createElement(\"BUTTON\");\n",
+        "var t = document.createTextNode(\"Press to start recording\");\n",
+        "\n",
+        "my_btn.appendChild(t);\n",
+        "//my_p.appendChild(my_btn);\n",
+        "my_div.appendChild(my_btn);\n",
+        "document.body.appendChild(my_div);\n",
+        "\n",
+        "var base64data = 0;\n",
+        "var reader;\n",
+        "var recorder, gumStream;\n",
+        "var recordButton = my_btn;\n",
+        "\n",
+        "var handleSuccess = function(stream) {\n",
+        "  gumStream = stream;\n",
+        "  var options = {\n",
+        "    //bitsPerSecond: 8000, //chrome seems to ignore, always 48k\n",
+        "    mimeType : 'audio/webm;codecs=opus'\n",
+        "    //mimeType : 'audio/webm;codecs=pcm'\n",
+        "  };\n",
+        "  //recorder = new MediaRecorder(stream, options);\n",
+        "  recorder = new MediaRecorder(stream);\n",
+        "  recorder.ondataavailable = function(e) {\n",
+        "    var url = URL.createObjectURL(e.data);\n",
+        "    var preview = document.createElement('audio');\n",
+        "    preview.controls = true;\n",
+        "    preview.src = url;\n",
+        "    document.body.appendChild(preview);\n",
+        "\n",
+        "    reader = new FileReader();\n",
+        "    reader.readAsDataURL(e.data);\n",
+        "    reader.onloadend = function() {\n",
+        "      base64data = reader.result;\n",
+        "      //console.log(\"Inside FileReader:\" + base64data);\n",
+        "    }\n",
+        "  };\n",
+        "  recorder.start();\n",
+        "  };\n",
+        "\n",
+        "recordButton.innerText = \"Recording... press to stop\";\n",
+        "\n",
+        "navigator.mediaDevices.getUserMedia({audio: true}).then(handleSuccess);\n",
+        "\n",
+        "\n",
+        "function toggleRecording() {\n",
+        "  if (recorder && recorder.state == \"recording\") {\n",
+        "      recorder.stop();\n",
+        "      gumStream.getAudioTracks()[0].stop();\n",
+        "      recordButton.innerText = \"Saving the recording... pls wait!\"\n",
+        "  }\n",
+        "}\n",
+        "\n",
+        "// https://stackoverflow.com/a/951057\n",
+        "function sleep(ms) {\n",
+        "  return new Promise(resolve => setTimeout(resolve, ms));\n",
+        "}\n",
+        "\n",
+        "var data = new Promise(resolve=>{\n",
+        "//recordButton.addEventListener(\"click\", toggleRecording);\n",
+        "recordButton.onclick = ()=>{\n",
+        "toggleRecording()\n",
+        "\n",
+        "sleep(2000).then(() => {\n",
+        "  // wait 2000ms for the data to be available...\n",
+        "  // ideally this should use something like await...\n",
+        "  //console.log(\"Inside data:\" + base64data)\n",
+        "  resolve(base64data.toString())\n",
+        "\n",
+        "});\n",
+        "\n",
+        "}\n",
+        "});\n",
+        "\n",
+        "</script>\n",
+        "\"\"\"\n",
+        "\n",
+        "%cd /\n",
+        "from ghc.l_ghc_cf import l_ghc_cf\n",
+        "%cd content\n",
+        "\n",
+        "def get_audio():\n",
+        "  display(HTML(AUDIO_HTML))\n",
+        "  data = eval_js(\"data\")\n",
+        "  binary = b64decode(data.split(',')[1])\n",
+        "\n",
+        "  process = (ffmpeg\n",
+        "    .input('pipe:0')\n",
+        "    .output('pipe:1', format='wav')\n",
+        "    .run_async(pipe_stdin=True, pipe_stdout=True, pipe_stderr=True, quiet=True, overwrite_output=True)\n",
+        "  )\n",
+        "  output, err = process.communicate(input=binary)\n",
+        "\n",
+        "  riff_chunk_size = len(output) - 8\n",
+        "  # Break up the chunk size into four bytes, held in b.\n",
+        "  q = riff_chunk_size\n",
+        "  b = []\n",
+        "  for i in range(4):\n",
+        "      q, r = divmod(q, 256)\n",
+        "      b.append(r)\n",
+        "\n",
+        "  # Replace bytes 4:8 in proc.stdout with the actual size of the RIFF chunk.\n",
+        "  riff = output[:4] + bytes(b) + output[8:]\n",
+        "\n",
+        "  sr, audio = wav_read(io.BytesIO(riff))\n",
+        "\n",
+        "  return audio, sr\n",
+        "\n",
+        "\n",
+        "from IPython.display import HTML\n",
+        "from base64 import b64encode\n",
+        "def showVideo(path):\n",
+        "  mp4 = open(str(path),'rb').read()\n",
+        "  data_url = \"data:video/mp4;base64,\" + b64encode(mp4).decode()\n",
+        "  return HTML(\"\"\"\n",
+        "  <video width=700 controls>\n",
+        "        <source src=\"%s\" type=\"video/mp4\">\n",
+        "  </video>\n",
+        "  \"\"\" % data_url)\n",
+        "\n",
+        "from IPython.display import clear_output\n",
+        "\n",
+        "clear_output()\n",
+        "print(\"All set and ready!\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "SEdy6PWDXMRL"
+      },
+      "source": [
+        "# LipSync Youtube Video"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "id": "QI4kcm8QEeGZ"
+      },
+      "outputs": [],
+      "source": [
+        "#@title STEP2: Select a Youtube Video\n",
+        "# Install yt-dlp\n",
+        "\n",
+        "import os\n",
+        "!pip install yt-dlp\n",
+        "\n",
+        "#@markdown ## Find YouTube video ID from URL\n",
+        "\n",
+        "#@markdown ___\n",
+        "\n",
+        "#@markdown Link format:\n",
+        "\n",
+        "#@markdown ``https://youtu.be/vAnWYLTdvfY`` ❌\n",
+        "\n",
+        "#@markdown ``https://www.youtube.com/watch?v=vAnWYLTdvfY`` ✔️\n",
+        "\n",
+        "!rm -df youtube.mp4\n",
+        "\n",
+        "#@markdown ___\n",
+        "from urllib import parse as urlparse\n",
+        "YOUTUBE_URL = 'https://www.youtube.com/watch?v=vAnWYLTdvfY' #@param {type:\"string\"}\n",
+        "url_data = urlparse.urlparse(YOUTUBE_URL)\n",
+        "query = urlparse.parse_qs(url_data.query)\n",
+        "YOUTUBE_ID = query[\"v\"][0]\n",
+        "\n",
+        "\n",
+        "# remove previous input video\n",
+        "!rm -f /content/sample_data/input_vid.mp4\n",
+        "\n",
+        "\n",
+        "#@markdown ___\n",
+        "\n",
+        "#@markdown ### Trim the video (start, end) seconds\n",
+        "start = 35 #@param {type:\"integer\"}\n",
+        "end = 62 #@param {type:\"integer\"}\n",
+        "interval = end - start\n",
+        "\n",
+        "#@markdown <font color=\"orange\"> Note: ``the trimmed video must have face on all frames``\n",
+        "\n",
+        "# Download the YouTube video using yt-dlp\n",
+        "!yt-dlp -f 'bestvideo[ext=mp4]' --output \"youtube.%(ext)s\" https://www.youtube.com/watch?v=$YOUTUBE_ID\n",
+        "\n",
+        "# Cut the video using FFmpeg\n",
+        "!ffmpeg -y -i youtube.mp4 -ss {start} -t {interval} -async 1 /content/sample_data/input_vid.mp4\n",
+        "\n",
+        "# Preview the trimmed video\n",
+        "from IPython.display import HTML\n",
+        "from base64 import b64encode\n",
+        "mp4 = open('/content/sample_data/input_vid.mp4','rb').read()\n",
+        "data_url = \"data:video/mp4;base64,\" + b64encode(mp4).decode()\n",
+        "HTML(f\"\"\"<video width=600 controls><source src=\"{data_url}\"></video>\"\"\")\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "id": "zS_RAeh-IfZy"
+      },
+      "outputs": [],
+      "source": [
+        "#@title STEP3: Select Audio (Record, Upload from local drive or Gdrive)\n",
+        "import os\n",
+        "from IPython.display import Audio\n",
+        "from IPython.core.display import display\n",
+        "\n",
+        "upload_method = 'Upload' #@param ['Record', 'Upload', 'Custom Path']\n",
+        "\n",
+        "#remove previous input audio\n",
+        "if os.path.isfile('/content/sample_data/input_audio.wav'):\n",
+        "    os.remove('/content/sample_data/input_audio.wav')\n",
+        "\n",
+        "def displayAudio():\n",
+        "  display(Audio('/content/sample_data/input_audio.wav'))\n",
+        "\n",
+        "if upload_method == 'Record':\n",
+        "  audio, sr = get_audio()\n",
+        "  import scipy\n",
+        "  scipy.io.wavfile.write('/content/sample_data/input_audio.wav', sr, audio)\n",
+        "\n",
+        "elif upload_method == 'Upload':\n",
+        "  from google.colab import files\n",
+        "  uploaded = files.upload()\n",
+        "  for fn in uploaded.keys():\n",
+        "    print('User uploaded file \"{name}\" with length {length} bytes'.format(\n",
+        "        name=fn, length=len(uploaded[fn])))\n",
+        "\n",
+        "  # Consider only the first file\n",
+        "  PATH_TO_YOUR_AUDIO = str(list(uploaded.keys())[0])\n",
+        "\n",
+        "  # Load audio with specified sampling rate\n",
+        "  import librosa\n",
+        "  audio, sr = librosa.load(PATH_TO_YOUR_AUDIO, sr=None)\n",
+        "\n",
+        "  # Save audio with specified sampling rate\n",
+        "  import soundfile as sf\n",
+        "  sf.write('/content/sample_data/input_audio.wav', audio, sr, format='wav')\n",
+        "\n",
+        "  clear_output()\n",
+        "  displayAudio()\n",
+        "\n",
+        "elif upload_method == 'Custom Path':\n",
+        "  from google.colab import drive\n",
+        "  drive.mount('/content/drive')\n",
+        "  #@markdown ``Add the full path to your audio on your Gdrive`` 👇\n",
+        "  PATH_TO_YOUR_AUDIO = '/content/drive/MyDrive/test.wav' #@param {type:\"string\"}\n",
+        "\n",
+        "  # Load audio with specified sampling rate\n",
+        "  import librosa\n",
+        "  audio, sr = librosa.load(PATH_TO_YOUR_AUDIO, sr=None)\n",
+        "\n",
+        "  # Save audio with specified sampling rate\n",
+        "  import soundfile as sf\n",
+        "  sf.write('/content/sample_data/input_audio.wav', audio, sr, format='wav')\n",
+        "\n",
+        "  clear_output()\n",
+        "  displayAudio()\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "id": "BQPLXJ8L0gms"
+      },
+      "outputs": [],
+      "source": [
+        "#@title STEP4: Start Crunching and Preview Output\n",
+        "#@markdown <b>Note: Only change these, if you have to</b>\n",
+        "\n",
+        "%cd /content/Wav2Lip\n",
+        "\n",
+        "# Set up paths and variables for the output file\n",
+        "output_file_path = '/content/Wav2Lip/results/result_voice.mp4'\n",
+        "\n",
+        "# Delete existing output file before processing, if any\n",
+        "if os.path.exists(output_file_path):\n",
+        "    os.remove(output_file_path)\n",
+        "\n",
+        "pad_top =  0#@param {type:\"integer\"}\n",
+        "pad_bottom =  10#@param {type:\"integer\"}\n",
+        "pad_left =  0#@param {type:\"integer\"}\n",
+        "pad_right =  0#@param {type:\"integer\"}\n",
+        "rescaleFactor =  1#@param {type:\"integer\"}\n",
+        "nosmooth = True #@param {type:\"boolean\"}\n",
+        "#@markdown ___\n",
+        "#@markdown Model selection:\n",
+        "use_hd_model = False #@param {type:\"boolean\"}\n",
+        "checkpoint_path = 'checkpoints/wav2lip.pth' if not use_hd_model else 'checkpoints/wav2lip_gan.pth'\n",
+        "\n",
+        "\n",
+        "if nosmooth == False:\n",
+        "  !python inference.py --checkpoint_path $checkpoint_path --face \"../sample_data/input_vid.mp4\" --audio \"../sample_data/input_audio.wav\" --pads $pad_top $pad_bottom $pad_left $pad_right --resize_factor $rescaleFactor\n",
+        "else:\n",
+        "  !python inference.py --checkpoint_path $checkpoint_path --face \"../sample_data/input_vid.mp4\" --audio \"../sample_data/input_audio.wav\" --pads $pad_top $pad_bottom $pad_left $pad_right --resize_factor $rescaleFactor --nosmooth\n",
+        "\n",
+        "#Preview output video\n",
+        "if os.path.exists(output_file_path):\n",
+        "    clear_output()\n",
+        "    print(\"Final Video Preview\")\n",
+        "    print(\"Download this video from\", output_file_path)\n",
+        "    showVideo(output_file_path)\n",
+        "else:\n",
+        "    print(\"Processing failed. Output video not found.\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "vYxpPeie1CYL"
+      },
+      "source": [
+        "# LipSync on Your Video File"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "id": "nDuM7tfZ1F0t"
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import shutil\n",
+        "from google.colab import drive\n",
+        "from google.colab import files\n",
+        "from IPython.display import HTML, clear_output\n",
+        "from base64 import b64encode\n",
+        "import moviepy.editor as mp\n",
+        "\n",
+        "\n",
+        "def showVideo(file_path):\n",
+        "    \"\"\"Function to display video in Colab\"\"\"\n",
+        "    mp4 = open(file_path,'rb').read()\n",
+        "    data_url = \"data:video/mp4;base64,\" + b64encode(mp4).decode()\n",
+        "    display(HTML(\"\"\"\n",
+        "    <video controls width=600>\n",
+        "        <source src=\"%s\" type=\"video/mp4\">\n",
+        "    </video>\n",
+        "    \"\"\" % data_url))\n",
+        "\n",
+        "def get_video_resolution(video_path):\n",
+        "    \"\"\"Function to get the resolution of a video\"\"\"\n",
+        "    import cv2\n",
+        "    video = cv2.VideoCapture(video_path)\n",
+        "    width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))\n",
+        "    height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))\n",
+        "    return (width, height)\n",
+        "\n",
+        "def resize_video(video_path, new_resolution):\n",
+        "    \"\"\"Function to resize a video\"\"\"\n",
+        "    import cv2\n",
+        "    video = cv2.VideoCapture(video_path)\n",
+        "    fourcc = int(video.get(cv2.CAP_PROP_FOURCC))\n",
+        "    fps = video.get(cv2.CAP_PROP_FPS)\n",
+        "    width, height = new_resolution\n",
+        "    output_path = os.path.splitext(video_path)[0] + '_720p.mp4'\n",
+        "    writer = cv2.VideoWriter(output_path, fourcc, fps, (width, height))\n",
+        "    while True:\n",
+        "        success, frame = video.read()\n",
+        "        if not success:\n",
+        "            break\n",
+        "        resized_frame = cv2.resize(frame, new_resolution)\n",
+        "        writer.write(resized_frame)\n",
+        "    video.release()\n",
+        "    writer.release()\n",
+        "\n",
+        "# Mount Google Drive if it's not already mounted\n",
+        "if not os.path.isdir(\"/content/drive/MyDrive\"):\n",
+        "    drive.mount('/content/drive', force_remount=True)\n",
+        "\n",
+        "#@markdown ### Select an uploading method\n",
+        "upload_method = \"Upload\" #@param [\"Upload\", \"Custom Path\"]\n",
+        "\n",
+        "\n",
+        "# remove previous input video\n",
+        "if os.path.isfile('/content/sample_data/input_vid.mp4'):\n",
+        "    os.remove('/content/sample_data/input_vid.mp4')\n",
+        "\n",
+        "if upload_method == \"Upload\":\n",
+        "    uploaded = files.upload()\n",
+        "    for filename in uploaded.keys():\n",
+        "        os.rename(filename, '/content/sample_data/input_vid.mp4')\n",
+        "    PATH_TO_YOUR_VIDEO = '/content/sample_data/input_vid.mp4'\n",
+        "\n",
+        "elif upload_method == 'Custom Path':\n",
+        "    #@markdown ``Add the full path to your video on your Gdrive `` 👇\n",
+        "    PATH_TO_YOUR_VIDEO = '/content/drive/MyDrive/test.mp4' #@param {type:\"string\"}\n",
+        "    if not os.path.isfile(PATH_TO_YOUR_VIDEO):\n",
+        "        print(\"ERROR: File not found!\")\n",
+        "        raise SystemExit(0)\n",
+        "\n",
+        "#@markdown <font color=\"orange\">Notes:\n",
+        "\n",
+        "#@markdown <font color=\"orange\">. ``If your uploaded video is 1080p or higher resolution, this cell will resize it to 720p.``\n",
+        "\n",
+        "#@markdown <font color=\"orange\">. ``Do not upload videos longer than 60 seconds.``\n",
+        "\n",
+        "#@markdown ___\n",
+        "\n",
+        "video_duration = mp.VideoFileClip(PATH_TO_YOUR_VIDEO).duration\n",
+        "if video_duration > 60:\n",
+        "    print(\"WARNING: Video duration exceeds 60 seconds. Please upload a shorter video.\")\n",
+        "    raise SystemExit(0)\n",
+        "\n",
+        "video_resolution = get_video_resolution(PATH_TO_YOUR_VIDEO)\n",
+        "print(f\"Video resolution: {video_resolution}\")\n",
+        "if video_resolution[0] >= 1920 or video_resolution[1] >= 1080:\n",
+        "    print(\"Resizing video to 720p...\")\n",
+        "    os.system(f\"ffmpeg -i {PATH_TO_YOUR_VIDEO} -vf scale=1280:720 /content/sample_data/input_vid.mp4\")\n",
+        "    PATH_TO_YOUR_VIDEO = \"/content/sample_data/input_vid.mp4\"\n",
+        "    print(\"Video resized to 720p\")\n",
+        "else:\n",
+        "    print(\"No resizing needed\")\n",
+        "\n",
+        "if upload_method == \"Upload\":\n",
+        "  clear_output()\n",
+        "  print(\"Input Video\")\n",
+        "  showVideo(PATH_TO_YOUR_VIDEO)\n",
+        "else:\n",
+        "    if os.path.isfile(PATH_TO_YOUR_VIDEO):\n",
+        "        # Check if the source and destination files are the same\n",
+        "        if PATH_TO_YOUR_VIDEO != \"/content/sample_data/input_vid.mp4\":\n",
+        "            shutil.copyfile(PATH_TO_YOUR_VIDEO, \"/content/sample_data/input_vid.mp4\")\n",
+        "            print(\"Video copied to destination.\")\n",
+        "\n",
+        "        print(\"Input Video\")\n",
+        "        # Display the video from the destination path\n",
+        "        showVideo(\"/content/sample_data/input_vid.mp4\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "id": "XgF4794r7sWK"
+      },
+      "outputs": [],
+      "source": [
+        "#@title STEP3: Select Audio (Record, Upload from local drive or Gdrive)\n",
+        "import os\n",
+        "from IPython.display import Audio\n",
+        "from IPython.core.display import display\n",
+        "\n",
+        "upload_method = 'Upload' #@param ['Record', 'Upload', 'Custom Path']\n",
+        "\n",
+        "#remove previous input audio\n",
+        "if os.path.isfile('/content/sample_data/input_audio.wav'):\n",
+        "    os.remove('/content/sample_data/input_audio.wav')\n",
+        "\n",
+        "def displayAudio():\n",
+        "  display(Audio('/content/sample_data/input_audio.wav'))\n",
+        "\n",
+        "if upload_method == 'Record':\n",
+        "  audio, sr = get_audio()\n",
+        "  import scipy\n",
+        "  scipy.io.wavfile.write('/content/sample_data/input_audio.wav', sr, audio)\n",
+        "\n",
+        "elif upload_method == 'Upload':\n",
+        "  from google.colab import files\n",
+        "  uploaded = files.upload()\n",
+        "  for fn in uploaded.keys():\n",
+        "    print('User uploaded file \"{name}\" with length {length} bytes.'.format(\n",
+        "        name=fn, length=len(uploaded[fn])))\n",
+        "\n",
+        "  # Consider only the first file\n",
+        "  PATH_TO_YOUR_AUDIO = str(list(uploaded.keys())[0])\n",
+        "\n",
+        "  # Load audio with specified sampling rate\n",
+        "  import librosa\n",
+        "  audio, sr = librosa.load(PATH_TO_YOUR_AUDIO, sr=None)\n",
+        "\n",
+        "  # Save audio with specified sampling rate\n",
+        "  import soundfile as sf\n",
+        "  sf.write('/content/sample_data/input_audio.wav', audio, sr, format='wav')\n",
+        "\n",
+        "  clear_output()\n",
+        "  displayAudio()\n",
+        "\n",
+        "else: # Custom Path\n",
+        "  from google.colab import drive\n",
+        "  drive.mount('/content/drive')\n",
+        "  #@markdown ``Add the full path to your audio on your Gdrive`` 👇\n",
+        "  PATH_TO_YOUR_AUDIO = '/content/drive/MyDrive/test.wav' #@param {type:\"string\"}\n",
+        "\n",
+        "  # Load audio with specified sampling rate\n",
+        "  import librosa\n",
+        "  audio, sr = librosa.load(PATH_TO_YOUR_AUDIO, sr=None)\n",
+        "\n",
+        "  # Save audio with specified sampling rate\n",
+        "  import soundfile as sf\n",
+        "  sf.write('/content/sample_data/input_audio.wav', audio, sr, format='wav')\n",
+        "\n",
+        "  clear_output()\n",
+        "  displayAudio()\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "id": "ZgtO08V28ANf"
+      },
+      "outputs": [],
+      "source": [
+        "#@title STEP4: Start Crunching and Preview Output\n",
+        "#@markdown <b>Note: Only change these, if you have to</b>\n",
+        "\n",
+        "%cd /content/Wav2Lip\n",
+        "\n",
+        "# Set up paths and variables for the output file\n",
+        "output_file_path = '/content/Wav2Lip/results/result_voice.mp4'\n",
+        "\n",
+        "# Delete existing output file before processing, if any\n",
+        "if os.path.exists(output_file_path):\n",
+        "    os.remove(output_file_path)\n",
+        "\n",
+        "pad_top =  0#@param {type:\"integer\"}\n",
+        "pad_bottom =  10#@param {type:\"integer\"}\n",
+        "pad_left =  0#@param {type:\"integer\"}\n",
+        "pad_right =  0#@param {type:\"integer\"}\n",
+        "rescaleFactor =  1#@param {type:\"integer\"}\n",
+        "nosmooth = True #@param {type:\"boolean\"}\n",
+        "#@markdown ___\n",
+        "#@markdown Model selection:\n",
+        "use_hd_model = False #@param {type:\"boolean\"}\n",
+        "checkpoint_path = 'checkpoints/wav2lip.pth' if not use_hd_model else 'checkpoints/wav2lip_gan.pth'\n",
+        "\n",
+        "\n",
+        "if nosmooth == False:\n",
+        "  !python inference.py --checkpoint_path $checkpoint_path --face \"../sample_data/input_vid.mp4\" --audio \"../sample_data/input_audio.wav\" --pads $pad_top $pad_bottom $pad_left $pad_right --resize_factor $rescaleFactor\n",
+        "else:\n",
+        "  !python inference.py --checkpoint_path $checkpoint_path --face \"../sample_data/input_vid.mp4\" --audio \"../sample_data/input_audio.wav\" --pads $pad_top $pad_bottom $pad_left $pad_right --resize_factor $rescaleFactor --nosmooth\n",
+        "\n",
+        "#Preview output video\n",
+        "if os.path.exists(output_file_path):\n",
+        "    clear_output()\n",
+        "    print(\"Final Video Preview\")\n",
+        "    print(\"Download this video from\", output_file_path)\n",
+        "    showVideo(output_file_path)\n",
+        "else:\n",
+        "    print(\"Processing failed. Output video not found.\")"
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "private_outputs": true,
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/audio.py b/audio.py
index 32b20c449..32ab5fabe 100644
--- a/audio.py
+++ b/audio.py
@@ -97,7 +97,7 @@ def _linear_to_mel(spectogram):
 
 def _build_mel_basis():
     assert hp.fmax <= hp.sample_rate // 2
-    return librosa.filters.mel(hp.sample_rate, hp.n_fft, n_mels=hp.num_mels,
+    return librosa.filters.mel(sr=hp.sample_rate, n_fft=hp.n_fft, n_mels=hp.num_mels,
                                fmin=hp.fmin, fmax=hp.fmax)
 
 def _amp_to_db(x):
diff --git a/cog.yaml b/cog.yaml
new file mode 100644
index 000000000..f188727d7
--- /dev/null
+++ b/cog.yaml
@@ -0,0 +1,35 @@
+# Configuration for Cog ⚙️
+# Reference: https://github.com/replicate/cog/blob/main/docs/yaml.md
+
+image: r8.im/devxpy/cog-wav2lip
+
+build:
+  # set to true if your model requires a GPU
+  gpu: true
+  cuda: "11.6.2"
+
+  # a list of ubuntu apt packages to install
+  system_packages:
+     - ffmpeg
+     - cmake
+
+  # python version in the form '3.8' or '3.8.12'
+  python_version: "3.8"
+
+  # a list of packages in the format <package-name>==<version>
+  python_packages:
+    - numpy==1.23.4
+    - librosa==0.7.0
+    - opencv-python==4.6.0.66
+    - torch==1.12.1+cu116 --extra-index-url=https://download.pytorch.org/whl/cu116
+    - torchvision==0.13.1+cu116 --extra-index-url=https://download.pytorch.org/whl/cu116
+    - tqdm==4.45.0
+    - numba==0.48
+    - mediapipe==0.8.11
+
+  # commands run after the environment is setup
+  run:
+    - pip install git+https://github.com/elliottzheng/batch-face.git@master
+
+# predict.py defines how predictions are run on your model
+predict: "predict.py:Predictor"
diff --git a/face_detect.py b/face_detect.py
new file mode 100644
index 000000000..fd35da2a1
--- /dev/null
+++ b/face_detect.py
@@ -0,0 +1,55 @@
+import cv2
+import mediapipe as mp
+
+mp_face_mesh = mp.solutions.face_mesh
+mp_drawing = mp.solutions.drawing_utils
+mp_drawing_styles = mp.solutions.drawing_styles
+mp_face_detection = mp.solutions.face_detection
+
+
+def face_rect(images):
+    with mp_face_detection.FaceDetection(
+        model_selection=1, min_detection_confidence=0.5
+    ) as face_detection:
+        for image_cv2 in images:
+            # Convert the BGR image to RGB and process it with MediaPipe Face Detection.
+            results = face_detection.process(cv2.cvtColor(image_cv2, cv2.COLOR_BGR2RGB))
+
+            # Draw face detections of each face.
+            if not results.detections:
+                yield None
+            for detection in results.detections:
+                yield _get_bounding_rect(image_cv2, detection)
+
+
+def _get_bounding_rect(
+    image: mp_drawing.np.ndarray,
+    detection: mp_drawing.detection_pb2.Detection,
+):
+    """
+    Stolen from mediapipe.solutions.drawing_utils.draw_detection()
+    """
+    if not detection.location_data:
+        return
+    if image.shape[2] != mp_drawing._BGR_CHANNELS:
+        raise ValueError("Input image must contain three channel bgr data.")
+    image_rows, image_cols, _ = image.shape
+
+    location = detection.location_data
+
+    # get bounding box if exists.
+    if not location.HasField("relative_bounding_box"):
+        return
+    relative_bounding_box = location.relative_bounding_box
+    rect_start_point = mp_drawing._normalized_to_pixel_coordinates(
+        relative_bounding_box.xmin, relative_bounding_box.ymin, image_cols, image_rows
+    )
+    rect_end_point = mp_drawing._normalized_to_pixel_coordinates(
+        relative_bounding_box.xmin + relative_bounding_box.width,
+        relative_bounding_box.ymin + relative_bounding_box.height,
+        image_cols,
+        image_rows,
+    )
+
+    return *rect_start_point, *rect_end_point
+
diff --git a/face_detection/detection/sfd/sfd_detector.py b/face_detection/detection/sfd/sfd_detector.py
index 8fbce1525..d1776e4bf 100644
--- a/face_detection/detection/sfd/sfd_detector.py
+++ b/face_detection/detection/sfd/sfd_detector.py
@@ -14,8 +14,9 @@
 
 
 class SFDDetector(FaceDetector):
-    def __init__(self, device, path_to_detector=os.path.join(os.path.dirname(os.path.abspath(__file__)), 's3fd.pth'), verbose=False):
-        super(SFDDetector, self).__init__(device, verbose)
+    @classmethod
+    def load_model(cls, device):
+        path_to_detector = os.path.join(os.path.dirname(os.path.abspath(__file__)), 's3fd.pth')
 
         # Initialise the face detector
         if not os.path.isfile(path_to_detector):
@@ -23,10 +24,10 @@ def __init__(self, device, path_to_detector=os.path.join(os.path.dirname(os.path
         else:
             model_weights = torch.load(path_to_detector)
 
-        self.face_detector = s3fd()
-        self.face_detector.load_state_dict(model_weights)
-        self.face_detector.to(device)
-        self.face_detector.eval()
+        cls.face_detector = s3fd()
+        cls.face_detector.load_state_dict(model_weights)
+        cls.face_detector.to(device)
+        cls.face_detector.eval()
 
     def detect_from_image(self, tensor_or_path):
         image = self.tensor_or_path_to_ndarray(tensor_or_path)
diff --git a/inference.py b/inference.py
index 90692521e..5e1522d25 100644
--- a/inference.py
+++ b/inference.py
@@ -1,280 +1,327 @@
-from os import listdir, path
+import argparse
+import math
+import os
+import platform
+import subprocess
+
+import cv2
 import numpy as np
-import scipy, cv2, os, sys, argparse, audio
-import json, subprocess, random, string
+import torch
 from tqdm import tqdm
-from glob import glob
-import torch, face_detection
+
+import audio
+# from face_detect import face_rect
 from models import Wav2Lip
-import platform
+
+from batch_face import RetinaFace
+from time import time
 
 parser = argparse.ArgumentParser(description='Inference code to lip-sync videos in the wild using Wav2Lip models')
 
 parser.add_argument('--checkpoint_path', type=str, 
-					help='Name of saved checkpoint to load weights from', required=True)
+                    help='Name of saved checkpoint to load weights from', required=True)
 
 parser.add_argument('--face', type=str, 
-					help='Filepath of video/image that contains faces to use', required=True)
+                    help='Filepath of video/image that contains faces to use', required=True)
 parser.add_argument('--audio', type=str, 
-					help='Filepath of video/audio file to use as raw audio source', required=True)
+                    help='Filepath of video/audio file to use as raw audio source', required=True)
 parser.add_argument('--outfile', type=str, help='Video path to save result. See default for an e.g.', 
-								default='results/result_voice.mp4')
+                                default='results/result_voice.mp4')
 
 parser.add_argument('--static', type=bool, 
-					help='If True, then use only first video frame for inference', default=False)
+                    help='If True, then use only first video frame for inference', default=False)
 parser.add_argument('--fps', type=float, help='Can be specified only if input is a static image (default: 25)', 
-					default=25., required=False)
+                    default=25., required=False)
 
 parser.add_argument('--pads', nargs='+', type=int, default=[0, 10, 0, 0], 
-					help='Padding (top, bottom, left, right). Please adjust to include chin at least')
+                    help='Padding (top, bottom, left, right). Please adjust to include chin at least')
 
-parser.add_argument('--face_det_batch_size', type=int, 
-					help='Batch size for face detection', default=16)
 parser.add_argument('--wav2lip_batch_size', type=int, help='Batch size for Wav2Lip model(s)', default=128)
 
-parser.add_argument('--resize_factor', default=1, type=int, 
-			help='Reduce the resolution by this factor. Sometimes, best results are obtained at 480p or 720p')
+parser.add_argument('--resize_factor', default=1, type=int,
+             help='Reduce the resolution by this factor. Sometimes, best results are obtained at 480p or 720p')
+
+parser.add_argument('--out_height', default=480, type=int,
+            help='Output video height. Best results are obtained at 480 or 720')
 
-parser.add_argument('--crop', nargs='+', type=int, default=[0, -1, 0, -1], 
-					help='Crop video to a smaller region (top, bottom, left, right). Applied after resize_factor and rotate arg. ' 
-					'Useful if multiple face present. -1 implies the value will be auto-inferred based on height, width')
+parser.add_argument('--crop', nargs='+', type=int, default=[0, -1, 0, -1],
+                    help='Crop video to a smaller region (top, bottom, left, right). Applied after resize_factor and rotate arg. ' 
+                    'Useful if multiple face present. -1 implies the value will be auto-inferred based on height, width')
 
 parser.add_argument('--box', nargs='+', type=int, default=[-1, -1, -1, -1], 
-					help='Specify a constant bounding box for the face. Use only as a last resort if the face is not detected.'
-					'Also, might work only if the face is not moving around much. Syntax: (top, bottom, left, right).')
+                    help='Specify a constant bounding box for the face. Use only as a last resort if the face is not detected.'
+                    'Also, might work only if the face is not moving around much. Syntax: (top, bottom, left, right).')
 
 parser.add_argument('--rotate', default=False, action='store_true',
-					help='Sometimes videos taken from a phone can be flipped 90deg. If true, will flip video right by 90deg.'
-					'Use if you get a flipped result, despite feeding a normal looking video')
+                    help='Sometimes videos taken from a phone can be flipped 90deg. If true, will flip video right by 90deg.'
+                    'Use if you get a flipped result, despite feeding a normal looking video')
 
 parser.add_argument('--nosmooth', default=False, action='store_true',
-					help='Prevent smoothing face detections over a short temporal window')
+                    help='Prevent smoothing face detections over a short temporal window')
 
-args = parser.parse_args()
-args.img_size = 96
-
-if os.path.isfile(args.face) and args.face.split('.')[1] in ['jpg', 'png', 'jpeg']:
-	args.static = True
 
 def get_smoothened_boxes(boxes, T):
-	for i in range(len(boxes)):
-		if i + T > len(boxes):
-			window = boxes[len(boxes) - T:]
-		else:
-			window = boxes[i : i + T]
-		boxes[i] = np.mean(window, axis=0)
-	return boxes
+    for i in range(len(boxes)):
+        if i + T > len(boxes):
+            window = boxes[len(boxes) - T:]
+        else:
+            window = boxes[i : i + T]
+        boxes[i] = np.mean(window, axis=0)
+    return boxes
 
 def face_detect(images):
-	detector = face_detection.FaceAlignment(face_detection.LandmarksType._2D, 
-											flip_input=False, device=device)
-
-	batch_size = args.face_det_batch_size
-	
-	while 1:
-		predictions = []
-		try:
-			for i in tqdm(range(0, len(images), batch_size)):
-				predictions.extend(detector.get_detections_for_batch(np.array(images[i:i + batch_size])))
-		except RuntimeError:
-			if batch_size == 1: 
-				raise RuntimeError('Image too big to run face detection on GPU. Please use the --resize_factor argument')
-			batch_size //= 2
-			print('Recovering from OOM error; New batch size: {}'.format(batch_size))
-			continue
-		break
-
-	results = []
-	pady1, pady2, padx1, padx2 = args.pads
-	for rect, image in zip(predictions, images):
-		if rect is None:
-			cv2.imwrite('temp/faulty_frame.jpg', image) # check this frame where the face was not detected.
-			raise ValueError('Face not detected! Ensure the video contains a face in all the frames.')
-
-		y1 = max(0, rect[1] - pady1)
-		y2 = min(image.shape[0], rect[3] + pady2)
-		x1 = max(0, rect[0] - padx1)
-		x2 = min(image.shape[1], rect[2] + padx2)
-		
-		results.append([x1, y1, x2, y2])
-
-	boxes = np.array(results)
-	if not args.nosmooth: boxes = get_smoothened_boxes(boxes, T=5)
-	results = [[image[y1: y2, x1:x2], (y1, y2, x1, x2)] for image, (x1, y1, x2, y2) in zip(images, boxes)]
-
-	del detector
-	return results 
+    results = []
+    pady1, pady2, padx1, padx2 = args.pads
+
+    s = time()
+
+    for image, rect in zip(images, face_rect(images)):
+        if rect is None:
+            cv2.imwrite('temp/faulty_frame.jpg', image) # check this frame where the face was not detected.
+            raise ValueError('Face not detected! Ensure the video contains a face in all the frames.')
+
+        y1 = max(0, rect[1] - pady1)
+        y2 = min(image.shape[0], rect[3] + pady2)
+        x1 = max(0, rect[0] - padx1)
+        x2 = min(image.shape[1], rect[2] + padx2)
+
+        results.append([x1, y1, x2, y2])
+
+    print('face detect time:', time() - s)
+
+    boxes = np.array(results)
+    if not args.nosmooth: boxes = get_smoothened_boxes(boxes, T=5)
+    results = [[image[y1: y2, x1:x2], (y1, y2, x1, x2)] for image, (x1, y1, x2, y2) in zip(images, boxes)]
+
+    return results
+
 
 def datagen(frames, mels):
-	img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []
+    img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []
 
-	if args.box[0] == -1:
-		if not args.static:
-			face_det_results = face_detect(frames) # BGR2RGB for CNN face detection
-		else:
-			face_det_results = face_detect([frames[0]])
-	else:
-		print('Using the specified bounding box instead of face detection...')
-		y1, y2, x1, x2 = args.box
-		face_det_results = [[f[y1: y2, x1:x2], (y1, y2, x1, x2)] for f in frames]
+    if args.box[0] == -1:
+        if not args.static:
+            face_det_results = face_detect(frames) # BGR2RGB for CNN face detection
+        else:
+            face_det_results = face_detect([frames[0]])
+    else:
+        print('Using the specified bounding box instead of face detection...')
+        y1, y2, x1, x2 = args.box
+        face_det_results = [[f[y1: y2, x1:x2], (y1, y2, x1, x2)] for f in frames]
 
-	for i, m in enumerate(mels):
-		idx = 0 if args.static else i%len(frames)
-		frame_to_save = frames[idx].copy()
-		face, coords = face_det_results[idx].copy()
+    for i, m in enumerate(mels):
+        idx = 0 if args.static else i%len(frames)
+        frame_to_save = frames[idx].copy()
+        face, coords = face_det_results[idx].copy()
 
-		face = cv2.resize(face, (args.img_size, args.img_size))
-			
-		img_batch.append(face)
-		mel_batch.append(m)
-		frame_batch.append(frame_to_save)
-		coords_batch.append(coords)
+        face = cv2.resize(face, (args.img_size, args.img_size))
 
-		if len(img_batch) >= args.wav2lip_batch_size:
-			img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)
+        img_batch.append(face)
+        mel_batch.append(m)
+        frame_batch.append(frame_to_save)
+        coords_batch.append(coords)
 
-			img_masked = img_batch.copy()
-			img_masked[:, args.img_size//2:] = 0
+        if len(img_batch) >= args.wav2lip_batch_size:
+            img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)
 
-			img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255.
-			mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1])
+            img_masked = img_batch.copy()
+            img_masked[:, args.img_size//2:] = 0
 
-			yield img_batch, mel_batch, frame_batch, coords_batch
-			img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []
+            img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255.
+            mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1])
 
-	if len(img_batch) > 0:
-		img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)
+            yield img_batch, mel_batch, frame_batch, coords_batch
+            img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []
 
-		img_masked = img_batch.copy()
-		img_masked[:, args.img_size//2:] = 0
+    if len(img_batch) > 0:
+        img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)
 
-		img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255.
-		mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1])
+        img_masked = img_batch.copy()
+        img_masked[:, args.img_size//2:] = 0
 
-		yield img_batch, mel_batch, frame_batch, coords_batch
+        img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255.
+        mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1])
+
+        yield img_batch, mel_batch, frame_batch, coords_batch
 
 mel_step_size = 16
 device = 'cuda' if torch.cuda.is_available() else 'cpu'
 print('Using {} for inference.'.format(device))
 
 def _load(checkpoint_path):
-	if device == 'cuda':
-		checkpoint = torch.load(checkpoint_path)
-	else:
-		checkpoint = torch.load(checkpoint_path,
-								map_location=lambda storage, loc: storage)
-	return checkpoint
+    if device == 'cuda':
+        checkpoint = torch.load(checkpoint_path)
+    else:
+        checkpoint = torch.load(checkpoint_path,
+                                map_location=lambda storage, loc: storage)
+    return checkpoint
 
 def load_model(path):
-	model = Wav2Lip()
-	print("Load checkpoint from: {}".format(path))
-	checkpoint = _load(path)
-	s = checkpoint["state_dict"]
-	new_s = {}
-	for k, v in s.items():
-		new_s[k.replace('module.', '')] = v
-	model.load_state_dict(new_s)
-
-	model = model.to(device)
-	return model.eval()
+    model = Wav2Lip()
+    print("Load checkpoint from: {}".format(path))
+    checkpoint = _load(path)
+    s = checkpoint["state_dict"]
+    new_s = {}
+    for k, v in s.items():
+        new_s[k.replace('module.', '')] = v
+    model.load_state_dict(new_s)
+
+    model = model.to(device)
+    return model.eval()
 
 def main():
-	if not os.path.isfile(args.face):
-		raise ValueError('--face argument must be a valid path to video/image file')
+    args.img_size = 96
+
+    if os.path.isfile(args.face) and args.face.split('.')[1] in ['jpg', 'png', 'jpeg']:
+        args.static = True
+
+    if not os.path.isfile(args.face):
+        raise ValueError('--face argument must be a valid path to video/image file')
+
+    elif args.face.split('.')[1] in ['jpg', 'png', 'jpeg']:
+        full_frames = [cv2.imread(args.face)]
+        fps = args.fps
+
+    else:
+        video_stream = cv2.VideoCapture(args.face)
+        fps = video_stream.get(cv2.CAP_PROP_FPS)
+
+        print('Reading video frames...')
+
+        full_frames = []
+        while 1:
+            still_reading, frame = video_stream.read()
+            if not still_reading:
+                video_stream.release()
+                break
+
+            aspect_ratio = frame.shape[1] / frame.shape[0]
+            frame = cv2.resize(frame, (int(args.out_height * aspect_ratio), args.out_height))
+            # if args.resize_factor > 1:
+            #     frame = cv2.resize(frame, (frame.shape[1]//args.resize_factor, frame.shape[0]//args.resize_factor))
+
+            if args.rotate:
+                frame = cv2.rotate(frame, cv2.cv2.ROTATE_90_CLOCKWISE)
+
+            y1, y2, x1, x2 = args.crop
+            if x2 == -1: x2 = frame.shape[1]
+            if y2 == -1: y2 = frame.shape[0]
+
+            frame = frame[y1:y2, x1:x2]
+
+            full_frames.append(frame)
+
+    print ("Number of frames available for inference: "+str(len(full_frames)))
+
+    if not args.audio.endswith('.wav'):
+        print('Extracting raw audio...')
+        # command = 'ffmpeg -y -i {} -strict -2 {}'.format(args.audio, 'temp/temp.wav')
+        # subprocess.call(command, shell=True)
+        subprocess.check_call([
+            "ffmpeg", "-y",
+            "-i", args.audio,
+            "temp/temp.wav",
+        ])
+        args.audio = 'temp/temp.wav'
 
-	elif args.face.split('.')[1] in ['jpg', 'png', 'jpeg']:
-		full_frames = [cv2.imread(args.face)]
-		fps = args.fps
+    wav = audio.load_wav(args.audio, 16000)
+    mel = audio.melspectrogram(wav)
+    print(mel.shape)
 
-	else:
-		video_stream = cv2.VideoCapture(args.face)
-		fps = video_stream.get(cv2.CAP_PROP_FPS)
+    if np.isnan(mel.reshape(-1)).sum() > 0:
+        raise ValueError('Mel contains nan! Using a TTS voice? Add a small epsilon noise to the wav file and try again')
 
-		print('Reading video frames...')
+    mel_chunks = []
+    mel_idx_multiplier = 80./fps
+    i = 0
+    while 1:
+        start_idx = int(i * mel_idx_multiplier)
+        if start_idx + mel_step_size > len(mel[0]):
+            mel_chunks.append(mel[:, len(mel[0]) - mel_step_size:])
+            break
+        mel_chunks.append(mel[:, start_idx : start_idx + mel_step_size])
+        i += 1
 
-		full_frames = []
-		while 1:
-			still_reading, frame = video_stream.read()
-			if not still_reading:
-				video_stream.release()
-				break
-			if args.resize_factor > 1:
-				frame = cv2.resize(frame, (frame.shape[1]//args.resize_factor, frame.shape[0]//args.resize_factor))
+    print("Length of mel chunks: {}".format(len(mel_chunks)))
 
-			if args.rotate:
-				frame = cv2.rotate(frame, cv2.cv2.ROTATE_90_CLOCKWISE)
+    full_frames = full_frames[:len(mel_chunks)]
 
-			y1, y2, x1, x2 = args.crop
-			if x2 == -1: x2 = frame.shape[1]
-			if y2 == -1: y2 = frame.shape[0]
+    batch_size = args.wav2lip_batch_size
+    gen = datagen(full_frames.copy(), mel_chunks)
 
-			frame = frame[y1:y2, x1:x2]
+    s = time()
 
-			full_frames.append(frame)
+    for i, (img_batch, mel_batch, frames, coords) in enumerate(tqdm(gen,
+                                            total=int(np.ceil(float(len(mel_chunks))/batch_size)))):
+        if i == 0:
+            frame_h, frame_w = full_frames[0].shape[:-1]
+            out = cv2.VideoWriter('temp/result.avi',
+                                    cv2.VideoWriter_fourcc(*'DIVX'), fps, (frame_w, frame_h))
 
-	print ("Number of frames available for inference: "+str(len(full_frames)))
+        img_batch = torch.FloatTensor(np.transpose(img_batch, (0, 3, 1, 2))).to(device)
+        mel_batch = torch.FloatTensor(np.transpose(mel_batch, (0, 3, 1, 2))).to(device)
 
-	if not args.audio.endswith('.wav'):
-		print('Extracting raw audio...')
-		command = 'ffmpeg -y -i {} -strict -2 {}'.format(args.audio, 'temp/temp.wav')
+        with torch.no_grad():
+            pred = model(mel_batch, img_batch)
 
-		subprocess.call(command, shell=True)
-		args.audio = 'temp/temp.wav'
+        pred = pred.cpu().numpy().transpose(0, 2, 3, 1) * 255.
 
-	wav = audio.load_wav(args.audio, 16000)
-	mel = audio.melspectrogram(wav)
-	print(mel.shape)
+        for p, f, c in zip(pred, frames, coords):
+            y1, y2, x1, x2 = c
+            p = cv2.resize(p.astype(np.uint8), (x2 - x1, y2 - y1))
 
-	if np.isnan(mel.reshape(-1)).sum() > 0:
-		raise ValueError('Mel contains nan! Using a TTS voice? Add a small epsilon noise to the wav file and try again')
+            f[y1:y2, x1:x2] = p
+            out.write(f)
 
-	mel_chunks = []
-	mel_idx_multiplier = 80./fps 
-	i = 0
-	while 1:
-		start_idx = int(i * mel_idx_multiplier)
-		if start_idx + mel_step_size > len(mel[0]):
-			mel_chunks.append(mel[:, len(mel[0]) - mel_step_size:])
-			break
-		mel_chunks.append(mel[:, start_idx : start_idx + mel_step_size])
-		i += 1
+    out.release()
 
-	print("Length of mel chunks: {}".format(len(mel_chunks)))
+    print("wav2lip prediction time:", time() - s)
 
-	full_frames = full_frames[:len(mel_chunks)]
+    subprocess.check_call([
+        "ffmpeg", "-y",
+        # "-vsync", "0", "-hwaccel", "cuda", "-hwaccel_output_format", "cuda",
+        "-i", "temp/result.avi",
+        "-i", args.audio,
+        # "-c:v", "h264_nvenc",
+        args.outfile,
+    ])
 
-	batch_size = args.wav2lip_batch_size
-	gen = datagen(full_frames.copy(), mel_chunks)
+model = detector = detector_model = None
 
-	for i, (img_batch, mel_batch, frames, coords) in enumerate(tqdm(gen, 
-											total=int(np.ceil(float(len(mel_chunks))/batch_size)))):
-		if i == 0:
-			model = load_model(args.checkpoint_path)
-			print ("Model loaded")
+def do_load(checkpoint_path):
+    global model, detector, detector_model
 
-			frame_h, frame_w = full_frames[0].shape[:-1]
-			out = cv2.VideoWriter('temp/result.avi', 
-									cv2.VideoWriter_fourcc(*'DIVX'), fps, (frame_w, frame_h))
+    model = load_model(checkpoint_path)
 
-		img_batch = torch.FloatTensor(np.transpose(img_batch, (0, 3, 1, 2))).to(device)
-		mel_batch = torch.FloatTensor(np.transpose(mel_batch, (0, 3, 1, 2))).to(device)
+    # SFDDetector.load_model(device)
+    # detector = RetinaFace(gpu_id=0, model_path="checkpoints/mobilenet.pth", network="mobilenet")
+    # detector = RetinaFace(gpu_id=0, model_path="checkpoints/resnet50.pth", network="resnet50")
+    if torch.cuda.is_available():
+        detector = RetinaFace(gpu_id=0, model_path="checkpoints/mobilenet.pth", network="mobilenet")
+    else:
+        detector = RetinaFace( model_path="checkpoints/mobilenet.pth", network="mobilenet")
+  
+    detector_model = detector.model
 
-		with torch.no_grad():
-			pred = model(mel_batch, img_batch)
+    print("Models loaded")
 
-		pred = pred.cpu().numpy().transpose(0, 2, 3, 1) * 255.
-		
-		for p, f, c in zip(pred, frames, coords):
-			y1, y2, x1, x2 = c
-			p = cv2.resize(p.astype(np.uint8), (x2 - x1, y2 - y1))
 
-			f[y1:y2, x1:x2] = p
-			out.write(f)
+face_batch_size = 64 * 8
 
-	out.release()
+def face_rect(images):
+    num_batches = math.ceil(len(images) / face_batch_size)
+    prev_ret = None
+    for i in range(num_batches):
+        batch = images[i * face_batch_size: (i + 1) * face_batch_size]
+        all_faces = detector(batch)  # return faces list of all images
+        for faces in all_faces:
+            if faces:
+                box, landmarks, score = faces[0]
+                prev_ret = tuple(map(int, box))
+            yield prev_ret
 
-	command = 'ffmpeg -y -i {} -i {} -strict -2 -q:v 1 {}'.format(args.audio, 'temp/result.avi', args.outfile)
-	subprocess.call(command, shell=platform.system() != 'Windows')
 
 if __name__ == '__main__':
-	main()
+    args = parser.parse_args()
+    do_load(args.checkpoint_path)
+    main()
diff --git a/predict.py b/predict.py
new file mode 100644
index 000000000..7fbc7eba6
--- /dev/null
+++ b/predict.py
@@ -0,0 +1,144 @@
+# Prediction interface for Cog ⚙️
+# https://github.com/replicate/cog/blob/main/docs/python.md
+import os
+import subprocess
+
+from cog import BasePredictor, Input, Path
+
+import inference
+
+from time import time
+
+from functools import wraps
+import torch
+
+
+def make_mem_efficient(cls: BasePredictor):
+    if not torch.cuda.is_available():
+        return cls
+
+    old_setup = cls.setup
+    old_predict = cls.predict
+
+    @wraps(old_setup)
+    def new_setup(self, *args, **kwargs):
+        ret = old_setup(self, *args, **kwargs)
+        _move_to(self, "cpu")
+        return ret
+
+    @wraps(old_predict)
+    def new_predict(self, *args, **kwargs):
+        _move_to(self, "cuda")
+        try:
+            ret = old_predict(self, *args, **kwargs)
+        finally:
+            _move_to(self, "cpu")
+        return ret
+
+    cls.setup = new_setup
+    cls.predict = new_predict
+
+    return cls
+
+
+def _move_to(self, device):
+    try:
+        self = self.cached_models
+    except AttributeError:
+        pass
+    for attr, value in vars(self).items():
+        try:
+            value = value.to(device)
+        except AttributeError:
+            pass
+        else:
+            print(f"Moving {self.__name__}.{attr} to {device}")
+            setattr(self, attr, value)
+    torch.cuda.empty_cache()
+
+
+@make_mem_efficient
+class Predictor(BasePredictor):
+    cached_models = inference
+
+    def setup(self):
+        inference.do_load("checkpoints/wav2lip_gan.pth")
+
+    def predict(
+        self,
+        face: Path = Input(description="video/image that contains faces to use"),
+        audio: Path = Input(description="video/audio file to use as raw audio source"),
+        pads: str = Input(
+            description="Padding for the detected face bounding box.\n"
+            "Please adjust to include chin at least\n"
+            'Format: "top bottom left right"',
+            default="0 10 0 0",
+        ),
+        smooth: bool = Input(
+            description="Smooth face detections over a short temporal window",
+            default=True,
+        ),
+        fps: float = Input(
+            description="Can be specified only if input is a static image",
+            default=25.0,
+        ),
+        out_height: int = Input(
+            description="Output video height. Best results are obtained at 480 or 720",
+            default=480,
+        ),
+    ) -> Path:
+        try:
+            os.remove("results/result_voice.mp4")
+        except FileNotFoundError:
+            pass
+
+        face_ext = os.path.splitext(face)[-1]
+        if face_ext not in [".mp4", ".mov", ".png" , ".jpg" , ".jpeg" , ".gif", ".mkv", ".webp"]:
+            raise ValueError(f'Unsupported face format {face_ext!r}')
+
+        audio_ext = os.path.splitext(audio)[-1]
+        if audio_ext not in [".wav", ".mp3"]:
+            raise ValueError(f'Unsupported audio format {audio_ext!r}')
+
+        args = [
+            "--checkpoint_path", "checkpoints/wav2lip_gan.pth",
+            "--face", str(face),
+            "--audio", str(audio),
+            "--pads", *pads.split(" "),
+            "--fps", str(fps),
+            "--out_height", str(out_height),
+        ]
+        if not smooth:
+            args += ["--nosmooth"]
+
+        print("-> run:", " ".join(args))
+        inference.args = inference.parser.parse_args(args)
+
+        s = time()
+
+        try:
+            inference.main()
+        except ValueError as e:
+            print('-> Encountered error, skipping lipsync:', e)
+
+            args = [
+                "ffmpeg", "-y",
+                # "-vsync", "0", "-hwaccel", "cuda", "-hwaccel_output_format", "cuda",
+                "-stream_loop", "-1",
+                "-i", str(face),
+                "-i", str(audio),
+                "-shortest",
+                "-fflags", "+shortest",
+                "-max_interleave_delta", "100M",
+                "-map", "0:v:0",
+                "-map", "1:a:0",
+                # "-c", "copy",
+                # "-c:v", "h264_nvenc",
+                "results/result_voice.mp4",
+            ]
+            print("-> run:", " ".join(args))
+            print(subprocess.check_output(args, encoding="utf-8"))
+
+        print(time() - s)
+
+        return Path("results/result_voice.mp4")
diff --git a/requirements.txt b/requirements.txt
index bfd428ab9..b16f1dabf 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,8 +1,73 @@
-librosa==0.7.0
-numpy==1.17.1
-opencv-contrib-python>=4.2.0.34
-opencv-python==4.1.0.25
-torch==1.1.0
-torchvision==0.3.0
-tqdm==4.45.0
-numba==0.48
+absl-py==2.1.0
+attrs==24.2.0
+audioread==3.0.1
+batch-face==1.5.0.dev0
+certifi==2024.8.30
+cffi==1.17.1
+charset-normalizer==3.4.0
+colorama==0.4.6
+contourpy==1.3.1
+cycler==0.12.1
+decorator==5.1.1
+ffmpeg-python==0.2.0
+filelock==3.16.1
+flatbuffers==24.3.25
+fonttools==4.55.2
+fsspec==2024.10.0
+future==1.0.0
+ghc==1.0
+idna==3.10
+intel-openmp==2021.4.0
+jax==0.4.36
+jaxlib==0.4.36
+Jinja2==3.1.4
+joblib==1.4.2
+kiwisolver==1.4.7
+lazy_loader==0.4
+librosa==0.10.2.post1
+llvmlite==0.43.0
+MarkupSafe==3.0.2
+matplotlib==3.9.3
+mediapipe==0.10.18
+mkl==2021.4.0
+ml_dtypes==0.5.0
+mpmath==1.3.0
+msgpack==1.1.0
+networkx==3.4.2
+numba==0.60.0
+numpy==1.26.4
+opencv-contrib-python==4.10.0.84
+opencv-python==4.10.0.84
+opencv-transforms==0.0.6
+opt_einsum==3.4.0
+packaging==24.2
+pandas==2.2.3
+pillow==11.0.0
+pip==24.2
+platformdirs==4.3.6
+pooch==1.8.2
+protobuf==4.25.5
+pycparser==2.22
+pyparsing==3.2.0
+python-dateutil==2.9.0.post0
+pytz==2024.2
+requests==2.32.3
+scikit-learn==1.5.2
+scipy==1.14.1
+sentencepiece==0.2.0
+setuptools==75.1.0
+six==1.17.0
+sixdrepnet==0.1.6
+sounddevice==0.5.1
+soundfile==0.12.1
+soxr==0.5.0.post1
+sympy==1.13.3
+tbb==2021.13.1
+threadpoolctl==3.5.0
+torch==2.3.0+cu118
+torchvision==0.18.0+cu118
+tqdm==4.67.1
+typing_extensions==4.12.2
+tzdata==2024.2
+urllib3==2.2.3
+wheel==0.44.0
diff --git a/requirementsCPU.txt b/requirementsCPU.txt
new file mode 100644
index 000000000..ac7cef623
--- /dev/null
+++ b/requirementsCPU.txt
@@ -0,0 +1,13 @@
+librosa
+numpy
+opencv-contrib-python
+opencv-python
+-f https://download.pytorch.org/whl/torch_stable.html
+torch
+torchvision
+tqdm
+numba
+mediapipe
+https://raw.githubusercontent.com/AwaleSajil/ghc/master/ghc-1.0-py3-none-any.whl
+git+https://github.com/elliottzheng/batch-face.git@master
+ffmpeg-python
diff --git a/requirements_colab.txt b/requirements_colab.txt
new file mode 100644
index 000000000..c5f75e7b4
--- /dev/null
+++ b/requirements_colab.txt
@@ -0,0 +1,7 @@
+numpy==1.23.4
+librosa
+opencv-python
+torch
+torchvision
+tqdm
+numba
diff --git a/scripts/download_models.sh b/scripts/download_models.sh
new file mode 100644
index 000000000..93049e873
--- /dev/null
+++ b/scripts/download_models.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+
+set -ex
+
+wget -c -O checkpoints/wav2lip_gan.pth 'https://iiitaphyd-my.sharepoint.com/personal/radrabha_m_research_iiit_ac_in/_layouts/15/download.aspx?share=EdjI7bZlgApMqsVoEUUXpLsBxqXbn5z8VTmoxp55YNDcIA'
+wget -c -O checkpoints/mobilenet.pth 'https://github.com/elliottzheng/face-detection/releases/download/0.0.1/mobilenet0.25_Final.pth'
+wget -c -O checkpoints/resnet50.pth 'https://github.com/elliottzheng/face-detection/releases/download/0.0.1/Resnet50_Final.pth'
diff --git a/scripts/run-dev.sh b/scripts/run-dev.sh
new file mode 100644
index 000000000..becde83e1
--- /dev/null
+++ b/scripts/run-dev.sh
@@ -0,0 +1,13 @@
+#!/usr/bin/env bash
+
+NAME=wav2lip-dev
+
+set -ex
+
+docker build . -t $NAME
+docker run -it --rm \
+  --name $NAME \
+  -v $PWD/checkpoints:/src/checkpoints \
+  -p 6001:5000 \
+  --gpus all \
+  $NAME
diff --git a/scripts/run-prod.sh b/scripts/run-prod.sh
new file mode 100644
index 000000000..08f378a48
--- /dev/null
+++ b/scripts/run-prod.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+
+NAME=wav2lip
+
+set -x
+
+docker rm -f $NAME
+
+docker build . -t $NAME
+docker run -d --restart always \
+  --name $NAME \
+  -v $PWD/checkpoints:/src/checkpoints \
+  -p 5001:5000 \
+  --gpus all \
+  $NAME
+
+docker logs -f $NAME