ManimCommunity · hdeep03 · Jan 8, 2024 · Feb 25, 2024
@@ -38,6 +38,10 @@ Speech services
    :members:
    :show-inheritance:
 
+.. automodule:: manim_voiceover.services.openaitts
+   :members:
+   :show-inheritance:
+
 .. automodule:: manim_voiceover.services.pyttsx3
    :members:
    :show-inheritance:

@@ -47,6 +47,11 @@ Manim Voiceover defines the :py:class:`~~base.SpeechService` class for adding ne
      - No
      - No
      - It's a free API subsidized by Google, so there is a likelihood it may stop working in the future.
+   * - :py:class:`~openaitts.OpenAIService`
+     - Very good, human-like
+     - No
+     - Yes
+     - Requires OpenAI developer account. See `platform <https://platform.openai.com/signup>`__ to sign up, and the `pricing page <https://openai.com/pricing#:~:text=%24-,0.030,-/%201K%20characters>`__ for more details.
    * - :py:class:`~pyttsx3.PyTTSX3Service`
      - Bad
      - Yes
@@ -136,6 +141,33 @@ Install Manim Voiceover with the ``gtts`` extra in order to use :py:class:`~gtts
 
 Refer to the `example usage <https://github.com/ManimCommunity/manim-voiceover/blob/main/examples/gtts-example.py>`__ to get started.
 
+:py:class:`~openaitts.OpenAIService`
+*************************************
+`OpenAI <https://platform.openai.com/docs/api-reference/audio/createSpeech/>`__ provides a text-to-speech service.
+Since it is an api request, it requires an internet connection to work. It requires an API key to use. Register for one `here <https://platform.openai.com/>`__.
+
+Install Manim Voiceover with the ``openai`` extra in order to use :py:class:`~openaitts.OpenAIService`:
+
+.. code:: sh
+
+   pip install "manim-voiceover[openai]"
+
+Then, you need to find out your api key:
+
+- Sign in to `OpenAI platform <https://platform.openai.com/>`__ and click into Api Keys from the left panel.
+- Click create a new secret key and copy it.
+
+Create a file called ``.env`` that contains your authentication
+information in the same directory where you call Manim.
+
+.. code:: sh
+
+   OPENAI_API_KEY="..." # insert the secret key here. It should start with "sk-"
+
+Check out `OpenAI docs <https://platform.openai.com/docs/guides/text-to-speech/>`__ for more details.
+
+Refer to the `example usage <https://github.com/ManimCommunity/manim-voiceover/blob/main/examples/openaitts-example.py>`__ to get started.
+
 :py:class:`~pyttsx3.PyTTSX3Service`
 ***********************************
 

@@ -0,0 +1,30 @@
+from manim import *
+from manim_voiceover import VoiceoverScene
+from manim_voiceover.services.openaitts import OpenAIService
+
+
+class OpenAIExample(VoiceoverScene):
+    def construct(self):
+        self.set_speech_service(
+            OpenAIService(
+                voice="fable",
+                model="tts-1-hd",
+            )
+        )
+
+        circle = Circle()
+        square = Square().shift(2 * RIGHT)
+
+        with self.voiceover(text="This circle is drawn as I speak.") as tracker:
+            self.play(Create(circle), run_time=tracker.duration)
+
+        with self.voiceover(text="Let's shift it to the left 2 units.") as tracker:
+            self.play(circle.animate.shift(2 * LEFT), run_time=tracker.duration)
+
+        with self.voiceover(text="Now, let's transform it into a square.") as tracker:
+            self.play(Transform(circle, square), run_time=tracker.duration)
+
+        with self.voiceover(text="Thank you for watching.", speed=0.75): # You can also change the audio speed by specifying the speed argument.
+            self.play(Uncreate(circle))
+
+        self.wait()
@@ -0,0 +1,92 @@
+import os
+import sys
+from pathlib import Path
+from manim import logger
+from dotenv import load_dotenv, find_dotenv
+
+from manim_voiceover.helper import create_dotenv_file, prompt_ask_missing_extras
+
+try:
+    import openai
+except ImportError:
+    logger.error(
+        'Missing packages. Run `pip install "manim-voiceover[openai]"` to use OpenAIService.'
+    )
+
+from manim_voiceover.services.base import SpeechService
+
+load_dotenv(find_dotenv(usecwd=True))
+
+
+def create_dotenv_openai():
+    logger.info(
+        "Check out https://voiceover.manim.community/en/stable/services.html to learn how to create an account and get your subscription key."
+    )
+    if not create_dotenv_file(["OPENAI_API_KEY"]):
+        raise ValueError(
+            "The environment variable OPENAI_API_KEY is not set. Please set it or create a .env file with the variables."
+        )
+    logger.info("The .env file has been created. Please run Manim again.")
+    sys.exit()
+
+
+class OpenAIService(SpeechService):
+    """Speech service class for OpenAI TTS Service. See the `OpenAI API page <https://platform.openai.com/docs/api-reference/audio/createSpeech>`__ for more information about voices and models."""
+
+    def __init__(self, voice: str = "alloy", model: str = "tts-1-hd", **kwargs):
+        """
+        Args:
+            voice (str, optional): The voice to use. See the `API page <https://platform.openai.com/docs/api-reference/audio/createSpeech>`__ for all the available options. Defaults to ``"alloy"``.
+            model (str, optional): The TTS model to use. See the `API page <https://platform.openai.com/docs/api-reference/audio/createSpeech>`__ for all the available options. Defaults to ``"tts-1-hd"``.
+        """
+        prompt_ask_missing_extras("openai", "openai", "OpenAIService")
+        self.voice = voice
+        self.model = model
+
+        SpeechService.__init__(self, **kwargs)
+
+    def generate_from_text(
+        self, text: str, cache_dir: str = None, path: str = None, **kwargs
+    ) -> dict:
+        """"""
+        if cache_dir is None:
+            cache_dir = self.cache_dir
+
+        speed = kwargs.get("speed", 1.0)
+
+        if not (0.25 <= speed <= 4.0):
+            raise ValueError("The speed must be between 0.25 and 4.0.")
+
+        input_data = {
+            "input_text": text,
+            "service": "openai",
+            "config": {"voice": self.voice, "model": self.model, "speed": speed},
+        }
+
+        cached_result = self.get_cached_result(input_data, cache_dir)
+        if cached_result is not None:
+            return cached_result
+
+        if path is None:
+            audio_path = self.get_audio_basename(input_data) + ".mp3"
+        else:
+            audio_path = path
+
+        if os.getenv("OPENAI_API_KEY") is None:
+            create_dotenv_openai()
+
+        response = openai.audio.speech.create(
+            model=self.model,
+            voice=self.voice,
+            input=text,
+            speed=speed,
+        )
+        response.stream_to_file(str(Path(cache_dir) / audio_path))
+
+        json_dict = {
+            "input_text": text,
+            "input_data": input_data,
+            "original_audio": audio_path,
+        }
+
+        return json_dict
@@ -51,6 +51,7 @@ azure-cognitiveservices-speech = { version = "^1.24.0", optional = true }
 PyAudio = { version = "^0.2.12", optional = true }
 gTTS = { version = "^2.2.4", optional = true }
 pyttsx3 = { version = "^2.90", optional = true }
+openai = { version = "^1.6.1", optional = true }
 # torch = { version = "*", optional = true }
 # TTS = { version = "*", optional = true }
 pynput = { version = "^1.7.6", optional = true }
@@ -65,6 +66,7 @@ elevenlabs = {version = "^0.2.27", optional = true}
 [tool.poetry.extras]
 azure = ["azure-cognitiveservices-speech"]
 gtts = ["gTTS"]
+openai = ["openai"]
 pyttsx3 = ["pyttsx3"]
 # coqui = ["torch", "TTS"]
 coqui = [] # Removed TTS as deps for now
@@ -80,6 +82,7 @@ all = [
     "TTS",
     "PyAudio",
     "pynput",
+    "openai",
     "deepl",
     "openai-whisper",
     "stable-ts",