From 3d419a1cfb1be8d86c95c261eb7e37bf6543094e Mon Sep 17 00:00:00 2001
From: Simon Sardorf <ssardorf@gmail.com>
Date: Wed, 18 Dec 2024 16:05:57 +0100
Subject: [PATCH] enable deepspeed on apple silicon

---
 setup.py               | 7 ++++---
 tortoise/do_tts.py     | 4 +---
 tortoise/read.py       | 2 --
 tortoise/read_fast.py  | 2 --
 tortoise/tts_stream.py | 2 --
 5 files changed, 5 insertions(+), 12 deletions(-)

diff --git a/setup.py b/setup.py
index 807670a4..890b58b5 100644
--- a/setup.py
+++ b/setup.py
@@ -28,9 +28,10 @@
         'scipy',
         'librosa',
         'transformers==4.31.0',
-        'tokenizers==0.14.0',
-        'scipy==1.13.1'
-        # 'deepspeed==0.8.3',
+        'tokenizers',
+        'scipy==1.13.1',
+        'deepspeed',
+        'py-cpuinfo'
     ],
     classifiers=[
         "Programming Language :: Python :: 3",
diff --git a/tortoise/do_tts.py b/tortoise/do_tts.py
index c6e2b17d..00ed2109 100644
--- a/tortoise/do_tts.py
+++ b/tortoise/do_tts.py
@@ -13,7 +13,7 @@
     parser.add_argument('--voice', type=str, help='Selects the voice to use for generation. See options in voices/ directory (and add your own!) '
                                                  'Use the & character to join two voices together. Use a comma to perform inference on multiple voices.', default='random')
     parser.add_argument('--preset', type=str, help='Which voice preset to use.', default='fast')
-    parser.add_argument('--use_deepspeed', type=str, help='Use deepspeed for speed bump.', default=False)
+    parser.add_argument('--use_deepspeed', type=bool, help='Use deepspeed for speed bump.', default=False)
     parser.add_argument('--kv_cache', type=bool, help='If you disable this please wait for a long a time to get the output', default=True)
     parser.add_argument('--half', type=bool, help="float16(half) precision inference if True it's faster and take less vram and ram", default=True)
     parser.add_argument('--output_path', type=str, help='Where to store outputs.', default='results/')
@@ -25,8 +25,6 @@
     parser.add_argument('--cvvp_amount', type=float, help='How much the CVVP model should influence the output.'
                                                           'Increasing this can in some cases reduce the likelihood of multiple speakers. Defaults to 0 (disabled)', default=.0)
     args = parser.parse_args()
-    if torch.backends.mps.is_available():
-        args.use_deepspeed = False
     os.makedirs(args.output_path, exist_ok=True)
     tts = TextToSpeech(models_dir=args.model_dir, use_deepspeed=args.use_deepspeed, kv_cache=args.kv_cache, half=args.half)
 
diff --git a/tortoise/read.py b/tortoise/read.py
index e5839aa8..ab31bae1 100644
--- a/tortoise/read.py
+++ b/tortoise/read.py
@@ -30,8 +30,6 @@
 
 
     args = parser.parse_args()
-    if torch.backends.mps.is_available():
-        args.use_deepspeed = False
     tts = TextToSpeech(models_dir=args.model_dir, use_deepspeed=args.use_deepspeed, kv_cache=args.kv_cache, half=args.half)
 
     outpath = args.output_path
diff --git a/tortoise/read_fast.py b/tortoise/read_fast.py
index f2778d4a..8a23e65f 100644
--- a/tortoise/read_fast.py
+++ b/tortoise/read_fast.py
@@ -28,8 +28,6 @@
 
 
     args = parser.parse_args()
-    if torch.backends.mps.is_available():
-        args.use_deepspeed = False
     tts = TextToSpeech(models_dir=args.model_dir, use_deepspeed=args.use_deepspeed, kv_cache=args.kv_cache, half=args.half)
 
     outpath = args.output_path
diff --git a/tortoise/tts_stream.py b/tortoise/tts_stream.py
index 94eaff57..f3266920 100644
--- a/tortoise/tts_stream.py
+++ b/tortoise/tts_stream.py
@@ -37,8 +37,6 @@ def play_audio(audio_queue):
 
 
     args = parser.parse_args()
-    if torch.backends.mps.is_available():
-        args.use_deepspeed = False
     tts = TextToSpeech(models_dir=args.model_dir, use_deepspeed=args.use_deepspeed, kv_cache=args.kv_cache, half=args.half)
 
     outpath = args.output_path