diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
index 1bffb0cb..da30d839 100644
--- a/unsloth/models/llama.py
+++ b/unsloth/models/llama.py
@@ -1684,11 +1684,11 @@ def from_pretrained(
             else:
                 inner_training_loop = Trainer._original_training_loop
         except:
-            raise RuntimeError('Unsloth currently does not support multi GPU setups - but we are working on it!')
+            raise RuntimeError('llama.py:1687 Unsloth currently does not support multi GPU setups - but we are working on it!')
         pass
 
         if ((post_check - pre_check) >= 1).sum() > 1:
-            raise RuntimeError('Unsloth currently does not support multi GPU setups - but we are working on it!')
+            raise RuntimeError('llama.py:1691 Unsloth currently does not support multi GPU setups - but we are working on it!')
 
         import transformers.trainer
         items_in_trainer = dir(transformers.trainer)
@@ -1715,17 +1715,23 @@ def from_pretrained(
         f"{chr(92)}        /    Total batch size = {total_train_batch_size:,} | Total steps = {max_steps:,}\\n"\\
         f' "-____-"     Number of trainable parameters = {get_model_param_count(model, trainable_only=True):,}'
         logger.warning(debug_info)
-        import subprocess, re, gc, numpy as np
+        import subprocess, os, re, gc, numpy as np
+        index_for_cuda = os.environ.get("CUDA_VISIBLE_DEVICES", -1)
+        if "," in index_for_cuda:
+            raise RuntimeError("llama.py:1681 Unsloth currently does not support multi GPU setups - but we are working on it!")
+        index_for_cuda = int(index_for_cuda)
         a = np.array([0,])
         try:
             a = subprocess.check_output('nvidia-smi --query-gpu=memory.used --format=csv', shell = True)
             a = re.findall(rb'([\\d]{1,})[\\s]{1,}M', a)
             a = np.array([int(x.decode('utf-8'))/1024 for x in a])
+            if index_for_cuda != -1:
+                a = np.array([a[index_for_cuda],])
         except:
             if not torch.cuda.is_available():
                 raise RuntimeError('Unsloth: We do not support AMD / Intel machines yet - it is a work in progress!')
         if ((a - PRE_CHECK) >= 1).sum() > 1:
-            raise RuntimeError('Unsloth currently does not support multi GPU setups - but we are working on it!')
+            raise RuntimeError('llama.py:1694 Unsloth currently does not support multi GPU setups - but we are working on it!')
         for _ in range(3):
             gc.collect()
             torch.cuda.empty_cache()"""
@@ -1786,7 +1792,7 @@ def from_pretrained(
             "False",
         )
         if "n_total_devices >" not in inner_training_loop:
-            raise RuntimeError('Unsloth currently does not support multi GPU setups - but we are working on it!')
+            raise RuntimeError('llama.py:1795 Unsloth currently does not support multi GPU setups - but we are working on it!')
         pass
         inner_training_loop = inner_training_loop.replace(
             "is_sagemaker_mp_enabled()",
diff --git a/unsloth/tokenizer_utils.py b/unsloth/tokenizer_utils.py
index 302017d5..ff78d4b9 100644
--- a/unsloth/tokenizer_utils.py
+++ b/unsloth/tokenizer_utils.py
@@ -830,12 +830,18 @@ def check_tokenizer(
 
 
 def check_nvidia():
+    index_for_cuda = os.environ.get("CUDA_VISIBLE_DEVICES", -1)
+    if "," in index_for_cuda:
+        raise RuntimeError("Unsloth currently does not support multi GPU setups - but we are working on it!")
+    index_for_cuda = int(index_for_cuda)
     # Unsloth doesn't work yet on AMD devices - we're working on it!
     output = np.array([0,])
     try:
         output = subprocess.check_output("nvidia-smi --query-gpu=memory.used --format=csv", shell = True)
         output = re.findall(rb'([\d]{1,})[\s]{1,}M', output)
         output = np.array([int(x.decode('utf-8'))/1024 for x in output])
+        if index_for_cuda != -1:
+            output = np.array([output[index_for_cuda],])
     except:
         if not torch.cuda.is_available():
             raise RuntimeError("Unsloth: We do not support AMD / Intel machines yet - it is a work in progress!")
@@ -958,7 +964,11 @@ def patch_sft_trainer_tokenizer():
 
         check_text = \
         "\n"\
-        "import subprocess, re, gc, numpy as np\n"\
+        "import subprocess, os, re, gc, numpy as np\n"\
+        "index_for_cuda = os.environ.get(\"CUDA_VISIBLE_DEVICES\", -1)\n"\
+        "if \",\" in index_for_cuda:\n"\
+        "    raise RuntimeError(\"tokenizer_utils.py:970 Unsloth currently does not support multi GPU setups - but we are working on it!\")\n"\
+        "index_for_cuda = int(index_for_cuda)\n"\
         "a = np.array([0,])\n"\
         "try:\n"\
         "    a = subprocess.check_output('nvidia-smi --query-gpu=memory.used --format=csv', shell = True)\n"\
@@ -968,7 +978,7 @@ def patch_sft_trainer_tokenizer():
         "    if not torch.cuda.is_available():\n"\
         "        raise RuntimeError('Unsloth: We do not support AMD / Intel machines yet - it is a work in progress!')\n"\
         "if ((a - PRE_CHECK) >= 1).sum() > 1:\n"\
-        "    raise RuntimeError('Unsloth currently does not support multi GPU setups - but we are working on it!')\n"\
+        "    raise RuntimeError('tokenizer_utils.py:981 Unsloth currently does not support multi GPU setups - but we are working on it!')\n"\
         "for _ in range(3):\n"\
         "    gc.collect()\n"\
         "    torch.cuda.empty_cache()\n"\