Add argument to load model in 8bit

orangetin · orangetin · commit ad0476c15f10 · 2023-03-31T13:00:56.000-05:00
diff --git a/environment.yml b/environment.yml
@@ -24,6 +24,6 @@ dependencies:
       - datasets==2.10.1
       - loguru==0.6.0
       - netifaces==0.11.0
-      - transformers==4.21.1
+      - transformers==4.27.4
       - wandb==0.13.10
       - zstandard==0.20.0
diff --git a/inference/bot.py b/inference/bot.py
@@ -48,15 +48,16 @@ class ChatModel:
     human_id = "<human>"
     bot_id = "<bot>"
 
-    def __init__(self, model_name, gpu_id, max_memory):
+    def __init__(self, model_name, gpu_id, max_memory, load_in_8bit):
         device = torch.device('cuda', gpu_id)   # TODO: allow sending to cpu
 
         # recommended default for devices with > 40 GB VRAM
         # load model onto one device
         if max_memory is None:
             self._model = AutoModelForCausalLM.from_pretrained(
-                model_name, torch_dtype=torch.float16, device_map="auto")
-            self._model.to(device)
+                model_name, torch_dtype=torch.float16, device_map="auto", load_in_8bit=load_in_8bit)
+            if not load_in_8bit:
+                self._model.to(device)  # not supported by load_in_8bit
         # load the model with the given max_memory config (for devices with insufficient VRAM or multi-gpu)
         else:
             config = AutoConfig.from_pretrained(model_name)
@@ -110,7 +111,7 @@ class OpenChatKitShell(cmd.Cmd):
     intro = "Welcome to OpenChatKit shell.   Type /help or /? to list commands.\n"
     prompt = ">>> "
 
-    def __init__(self, gpu_id, model_name_or_path, max_tokens, sample, temperature, top_k, retrieval, max_memory, do_stream):
+    def __init__(self, gpu_id, model_name_or_path, max_tokens, sample, temperature, top_k, retrieval, max_memory, do_stream, load_in_8bit):
         super().__init__()
         self._gpu_id = int(gpu_id)
         self._model_name_or_path = model_name_or_path
@@ -121,10 +122,11 @@ def __init__(self, gpu_id, model_name_or_path, max_tokens, sample, temperature,
         self._retrieval = retrieval
         self._max_memory = max_memory
         self._do_stream = do_stream
+        self._load_in_8bit = load_in_8bit
 
     def preloop(self):
         print(f"Loading {self._model_name_or_path} to cuda:{self._gpu_id}...")
-        self._model = ChatModel(self._model_name_or_path, self._gpu_id, self._max_memory)
+        self._model = ChatModel(self._model_name_or_path, self._gpu_id, self._max_memory, self._load_in_8bit)
 
         if self._retrieval:
             print(f"Loading retrieval index...")
@@ -253,6 +255,13 @@ def main():
         help='max CPU RAM to allocate',
         required=False
     )
+    # `pip install bitsandbytes` to use. No effect when used with -g or -r.
+    parser.add_argument(
+        '--load-in-8bit',
+        default=False,
+        action='store_true',
+        help='indicates whether to load model in 8 bit'
+    )
     args = parser.parse_args()
 
     # set max_memory dictionary if given
@@ -278,6 +287,7 @@ def main():
         args.retrieval,
         max_memory,
         not args.no_stream,
+        args.load_in_8bit,
     ).cmdloop()