Update bot.py

orangetin · web-flow · commit f2859998f656 · 2023-04-18T21:32:22.000Z
diff --git a/inference/bot.py b/inference/bot.py
@@ -11,7 +11,7 @@
 import argparse
 import conversation as convo
 import retrieval.wikipedia as wp
-from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, StoppingCriteria, StoppingCriteriaList
+from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, StoppingCriteria, StoppingCriteriaList, BitsAndBytesConfig
 from accelerate import infer_auto_device_map, init_empty_weights
 
 
@@ -51,14 +51,16 @@ class ChatModel:
     def __init__(self, model_name, gpu_id, max_memory, load_in_8bit):
         device = torch.device('cuda', gpu_id)   # TODO: allow sending to cpu
 
+        quantization_config = BitsAndBytesConfig(
+            load_in_8bit=load_in_8bit, 
+            llm_int8_enable_fp32_cpu_offload=True,
+        )   # config to load in 8-bit if load_in_8bit
+
         # recommended default for devices with > 40 GB VRAM
         # load model onto one device
         if max_memory is None:
-            self._model = AutoModelForCausalLM.from_pretrained(
-                model_name, torch_dtype=torch.float16, device_map="auto", load_in_8bit=load_in_8bit)
-            if not load_in_8bit:
-                self._model.to(device)  # not supported by load_in_8bit
-        # load the model with the given max_memory config (for devices with insufficient VRAM or multi-gpu)
+            device_map="auto"
+
         else:
             config = AutoConfig.from_pretrained(model_name)
             # load empty weights
@@ -67,21 +69,24 @@ def __init__(self, model_name, gpu_id, max_memory, load_in_8bit):
 
             model_from_conf.tie_weights()
 
-            # create a device_map from max_memory
+            #create a device_map from max_memory
             device_map = infer_auto_device_map(
                 model_from_conf,
                 max_memory=max_memory,
                 no_split_module_classes=["GPTNeoXLayer"],
-                dtype="float16"
-            )
-            # load the model with the above device_map
-            self._model = AutoModelForCausalLM.from_pretrained(
-                model_name,
-                device_map=device_map,
-                offload_folder="offload",  # optional offload-to-disk overflow directory (auto-created)
-                offload_state_dict=True,
-                torch_dtype=torch.float16
+                dtype="float16",
             )
+
+        self._model = AutoModelForCausalLM.from_pretrained(
+            model_name, 
+            torch_dtype=torch.float16, 
+            device_map=device_map, 
+            offload_folder="offload",
+            quantization_config=quantization_config,
+        )
+        if not load_in_8bit:
+            self._model.to(device)  # not supported by load_in_8bit
+
         self._tokenizer = AutoTokenizer.from_pretrained(model_name)
 
     def do_inference(self, prompt, max_new_tokens, do_sample, temperature, top_k, stream_callback=None):