Add -e llm.gpu boolean config variable

radareorg · May 9, 2024 · 66e72b0 · 66e72b0
1 parent 9caaea7
commit 66e72b0
Show file tree

Hide file tree

Showing 2 changed files with 18 additions and 16 deletions.
diff --git a/r2ai/interpreter.py b/r2ai/interpreter.py
@@ -518,6 +518,7 @@ def __init__(self):
     self.system_message = ""
     self.env["debug"] = "false"
     self.env["llm.model"] = self.model ## TODO: dup. must get rid of self.model
+    self.env["llm.gpu"] = "true"
     self.env["llm.window"] = "8096" # "4096" # context_window
     self.env["llm.maxtokens"] = "4096" # "1750"
     self.env["llm.maxmsglen"] = "8096" # "1750"
@@ -623,7 +624,7 @@ def keywords_ai(self, text):
     words = []
     mmname = "TheBloke/Mistral-7B-Instruct-v0.2-GGUF"
     ctxwindow = int(self.env["llm.window"])
-    mm = new_get_hf_llm(mmname, False, ctxwindow)
+    mm = new_get_hf_llm(self, mmname, False, ctxwindow)
     msg = f"Considering the sentence \"{text}\" as input, Take the KEYWORDS or combination of TWO words from the given text and respond ONLY a comma separated list of the most relevant words. DO NOT introduce your response, ONLY show the words"
     msg = f"Take \"{text}\" as input, and extract the keywords and combination of keywords to make a search online, the output must be a comma separated list" #Take the KEYWORDS or combination of TWO words from the given text and respond ONLY a comma separated list of the most relevant words. DO NOT introduce your response, ONLY show the words"
     response = mm(msg, stream=False, temperature=0.1, stop="</s>", max_tokens=1750)
@@ -682,7 +683,7 @@ def chat(self, message=None):
       try:
         ctxwindow = int(self.env["llm.window"])
         debug_mode = False # maybe true when debuglevel=2 ?
-        self.llama_instance = new_get_hf_llm(self.model, debug_mode, ctxwindow)
+        self.llama_instance = new_get_hf_llm(self, self.model, debug_mode, ctxwindow)
         if self.llama_instance == None:
           builtins.print("Cannot find the model")
           return
@@ -767,7 +768,7 @@ def trimsource_ai(self, msg):
       mmname = "TheBloke/Mistral-7B-Instruct-v0.1-GGUF"
       mmname = "TheBloke/Mistral-7B-Instruct-v0.2-GGUF"
       ctxwindow = int(self.env["llm.window"])
-      self.mistral = new_get_hf_llm(mmname, False, ctxwindow)
+      self.mistral = new_get_hf_llm(self, mmname, False, ctxwindow)
     # q = f"Rewrite this code into shorter pseudocode (less than 500 tokens). keep the comments and essential logic:\n```\n{msg}\n```\n"
     q = f"Rewrite this code into shorter pseudocode (less than 200 tokens). keep the relevant comments and essential logic:\n```\n{msg}\n```\n"
     response = self.mistral(q, stream=False, temperature=0.1, stop="</s>", max_tokens=4096)

diff --git a/r2ai/models.py b/r2ai/models.py
@@ -122,8 +122,15 @@ def models():
 -m Undi95/UtopiaXL-13B-GGUF
 """
 
-def get_hf_llm(repo_id, debug_mode, context_window):
-    n_gpu_layers = -1
+def gpulayers(ai):
+    if "llm.gpu" in ai.env:
+        if ai.env["llm.gpu"] == "true":
+            print("[r2ai] Using GPU")
+            return -1
+    print("[r2ai] Using CPU")
+    return 0
+
+def get_hf_llm(ai, repo_id, debug_mode, context_window):
     usermodels = None
     try:
         try:
@@ -138,7 +145,7 @@ def get_hf_llm(repo_id, debug_mode, context_window):
         if usermodels is not None and repo_id in usermodels:
             model_path = usermodels[repo_id]
 #            print(f"[r2ai] Using {r2ai_model_json} {model_path}")
-            return llama_cpp.Llama(model_path=model_path, n_gpu_layers=n_gpu_layers, verbose=debug_mode, n_ctx=context_window)
+            return llama_cpp.Llama(model_path=model_path, n_gpu_layers=gpulayers(ai), verbose=debug_mode, n_ctx=context_window)
     except:
         traceback.print_exc()
     print(f"Select {repo_id} model. See -M and -m flags", file=sys.stderr)
@@ -188,11 +195,6 @@ def get_hf_llm(repo_id, debug_mode, context_window):
       print("No model selected")
       return
     answers = inquirer.prompt([inquirer.List("default", message="Use this model by default? ~/.r2ai.model", choices=["Yes", "No"])])
-    # Third stage: GPU confirm
-#if confirm_action("Use GPU? (Large models might crash on GPU, but will run more quickly)"):
-##      n_gpu_layers = -1
-#    else:
-#      n_gpu_layers = 0
 
     # Get user data directory
     user_data_dir = appdirs.user_data_dir("r2ai")
@@ -348,7 +350,7 @@ def supports_metal():
         json.dump(usermodels, fd)
         fd.close()
         print("Saved")
-    return llama_cpp.Llama(model_path=model_path, n_gpu_layers=n_gpu_layers, verbose=debug_mode, n_ctx=context_window)
+    return llama_cpp.Llama(model_path=model_path, n_gpu_layers=gpulayers(ai), verbose=debug_mode, n_ctx=context_window)
 
 def set_default_model(repo_id):
     usermodels = {"default": repo_id}
@@ -474,13 +476,12 @@ def enough_disk_space(size, path) -> bool:
 
     return False
 
-def new_get_hf_llm(repo_id, debug_mode, context_window):
+def new_get_hf_llm(ai, repo_id, debug_mode, context_window):
     if repo_id.startswith("openai:") or repo_id.startswith("anthropic:") or repo_id.startswith("groq:") or repo_id.startswith("google:"):
         return repo_id
     if not os.path.exists(repo_id):
-        return get_hf_llm(repo_id, debug_mode, context_window)
+        return get_hf_llm(ai, repo_id, debug_mode, context_window)
     # print(f"LOADING FILE: {repo_id}")
-    n_gpu_layers = -1 # = 0 to use cpu
     user_data_dir = appdirs.user_data_dir("Open Interpreter")
     default_path = os.path.join(user_data_dir, "models")
 
@@ -555,4 +556,4 @@ def supports_metal():
     # Initialize and return Code-Llama
     if not os.path.isfile(model_path):
         print("Model is not a file")
-    return llama_cpp.Llama(model_path=model_path, n_gpu_layers=n_gpu_layers, verbose=debug_mode, n_ctx=context_window)
+    return llama_cpp.Llama(model_path=model_path, n_gpu_layers=gpulayers(ai), verbose=debug_mode, n_ctx=context_window)