From 94bd5078e6c6cf5acb5010043466cd071514fde3 Mon Sep 17 00:00:00 2001
From: JasonOA888 <jason@outland.art>
Date: Mon, 9 Mar 2026 15:42:09 +0800
Subject: [PATCH] feat(llm): add QMD_FORCE_CUDA env var to disable Vulkan
 offloading

On Windows VMs with para-virtualized GPUs, QMD may use Vulkan
offloading instead of pure CUDA mode even when CUDA is available.

This adds QMD_FORCE_CUDA env var to force CUDA and disable Vulkan:

  export QMD_FORCE_CUDA=1
  qmd query "test"

Fixes #278
---
 src/llm.ts | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)
diff --git a/src/llm.ts b/src/llm.ts
index 100a1ec7..4a9d3628 100644
--- a/src/llm.ts
+++ b/src/llm.ts
@@ -545,10 +545,23 @@ export class LlamaCpp implements LLM {
    */
   private async ensureLlama(): Promise<Llama> {
     if (!this.llama) {
+      // QMD_FORCE_CPU=1 forces CPU-only mode (useful for older GPUs like Pascal)
+      // QMD_FORCE_CUDA=1 forces CUDA and disables Vulkan offloading
+      const forceCpu = process.env.QMD_FORCE_CPU === "1" || process.env.QMD_FORCE_CPU === "true";
+      const forceCuda = process.env.QMD_FORCE_CUDA === "1" || process.env.QMD_FORCE_CUDA === "true";
+      
+      let gpuSetting: "auto" | "cuda" | false = "auto";
+      if (forceCpu) {
+        gpuSetting = false;
+      } else if (forceCuda) {
+        gpuSetting = "cuda";
+      }
+      
       const llama = await getLlama({
         // attempt to build
-        build: "autoAttempt",
-        logLevel: LlamaLogLevel.error
+        build: "autoAttempt" as any,
+        logLevel: LlamaLogLevel.error,
+        gpu: gpuSetting,
       });
 
       if (llama.gpu === false) {