turboderp-org
diff --git a/‎eval/humaneval.py
Lines changed: 11 additions & 7 deletions b/‎eval/humaneval.py
Lines changed: 11 additions & 7 deletions
diff --git a/‎examples/chat.py
Lines changed: 25 additions & 2 deletions b/‎examples/chat.py
Lines changed: 25 additions & 2 deletions
diff --git a/‎examples/chat_console.py
Lines changed: 7 additions & 0 deletions b/‎examples/chat_console.py
Lines changed: 7 additions & 0 deletions
diff --git a/‎examples/chat_util.py
Lines changed: 12 additions & 0 deletions b/‎examples/chat_util.py
Lines changed: 12 additions & 0 deletions
diff --git a/‎exllamav3/conversion/allocation.py
Lines changed: 2 additions & 2 deletions b/‎exllamav3/conversion/allocation.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎exllamav3/conversion/convert_model.py
Lines changed: 11 additions & 1 deletion b/‎exllamav3/conversion/convert_model.py
Lines changed: 11 additions & 1 deletion
diff --git a/‎exllamav3/exllamav3_ext/bindings.cpp
Lines changed: 1 addition & 0 deletions b/‎exllamav3/exllamav3_ext/bindings.cpp
Lines changed: 1 addition & 0 deletions
diff --git a/‎exllamav3/exllamav3_ext/generator/rep_pen.cu
Lines changed: 2 additions & 1 deletion b/‎exllamav3/exllamav3_ext/generator/rep_pen.cu
Lines changed: 2 additions & 1 deletion
diff --git a/‎exllamav3/exllamav3_ext/quant/comp_units/exl3_comp_unit_1.cu
Lines changed: 6 additions & 0 deletions b/‎exllamav3/exllamav3_ext/quant/comp_units/exl3_comp_unit_1.cu
Lines changed: 6 additions & 0 deletions
diff --git a/‎exllamav3/exllamav3_ext/quant/comp_units/exl3_comp_unit_1.cuh
Lines changed: 2 additions & 0 deletions b/‎exllamav3/exllamav3_ext/quant/comp_units/exl3_comp_unit_1.cuh
Lines changed: 2 additions & 0 deletions
@@ -14,12 +14,12 @@
         "    "
     ),
     "granite": (
-        "Question:\nComplete the following Python function:\n\n{{problem}}\n\nAnswer:\n"
+        "<|endoftext|>Question:\nComplete the following Python function:\n\n{{problem}}\n\nAnswer:\n"
         "Sure! Here is how you might implement the function:\n\n```python\n{{problem}}",
         "    "
     ),
     "llama": (
-        "[INST] <<SYS>>\n"
+        "<s>[INST] <<SYS>>\n"
         "You are a helpful AI coding assistant.\n"
         "<</SYS>>\n\n"
         "Complete the following Python function:\n\n"
@@ -28,7 +28,7 @@
         "    "
     ),
     "llama3": (
-        "<|start_header_id|>system<|end_header_id|>\n\n"
+        "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n"
         "You are a helpful AI coding assistant.<|eot_id|>"
         "<|start_header_id|>user<|end_header_id|>\n\n"
         "Complete the following Python function:\n\n{{problem}}<|eot_id|>"
@@ -37,7 +37,7 @@
         "    "
     ),
     "mistral": (
-        "[INST] You are a helpful AI coding assistant.\n\n"
+        "<s>[INST] You are a helpful AI coding assistant.\n\n"
         "Complete the following Python function:\n\n"
         "{{problem}}[/INST]"
         " Sure! Here is how you might implement the function:\n\n```python\n{{problem}}",
@@ -51,7 +51,7 @@
         "    "
     ),
     "reka": (
-        "human: Complete the following Python function."
+        "<|endoftext|>human: Complete the following Python function."
         " Provide your reasoning in comments, but be concise and don't second-guess."
         "\n\n{{problem}}"
         " <sep> assistant: ```python\n{{problem}}",
@@ -76,7 +76,7 @@
         "    "
     ),
     "deepseek": (
-        "You are a helpful AI coding assistant.\n"
+        "<｜begin▁of▁sentence｜>You are a helpful AI coding assistant.\n"
         "<｜User｜>Complete the following Python function:\n\n{{problem}}"
         "<｜Assistant｜>Sure! Here is how you might implement the function:\n\n```python\n{{problem}}",
         "    "
@@ -124,7 +124,11 @@ def main(args):
         for idx, (problem_id, problem) in enumerate(problems.items()):
             b_problem = problem["prompt"]
             f_problem = prompt_format.replace("{{problem}}", b_problem)
-            input_ids = tokenizer.encode(f_problem, encode_special_tokens = True, add_bos = True)
+            input_ids = tokenizer.encode(
+                f_problem,
+                encode_special_tokens = True,
+                add_bos = (args.prompt_format == "raw")
+            )
             for s in range(num_samples_per_task):
                 job = Job(
                     input_ids = input_ids,
 
@@ -5,6 +5,7 @@
 from exllamav3 import Generator, Job, model_init
 from exllamav3.generator.sampler import ComboSampler
 from chat_templates import *
+from chat_util import *
 import torch
 from chat_console import *
 
@@ -61,15 +62,37 @@ def main(args):
     # Main loop
     print("\n" + col_sysprompt + system_prompt.strip() + col_default)
     context = []
+    response = ""
 
     while True:
 
         # Amnesia mode
         if args.amnesia:
             context = []
 
-        # Get user prompt and add to context
+        # Get user prompt
         user_prompt = read_input_fn(args, user_name)
+
+        # Intercept commands
+        if user_prompt.startswith("/"):
+            c = user_prompt.strip()
+            match c:
+                case "/x":
+                    print_info("Exiting")
+                    break
+                case "/cc":
+                    snippet = copy_last_codeblock(response)
+                    if not snippet:
+                        print_error("No code block found in last response")
+                    else:
+                        num_lines = len(snippet.split("\n"))
+                        print_info(f"Copied {num_lines} line{'s' if num_lines > 1 else ''} to the clipboard")
+                    continue
+                case _:
+                    print_error(f"Unknown command: {c}")
+                    continue
+
+        # Add to context
         context.append((user_prompt, None))
 
         # Tokenize context and trim from head if too long
@@ -141,7 +164,7 @@ def get_input_ids():
     parser.add_argument("-freqp", "--frequency_penalty", type = float, help = "Frequency penalty, 0 to disable (default: disabled)", default = 0.0)
     parser.add_argument("-penr", "--penalty_range", type = int, help = "Range for penalties, in tokens (default: 1024) ", default = 1024)
     parser.add_argument("-minp", "--min_p", type = float, help = "Min-P truncation, 0 to disable (default: 0.08)", default = 0.08)
-    parser.add_argument("-topk", "--top_k", type = float, help = "Top-K truncation, 0 to disable (default: disabled)", default = 0)
+    parser.add_argument("-topk", "--top_k", type = int, help = "Top-K truncation, 0 to disable (default: disabled)", default = 0)
     parser.add_argument("-topp", "--top_p", type = float, help = "Top-P truncation, 1 to disable (default: disabled)", default = 1.0)
     _args = parser.parse_args()
     main(_args)
@@ -15,8 +15,15 @@
 col_think1 = "\u001b[35;1m"  # Bright magenta
 col_think2 = "\u001b[35m"  # Magenta
 col_error = "\u001b[31;1m"  # Bright red
+col_info = "\u001b[32;1m"  # Bright red
 col_sysprompt = "\u001b[37;1m"  # Grey
 
+def print_error(text):
+    print(col_error + "\nError: " + col_default + text)
+
+def print_info(text):
+    print(col_info + "\nInfo: " + col_default + text)
+
 def read_input_console(args, user_name):
     print("\n" + col_user + user_name + ": " + col_default, end = '', flush = True)
     if args.multiline:
 
@@ -0,0 +1,12 @@
+import re
+import sys
+import pyperclip
+
+def copy_last_codeblock(text: str) -> str | None:
+    pattern = re.compile(r"```[^\n`]*\n(.*?)```", re.DOTALL)
+    matches = pattern.findall(text)
+    if not matches:
+        return None
+    snippet = matches[-1].strip()
+    pyperclip.copy(snippet)
+    return snippet
@@ -44,7 +44,7 @@ def allocate_transformer(
         assert d
         if isinstance(g, list):
             for m in (g, u, d):
-                key_ = m[0].key.replace(".slice.0", ".slice.*")
+                key_ = m[0].key.replace(".slice.0", ".slice.*").replace(".experts.0.", ".experts.*.")
                 keys += [key_]
                 numels += [sum(mm.weights_numel() for mm in m)]
                 for mm in m:
@@ -65,7 +65,7 @@ def allocate_transformer(
         assert d
         if isinstance(u, list):
             for m in (u, d):
-                key_ = m[0].key.replace(".slice.0", ".slice.*")
+                key_ = m[0].key.replace(".slice.0", ".slice.*").replace(".experts.0.", ".experts.*.")
                 keys += [m]
                 numels += [sum(mm.weights_numel() for mm in m)]
                 for mm in m:
 
@@ -278,7 +278,9 @@ def main(args, job_state):
             qmaps = module.get_qmaps()
             if len(qmaps) > 0:
 
-                # Capture calibration input states during forward pass
+                # Capture calibration input states during forward pass. For block-sparse models, all expert layers
+                # are activated to ensure all down projections capture at least some calibration data. When the
+                # state is advanced later, only selected experts will be used.
                 with ProgressBar(f" -- Capturing: {module.key}" + slice_str, len(state)) as progress:
                     capture_H = {}
                     ref_states = []
@@ -287,12 +289,20 @@ def main(args, job_state):
                         params = {
                             "attn_mode": "flash_attn_nc",
                             "capture": capture_H,
+                            "activate_all_experts": model.calibration_all_experts,
                         }
                         if slicing:
                              params["q_mlp_slice"] = current_slice
                         rs = module.prepare_for_device(state[i], params)
                         rs = module.forward(rs, params)
                         if i < num_ref_states:
+                            if model.calibration_all_experts:
+                                # Reference state for measuring error need, with only selected experts
+                                params = { "attn_mode": "flash_attn_nc" }
+                                if slicing:
+                                    params["q_mlp_slice"] = current_slice
+                                rs = module.prepare_for_device(state[i], params)
+                                rs = module.forward(rs, params)
                             ref_states.append(rs.cpu())
                         rs = None
                 print(f" -- Captured: {module.key}" + slice_str)
 
@@ -55,6 +55,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
     m.def("exl3_gemm", &exl3_gemm, "exl3_gemm");
     m.def("exl3_gemm_num_kernel_shapes", &exl3_gemm_num_kernel_shapes, "exl3_gemm_num_kernel_shapes");
     m.def("exl3_gemm_shape_compat", &exl3_gemm_shape_compat, "exl3_gemm_shape_compat");
+    m.def("exl3_mgemm", &exl3_mgemm, "exl3_mgemm");
     m.def("hgemm", &hgemm, "hgemm");
     m.def("rope", &rope, "rope");
     m.def("silu_mul", &silu_mul, "silu_mul");
 
@@ -75,7 +75,8 @@ void apply_rep_pens_kernel
 
         float w = v > 0.0f ? v / rep_p : v * rep_p;
         float f = factors[i] + 1e-30;
-        float o = v * (1.0f - f) + w * f;
+        float f1 = (1.0f - f) + 1e-30;
+        float o = v * f1 + w * f;
         out_logits[i + range_min] = o;
     }
 }
 
@@ -17,4 +17,10 @@ fp_exl3_gemm_kernel tfp_exl3_gemm_kernel_fp16_b1[] = {
     EXL3_GEMM_KERNEL_INSTANCES(1, false)
 };
 
+fp_exl3_mgemm_kernel tfp_exl3_mgemm_kernel_fp32_b1[] = {
+    EXL3_MGEMM_KERNEL_INSTANCES(1, true)
+};
 
+fp_exl3_mgemm_kernel tfp_exl3_mgemm_kernel_fp16_b1[] = {
+    EXL3_MGEMM_KERNEL_INSTANCES(1, false)
+};
@@ -2,3 +2,5 @@
 
 extern fp_exl3_gemm_kernel tfp_exl3_gemm_kernel_fp32_b1[];
 extern fp_exl3_gemm_kernel tfp_exl3_gemm_kernel_fp16_b1[];
+extern fp_exl3_mgemm_kernel tfp_exl3_mgemm_kernel_fp32_b1[];
+extern fp_exl3_mgemm_kernel tfp_exl3_mgemm_kernel_fp16_b1[];
Original file line number	Diff line number	Diff line change
`@@ -75,7 +75,8 @@ void apply_rep_pens_kernel`
`75`	`75`
`76`	`76`	`float w = v > 0.0f ? v / rep_p : v * rep_p;`
`77`	`77`	`float f = factors[i] + 1e-30;`
`78`		`- float o = v * (1.0f - f) + w * f;`
	`78`	`+ float f1 = (1.0f - f) + 1e-30;`
	`79`	`+ float o = v * f1 + w * f;`
`79`	`80`	`out_logits[i + range_min] = o;`
`80`	`81`	`}`
`81`	`82`	`}`