turboderp-org
diff --git a/‎README.md
Lines changed: 4 additions & 10 deletions b/‎README.md
Lines changed: 4 additions & 10 deletions
diff --git a/‎examples/chat.py
Lines changed: 16 additions & 2 deletions b/‎examples/chat.py
Lines changed: 16 additions & 2 deletions
diff --git a/‎examples/multimodal.py
Lines changed: 1 addition & 1 deletion b/‎examples/multimodal.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎exllamav3/__init__.py
Lines changed: 2 additions & 2 deletions b/‎exllamav3/__init__.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎exllamav3/architecture/__init__.py
Lines changed: 1 addition & 0 deletions b/‎exllamav3/architecture/__init__.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎exllamav3/architecture/arcee.py
Lines changed: 153 additions & 0 deletions b/‎exllamav3/architecture/arcee.py
Lines changed: 153 additions & 0 deletions
diff --git a/‎exllamav3/models/architectures.py renamed to ‎exllamav3/architecture/architectures.py
Lines changed: 5 additions & 1 deletion b/‎exllamav3/models/architectures.py renamed to ‎exllamav3/architecture/architectures.py
Lines changed: 5 additions & 1 deletion
diff --git a/‎exllamav3/models/cohere.py renamed to ‎exllamav3/architecture/cohere.py
Lines changed: 3 additions & 4 deletions b/‎exllamav3/models/cohere.py renamed to ‎exllamav3/architecture/cohere.py
Lines changed: 3 additions & 4 deletions
diff --git a/‎exllamav3/models/cohere2.py renamed to ‎exllamav3/architecture/cohere2.py
Lines changed: 3 additions & 4 deletions b/‎exllamav3/models/cohere2.py renamed to ‎exllamav3/architecture/cohere2.py
Lines changed: 3 additions & 4 deletions
diff --git a/‎exllamav3/models/decilm.py renamed to ‎exllamav3/architecture/decilm.py
Lines changed: 3 additions & 4 deletions b/‎exllamav3/models/decilm.py renamed to ‎exllamav3/architecture/decilm.py
Lines changed: 3 additions & 4 deletions
@@ -1,13 +1,6 @@
 
 # <img src="doc/cat.png" width="40"> ExLlamaV3
 
-ExLlamaV3 is still in development. Please note: ↙
-
-- The framework <u>is not yet fully optimized</u>. Performance is lacking, especially on Ampere, and there may be a significant CPU bottleneck on slower processors until the extension functions are fully built out.
-- AMD GPUs (ROCm) are not yet supported.
-- [FlashAttention-2](https://github.com/Dao-AILab/flash-attention) is currently required. I hope to switch over to [FlashInfer](https://github.com/flashinfer-ai/flashinfer/tree/main) in time, but there are some obstacles to overcome first. 
-- A number of important features are yet to be added, such as tensor parallelism.
-
 ## Why?
 
 As the name implies, the original intention for ExLlama was to run inference on quantized Llama models. ExLlamaV2 was able to support a number of other architectures by treating every new model as (more or less) a Llama variant with optional features. However, as new models are increasingly moving away from the basic transformer template, this approach is no longer sustainable.  
@@ -18,12 +11,13 @@ Aside from lifting a few of the most successful features from V2 (such as the ge
 
 ## What's missing?
 
-There's much that still needs to be added and/or ported over from ExLlamaV2. I've decided to release ExLlamaV3 in its current state to invite testing, feedback and contributions, but please be aware that it's not yet a viable replacement for ExLlamaV2. Currently on the to-do list:
+Currently on the to-do list:
 
+- Lots of optimization
 - LoRA support
 - ROCm support
-- Tensor-parallel inference
-- Lots of optimization
+- More sampling functions
+- More quantization modes (FP4 etc.)
 
 As for what is implemented, expect that some things may be a little broken at first. Please be patient and/or contribute. 👉👈 
 
 
@@ -62,6 +62,9 @@ def main(args):
         temp_last = not args.temperature_first,
     )
 
+    # Single prompt mode
+    single_prompt = args.prompt
+
     # Main loop
     print("\n" + col_sysprompt + system_prompt.strip() + col_default)
     context = []
@@ -76,8 +79,18 @@ def main(args):
             context = []
 
         # Get user prompt
-        user_prompt = read_input_fn(args, user_name, multiline)
-        prefix = ""
+        if single_prompt is not None:
+            # This round, use provided prompt from cmdline
+            user_prompt = single_prompt
+            prefix = ""
+            # Next round, exit
+            single_prompt = "/x"
+        else:
+            try:
+                user_prompt = read_input_fn(args, user_name, multiline)
+                prefix = ""
+            except KeyboardInterrupt:
+                user_prompt = "/x"
 
         # Intercept commands
         if user_prompt.startswith("/"):
@@ -282,5 +295,6 @@ def get_input_ids(_prefix):
     parser.add_argument("-topk", "--top_k", type = int, help = "Top-K truncation, 0 to disable (default: disabled)", default = 0)
     parser.add_argument("-topp", "--top_p", type = float, help = "Top-P truncation, 1 to disable (default: disabled)", default = 1.0)
     parser.add_argument("-tps", "--show_tps", action = "store_true", help = "Show tokens/second after every reply")
+    parser.add_argument("-prompt", "--prompt", type = str, help = "Run single prompt, then exit")
     _args = parser.parse_args()
     main(_args)
@@ -18,7 +18,7 @@
         model_dir = "/mnt/str/models/gemma3-4b-it/exl3/5.0bpw/"
     case "mistral3":
         prompt_format = "mistral"
-        model_dir = "/mnt/str/models/mistral-small-3.1-24b-instruct/exl3/8.0bpw_H8"
+        model_dir = "/mnt/str/models/mistral-small-3.1-24b-instruct-2503/exl3/4.0bpw/"
 
 images = [
     # Cat
 
@@ -1,5 +1,5 @@
-from .models.config import Config
-from .models.model import Model
+from .model.config import Config
+from .model.model import Model
 from .tokenizer import Tokenizer, MMEmbedding
 from .cache import Cache, CacheLayer_fp16, CacheLayer_quant
 from .generator import Generator, Job, AsyncGenerator, AsyncJob, Filter, FormatronFilter
 
@@ -0,0 +1 @@
+from __future__ import annotations
@@ -0,0 +1,153 @@
+from typing_extensions import override
+import torch
+from ..modules import RMSNorm, Embedding, TransformerBlock, Attention, MLP, Linear
+from ..model.config import Config, no_default
+from ..model.model import Model
+from ..util.rope import RopeStyle
+from ..modules.attn import prepare_for_attn
+
+class ArceeConfig(Config):
+    arch_string = "ArceeForCausalLM"
+
+    def __init__(
+        self,
+        directory: str,
+        derived_model: dict | None = None,
+        **kwargs,
+    ):
+        super().__init__(
+            directory,
+            derived_model if derived_model else {"text": ArceeModel},
+            **kwargs
+        )
+
+        # Attention params
+        self.head_dim = self.read_cfg(int, "head_dim", None)
+        self.hidden_size = self.read_cfg(int, "hidden_size", no_default)
+        self.num_q_heads = self.read_cfg(int, "num_attention_heads", no_default)
+        self.num_kv_heads = self.read_cfg(int, "num_key_value_heads", self.num_q_heads)
+
+        if not self.head_dim:
+            self.head_dim = self.hidden_size // self.num_q_heads
+
+        # MLP params
+        self.assert_cfg(str, "hidden_act", "relu2", True)
+        self.intermediate_size = self.read_cfg(int, "intermediate_size", no_default)
+
+        # Norms
+        self.rms_norm_eps = self.read_cfg(float, "rms_norm_eps", no_default)
+
+        # Layers
+        self.num_hidden_layers = self.read_cfg(int, "num_hidden_layers", no_default)
+        self.tie_word_embeddings = self.read_cfg(bool, "tie_word_embeddings", False)
+
+        # RoPE
+        self.rope_settings = self.read_rope_settings_default(RopeStyle.NEOX)
+
+
+class ArceeModel(Model):
+    config_class = ArceeConfig
+
+    def __init__(
+        self,
+        config: ArceeConfig,
+        **kwargs
+    ):
+        super().__init__(config, **kwargs)
+
+        self.modules += [
+            Embedding(
+                config = config,
+                key = "model.embed_tokens",
+                vocab_size = config.vocab_size,
+                hidden_size = config.hidden_size,
+            )
+        ]
+
+        self.first_block_idx = len(self.modules)
+
+        self.modules += [
+            TransformerBlock(
+                config = config,
+                key = f"model.layers.{idx}",
+                attn_norm = RMSNorm(
+                    config = config,
+                    key = f"model.layers.{idx}.input_layernorm",
+                    rms_norm_eps = config.rms_norm_eps,
+                ),
+                attn = Attention(
+                    config = config,
+                    key = f"model.layers.{idx}.self_attn",
+                    layer_idx = idx,
+                    hidden_size = config.hidden_size,
+                    head_dim = config.head_dim,
+                    num_q_heads = config.num_q_heads,
+                    num_kv_heads = config.num_kv_heads,
+                    rope_settings = config.rope_settings,
+                    sm_scale = None,
+                    key_q = "q_proj",
+                    key_k = "k_proj",
+                    key_v = "v_proj",
+                    key_o = "o_proj",
+                    qmap = "block.attn",
+                ),
+                mlp_norm = RMSNorm(
+                    config = config,
+                    key = f"model.layers.{idx}.post_attention_layernorm",
+                    rms_norm_eps = config.rms_norm_eps,
+                ),
+                mlp = MLP(
+                    config = config,
+                    key = f"model.layers.{idx}.mlp",
+                    hidden_size = config.hidden_size,
+                    intermediate_size = config.intermediate_size,
+                    key_up = "up_proj",
+                    key_down = "down_proj",
+                    qmap = "block.mlp",
+                    activation_fn = "relu2",
+                    out_dtype = torch.float,
+                ),
+            )
+            for idx in range(config.num_hidden_layers)
+        ]
+
+        self.last_kv_module_idx = len(self.modules) - 1
+
+        head_alt_key = None
+        if config.tie_word_embeddings and not self.config.stc.has_tensor("lm_head"):
+            head_alt_key = "model.embed_tokens"
+
+        self.modules += [
+            RMSNorm(
+                config = config,
+                key = "model.norm",
+                rms_norm_eps = config.rms_norm_eps,
+                out_dtype = torch.half,
+            ),
+            Linear(
+                config = config,
+                key = "lm_head",
+                qbits_key = "head_bits",
+                alt_key = head_alt_key,
+                in_features = config.hidden_size,
+                out_features = config.vocab_size,
+                qmap = "block",
+                caps = {"logits_output": True}
+            )
+        ]
+
+        self.logit_layer_idx = len(self.modules) - 1
+
+    @override
+    def prepare_inputs(self, input_ids: torch.Tensor, params: dict) -> torch.Tensor:
+        params["input_ids"] = input_ids
+        input_ids = prepare_for_attn(input_ids, params)
+        return input_ids
+
+    @override
+    def default_chat_prompt(self, prompt: str, system_prompt: str | None = None) -> str:
+        p = "<|begin_of_text|>"
+        if system_prompt:
+            p += f"<|im_start|>system\n{system_prompt}<|im_end|>\n"
+        p += f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
+        return p
@@ -1,3 +1,4 @@
+from .arcee import ArceeModel
 from .cohere import CohereModel
 from .cohere2 import Cohere2Model
 from .decilm import DeciLMModel
@@ -8,6 +9,7 @@
 from .gemma2 import Gemma2Model
 from .gemma3 import Gemma3Model, Gemma3TextModel
 from .glm4 import Glm4Model
+from .glm4_moe import Glm4MoeModel
 from .llama import LlamaModel
 from .mimo import MiMoModel
 from .mistral import MistralModel
@@ -25,6 +27,7 @@
         "config_class": m.config_class,
         "model_class": m,
     } for m in [
+        ArceeModel,
         CohereModel,
         Cohere2Model,
         DeciLMModel,
@@ -36,6 +39,7 @@
         Gemma3Model,
         Gemma3TextModel,
         Glm4Model,
+        Glm4MoeModel,
         LlamaModel,
         MiMoModel,
         MistralModel,
@@ -50,4 +54,4 @@
 }
 
 def get_architectures():
-    return ARCHITECTURES
+    return ARCHITECTURES
@@ -1,9 +1,9 @@
 from __future__ import annotations
 from typing_extensions import override
 import torch
-from .config import Config, no_default
-from .model import Model
-from ..util.rope import RopeSettings, RopeStyle
+from ..model.config import Config, no_default
+from ..model.model import Model
+from ..util.rope import RopeStyle
 from ..modules import LayerNorm, Embedding, ParallelDecoderBlock, Attention, GatedMLP, Linear
 from ..modules.attn import prepare_for_attn
 
@@ -152,7 +152,6 @@ def __init__(
 
     @override
     def prepare_inputs(self, input_ids: torch.Tensor, params: dict) -> torch.Tensor:
-        params["input_ids"] = input_ids
         input_ids = prepare_for_attn(input_ids, params)
         return input_ids
 
 
@@ -1,9 +1,9 @@
 from __future__ import annotations
 from typing_extensions import override
 import torch
-from .config import Config, no_default
-from .model import Model
-from ..util.rope import RopeSettings, RopeStyle
+from ..model.config import Config, no_default
+from ..model.model import Model
+from ..util.rope import RopeStyle
 from ..modules import LayerNorm, Embedding, ParallelDecoderBlock, Attention, GatedMLP, Linear
 from ..modules.attn import prepare_for_attn
 
@@ -151,7 +151,6 @@ def __init__(
 
     @override
     def prepare_inputs(self, input_ids: torch.Tensor, params: dict) -> torch.Tensor:
-        params["input_ids"] = input_ids
         input_ids = prepare_for_attn(input_ids, params)
         return input_ids
 
 
@@ -1,9 +1,9 @@
 from __future__ import annotations
 from typing_extensions import override
 import torch
-from .config import Config, no_default
-from .model import Model
-from ..util.rope import RopeSettings, RopeStyle
+from ..model.config import Config, no_default
+from ..model.model import Model
+from ..util.rope import RopeStyle
 from ..modules import RMSNorm, Embedding, TransformerBlock, Attention, GatedMLP, Linear
 from ..modules.attn import prepare_for_attn
 
@@ -172,7 +172,6 @@ def __init__(
 
     @override
     def prepare_inputs(self, input_ids: torch.Tensor, params: dict) -> torch.Tensor:
-        params["input_ids"] = input_ids
         input_ids = prepare_for_attn(input_ids, params)
         return input_ids