InternLM
diff --git a/‎README.md‎
Lines changed: 1 addition & 0 deletions b/‎README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎README_ja.md‎
Lines changed: 1 addition & 0 deletions b/‎README_ja.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎README_zh-CN.md‎
Lines changed: 1 addition & 0 deletions b/‎README_zh-CN.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/en/supported_models/supported_models.md‎
Lines changed: 1 addition & 0 deletions b/‎docs/en/supported_models/supported_models.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/zh_cn/supported_models/supported_models.md‎
Lines changed: 1 addition & 0 deletions b/‎docs/zh_cn/supported_models/supported_models.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎lmdeploy/pytorch/backends/cuda/graph_runner.py‎
Lines changed: 21 additions & 7 deletions b/‎lmdeploy/pytorch/backends/cuda/graph_runner.py‎
Lines changed: 21 additions & 7 deletions
diff --git a/‎lmdeploy/pytorch/check_env/model.py‎
Lines changed: 7 additions & 1 deletion b/‎lmdeploy/pytorch/check_env/model.py‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎lmdeploy/pytorch/config.py‎
Lines changed: 15 additions & 2 deletions b/‎lmdeploy/pytorch/config.py‎
Lines changed: 15 additions & 2 deletions
diff --git a/‎lmdeploy/pytorch/configurations/default.py‎
Lines changed: 2 additions & 0 deletions b/‎lmdeploy/pytorch/configurations/default.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎lmdeploy/pytorch/configurations/qwen3_next.py‎
Lines changed: 58 additions & 0 deletions b/‎lmdeploy/pytorch/configurations/qwen3_next.py‎
Lines changed: 58 additions & 0 deletions
@@ -130,6 +130,7 @@ LMDeploy is a toolkit for compressing, deploying, and serving LLM, developed by
   <li>Qwen2-MoE (57BA14B)</li>
   <li>Qwen2.5 (0.5B - 32B)</li>
   <li>Qwen3, Qwen3-MoE</li>
+  <li>Qwen3-Next(80B)</li>
   <li>Baichuan (7B)</li>
   <li>Baichuan2 (7B-13B)</li>
   <li>Code Llama (7B - 34B)</li>
 
@@ -117,6 +117,7 @@ LMDeploy TurboMindエンジンは卓越した推論能力を持ち、さまざ
   <li>Qwen2-MoE (57BA14B)</li>
   <li>Qwen2.5 (0.5B - 32B)</li>
   <li>Qwen3, Qwen3-MoE</li>
+  <li>Qwen3-Next(80B)</li>
   <li>Baichuan (7B)</li>
   <li>Baichuan2 (7B-13B)</li>
   <li>Code Llama (7B - 34B)</li>
 
@@ -131,6 +131,7 @@ LMDeploy TurboMind 引擎拥有卓越的推理能力，在各种规模的模型
   <li>Qwen2-MoE (57BA14B)</li>
   <li>Qwen2.5 (0.5B - 32B)</li>
   <li>Qwen3, Qwen3-MoE</li>
+  <li>Qwen3-Next(80B)</li>
   <li>Baichuan (7B)</li>
   <li>Baichuan2 (7B-13B)</li>
   <li>Code Llama (7B - 34B)</li>
 
@@ -85,6 +85,7 @@ The following tables detail the models supported by LMDeploy's TurboMind engine
 |             QWen2              |   0.5B - 72B    | LLM  |    Yes    |   Yes   |   No    | Yes  |  Yes  |
 |            Qwen2.5             |   0.5B - 72B    | LLM  |    Yes    |   Yes   |   No    | Yes  |  Yes  |
 |             Qwen3              |   0.6B - 235B   | LLM  |    Yes    |   Yes   |  Yes\*  |  -   | Yes\* |
+|           QWen3-Next           |       80B       | LLM  |    Yes    |   No    |   No    |  No  |  No   |
 |            QWen2-VL            |     2B, 7B      | MLLM |    Yes    |   Yes   |   No    |  No  |  Yes  |
 |           QWen2.5-VL           |    3B - 72B     | MLLM |    Yes    |   No    |   No    |  No  |  No   |
 |          DeepSeek-MoE          |       16B       | LLM  |    Yes    |   No    |   No    |  No  |  No   |
 
@@ -85,6 +85,7 @@
 |             QWen2              |   0.5B - 72B    | LLM  |    Yes    |   Yes   |   No    | Yes  |  Yes  |
 |            Qwen2.5             |   0.5B - 72B    | LLM  |    Yes    |   Yes   |   No    | Yes  |  Yes  |
 |             Qwen3              |   0.6B - 235B   | LLM  |    Yes    |   Yes   |  Yes\*  |  -   |  Yes  |
+|           QWen3-Next           |       80B       | LLM  |    Yes    |   No    |   No    |  No  |  No   |
 |            QWen2-VL            |     2B, 7B      | MLLM |    Yes    |   Yes   |   No    |  No  |  Yes  |
 |           QWen2.5-VL           |    3B - 72B     | MLLM |    Yes    |   No    |   No    |  No  |  No   |
 |          DeepSeek-MoE          |       16B       | LLM  |    Yes    |   No    |   No    |  No  |  No   |
 
@@ -91,6 +91,16 @@ def __init__(
         self.pool = pool
         self._graph: torch.cuda.CUDAGraph = None
 
+    def make_output_buffers(self, output):
+        """Make output buffers."""
+        output_buffers = dict(logits=output)
+        return output_buffers
+
+    def slice_output(self, output_buffers: Dict[str, Any], inputs: Dict[str, Any]):
+        """Slice output."""
+        num_tokens = inputs['input_ids'].size(-1)
+        return output_buffers['logits'][:, :num_tokens]
+
     @record_function('capture_cudagraph')
     def capture(self, **kwargs):
         """Capture graph."""
@@ -102,29 +112,31 @@ def capture(self, **kwargs):
         current_stream = torch.cuda.current_stream()
 
         # warmup
-        self.model(**padded_kwargs)
+        warmup_output = self.model(**padded_kwargs)
+        warmup_buffers = self.make_output_buffers(warmup_output)
 
         self._graph = torch.cuda.CUDAGraph()
         # unsafe kernel call in other thread might invalid the capture
         # so we set thread_safe capture mode here.
         with torch.cuda.graph(self._graph, pool=self.pool, stream=current_stream, capture_error_mode='thread_local'):
             output = self.model(**padded_kwargs)
 
-        output_buffers = dict(logits=output)
+        output_buffers = self.make_output_buffers(output)
         self.meta.output_buffers = output_buffers
+        output = self.slice_output(warmup_buffers, kwargs)
         return output
 
     @record_function('forward_cudagraph')
     def forward(self, **kwargs):
         """forward."""
-        num_tokens = kwargs['input_ids'].size(-1)
         assert self._graph is not None
         self.model.fill_buffers_cudagraph(self.meta, **kwargs)
         context = self.ctx_mgr.current_context()
         self.model.update_context_cudagraph(self.meta, context)
         self._graph.replay()
 
-        output = self.meta.output_buffers['logits'][:, :num_tokens]
+        output_buffers = self.meta.output_buffers
+        output = self.slice_output(output_buffers, kwargs)
         return output
 
     def __del__(self):
@@ -223,12 +235,14 @@ def __call__(self, **kwargs):
                                            pool=self.graph_pool_handle,
                                            model_config=self.model_config,
                                            device=self.device)
-            runner.capture(**kwargs)
+            output = runner.capture(**kwargs)
             self._runner_map[graph_key] = runner
+            # SSM would update the state in capture(warmup), replay the graph will leads unexpected state update.
+            return output
         else:
             runner = self._runner_map[graph_key]
-        output = runner.forward(**kwargs)
-        return output
+            output = runner.forward(**kwargs)
+            return output
 
     @record_function('prepare_inputs_for_generation')
     def prepare_inputs_for_generation(
 
@@ -57,7 +57,13 @@ def check_dtype(self, config):
                 if not is_bf16_supported(device_type):
                     logger.warning('Device does not support bfloat16.')
         except Exception as e:
-            message = (f'Checking failed with error {e}', 'Please send issue to LMDeploy with error logs.')
+            message = (f'Checking failed with error {e}. Please send issue to LMDeploy with error logs.')
+            self.log_and_exit(e, 'Model', message=message)
+
+        try:
+            model_config.check_env_func(device_type)
+        except Exception as e:
+            message = (f'Checking failed with error {e}.')
             self.log_and_exit(e, 'Model', message=message)
 
     def check(self):
 
@@ -1,7 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import enum
-from dataclasses import dataclass
-from typing import Any, Dict, List, Literal
+from dataclasses import dataclass, field
+from typing import Any, Callable, Dict, List, Literal, Tuple
 
 import torch
 
@@ -86,6 +86,8 @@ class CacheConfig:
     enable_prefix_caching: bool = False
     quant_policy: Literal[0, 4, 8] = 0
     device_type: str = 'cuda'
+    num_state_caches: int = None
+    states_shapes: List[Tuple] = field(default_factory=list)
 
     # For PD Disaggregation
     role: EngineRole = EngineRole.Hybrid
@@ -183,6 +185,10 @@ def override_hf_config(hf_config: Any, hf_overrides: Dict[str, Any]):
         _override_hf_config(hf_config, k, v)
 
 
+def _default_check_env(device: str):
+    pass
+
+
 @dataclass
 class ModelConfig:
     """Config of model."""
@@ -208,6 +214,13 @@ class ModelConfig:
     dllm_mask_token: int = 0
     dllm_block_length: int = None
 
+    # added for qwen3_next
+    # could used for any SSM model.
+    states_shapes: List[Tuple[Tuple[int], torch.dtype]] = field(default_factory=list)
+
+    # check env for model-device combination
+    check_env_func: Callable = _default_check_env
+
     def get_head_size(self):
         """Get head size."""
         return self.head_dim
 
@@ -37,6 +37,8 @@ def build(cls, hf_config, model_path: str = None, **kwargs):
             eos_token_id=hf_config.eos_token_id,
             sliding_window=sliding_window,
             head_dim=head_dim,
+            k_head_dim=head_dim,
+            v_head_dim=head_dim,
             vocab_size=hf_config.vocab_size,
             llm_config=hf_config,
         )
@@ -0,0 +1,58 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from .builder import AutoModelConfigBuilder
+from .default import DefaultModelConfigBuilder
+
+
+def _check_env_qwen3_next(device: str):
+    """Check env for qwen3 next."""
+    if device != 'cuda':
+        return
+
+    # check cuda
+    try:
+        import causal_conv1d  # noqa: F401
+    except ImportError:
+        raise ImportError('Qwen3-Next cuda support requires https://github.com/Dao-AILab/causal-conv1d.')
+
+    try:
+        import fla  # noqa: F401
+    except ImportError:
+        raise ImportError('Qwen3-Next cuda support requires https://github.com/fla-org/flash-linear-attention.')
+
+
+class Qwen3NextModelConfigBuilder(AutoModelConfigBuilder):
+
+    @classmethod
+    def condition(cls, hf_config):
+        """config."""
+        return hf_config.model_type == 'qwen3_next'
+
+    @classmethod
+    def build(cls, hf_config, model_path: str = None, tp: int = 1, **kwargs):
+        """build."""
+        cfg = DefaultModelConfigBuilder.build(hf_config, model_path, tp=tp, **kwargs)
+
+        # update num layers
+        num_layers = cfg.num_layers
+        num_full_layers = num_layers // hf_config.full_attention_interval
+        num_delta_layers = num_full_layers * (hf_config.full_attention_interval - 1)
+        cfg.num_layers = num_full_layers
+
+        # set state shapes
+        head_k_dim = hf_config.linear_key_head_dim
+        head_v_dim = hf_config.linear_value_head_dim
+        num_v_heads = hf_config.linear_num_value_heads // tp
+        num_k_heads = hf_config.linear_num_key_heads // tp
+        key_dim = head_k_dim * num_k_heads
+        value_dim = head_v_dim * num_v_heads
+        conv_dim = key_dim * 2 + value_dim
+        conv_kernel_size = hf_config.linear_conv_kernel_dim
+
+        conv_state_shape = (num_delta_layers, conv_dim, conv_kernel_size)
+        recurrent_state_shape = (num_delta_layers, num_v_heads, head_k_dim, head_v_dim)
+        dtype = torch.bfloat16
+        cfg.states_shapes = [(conv_state_shape, dtype), (recurrent_state_shape, dtype)]
+        cfg.check_env_func = _check_env_qwen3_next
+        return cfg
Original file line number	Diff line number	Diff line change
`@@ -37,6 +37,8 @@ def build(cls, hf_config, model_path: str = None, **kwargs):`
`37`	`37`	`eos_token_id=hf_config.eos_token_id,`
`38`	`38`	`sliding_window=sliding_window,`
`39`	`39`	`head_dim=head_dim,`
	`40`	`+ k_head_dim=head_dim,`
	`41`	`+ v_head_dim=head_dim,`
`40`	`42`	`vocab_size=hf_config.vocab_size,`
`41`	`43`	`llm_config=hf_config,`
`42`	`44`	`)`