diff --git a/docs/base_config_usage.md b/docs/base_config_usage.md
new file mode 100644
index 00000000..53d05c06
--- /dev/null
+++ b/docs/base_config_usage.md
@@ -0,0 +1,135 @@
+# BaseTestConfig 使用文档
+
+## 概述
+
+`BaseTestConfig` 是 InfiniLM 项目的统一配置基类，为各个测试脚本提供通用的命令行参数解析和配置管理功能。
+
+## 特性
+
+- **统一参数管理**: 提供所有测试脚本共用的基础参数
+- **设备类型映射**: 支持多种硬件设备的自动类型转换
+- **灵活扩展**: 使用 `parse_known_args()` 容错处理，允许脚本添加特定参数
+- **类型安全**: 自动将设备字符串转换为对应的 `DeviceType` 枚举
+
+## 支持的设备类型
+
+| 设备名称 | DeviceType 枚举值 |
+|---------|-----------------|
+| `cpu` | `DEVICE_TYPE_CPU` |
+| `nvidia` | `DEVICE_TYPE_NVIDIA` |
+| `qy` | `DEVICE_TYPE_QY` |
+| `cambricon` | `DEVICE_TYPE_CAMBRICON` |
+| `ascend` | `DEVICE_TYPE_ASCEND` |
+| `metax` | `DEVICE_TYPE_METAX` |
+| `moore` | `DEVICE_TYPE_MOORE` |
+| `iluvatar` | `DEVICE_TYPE_ILUVATAR` |
+| `kunlun` | `DEVICE_TYPE_KUNLUN` |
+| `hygon` | `DEVICE_TYPE_HYGON` |
+
+## 通用参数说明
+
+| 参数 | 类型 | 是否必需 | 默认值 | 说明 |
+|------|------|---------|--------|------|
+| `--model_path` | str | ✓ 是 | - | 模型文件路径 |
+| `--device` | str | 否 | `cpu` | 目标设备类型（见上表） |
+| `--ndev` | int | 否 | `1` | 使用的设备数量 |
+| `--verbose` | flag | 否 | `False` | 启用详细输出模式 |
+
+## 基本使用
+
+### 1. 直接使用（测试）
+
+```bash
+python scripts/base_config.py --model_path /path/to/model
+```
+
+### 2. 在脚本中继承使用
+
+```python
+from base_config import BaseTestConfig
+
+class MyTestConfig(BaseTestConfig):
+    def __init__(self):
+        super().__init__()
+        # 添加脚本特定参数
+        self.parser.add_argument("--my_param", type=int, default=10)
+        self.my_param = self.args.my_param
+
+# 使用
+cfg = MyTestConfig()
+print(f"模型路径: {cfg.model_path}")
+print(f"设备类型: {cfg.device_type}")
+print(f"自定义参数: {cfg.my_param}")
+```
+
+## 命令行示例
+
+### 基础用法
+```bash
+python your_script.py --model_path ./models/llama2
+```
+
+### 使用 NVIDIA GPU
+```bash
+python your_script.py --model_path ./models/llama2 --device nvidia --ndev 2
+```
+
+### 使用 QY 设备并启用详细输出
+```bash
+python your_script.py --model_path ./models/llama2 --device qy --verbose
+```
+
+### 结合自定义参数
+```bash
+python your_script.py --model_path ./models/llama2 --device nvidia --ndev 4 --batch_size 32
+```
+
+## 类属性说明
+
+初始化后可访问的属性：
+
+- `model_path` (str): 模型路径
+- `ndev` (int): 设备数量
+- `verbose` (bool): 详细输出开关
+- `device_name` (str): 设备名称（原始输入）
+- `device_type` (DeviceType): 设备类型枚举值
+- `args` (Namespace): 解析后的参数命名空间
+- `extra` (list): 未解析的额外参数
+
+
+## 扩展指南
+
+### 添加新的设备类型
+
+修改 `_get_device_type` 方法中的 `DEVICE_TYPE_MAP`：
+
+```python
+DEVICE_TYPE_MAP = {
+    # ... 现有映射 ...
+    "new_device": DeviceType.DEVICE_TYPE_NEW,
+}
+```
+
+### 添加新的通用参数
+
+修改 `_add_common_args` 方法：
+
+```python
+def _add_common_args(self):
+    # ... 现有参数 ...
+    self.parser.add_argument("--new_param", type=str, default="default")
+```
+
+## 注意事项
+
+1. **参数顺序**: 命令行参数顺序不影响解析结果
+2. **类型转换**: `--ndev` 等整数参数会自动验证类型
+3. **参数覆盖**: 后出现的参数会覆盖前面的同名参数
+4. **帮助信息**: 使用 `--help` 查看所有可用参数
+
+## 相关文件
+
+- `scripts/jiuge_config.py`: 九格评测配置
+- `scripts/jiuge_ppl_config.py`: 九格 PPL 配置
+- `scripts/jiuge.py`: 九歌评测主脚本
+
diff --git a/jiuge.sh b/jiuge.sh
new file mode 100644
index 00000000..c186fc7c
--- /dev/null
+++ b/jiuge.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+
+# Jiuge模型运行脚本
+# 使用NVIDIA显卡运行9G4B模型
+
+set -e  # 遇到错误立即退出
+
+echo "=========================================="
+echo "🚀 启动 Jiuge 模型 (9G4B) - NVIDIA版本"
+echo "=========================================="
+export INFINI_ROOT=/home/featurize/.infini
+# 设置参数
+MODEL_DIR="/home/featurize/work/InfiniFamily/9G4B"
+DEVICE="--nvidia"
+N_DEVICE=1
+SCRIPT_PATH="python scripts/jiuge.py"
+
+# 检查模型目录是否存在
+if [ ! -d "$MODEL_DIR" ]; then
+    echo "❌ 错误: 模型目录不存在: $MODEL_DIR"
+    echo "请检查路径是否正确"
+    exit 1
+fi
+
+# 检查Python脚本是否存在
+if [ ! -f "scripts/jiuge.py" ]; then
+    echo "❌ 错误: 未找到jiuge.py脚本: scripts/jiuge.py"
+    echo "请确保在当前目录下运行此脚本"
+    exit 1
+fi
+
+echo "📁 模型路径: $MODEL_DIR"
+echo "🎯 设备类型: NVIDIA GPU"
+echo "💻 设备数量: $N_DEVICE"
+echo ""
+
+# 运行模型
+echo "🔄 启动模型..."
+$SCRIPT_PATH $DEVICE $MODEL_DIR $N_DEVICE
+
+echo ""
+echo "=========================================="
+echo "✅ 模型运行完成"
+echo "=========================================="
\ No newline at end of file
diff --git a/scripts/base_config.py b/scripts/base_config.py
new file mode 100644
index 00000000..ab100211
--- /dev/null
+++ b/scripts/base_config.py
@@ -0,0 +1,53 @@
+import argparse
+import sys
+from libinfinicore_infer import DeviceType
+
+
+
+
+class BaseTestConfig:
+    def __init__(self):
+        self.parser = argparse.ArgumentParser(description="InfiniLM Unified Config")
+        self._add_common_args()
+        
+        # 核心：使用 parse_known_args() 容忍脚本特有参数
+        # args 存储解析好的命名空间，extra 存储未识别的参数
+        self.args, self.extra = self.parser.parse_known_args()
+
+        self.model_path = self.args.model_path
+        self.ndev = self.args.ndev
+        self.verbose = self.args.verbose
+        
+        self.device_name = self.args.device
+        self.device_type = self._get_device_type(self.args.device)
+
+
+
+    def _add_common_args(self):
+
+        self.parser.add_argument("--device", type=str, default="cpu")
+        self.parser.add_argument("--model_path", type=str, required=True)
+        self.parser.add_argument("--ndev", type=int, default=1)
+        self.parser.add_argument("--verbose", action="store_true")
+
+
+    def _get_device_type(self, dev_str):
+        DEVICE_TYPE_MAP = {
+            "cpu": DeviceType.DEVICE_TYPE_CPU,
+            "nvidia": DeviceType.DEVICE_TYPE_NVIDIA,
+            "qy": DeviceType.DEVICE_TYPE_QY,
+            "cambricon": DeviceType.DEVICE_TYPE_CAMBRICON,
+            "ascend": DeviceType.DEVICE_TYPE_ASCEND,
+            "metax": DeviceType.DEVICE_TYPE_METAX,
+            "moore": DeviceType.DEVICE_TYPE_MOORE,
+            "iluvatar": DeviceType.DEVICE_TYPE_ILUVATAR,
+            "kunlun": DeviceType.DEVICE_TYPE_KUNLUN,
+            "hygon": DeviceType.DEVICE_TYPE_HYGON,
+        }
+
+        return DEVICE_TYPE_MAP.get(dev_str.lower(), DeviceType.DEVICE_TYPE_CPU)
+
+if __name__ == '__main__':
+    cfg = BaseTestConfig()
+    print(cfg.model_path)
+    print(cfg.ndev)
\ No newline at end of file
diff --git a/scripts/jiuge.py b/scripts/jiuge.py
index 35b2c8ca..f73fd199 100644
--- a/scripts/jiuge.py
+++ b/scripts/jiuge.py
@@ -871,7 +871,8 @@ def test():
     # Find n_device argument (skip --verbose)
     ndev_args = [arg for arg in sys.argv[3:] if arg != "--verbose"]
     ndev = int(ndev_args[0]) if ndev_args else 1
-
+    print("type is")
+    print(type(device_type))
     model = JiugeForCauslLM(model_path, device_type, ndev)
     model.generate("山东最高的山是？", 500, verbose=verbose)
     model.destroy_model_instance()
diff --git a/scripts/jiuge_config.py b/scripts/jiuge_config.py
new file mode 100644
index 00000000..7f2708f2
--- /dev/null
+++ b/scripts/jiuge_config.py
@@ -0,0 +1,849 @@
+from typing import List, Sequence
+import math
+import os
+from pathlib import Path
+import safetensors
+import sys
+import time
+import json
+import torch
+import transformers
+from base_config import BaseTestConfig
+from libinfinicore_infer import (
+    JiugeModel,
+    JiugeMetaCStruct,
+    JiugeWeightsCStruct,
+    DataType,
+    DeviceType,
+    KVCacheCStruct,
+)
+from infer_task import InferTask, KVCache
+
+from ctypes import POINTER, c_float, c_int, c_uint, c_void_p, byref
+
+torch.set_default_device("cpu")
+
+
+class LlamaWeightsNaming:
+    def input_embd(self):
+        return "model.embed_tokens.weight"
+
+    def output_norm(self):
+        return "model.norm.weight"
+
+    def output_embd(self):
+        return "lm_head.weight"
+
+    def attn_norm(self, i):
+        return f"model.layers.{i}.input_layernorm.weight"
+
+    def attn_q(self, i):
+        return f"model.layers.{i}.self_attn.q_proj.weight"
+
+    def attn_k(self, i):
+        return f"model.layers.{i}.self_attn.k_proj.weight"
+
+    def attn_v(self, i):
+        return f"model.layers.{i}.self_attn.v_proj.weight"
+
+    def attn_o(self, i):
+        return f"model.layers.{i}.self_attn.o_proj.weight"
+
+    def attn_q_b(self, i):
+        return f"model.layers.{i}.self_attn.q_proj.bias"
+
+    def attn_k_b(self, i):
+        return f"model.layers.{i}.self_attn.k_proj.bias"
+
+    def attn_v_b(self, i):
+        return f"model.layers.{i}.self_attn.v_proj.bias"
+
+    def attn_q_norm(self, i):
+        return f"model.layers.{i}.self_attn.q_norm.weight"
+
+    def attn_k_norm(self, i):
+        return f"model.layers.{i}.self_attn.k_norm.weight"
+
+    def ffn_norm(self, i):
+        return f"model.layers.{i}.post_attention_layernorm.weight"
+
+    def gate(self, i):
+        return f"model.layers.{i}.mlp.gate_proj.weight"
+
+    def up(self, i):
+        return f"model.layers.{i}.mlp.up_proj.weight"
+
+    def down(self, i):
+        return f"model.layers.{i}.mlp.down_proj.weight"
+
+    def match(state_dict):
+        return (
+            "model.norm.weight" in state_dict
+            and "model.layers.0.self_attn.q_proj.weight" in state_dict
+        )
+
+
+class JiugeMetaFromLlama(JiugeMetaCStruct):
+    def __init__(self, config, dtype=torch.float16, max_tokens=None):
+        if dtype == torch.float16:
+            dt_ = DataType.INFINI_DTYPE_F16
+        elif dtype == torch.float32:
+            dt_ = DataType.INFINI_DTYPE_F32
+        elif dtype == torch.bfloat16:
+            dt_ = DataType.INFINI_DTYPE_BF16
+        else:
+            dt_ = DataType.INFINI_DTYPE_F16
+
+        self.scale_input = 1.0
+        self.scale_output = 1.0
+        self.scale_o = 1.0
+        self.scale_down = 1.0
+        if (
+            config["model_type"] in ["fm9g", "minicpm"]
+            and "scale_emb" in config
+            and "scale_depth" in config
+            and "dim_model_base" in config
+        ):
+            self.scale_input = config["scale_emb"]
+            self.scale_output = config["hidden_size"] // config["dim_model_base"]
+            self.scale_o = config["scale_depth"] / math.sqrt(
+                config["num_hidden_layers"]
+            )
+            self.scale_down = config["scale_depth"] / math.sqrt(
+                config["num_hidden_layers"]
+            )
+
+        super().__init__(
+            dt_logits=dt_,
+            nlayer=config["num_hidden_layers"],
+            d=config["hidden_size"],
+            nh=config["num_attention_heads"],
+            nkvh=(
+                config["num_key_value_heads"]
+                if "num_key_value_heads" in config
+                else config["num_attention_heads"]
+            ),
+            dh=(
+                config["head_dim"]
+                if "head_dim" in config
+                else config["hidden_size"] // config["num_attention_heads"]
+            ),
+            di=config["intermediate_size"],
+            dctx=(
+                config["max_position_embeddings"] if max_tokens is None else max_tokens
+            ),
+            dvoc=config["vocab_size"],
+            epsilon=config["rms_norm_eps"],
+            theta=(config["rope_theta"] if "rope_theta" in config else 100000.0),
+            end_token=2,
+        )
+        self.torch_dtype_logits = dtype
+
+
+class JiugeWeightsImpl(JiugeWeightsCStruct):
+    def __init__(
+        self,
+        meta,
+        naming,
+        state_dict,
+        torch_dt_mat=torch.float16,
+        torch_dt_norm=torch.float32,
+        ndev=1,
+        transpose_weight=True,
+    ):
+        nlayer = meta.nlayer
+        nh = meta.nh
+        nkvh = meta.nkvh
+        dh = meta.dh
+        d = meta.d
+        di = meta.di
+        scale_input = meta.scale_input
+        scale_output = meta.scale_output
+        scale_o = meta.scale_o
+        scale_down = meta.scale_down
+        assert nh % nkvh == 0
+        assert nh % ndev == 0
+        assert nkvh % ndev == 0
+        assert di % ndev == 0
+        torch_dt_logits = meta.torch_dtype_logits
+        if torch_dt_mat == torch.float16:
+            self.dt_mat = DataType.INFINI_DTYPE_F16
+        elif torch_dt_mat == torch.float32:
+            self.dt_mat = DataType.INFINI_DTYPE_F32
+        elif torch_dt_mat == torch.bfloat16:
+            self.dt_mat = DataType.INFINI_DTYPE_BF16
+        else:
+            raise ValueError("Unsupported proj weight data type")
+        if torch_dt_norm == torch.float16:
+            self.dt_norm = DataType.INFINI_DTYPE_F16
+        elif torch_dt_norm == torch.float32:
+            self.dt_norm = DataType.INFINI_DTYPE_F32
+        elif torch_dt_norm == torch.bfloat16:
+            self.dt_norm = DataType.INFINI_DTYPE_BF16
+        else:
+            raise ValueError("Unsupported norm weight data type")
+
+        input_embd_naming = (
+            naming.input_embd()
+            if naming.input_embd() in state_dict
+            else naming.output_embd()
+        )
+        output_embd_naming = (
+            naming.output_embd()
+            if naming.output_embd() in state_dict
+            else naming.input_embd()
+        )
+        self.transpose_linear_weights = 1 if transpose_weight else 0
+        self.nlayer = nlayer
+        self.input_embd_tensor = (
+            state_dict[input_embd_naming].to(torch_dt_logits) * scale_input
+        )
+        self.input_embd = self.input_embd_tensor.data_ptr()
+        self.output_norm_tensor = (
+            state_dict[naming.output_norm()].to(torch_dt_norm) * scale_output
+        )
+        self.output_norm = self.output_norm_tensor.data_ptr()
+        self.output_embd_tensor = state_dict[output_embd_naming].to(torch_dt_mat)
+        if not transpose_weight:
+            self.output_embd_tensor = self.output_embd_tensor.transpose(
+                0, 1
+            ).contiguous()
+        self.output_embd = self.output_embd_tensor.data_ptr()
+
+        self.attn_norm_tensors = [
+            state_dict[naming.attn_norm(i)].to(torch_dt_norm) for i in range(nlayer)
+        ]
+        self.attn_norm_ptrs = [
+            self.attn_norm_tensors[i].data_ptr() for i in range(nlayer)
+        ]
+        self.attn_norm = (c_void_p * nlayer)(*self.attn_norm_ptrs)
+
+        def qkv_slices(_i):
+            _Q = (
+                state_dict[naming.attn_q(_i)]
+                .reshape([nh, 2, dh // 2, d])
+                .transpose(1, 2)
+            )
+            _K = (
+                state_dict[naming.attn_k(_i)]
+                .reshape([nkvh, 2, dh // 2, d])
+                .transpose(1, 2)
+            )
+            _V = state_dict[naming.attn_v(_i)].reshape([nkvh, dh // 2, 2, d])
+            _result = []
+            _nh = nh // ndev
+            _nkvh = nkvh // ndev
+            for _idev in range(ndev):
+                _result.append(_Q[_idev * _nh : (_idev + 1) * _nh, :, :, :])
+                _result.append(_K[_idev * _nkvh : (_idev + 1) * _nkvh, :, :, :])
+                _result.append(_V[_idev * _nkvh : (_idev + 1) * _nkvh, :, :])
+            return _result
+
+        self.qkv_tensor = [
+            torch.concat(qkv_slices(i)).to(torch_dt_mat) for i in range(nlayer)
+        ]
+        if not transpose_weight:
+            for i in range(nlayer):
+                self.qkv_tensor[i] = (
+                    self.qkv_tensor[i]
+                    .reshape(ndev, (nh + 2 * nkvh) // ndev * dh, d)
+                    .transpose(1, 2)
+                    .contiguous()
+                )
+        self.qkv_tensor_ptrs = [self.qkv_tensor[i].data_ptr() for i in range(nlayer)]
+        self.attn_qkv = (c_void_p * nlayer)(*self.qkv_tensor_ptrs)
+
+        def qkv_b_slices(_i):
+            _QB = (
+                state_dict[naming.attn_q_b(_i)]
+                .reshape([nh, 2, dh // 2])
+                .transpose(1, 2)
+            )
+            _KB = (
+                state_dict[naming.attn_k_b(_i)]
+                .reshape([nkvh, 2, dh // 2])
+                .transpose(1, 2)
+            )
+            _VB = state_dict[naming.attn_v_b(_i)].reshape([nkvh, dh // 2, 2])
+            _result = []
+            _nh = nh // ndev
+            _nkvh = nkvh // ndev
+            for _idev in range(ndev):
+                _result.append(_QB[_idev * _nh : (_idev + 1) * _nh, :, :].flatten())
+                _result.append(_KB[_idev * _nkvh : (_idev + 1) * _nkvh, :, :].flatten())
+                _result.append(_VB[_idev * _nkvh : (_idev + 1) * _nkvh, :, :].flatten())
+            return _result
+
+        if naming.attn_q_b(0) in state_dict:
+            self.qkv_b_tensors = [
+                torch.concat(qkv_b_slices(i)).to(torch_dt_logits) for i in range(nlayer)
+            ]
+            self.qkv_b_tensor_ptrs = [
+                self.qkv_b_tensors[i].data_ptr() for i in range(nlayer)
+            ]
+            self.attn_qkv_b = (c_void_p * nlayer)(*self.qkv_b_tensor_ptrs)
+        else:
+            self.attn_qkv_b = None
+
+        if naming.attn_q_norm(0) in state_dict:
+            self.attn_q_norm_tensors = [
+                state_dict[naming.attn_q_norm(i)]
+                .reshape([2, dh // 2])
+                .transpose(0, 1)
+                .contiguous()
+                .to(torch_dt_norm)
+                for i in range(nlayer)
+            ]
+            self.attn_q_norm_ptrs = [
+                self.attn_q_norm_tensors[i].data_ptr() for i in range(nlayer)
+            ]
+            self.attn_q_norm = (c_void_p * nlayer)(*self.attn_q_norm_ptrs)
+            self.attn_k_norm_tensors = [
+                state_dict[naming.attn_k_norm(i)]
+                .reshape([2, dh // 2])
+                .transpose(0, 1)
+                .contiguous()
+                .to(torch_dt_norm)
+                for i in range(nlayer)
+            ]
+            self.attn_k_norm_ptrs = [
+                self.attn_k_norm_tensors[i].data_ptr() for i in range(nlayer)
+            ]
+            self.attn_k_norm = (c_void_p * nlayer)(*self.attn_k_norm_ptrs)
+        else:
+            self.attn_q_norm = None
+            self.attn_k_norm = None
+
+        self.attn_o_tensor = [
+            (
+                state_dict[naming.attn_o(i)]
+                .to(torch_dt_mat)
+                .reshape([d, ndev, nh // ndev * dh])
+                .transpose(0, 1)
+                .contiguous()
+                if transpose_weight
+                else state_dict[naming.attn_o(i)]
+                .transpose(0, 1)
+                .to(torch_dt_mat)
+                .contiguous()
+            )
+            * scale_o
+            for i in range(nlayer)
+        ]
+        self.attn_o_ptrs = [self.attn_o_tensor[i].data_ptr() for i in range(nlayer)]
+        self.attn_o = (c_void_p * nlayer)(*self.attn_o_ptrs)
+
+        self.ffn_norm_tensors = [
+            state_dict[naming.ffn_norm(i)].to(torch_dt_norm) for i in range(nlayer)
+        ]
+        self.ffn_norm_ptrs = [
+            self.ffn_norm_tensors[i].data_ptr() for i in range(nlayer)
+        ]
+        self.ffn_norm = (c_void_p * nlayer)(*self.ffn_norm_ptrs)
+
+        def gate_up_slices(_i):
+            _result = []
+            _di = di // ndev
+            for _idev in range(ndev):
+                _start = _idev * _di
+                _end = (_idev + 1) * _di
+                _result.append(state_dict[naming.gate(_i)][_start:_end, :])
+                _result.append(state_dict[naming.up(_i)][_start:_end, :])
+            return _result
+
+        self.gate_up_tensors = [
+            torch.concat(gate_up_slices(i)).to(torch_dt_mat) for i in range(nlayer)
+        ]
+        if not transpose_weight:
+            for i in range(nlayer):
+                self.gate_up_tensors[i] = (
+                    self.gate_up_tensors[i]
+                    .reshape(ndev, 2 * di // ndev, d)
+                    .transpose(1, 2)
+                    .contiguous()
+                )
+        self.gate_up_ptrs = [self.gate_up_tensors[i].data_ptr() for i in range(nlayer)]
+        self.ffn_gate_up = (c_void_p * nlayer)(*self.gate_up_ptrs)
+
+        self.ffn_down_tensor = [
+            (
+                state_dict[naming.down(i)]
+                .to(torch_dt_mat)
+                .reshape([d, ndev, di // ndev])
+                .transpose(0, 1)
+                .contiguous()
+                if transpose_weight
+                else state_dict[naming.down(i)]
+                .transpose(0, 1)
+                .to(torch_dt_mat)
+                .contiguous()
+            )
+            * scale_down
+            for i in range(nlayer)
+        ]
+        self.ffn_down_ptrs = [self.ffn_down_tensor[i].data_ptr() for i in range(nlayer)]
+        self.ffn_down = (c_void_p * nlayer)(*self.ffn_down_ptrs)
+
+
+class JiugeBatchedTask:
+    def __init__(self, tasks: List[InferTask]):
+        self.tasks = tasks
+        self.nreq = len(tasks)
+
+        # Precompute fields
+        token_lists = [t.tokens for t in tasks]
+        self.req_lens_list = [len(toks) for toks in token_lists]
+        self.req_pos_list = [t.pos for t in tasks]
+        self.kv_cache_ptrs = [t.kvcache().data() for t in tasks]
+        self.temperaturas_list = [t.temperature for t in tasks]
+        self.topks_list = [t.topk for t in tasks]
+        self.topps_list = [t.topp for t in tasks]
+
+        # Flatten token lists
+        flat_tokens = [tok for toks in token_lists for tok in toks]
+        self.ntok = len(flat_tokens)
+
+        # Convert to ctypes arrays in one pass
+        self.tokens = (c_uint * self.ntok)(*flat_tokens)
+        self.req_lens = (c_uint * self.nreq)(*self.req_lens_list)
+        self.req_pos = (c_uint * self.nreq)(*self.req_pos_list)
+        self.kv_caches = (POINTER(KVCacheCStruct) * self.nreq)(*self.kv_cache_ptrs)
+        self.temperaturas = (c_float * self.nreq)(*self.temperaturas_list)
+        self.topks = (c_uint * self.nreq)(*self.topks_list)
+        self.topps = (c_float * self.nreq)(*self.topps_list)
+
+    def input_args(self):
+        return (
+            self.tokens,
+            self.ntok,
+            self.req_lens,
+            self.nreq,
+            self.req_pos,
+            self.kv_caches,
+            self.temperaturas,
+            self.topks,
+            self.topps,
+        )
+
+
+class JiugeForCauslLM:
+    def __init__(
+        self, model_dir_path, device=DeviceType.DEVICE_TYPE_CPU, ndev=1, max_tokens=None
+    ):
+        def load_all_safetensors_from_dir(dir_path_: str):
+            tensors_ = {}
+            dir_path_ = Path(dir_path_)
+            for file in sorted(dir_path_.glob("*.safetensors")):
+                data_ = safetensors.safe_open(file, "pt")
+                for name_ in data_.keys():
+                    tensors_[name_] = data_.get_tensor(name_)
+            return tensors_
+
+        print("Loading model weights to host...")
+        load_start_time = time.time()
+
+        with open(os.path.join(model_dir_path, "config.json"), "r") as f:
+            config = json.load(f)
+            self.config = config
+        eos_token_id = self.config["eos_token_id"]
+        self.eos_token_id = (
+            [eos_token_id] if type(eos_token_id) == int else eos_token_id
+        )
+        transpose_weight = (
+            device != DeviceType.DEVICE_TYPE_ASCEND
+        )  # y = xW is faster than y=xW^T on Ascend
+
+        self.jiuge_model = JiugeModel()
+
+        if "llama" == config["model_type"]:
+            model = (
+                transformers.LlamaForCausalLM.from_pretrained(model_dir_path)
+                .cpu()
+                .half()
+            )
+            self.meta = JiugeMetaFromLlama(config, max_tokens=max_tokens)
+            self.tokenizer = transformers.AutoTokenizer.from_pretrained(model_dir_path)
+            self.weights = JiugeWeightsImpl(
+                self.meta,
+                LlamaWeightsNaming(),
+                model.state_dict(),
+                ndev=ndev,
+                transpose_weight=transpose_weight,
+            )
+        elif "fm9g" == config["model_type"] or "minicpm" == config["model_type"]:
+            if any(
+                file.suffix == ".safetensors" for file in Path(model_dir_path).iterdir()
+            ):
+                state_dict = load_all_safetensors_from_dir(model_dir_path)
+            else:
+                state_dict = torch.load(
+                    os.path.join(model_dir_path, "pytorch_model.bin"),
+                    weights_only=True,
+                    map_location="cpu",
+                )
+            if LlamaWeightsNaming.match(state_dict):
+                self.meta = JiugeMetaFromLlama(config, max_tokens=max_tokens)
+                self.weights = JiugeWeightsImpl(
+                    self.meta,
+                    LlamaWeightsNaming(),
+                    state_dict,
+                    ndev=ndev,
+                    transpose_weight=transpose_weight,
+                )
+                self.tokenizer = transformers.AutoTokenizer.from_pretrained(
+                    model_dir_path, trust_remote_code=True
+                )
+            else:
+                raise ValueError("Unsupported weight naming")
+        elif "fm9g7b" == config["model_type"]:
+            if any(
+                file.suffix == ".safetensors" for file in Path(model_dir_path).iterdir()
+            ):
+                state_dict = load_all_safetensors_from_dir(model_dir_path)
+            else:
+                state_dict = torch.load(
+                    os.path.join(model_dir_path, "pytorch_model.bin"),
+                    weights_only=True,
+                    map_location="cpu",
+                )
+            if LlamaWeightsNaming.match(state_dict):
+                self.meta = JiugeMetaFromLlama(config, max_tokens=max_tokens)
+                self.weights = JiugeWeightsImpl(
+                    self.meta,
+                    LlamaWeightsNaming(),
+                    state_dict,
+                    ndev=ndev,
+                    transpose_weight=transpose_weight,
+                )
+                self.tokenizer = transformers.AutoTokenizer.from_pretrained(
+                    model_dir_path, trust_remote_code=True
+                )
+            else:
+                raise ValueError("Unsupported weight naming")
+        elif "qwen2" == config["model_type"] or "qwen3" == config["model_type"]:
+            state_dict = load_all_safetensors_from_dir(model_dir_path)
+            if LlamaWeightsNaming.match(state_dict):
+                self.meta = JiugeMetaFromLlama(config, max_tokens=max_tokens)
+                self.weights = JiugeWeightsImpl(
+                    self.meta,
+                    LlamaWeightsNaming(),
+                    state_dict,
+                    ndev=ndev,
+                    transpose_weight=transpose_weight,
+                )
+                self.tokenizer = transformers.AutoTokenizer.from_pretrained(
+                    model_dir_path
+                )
+        else:
+            raise ValueError("Unsupported model architecture")
+
+        if "llama" == config["model_type"]:
+            from tokenizers import decoders as _dec
+
+            backend = getattr(self.tokenizer, "backend_tokenizer", None)
+            target = getattr(backend, "_tokenizer", backend)
+            norm = getattr(target, "normalizer", None)
+            dec = getattr(target, "decoder", None)
+            sn = repr(norm)[:800] if norm is not None else ""
+            sd = repr(dec)[:800] if dec is not None else ""
+            has_prepend = "Prepend" in sn
+            has_strip = "Strip" in sd
+            if has_prepend and has_strip:
+                target.decoder = _dec.Sequence(
+                    [
+                        _dec.Replace("▁", " "),
+                        _dec.ByteFallback(),
+                        _dec.Fuse(),
+                    ]
+                )
+
+        load_end_time = time.time()
+        print(f"Time used: {load_end_time - load_start_time:.3f}s")
+
+        print(f"Creating model on {ndev} devices...")
+        load_start_time = time.time()
+        self.dev_ids = (c_int * ndev)(*[i for i in range(ndev)])
+        self.ndev = ndev
+        self.device = device
+
+        self.model_instance = self.jiuge_model.create_model(
+            byref(self.meta),
+            byref(self.weights),
+            device,
+            ndev,
+            self.dev_ids,
+        )
+        load_end_time = time.time()
+        print(f"Time used: {load_end_time - load_start_time:.3f}s")
+
+    def max_context_len(self):
+        return self.meta.dctx
+
+    def create_kv_cache(self):
+        return self.jiuge_model.create_kv_cache(
+            self.meta.nlayer,
+            self.meta.dctx,
+            self.meta.nkvh,
+            self.meta.dh,
+            self.meta.dh,
+            self.meta.dt_logits,
+            self.device,
+            self.dev_ids,
+            self.ndev,
+        )
+
+    def drop_kv_cache(self, kv_cache):
+        self.jiuge_model.drop_kv_cache(kv_cache)
+
+    def batch_infer_one_round(self, tasks: List[InferTask]):
+        output = (c_uint * len(tasks))()
+        batch_inputs = JiugeBatchedTask(tasks)
+        self.jiuge_model.infer_batch(
+            self.model_instance,
+            *(batch_inputs.input_args()),
+            output,
+        )
+        return list(output)
+
+    def generate(
+        self,
+        input_content,
+        max_steps,
+        topp_=1.0,
+        topk_=1,
+        temperature_=1.0,
+        verbose=False,
+    ):
+        input_content = self.tokenizer.apply_chat_template(
+            conversation=[{"role": "user", "content": input_content}],
+            add_generation_prompt=True,
+            tokenize=False,
+        )
+        print(input_content, end="", flush=True)
+        tokens = self.tokenizer.encode(input_content)
+        infer_task = InferTask(
+            0,
+            tokens,
+            self.max_context_len(),
+            temperature_,
+            topk_,
+            topp_,
+            self.eos_token_id,
+        )
+        infer_task.bind_kvcache(KVCache(self))
+
+        steps = 0
+        total_time = 0
+        prefill_time = 0
+        decode_time = 0
+        output_content = ""
+
+        # Prefill phase - process initial prompt
+        prefill_start_time = time.time()
+        output_tokens = self.batch_infer_one_round([infer_task])
+        prefill_end_time = time.time()
+        prefill_time = prefill_end_time - prefill_start_time
+        steps += 1
+
+        output_str = self.tokenizer.decode(output_tokens[0])
+        output_content += output_str
+        print(output_str, end="", flush=True)
+        if output_tokens[0] in self.eos_token_id:
+            # If generation ends after prefill, calculate metrics
+            total_time = prefill_time
+            total_tokens = len(tokens) + 1  # input tokens + first output token
+
+            print("\n")
+            print(f"Time per step: {total_time * 1000:.3f}ms")
+
+            if verbose:
+                overall_throughput = total_tokens / total_time
+                prefill_throughput = len(tokens) / prefill_time
+                decode_throughput = 1 / 0.001  # Avoid division by zero, use small value
+
+                print("=" * 50)
+                print("PERFORMANCE METRICS")
+                print("=" * 50)
+                print(f"Input tokens: {len(tokens)}")
+                print(f"Generated tokens: 1")
+                print(f"Total tokens: {total_tokens}")
+                print(f"Total time: {total_time * 1000:.3f}ms")
+                print(f"Prefill time: {prefill_time * 1000:.3f}ms")
+                print(f"Decode time: 0.000ms")
+                print("-" * 50)
+                print(f"Time per step: {total_time * 1000:.3f}ms")
+                print(
+                    f"Avg prefill time per token: {prefill_time * 1000 / len(tokens):.3f}ms"
+                )
+                print(f"Avg decode time per token: N/A")
+                print("-" * 50)
+                print(f"Overall throughput: {overall_throughput:.2f} tokens/s")
+                print(f"Prefill throughput: {prefill_throughput:.2f} tokens/s")
+                print(f"Decode throughput: N/A")
+                print("=" * 50)
+
+            return output_content, total_time * 1000
+
+        infer_task.next(output_tokens[0])
+
+        # Decode phase - generate subsequent tokens
+        decode_start_time = time.time()
+        for step_i in range(1, max_steps):
+            start_time = time.time()
+            output_tokens = self.batch_infer_one_round([infer_task])
+            end_time = time.time()
+            steps += 1
+            output_str = self.tokenizer.decode(output_tokens[0])
+
+            output_content += output_str
+            print(output_str, end="", flush=True)
+            if output_tokens[0] in self.eos_token_id:
+                break
+            infer_task.next(output_tokens[0])
+
+            if step_i > 0:
+                total_time += end_time - start_time
+
+        decode_end_time = time.time()
+        decode_time = decode_end_time - decode_start_time
+
+        print("\n")
+
+        # Calculate performance metrics
+        total_time = prefill_time + decode_time
+        input_tokens = len(tokens)
+        generated_tokens = steps  # including first token from prefill
+
+        # Time per token calculations
+        avg_time_per_step = (
+            total_time * 1000 / (steps - 1) if steps > 1 else total_time * 1000
+        )
+
+        print(f"Time per step: {avg_time_per_step:.3f}ms")
+
+        # Only print detailed metrics if verbose flag is set
+        if verbose:
+            total_tokens = input_tokens + generated_tokens
+
+            # Throughput calculations
+            overall_throughput = total_tokens / total_time  # tokens per second
+            prefill_throughput = input_tokens / prefill_time if prefill_time > 0 else 0
+            decode_throughput = (
+                (generated_tokens - 1) / decode_time if decode_time > 0 else 0
+            )  # exclude first token from prefill
+
+            # Time per token calculations
+            avg_prefill_time_per_token = (
+                prefill_time * 1000 / input_tokens if input_tokens > 0 else 0
+            )
+            avg_decode_time_per_token = (
+                decode_time * 1000 / (generated_tokens - 1)
+                if generated_tokens > 1
+                else 0
+            )
+
+            print("=" * 50)
+            print("PERFORMANCE METRICS")
+            print("=" * 50)
+            print(f"Input tokens: {input_tokens}")
+            print(f"Generated tokens: {generated_tokens}")
+            print(f"Total tokens: {total_tokens}")
+            print(f"Total time: {total_time * 1000:.3f}ms")
+            print(f"Prefill time: {prefill_time * 1000:.3f}ms")
+            print(f"Decode time: {decode_time * 1000:.3f}ms")
+            print("-" * 50)
+            print(f"Time per step: {avg_time_per_step:.3f}ms")
+            print(f"Avg prefill time per token: {avg_prefill_time_per_token:.3f}ms")
+            print(f"Avg decode time per token: {avg_decode_time_per_token:.3f}ms")
+            print("-" * 50)
+            print(f"Overall throughput: {overall_throughput:.2f} tokens/s")
+            print(f"Prefill throughput: {prefill_throughput:.2f} tokens/s")
+            print(f"Decode throughput: {decode_throughput:.2f} tokens/s")
+            print("=" * 50)
+
+        infer_task._kv_cache.drop(self)
+        return output_content, avg_time_per_step
+
+    def perplexity(self, test_sequences: List[Sequence[int]], batch_size=10):
+        tasks = [
+            InferTask(i, [], self.max_context_len(), 1.0, 1, 1.0, self.eos_token_id)
+            for i in range(batch_size)
+        ]
+        kv_caches = [KVCache(self) for _ in range(batch_size)]
+
+        nll = 0.0
+        total_len = 0
+
+        for i in range(0, len(test_sequences), batch_size):
+            batch_id = 0
+            true_tokens = []
+            while batch_id < batch_size and batch_id + i < len(test_sequences):
+                input_tokens = test_sequences[i + batch_id][:-1]
+                true_tokens.extend(test_sequences[i + batch_id][1:])
+                tasks[batch_id].tokens = input_tokens
+                tasks[batch_id].bind_kvcache(kv_caches[batch_id])
+                batch_id += 1
+
+            batch_inputs = JiugeBatchedTask(tasks[:batch_id])
+            logits = torch.zeros(
+                (batch_inputs.ntok, self.meta.dvoc), dtype=self.meta.torch_dtype_logits
+            )
+            self.jiuge_model.forward_batch(
+                self.model_instance,
+                batch_inputs.tokens,
+                batch_inputs.ntok,
+                batch_inputs.req_lens,
+                batch_inputs.nreq,
+                batch_inputs.req_pos,
+                batch_inputs.kv_caches,
+                logits.data_ptr(),
+            )
+
+            logits = logits.float()
+            token_ids = torch.tensor(true_tokens, dtype=torch.int64)  # [ntok,]
+            log_probs = torch.nn.functional.log_softmax(logits, dim=-1)  # (ntok, vocab)
+            token_logprobs = log_probs[
+                torch.arange(batch_inputs.ntok), token_ids
+            ]  # (ntok,)
+
+            start = 0
+            for l in batch_inputs.req_lens_list:
+                nll += -token_logprobs[start : start + l].sum().item()
+                start += l
+            total_len += token_logprobs.numel()
+
+        for task in tasks:
+            task.release_kvcache()
+
+        return math.exp(nll / total_len)
+
+    def destroy_model_instance(self):
+        self.jiuge_model.destroy_model(self.model_instance)
+        print("Model destroyed")
+
+
+def test():
+    cfg = BaseTestConfig()
+
+    # 2. 【关键】统一从 cfg 对象中提取属性
+    model_path = cfg.model_path
+    device_type = cfg.device_type
+    ndev = cfg.ndev
+    verbose = cfg.verbose
+
+    # 打印出来确认一下，确保输出逻辑和变量对应
+    print(f"DEBUG: path={model_path}")
+    print(f"DEBUG: device={device_type}") # 这里应该打印出类似 DeviceType.DEVICE_TYPE_NVIDIA
+    print(f"DEBUG: ndev={ndev}")
+    print(f"DEBUG: verbose={verbose}")
+    print(type(model_path))
+    print(device_type)
+    print(ndev)
+    print(verbose)
+    model = JiugeForCauslLM(model_path, device_type, ndev)
+    model.generate("山东最高的山是？", 500, verbose=verbose)
+    model.destroy_model_instance()
+
+
+if __name__ == "__main__":
+    test()
diff --git a/scripts/jiuge_ppl_config.py b/scripts/jiuge_ppl_config.py
new file mode 100644
index 00000000..d93ab55f
--- /dev/null
+++ b/scripts/jiuge_ppl_config.py
@@ -0,0 +1,122 @@
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from datasets import load_dataset
+from jiuge import JiugeForCauslLM
+from libinfinicore_infer import DeviceType
+
+
+from base_config import BaseTestConfig
+cfg = BaseTestConfig()
+
+# DEVICE_TYPE_MAP = {
+#     "cpu": DeviceType.DEVICE_TYPE_CPU,
+#     "nvidia": DeviceType.DEVICE_TYPE_NVIDIA,
+#     "qy": DeviceType.DEVICE_TYPE_QY,
+#     "cambricon": DeviceType.DEVICE_TYPE_CAMBRICON,
+#     "ascend": DeviceType.DEVICE_TYPE_ASCEND,
+#     "metax": DeviceType.DEVICE_TYPE_METAX,
+#     "moore": DeviceType.DEVICE_TYPE_MOORE,
+#     "iluvatar": DeviceType.DEVICE_TYPE_ILUVATAR,
+#     "kunlun": DeviceType.DEVICE_TYPE_KUNLUN,
+#     "hygon": DeviceType.DEVICE_TYPE_HYGON,
+# }
+
+TORCH_DEVICE_TYPE_MAP = {
+    "cpu": "cpu",
+    "nvidia": "cuda",
+    "qy": "cuda",
+    "cambricon": "mlu",
+    "ascend": "npu",
+    "metax": "cuda",
+    "moore": "cuda",
+    "iluvatar": "cuda",
+    "kunlun": "cuda",
+    "hygon": "cuda",
+}
+
+
+def test_torch(input_ids_list, ):
+    device = TORCH_DEVICE_TYPE_MAP[cfg.device_name]
+    model = AutoModelForCausalLM.from_pretrained(cfg.model_path, trust_remote_code=True).to(
+        device
+    )
+    model.eval()
+
+    total_neg_log_likelihood = 0
+    total_tokens = 0
+
+    with torch.no_grad():
+        for input_ids in input_ids_list:
+            input_ids = torch.tensor(input_ids, device=device)
+            # shift inputs and labels
+            inputs = input_ids[:-1].unsqueeze(0)  # [1, seq_len-1]
+            labels = input_ids[1:].unsqueeze(0)  # [1, seq_len-1]
+
+            outputs = model(inputs, use_cache=False)
+            logits = outputs.logits  # [1, seq_len-1, vocab_size]
+
+            log_probs = torch.nn.functional.log_softmax(logits, dim=-1)
+            # gather log probs of true tokens
+            true_token_log_probs = log_probs.gather(
+                dim=-1, index=labels.unsqueeze(-1)
+            ).squeeze(-1)
+
+            total_neg_log_likelihood += -true_token_log_probs.sum().item()
+            total_tokens += labels.numel()
+
+    perplexity = torch.exp(torch.tensor(total_neg_log_likelihood / total_tokens))
+    return perplexity
+
+
+def test_infinicore(input_ids_list, device_, ndev_):
+    device = cfg.device_type
+
+    model = JiugeForCauslLM(
+        cfg.model_path, device, max_tokens=len(input_ids_list[0]), ndev=ndev_
+    )
+    perplexity = model.perplexity(input_ids_list)
+    model.destroy_model_instance()
+    return perplexity
+
+
+if __name__ == "__main__":
+
+    # parser = argparse.ArgumentParser()
+    # parser.add_argument("--model-path", type=str, required=True)
+    # parser.add_argument(
+    #     "--dev", type=str, default="cpu", choices=DEVICE_TYPE_MAP.keys()
+    # )
+    # parser.add_argument(
+    #     "--ndev",
+    #     type=int,
+    #     default=1,
+    #     help="Number of devices to use (default: 1)",
+    # )
+    # args = parser.parse_args()
+
+    seq_len = 512
+
+    # model_path = args.model_path
+    tokenizer = AutoTokenizer.from_pretrained(cfg.model_path, trust_remote_code=True)
+    dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
+
+    texts = dataset["text"]
+    texts = [t.strip() for t in texts if len(t.strip()) > 0]
+
+    input_ids_list = []
+    for text in texts:
+        ids = tokenizer.encode(text)
+        # split long sequences into chunks
+        for i in range(0, len(ids) - seq_len + 1, seq_len):
+            input_ids_list.append(ids[i : i + seq_len])
+
+    # perplexity = test_infinicore(input_ids_list, args.dev, args.ndev)
+    perplexity = test_infinicore(input_ids_list, cfg.device_type, cfg.ndev)
+    print(f"InfiniCore Perplexity: {perplexity:.2f}")
+
+    # if args.ndev == 1:  # Todo: support multi-device testing with torch
+    #     perplexity = test_torch(input_ids_list, args.dev)
+    #     print(f"Torch Perplexity: {perplexity.item():.2f}")
+    if cfg.ndev == 1:  # Todo: support multi-device testing with torch
+        perplexity = test_torch(input_ids_list)
+        print(f"Torch Perplexity: {perplexity.item():.2f}")
diff --git a/scripts/launch_server_config.py b/scripts/launch_server_config.py
new file mode 100644
index 00000000..aac5af4f
--- /dev/null
+++ b/scripts/launch_server_config.py
@@ -0,0 +1,302 @@
+from jiuge import JiugeForCauslLM
+from jiuge_awq import JiugeAWQForCausalLM
+from libinfinicore_infer import DeviceType
+from infer_task import InferTask
+from kvcache_pool import KVCachePool
+
+import argparse
+import queue
+from fastapi import FastAPI, Request
+from fastapi.responses import StreamingResponse, JSONResponse
+import contextlib
+import uvicorn
+import time
+import uuid
+import json
+import threading
+import janus
+
+from base_config import BaseTestConfig
+
+class launch_config(BaseTestConfig):
+    def __init__(self):
+        super().__init__()
+        # 创建专门的 parser 解析 launch_server 特有参数
+        self.launch_parser = argparse.ArgumentParser()
+        self._add_launch_args()
+        self.launch_args = self.launch_parser.parse_known_args()[0]
+
+        # 设置 launch_server 特有参数
+        self.awq = self.launch_args.awq
+        self.max_batch = self.launch_args.max_batch
+        self.max_tokens = self.launch_args.max_tokens
+
+    def _add_launch_args(self):
+        """添加 launch_server 特有的参数"""
+        self.launch_parser.add_argument(
+            "--awq",
+            action="store_true",
+            default=False,
+            help="Whether to use AWQ quantized model (default: False)",
+        )
+        self.launch_parser.add_argument(
+            "--max-batch",
+            type=int,
+            default=3,
+            help="Maximum number of requests that can be batched together (default: 3)",
+        )
+        self.launch_parser.add_argument(
+            "--max-tokens",
+            type=int,
+            required=False,
+            default=None,
+            help="Max token sequence length that model will handle (follows model config if not provided)",
+        )
+
+
+cfg = launch_config()
+
+# 使用 cfg 对象获取配置
+device_type = cfg.device_type
+model_path = cfg.model_path
+ndev = cfg.ndev
+max_tokens = cfg.max_tokens
+USE_AWQ = cfg.awq
+MAX_BATCH = cfg.max_batch
+print(
+    f"Using MAX_BATCH={MAX_BATCH}. Try reduce this value if out of memory error occurs."
+)
+
+
+def chunk_json(id_, content=None, role=None, finish_reason=None):
+    delta = {}
+    if content:
+        delta["content"] = content
+    if role:
+        delta["role"] = role
+    return {
+        "id": id_,
+        "object": "chat.completion.chunk",
+        "created": int(time.time()),
+        "model": "jiuge",
+        "system_fingerprint": None,
+        "choices": [
+            {
+                "index": 0,
+                "text": content,
+                "delta": delta,
+                "logprobs": None,
+                "finish_reason": finish_reason,
+            }
+        ],
+    }
+
+
+# A wrapper for InferTask that supports async output queue
+class AsyncInferTask(InferTask):
+    def __init__(self, id, tokens, max_tokens, temperature, topk, topp, end_tokens):
+        super().__init__(id, tokens, max_tokens, temperature, topk, topp, end_tokens)
+        self.output_queue = janus.Queue()
+        print(f"[INFO] Create InferTask {self.id}")
+
+    def output(self, out_token):
+        self.next(out_token)
+        self.output_queue.sync_q.put(out_token)
+
+
+@contextlib.asynccontextmanager
+async def lifespan(app: FastAPI):
+    # Startup
+    if USE_AWQ:
+        app.state.model = JiugeAWQForCausalLM(
+            model_path, device_type, ndev, max_tokens=max_tokens
+        )
+    else:
+        app.state.model = JiugeForCauslLM(
+            model_path, device_type, ndev, max_tokens=max_tokens
+        )
+    app.state.kv_cache_pool = KVCachePool(app.state.model, MAX_BATCH)
+    app.state.request_queue = janus.Queue()
+    worker_thread = threading.Thread(target=worker_loop, args=(app,), daemon=True)
+    worker_thread.start()
+
+    try:
+        yield  # The app runs here
+    finally:
+        # Shutdown
+        app.state.request_queue.sync_q.put(None)
+        worker_thread.join()
+        app.state.request_queue.shutdown()
+
+        app.state.kv_cache_pool.finalize()
+        app.state.model.destroy_model_instance()
+
+
+App = FastAPI(lifespan=lifespan)
+
+
+# App loop: take requests from the queue, do inference, and put unfinished requests back into the queue.
+def worker_loop(app):
+    while True:
+        try:
+            task = app.state.request_queue.sync_q.get(timeout=0.01)
+        except queue.Empty:
+            continue
+
+        if task is None:
+            return
+
+        batch = [task]
+        while len(batch) < MAX_BATCH:
+            try:
+                req = app.state.request_queue.sync_q.get_nowait()
+                if req is not None:
+                    batch.append(req)
+            except queue.Empty:
+                break
+        output_tokens = app.state.model.batch_infer_one_round(batch)
+        for task, token in zip(batch, output_tokens):
+            task.output(token)
+            if task.finish_reason is None:
+                app.state.request_queue.sync_q.put(task)
+            else:
+                print(f"[INFO] Task {task.id} finished infer.")
+                app.state.kv_cache_pool.release_sync(task)
+
+
+def build_task(id_, request_data, request: Request):
+    messages = request_data.get("messages", [])
+    input_content = request.app.state.model.tokenizer.apply_chat_template(
+        conversation=messages,
+        add_generation_prompt=True,
+        tokenize=False,
+    )
+    tokens = request.app.state.model.tokenizer.encode(input_content)
+    return AsyncInferTask(
+        id_,
+        tokens,
+        request_data.get("max_tokens", request.app.state.model.max_context_len()),
+        request_data.get("temperature", 1.0),
+        request_data.get("top_k", 1),
+        request_data.get("top_p", 1.0),
+        request.app.state.model.eos_token_id,
+    )
+
+
+async def chat_stream(id_, request_data, request: Request):
+    try:
+        infer_task = build_task(id_, request_data, request)
+        await request.app.state.kv_cache_pool.acquire(infer_task)
+
+        # Initial empty content
+        chunk = json.dumps(
+            chunk_json(id_, content="", role="assistant"), ensure_ascii=False
+        )
+        yield f"data: {chunk}\n\n"
+
+        request.app.state.request_queue.sync_q.put(infer_task)
+
+        while True:
+            if await request.is_disconnected():
+                print("Client disconnected. Aborting stream.")
+                break
+            if (
+                infer_task.finish_reason is not None
+                and infer_task.output_queue.async_q.empty()
+            ):
+                chunk = json.dumps(
+                    chunk_json(id_, finish_reason=infer_task.finish_reason),
+                    ensure_ascii=False,
+                )
+                yield f"data: {chunk}\n\n"
+                break
+
+            token = await infer_task.output_queue.async_q.get()
+            content = request.app.state.model.tokenizer.decode(token)
+
+            chunk = json.dumps(chunk_json(id_, content=content), ensure_ascii=False)
+            yield f"data: {chunk}\n\n"
+
+    except Exception as e:
+        print(f"[Error] ID : {id_} Exception: {e}")
+    finally:
+        if infer_task.finish_reason is None:
+            infer_task.finish_reason = "cancel"
+
+
+async def chat(id_, request_data, request: Request):
+    try:
+        infer_task = build_task(id_, request_data, request)
+        await request.app.state.kv_cache_pool.acquire(infer_task)
+        request.app.state.request_queue.sync_q.put(infer_task)
+        output = []
+        while True:
+            if (
+                infer_task.finish_reason is not None
+                and infer_task.output_queue.async_q.empty()
+            ):
+                break
+
+            token = await infer_task.output_queue.async_q.get()
+            content = request.app.state.model.tokenizer.decode(token)
+            output.append(content)
+
+        output_text = "".join(output).strip()
+        response = chunk_json(
+            id_,
+            content=output_text,
+            role="assistant",
+            finish_reason=infer_task.finish_reason or "stop",
+        )
+        return response
+
+    except Exception as e:
+        print(f"[Error] ID: {id_} Exception: {e}")
+        return JSONResponse(content={"error": str(e)}, status_code=500)
+    finally:
+        if infer_task.finish_reason is None:
+            infer_task.finish_reason = "cancel"
+
+
+@App.post("/chat/completions")
+async def chat_completions(request: Request):
+    data = await request.json()
+    print('-----------------------------------------')
+    print(data)
+    print('-----------------------------------------')
+
+    if not data.get("messages"):
+        if not data.get("prompt"):
+            return JSONResponse(content={"error": "No message provided"}, status_code=400)
+        else:
+            data['messages'] = [{"role": "user", "content": data.get("prompt")}]
+
+    stream = data.get("stream", False)
+    id_ = f"cmpl-{uuid.uuid4().hex}"
+    if stream:
+        return StreamingResponse(
+            chat_stream(id_, data, request), media_type="text/event-stream"
+        )
+    else:
+        response = await chat(id_, data, request)
+        return JSONResponse(content=response)
+
+
+if __name__ == "__main__":
+    uvicorn.run(App, host="0.0.0.0", port=8000)
+
+"""
+curl -N -H "Content-Type: application/json" \
+     -X POST http://127.0.0.1:8000/chat/completions \
+     -d '{
+       "model": "jiuge",
+       "messages": [
+         {"role": "user", "content": "山东最高的山是？"}
+       ],
+       "temperature": 1.0,
+       "top_k": 50,
+       "top_p": 0.8,
+       "max_tokens": 512,
+       "stream": true
+     }'
+"""