diff --git a/docs/base_config_usage.md b/docs/base_config_usage.md new file mode 100644 index 00000000..53d05c06 --- /dev/null +++ b/docs/base_config_usage.md @@ -0,0 +1,135 @@ +# BaseTestConfig 使用文档 + +## 概述 + +`BaseTestConfig` 是 InfiniLM 项目的统一配置基类,为各个测试脚本提供通用的命令行参数解析和配置管理功能。 + +## 特性 + +- **统一参数管理**: 提供所有测试脚本共用的基础参数 +- **设备类型映射**: 支持多种硬件设备的自动类型转换 +- **灵活扩展**: 使用 `parse_known_args()` 容错处理,允许脚本添加特定参数 +- **类型安全**: 自动将设备字符串转换为对应的 `DeviceType` 枚举 + +## 支持的设备类型 + +| 设备名称 | DeviceType 枚举值 | +|---------|-----------------| +| `cpu` | `DEVICE_TYPE_CPU` | +| `nvidia` | `DEVICE_TYPE_NVIDIA` | +| `qy` | `DEVICE_TYPE_QY` | +| `cambricon` | `DEVICE_TYPE_CAMBRICON` | +| `ascend` | `DEVICE_TYPE_ASCEND` | +| `metax` | `DEVICE_TYPE_METAX` | +| `moore` | `DEVICE_TYPE_MOORE` | +| `iluvatar` | `DEVICE_TYPE_ILUVATAR` | +| `kunlun` | `DEVICE_TYPE_KUNLUN` | +| `hygon` | `DEVICE_TYPE_HYGON` | + +## 通用参数说明 + +| 参数 | 类型 | 是否必需 | 默认值 | 说明 | +|------|------|---------|--------|------| +| `--model_path` | str | ✓ 是 | - | 模型文件路径 | +| `--device` | str | 否 | `cpu` | 目标设备类型(见上表) | +| `--ndev` | int | 否 | `1` | 使用的设备数量 | +| `--verbose` | flag | 否 | `False` | 启用详细输出模式 | + +## 基本使用 + +### 1. 直接使用(测试) + +```bash +python scripts/base_config.py --model_path /path/to/model +``` + +### 2. 在脚本中继承使用 + +```python +from base_config import BaseTestConfig + +class MyTestConfig(BaseTestConfig): + def __init__(self): + super().__init__() + # 添加脚本特定参数 + self.parser.add_argument("--my_param", type=int, default=10) + self.my_param = self.args.my_param + +# 使用 +cfg = MyTestConfig() +print(f"模型路径: {cfg.model_path}") +print(f"设备类型: {cfg.device_type}") +print(f"自定义参数: {cfg.my_param}") +``` + +## 命令行示例 + +### 基础用法 +```bash +python your_script.py --model_path ./models/llama2 +``` + +### 使用 NVIDIA GPU +```bash +python your_script.py --model_path ./models/llama2 --device nvidia --ndev 2 +``` + +### 使用 QY 设备并启用详细输出 +```bash +python your_script.py --model_path ./models/llama2 --device qy --verbose +``` + +### 结合自定义参数 +```bash +python your_script.py --model_path ./models/llama2 --device nvidia --ndev 4 --batch_size 32 +``` + +## 类属性说明 + +初始化后可访问的属性: + +- `model_path` (str): 模型路径 +- `ndev` (int): 设备数量 +- `verbose` (bool): 详细输出开关 +- `device_name` (str): 设备名称(原始输入) +- `device_type` (DeviceType): 设备类型枚举值 +- `args` (Namespace): 解析后的参数命名空间 +- `extra` (list): 未解析的额外参数 + + +## 扩展指南 + +### 添加新的设备类型 + +修改 `_get_device_type` 方法中的 `DEVICE_TYPE_MAP`: + +```python +DEVICE_TYPE_MAP = { + # ... 现有映射 ... + "new_device": DeviceType.DEVICE_TYPE_NEW, +} +``` + +### 添加新的通用参数 + +修改 `_add_common_args` 方法: + +```python +def _add_common_args(self): + # ... 现有参数 ... + self.parser.add_argument("--new_param", type=str, default="default") +``` + +## 注意事项 + +1. **参数顺序**: 命令行参数顺序不影响解析结果 +2. **类型转换**: `--ndev` 等整数参数会自动验证类型 +3. **参数覆盖**: 后出现的参数会覆盖前面的同名参数 +4. **帮助信息**: 使用 `--help` 查看所有可用参数 + +## 相关文件 + +- `scripts/jiuge_config.py`: 九格评测配置 +- `scripts/jiuge_ppl_config.py`: 九格 PPL 配置 +- `scripts/jiuge.py`: 九歌评测主脚本 + diff --git a/jiuge.sh b/jiuge.sh new file mode 100644 index 00000000..c186fc7c --- /dev/null +++ b/jiuge.sh @@ -0,0 +1,44 @@ +#!/bin/bash + +# Jiuge模型运行脚本 +# 使用NVIDIA显卡运行9G4B模型 + +set -e # 遇到错误立即退出 + +echo "==========================================" +echo "🚀 启动 Jiuge 模型 (9G4B) - NVIDIA版本" +echo "==========================================" +export INFINI_ROOT=/home/featurize/.infini +# 设置参数 +MODEL_DIR="/home/featurize/work/InfiniFamily/9G4B" +DEVICE="--nvidia" +N_DEVICE=1 +SCRIPT_PATH="python scripts/jiuge.py" + +# 检查模型目录是否存在 +if [ ! -d "$MODEL_DIR" ]; then + echo "❌ 错误: 模型目录不存在: $MODEL_DIR" + echo "请检查路径是否正确" + exit 1 +fi + +# 检查Python脚本是否存在 +if [ ! -f "scripts/jiuge.py" ]; then + echo "❌ 错误: 未找到jiuge.py脚本: scripts/jiuge.py" + echo "请确保在当前目录下运行此脚本" + exit 1 +fi + +echo "📁 模型路径: $MODEL_DIR" +echo "🎯 设备类型: NVIDIA GPU" +echo "💻 设备数量: $N_DEVICE" +echo "" + +# 运行模型 +echo "🔄 启动模型..." +$SCRIPT_PATH $DEVICE $MODEL_DIR $N_DEVICE + +echo "" +echo "==========================================" +echo "✅ 模型运行完成" +echo "==========================================" \ No newline at end of file diff --git a/scripts/base_config.py b/scripts/base_config.py new file mode 100644 index 00000000..ab100211 --- /dev/null +++ b/scripts/base_config.py @@ -0,0 +1,53 @@ +import argparse +import sys +from libinfinicore_infer import DeviceType + + + + +class BaseTestConfig: + def __init__(self): + self.parser = argparse.ArgumentParser(description="InfiniLM Unified Config") + self._add_common_args() + + # 核心:使用 parse_known_args() 容忍脚本特有参数 + # args 存储解析好的命名空间,extra 存储未识别的参数 + self.args, self.extra = self.parser.parse_known_args() + + self.model_path = self.args.model_path + self.ndev = self.args.ndev + self.verbose = self.args.verbose + + self.device_name = self.args.device + self.device_type = self._get_device_type(self.args.device) + + + + def _add_common_args(self): + + self.parser.add_argument("--device", type=str, default="cpu") + self.parser.add_argument("--model_path", type=str, required=True) + self.parser.add_argument("--ndev", type=int, default=1) + self.parser.add_argument("--verbose", action="store_true") + + + def _get_device_type(self, dev_str): + DEVICE_TYPE_MAP = { + "cpu": DeviceType.DEVICE_TYPE_CPU, + "nvidia": DeviceType.DEVICE_TYPE_NVIDIA, + "qy": DeviceType.DEVICE_TYPE_QY, + "cambricon": DeviceType.DEVICE_TYPE_CAMBRICON, + "ascend": DeviceType.DEVICE_TYPE_ASCEND, + "metax": DeviceType.DEVICE_TYPE_METAX, + "moore": DeviceType.DEVICE_TYPE_MOORE, + "iluvatar": DeviceType.DEVICE_TYPE_ILUVATAR, + "kunlun": DeviceType.DEVICE_TYPE_KUNLUN, + "hygon": DeviceType.DEVICE_TYPE_HYGON, + } + + return DEVICE_TYPE_MAP.get(dev_str.lower(), DeviceType.DEVICE_TYPE_CPU) + +if __name__ == '__main__': + cfg = BaseTestConfig() + print(cfg.model_path) + print(cfg.ndev) \ No newline at end of file diff --git a/scripts/jiuge.py b/scripts/jiuge.py index 35b2c8ca..f73fd199 100644 --- a/scripts/jiuge.py +++ b/scripts/jiuge.py @@ -871,7 +871,8 @@ def test(): # Find n_device argument (skip --verbose) ndev_args = [arg for arg in sys.argv[3:] if arg != "--verbose"] ndev = int(ndev_args[0]) if ndev_args else 1 - + print("type is") + print(type(device_type)) model = JiugeForCauslLM(model_path, device_type, ndev) model.generate("山东最高的山是?", 500, verbose=verbose) model.destroy_model_instance() diff --git a/scripts/jiuge_config.py b/scripts/jiuge_config.py new file mode 100644 index 00000000..7f2708f2 --- /dev/null +++ b/scripts/jiuge_config.py @@ -0,0 +1,849 @@ +from typing import List, Sequence +import math +import os +from pathlib import Path +import safetensors +import sys +import time +import json +import torch +import transformers +from base_config import BaseTestConfig +from libinfinicore_infer import ( + JiugeModel, + JiugeMetaCStruct, + JiugeWeightsCStruct, + DataType, + DeviceType, + KVCacheCStruct, +) +from infer_task import InferTask, KVCache + +from ctypes import POINTER, c_float, c_int, c_uint, c_void_p, byref + +torch.set_default_device("cpu") + + +class LlamaWeightsNaming: + def input_embd(self): + return "model.embed_tokens.weight" + + def output_norm(self): + return "model.norm.weight" + + def output_embd(self): + return "lm_head.weight" + + def attn_norm(self, i): + return f"model.layers.{i}.input_layernorm.weight" + + def attn_q(self, i): + return f"model.layers.{i}.self_attn.q_proj.weight" + + def attn_k(self, i): + return f"model.layers.{i}.self_attn.k_proj.weight" + + def attn_v(self, i): + return f"model.layers.{i}.self_attn.v_proj.weight" + + def attn_o(self, i): + return f"model.layers.{i}.self_attn.o_proj.weight" + + def attn_q_b(self, i): + return f"model.layers.{i}.self_attn.q_proj.bias" + + def attn_k_b(self, i): + return f"model.layers.{i}.self_attn.k_proj.bias" + + def attn_v_b(self, i): + return f"model.layers.{i}.self_attn.v_proj.bias" + + def attn_q_norm(self, i): + return f"model.layers.{i}.self_attn.q_norm.weight" + + def attn_k_norm(self, i): + return f"model.layers.{i}.self_attn.k_norm.weight" + + def ffn_norm(self, i): + return f"model.layers.{i}.post_attention_layernorm.weight" + + def gate(self, i): + return f"model.layers.{i}.mlp.gate_proj.weight" + + def up(self, i): + return f"model.layers.{i}.mlp.up_proj.weight" + + def down(self, i): + return f"model.layers.{i}.mlp.down_proj.weight" + + def match(state_dict): + return ( + "model.norm.weight" in state_dict + and "model.layers.0.self_attn.q_proj.weight" in state_dict + ) + + +class JiugeMetaFromLlama(JiugeMetaCStruct): + def __init__(self, config, dtype=torch.float16, max_tokens=None): + if dtype == torch.float16: + dt_ = DataType.INFINI_DTYPE_F16 + elif dtype == torch.float32: + dt_ = DataType.INFINI_DTYPE_F32 + elif dtype == torch.bfloat16: + dt_ = DataType.INFINI_DTYPE_BF16 + else: + dt_ = DataType.INFINI_DTYPE_F16 + + self.scale_input = 1.0 + self.scale_output = 1.0 + self.scale_o = 1.0 + self.scale_down = 1.0 + if ( + config["model_type"] in ["fm9g", "minicpm"] + and "scale_emb" in config + and "scale_depth" in config + and "dim_model_base" in config + ): + self.scale_input = config["scale_emb"] + self.scale_output = config["hidden_size"] // config["dim_model_base"] + self.scale_o = config["scale_depth"] / math.sqrt( + config["num_hidden_layers"] + ) + self.scale_down = config["scale_depth"] / math.sqrt( + config["num_hidden_layers"] + ) + + super().__init__( + dt_logits=dt_, + nlayer=config["num_hidden_layers"], + d=config["hidden_size"], + nh=config["num_attention_heads"], + nkvh=( + config["num_key_value_heads"] + if "num_key_value_heads" in config + else config["num_attention_heads"] + ), + dh=( + config["head_dim"] + if "head_dim" in config + else config["hidden_size"] // config["num_attention_heads"] + ), + di=config["intermediate_size"], + dctx=( + config["max_position_embeddings"] if max_tokens is None else max_tokens + ), + dvoc=config["vocab_size"], + epsilon=config["rms_norm_eps"], + theta=(config["rope_theta"] if "rope_theta" in config else 100000.0), + end_token=2, + ) + self.torch_dtype_logits = dtype + + +class JiugeWeightsImpl(JiugeWeightsCStruct): + def __init__( + self, + meta, + naming, + state_dict, + torch_dt_mat=torch.float16, + torch_dt_norm=torch.float32, + ndev=1, + transpose_weight=True, + ): + nlayer = meta.nlayer + nh = meta.nh + nkvh = meta.nkvh + dh = meta.dh + d = meta.d + di = meta.di + scale_input = meta.scale_input + scale_output = meta.scale_output + scale_o = meta.scale_o + scale_down = meta.scale_down + assert nh % nkvh == 0 + assert nh % ndev == 0 + assert nkvh % ndev == 0 + assert di % ndev == 0 + torch_dt_logits = meta.torch_dtype_logits + if torch_dt_mat == torch.float16: + self.dt_mat = DataType.INFINI_DTYPE_F16 + elif torch_dt_mat == torch.float32: + self.dt_mat = DataType.INFINI_DTYPE_F32 + elif torch_dt_mat == torch.bfloat16: + self.dt_mat = DataType.INFINI_DTYPE_BF16 + else: + raise ValueError("Unsupported proj weight data type") + if torch_dt_norm == torch.float16: + self.dt_norm = DataType.INFINI_DTYPE_F16 + elif torch_dt_norm == torch.float32: + self.dt_norm = DataType.INFINI_DTYPE_F32 + elif torch_dt_norm == torch.bfloat16: + self.dt_norm = DataType.INFINI_DTYPE_BF16 + else: + raise ValueError("Unsupported norm weight data type") + + input_embd_naming = ( + naming.input_embd() + if naming.input_embd() in state_dict + else naming.output_embd() + ) + output_embd_naming = ( + naming.output_embd() + if naming.output_embd() in state_dict + else naming.input_embd() + ) + self.transpose_linear_weights = 1 if transpose_weight else 0 + self.nlayer = nlayer + self.input_embd_tensor = ( + state_dict[input_embd_naming].to(torch_dt_logits) * scale_input + ) + self.input_embd = self.input_embd_tensor.data_ptr() + self.output_norm_tensor = ( + state_dict[naming.output_norm()].to(torch_dt_norm) * scale_output + ) + self.output_norm = self.output_norm_tensor.data_ptr() + self.output_embd_tensor = state_dict[output_embd_naming].to(torch_dt_mat) + if not transpose_weight: + self.output_embd_tensor = self.output_embd_tensor.transpose( + 0, 1 + ).contiguous() + self.output_embd = self.output_embd_tensor.data_ptr() + + self.attn_norm_tensors = [ + state_dict[naming.attn_norm(i)].to(torch_dt_norm) for i in range(nlayer) + ] + self.attn_norm_ptrs = [ + self.attn_norm_tensors[i].data_ptr() for i in range(nlayer) + ] + self.attn_norm = (c_void_p * nlayer)(*self.attn_norm_ptrs) + + def qkv_slices(_i): + _Q = ( + state_dict[naming.attn_q(_i)] + .reshape([nh, 2, dh // 2, d]) + .transpose(1, 2) + ) + _K = ( + state_dict[naming.attn_k(_i)] + .reshape([nkvh, 2, dh // 2, d]) + .transpose(1, 2) + ) + _V = state_dict[naming.attn_v(_i)].reshape([nkvh, dh // 2, 2, d]) + _result = [] + _nh = nh // ndev + _nkvh = nkvh // ndev + for _idev in range(ndev): + _result.append(_Q[_idev * _nh : (_idev + 1) * _nh, :, :, :]) + _result.append(_K[_idev * _nkvh : (_idev + 1) * _nkvh, :, :, :]) + _result.append(_V[_idev * _nkvh : (_idev + 1) * _nkvh, :, :]) + return _result + + self.qkv_tensor = [ + torch.concat(qkv_slices(i)).to(torch_dt_mat) for i in range(nlayer) + ] + if not transpose_weight: + for i in range(nlayer): + self.qkv_tensor[i] = ( + self.qkv_tensor[i] + .reshape(ndev, (nh + 2 * nkvh) // ndev * dh, d) + .transpose(1, 2) + .contiguous() + ) + self.qkv_tensor_ptrs = [self.qkv_tensor[i].data_ptr() for i in range(nlayer)] + self.attn_qkv = (c_void_p * nlayer)(*self.qkv_tensor_ptrs) + + def qkv_b_slices(_i): + _QB = ( + state_dict[naming.attn_q_b(_i)] + .reshape([nh, 2, dh // 2]) + .transpose(1, 2) + ) + _KB = ( + state_dict[naming.attn_k_b(_i)] + .reshape([nkvh, 2, dh // 2]) + .transpose(1, 2) + ) + _VB = state_dict[naming.attn_v_b(_i)].reshape([nkvh, dh // 2, 2]) + _result = [] + _nh = nh // ndev + _nkvh = nkvh // ndev + for _idev in range(ndev): + _result.append(_QB[_idev * _nh : (_idev + 1) * _nh, :, :].flatten()) + _result.append(_KB[_idev * _nkvh : (_idev + 1) * _nkvh, :, :].flatten()) + _result.append(_VB[_idev * _nkvh : (_idev + 1) * _nkvh, :, :].flatten()) + return _result + + if naming.attn_q_b(0) in state_dict: + self.qkv_b_tensors = [ + torch.concat(qkv_b_slices(i)).to(torch_dt_logits) for i in range(nlayer) + ] + self.qkv_b_tensor_ptrs = [ + self.qkv_b_tensors[i].data_ptr() for i in range(nlayer) + ] + self.attn_qkv_b = (c_void_p * nlayer)(*self.qkv_b_tensor_ptrs) + else: + self.attn_qkv_b = None + + if naming.attn_q_norm(0) in state_dict: + self.attn_q_norm_tensors = [ + state_dict[naming.attn_q_norm(i)] + .reshape([2, dh // 2]) + .transpose(0, 1) + .contiguous() + .to(torch_dt_norm) + for i in range(nlayer) + ] + self.attn_q_norm_ptrs = [ + self.attn_q_norm_tensors[i].data_ptr() for i in range(nlayer) + ] + self.attn_q_norm = (c_void_p * nlayer)(*self.attn_q_norm_ptrs) + self.attn_k_norm_tensors = [ + state_dict[naming.attn_k_norm(i)] + .reshape([2, dh // 2]) + .transpose(0, 1) + .contiguous() + .to(torch_dt_norm) + for i in range(nlayer) + ] + self.attn_k_norm_ptrs = [ + self.attn_k_norm_tensors[i].data_ptr() for i in range(nlayer) + ] + self.attn_k_norm = (c_void_p * nlayer)(*self.attn_k_norm_ptrs) + else: + self.attn_q_norm = None + self.attn_k_norm = None + + self.attn_o_tensor = [ + ( + state_dict[naming.attn_o(i)] + .to(torch_dt_mat) + .reshape([d, ndev, nh // ndev * dh]) + .transpose(0, 1) + .contiguous() + if transpose_weight + else state_dict[naming.attn_o(i)] + .transpose(0, 1) + .to(torch_dt_mat) + .contiguous() + ) + * scale_o + for i in range(nlayer) + ] + self.attn_o_ptrs = [self.attn_o_tensor[i].data_ptr() for i in range(nlayer)] + self.attn_o = (c_void_p * nlayer)(*self.attn_o_ptrs) + + self.ffn_norm_tensors = [ + state_dict[naming.ffn_norm(i)].to(torch_dt_norm) for i in range(nlayer) + ] + self.ffn_norm_ptrs = [ + self.ffn_norm_tensors[i].data_ptr() for i in range(nlayer) + ] + self.ffn_norm = (c_void_p * nlayer)(*self.ffn_norm_ptrs) + + def gate_up_slices(_i): + _result = [] + _di = di // ndev + for _idev in range(ndev): + _start = _idev * _di + _end = (_idev + 1) * _di + _result.append(state_dict[naming.gate(_i)][_start:_end, :]) + _result.append(state_dict[naming.up(_i)][_start:_end, :]) + return _result + + self.gate_up_tensors = [ + torch.concat(gate_up_slices(i)).to(torch_dt_mat) for i in range(nlayer) + ] + if not transpose_weight: + for i in range(nlayer): + self.gate_up_tensors[i] = ( + self.gate_up_tensors[i] + .reshape(ndev, 2 * di // ndev, d) + .transpose(1, 2) + .contiguous() + ) + self.gate_up_ptrs = [self.gate_up_tensors[i].data_ptr() for i in range(nlayer)] + self.ffn_gate_up = (c_void_p * nlayer)(*self.gate_up_ptrs) + + self.ffn_down_tensor = [ + ( + state_dict[naming.down(i)] + .to(torch_dt_mat) + .reshape([d, ndev, di // ndev]) + .transpose(0, 1) + .contiguous() + if transpose_weight + else state_dict[naming.down(i)] + .transpose(0, 1) + .to(torch_dt_mat) + .contiguous() + ) + * scale_down + for i in range(nlayer) + ] + self.ffn_down_ptrs = [self.ffn_down_tensor[i].data_ptr() for i in range(nlayer)] + self.ffn_down = (c_void_p * nlayer)(*self.ffn_down_ptrs) + + +class JiugeBatchedTask: + def __init__(self, tasks: List[InferTask]): + self.tasks = tasks + self.nreq = len(tasks) + + # Precompute fields + token_lists = [t.tokens for t in tasks] + self.req_lens_list = [len(toks) for toks in token_lists] + self.req_pos_list = [t.pos for t in tasks] + self.kv_cache_ptrs = [t.kvcache().data() for t in tasks] + self.temperaturas_list = [t.temperature for t in tasks] + self.topks_list = [t.topk for t in tasks] + self.topps_list = [t.topp for t in tasks] + + # Flatten token lists + flat_tokens = [tok for toks in token_lists for tok in toks] + self.ntok = len(flat_tokens) + + # Convert to ctypes arrays in one pass + self.tokens = (c_uint * self.ntok)(*flat_tokens) + self.req_lens = (c_uint * self.nreq)(*self.req_lens_list) + self.req_pos = (c_uint * self.nreq)(*self.req_pos_list) + self.kv_caches = (POINTER(KVCacheCStruct) * self.nreq)(*self.kv_cache_ptrs) + self.temperaturas = (c_float * self.nreq)(*self.temperaturas_list) + self.topks = (c_uint * self.nreq)(*self.topks_list) + self.topps = (c_float * self.nreq)(*self.topps_list) + + def input_args(self): + return ( + self.tokens, + self.ntok, + self.req_lens, + self.nreq, + self.req_pos, + self.kv_caches, + self.temperaturas, + self.topks, + self.topps, + ) + + +class JiugeForCauslLM: + def __init__( + self, model_dir_path, device=DeviceType.DEVICE_TYPE_CPU, ndev=1, max_tokens=None + ): + def load_all_safetensors_from_dir(dir_path_: str): + tensors_ = {} + dir_path_ = Path(dir_path_) + for file in sorted(dir_path_.glob("*.safetensors")): + data_ = safetensors.safe_open(file, "pt") + for name_ in data_.keys(): + tensors_[name_] = data_.get_tensor(name_) + return tensors_ + + print("Loading model weights to host...") + load_start_time = time.time() + + with open(os.path.join(model_dir_path, "config.json"), "r") as f: + config = json.load(f) + self.config = config + eos_token_id = self.config["eos_token_id"] + self.eos_token_id = ( + [eos_token_id] if type(eos_token_id) == int else eos_token_id + ) + transpose_weight = ( + device != DeviceType.DEVICE_TYPE_ASCEND + ) # y = xW is faster than y=xW^T on Ascend + + self.jiuge_model = JiugeModel() + + if "llama" == config["model_type"]: + model = ( + transformers.LlamaForCausalLM.from_pretrained(model_dir_path) + .cpu() + .half() + ) + self.meta = JiugeMetaFromLlama(config, max_tokens=max_tokens) + self.tokenizer = transformers.AutoTokenizer.from_pretrained(model_dir_path) + self.weights = JiugeWeightsImpl( + self.meta, + LlamaWeightsNaming(), + model.state_dict(), + ndev=ndev, + transpose_weight=transpose_weight, + ) + elif "fm9g" == config["model_type"] or "minicpm" == config["model_type"]: + if any( + file.suffix == ".safetensors" for file in Path(model_dir_path).iterdir() + ): + state_dict = load_all_safetensors_from_dir(model_dir_path) + else: + state_dict = torch.load( + os.path.join(model_dir_path, "pytorch_model.bin"), + weights_only=True, + map_location="cpu", + ) + if LlamaWeightsNaming.match(state_dict): + self.meta = JiugeMetaFromLlama(config, max_tokens=max_tokens) + self.weights = JiugeWeightsImpl( + self.meta, + LlamaWeightsNaming(), + state_dict, + ndev=ndev, + transpose_weight=transpose_weight, + ) + self.tokenizer = transformers.AutoTokenizer.from_pretrained( + model_dir_path, trust_remote_code=True + ) + else: + raise ValueError("Unsupported weight naming") + elif "fm9g7b" == config["model_type"]: + if any( + file.suffix == ".safetensors" for file in Path(model_dir_path).iterdir() + ): + state_dict = load_all_safetensors_from_dir(model_dir_path) + else: + state_dict = torch.load( + os.path.join(model_dir_path, "pytorch_model.bin"), + weights_only=True, + map_location="cpu", + ) + if LlamaWeightsNaming.match(state_dict): + self.meta = JiugeMetaFromLlama(config, max_tokens=max_tokens) + self.weights = JiugeWeightsImpl( + self.meta, + LlamaWeightsNaming(), + state_dict, + ndev=ndev, + transpose_weight=transpose_weight, + ) + self.tokenizer = transformers.AutoTokenizer.from_pretrained( + model_dir_path, trust_remote_code=True + ) + else: + raise ValueError("Unsupported weight naming") + elif "qwen2" == config["model_type"] or "qwen3" == config["model_type"]: + state_dict = load_all_safetensors_from_dir(model_dir_path) + if LlamaWeightsNaming.match(state_dict): + self.meta = JiugeMetaFromLlama(config, max_tokens=max_tokens) + self.weights = JiugeWeightsImpl( + self.meta, + LlamaWeightsNaming(), + state_dict, + ndev=ndev, + transpose_weight=transpose_weight, + ) + self.tokenizer = transformers.AutoTokenizer.from_pretrained( + model_dir_path + ) + else: + raise ValueError("Unsupported model architecture") + + if "llama" == config["model_type"]: + from tokenizers import decoders as _dec + + backend = getattr(self.tokenizer, "backend_tokenizer", None) + target = getattr(backend, "_tokenizer", backend) + norm = getattr(target, "normalizer", None) + dec = getattr(target, "decoder", None) + sn = repr(norm)[:800] if norm is not None else "" + sd = repr(dec)[:800] if dec is not None else "" + has_prepend = "Prepend" in sn + has_strip = "Strip" in sd + if has_prepend and has_strip: + target.decoder = _dec.Sequence( + [ + _dec.Replace("▁", " "), + _dec.ByteFallback(), + _dec.Fuse(), + ] + ) + + load_end_time = time.time() + print(f"Time used: {load_end_time - load_start_time:.3f}s") + + print(f"Creating model on {ndev} devices...") + load_start_time = time.time() + self.dev_ids = (c_int * ndev)(*[i for i in range(ndev)]) + self.ndev = ndev + self.device = device + + self.model_instance = self.jiuge_model.create_model( + byref(self.meta), + byref(self.weights), + device, + ndev, + self.dev_ids, + ) + load_end_time = time.time() + print(f"Time used: {load_end_time - load_start_time:.3f}s") + + def max_context_len(self): + return self.meta.dctx + + def create_kv_cache(self): + return self.jiuge_model.create_kv_cache( + self.meta.nlayer, + self.meta.dctx, + self.meta.nkvh, + self.meta.dh, + self.meta.dh, + self.meta.dt_logits, + self.device, + self.dev_ids, + self.ndev, + ) + + def drop_kv_cache(self, kv_cache): + self.jiuge_model.drop_kv_cache(kv_cache) + + def batch_infer_one_round(self, tasks: List[InferTask]): + output = (c_uint * len(tasks))() + batch_inputs = JiugeBatchedTask(tasks) + self.jiuge_model.infer_batch( + self.model_instance, + *(batch_inputs.input_args()), + output, + ) + return list(output) + + def generate( + self, + input_content, + max_steps, + topp_=1.0, + topk_=1, + temperature_=1.0, + verbose=False, + ): + input_content = self.tokenizer.apply_chat_template( + conversation=[{"role": "user", "content": input_content}], + add_generation_prompt=True, + tokenize=False, + ) + print(input_content, end="", flush=True) + tokens = self.tokenizer.encode(input_content) + infer_task = InferTask( + 0, + tokens, + self.max_context_len(), + temperature_, + topk_, + topp_, + self.eos_token_id, + ) + infer_task.bind_kvcache(KVCache(self)) + + steps = 0 + total_time = 0 + prefill_time = 0 + decode_time = 0 + output_content = "" + + # Prefill phase - process initial prompt + prefill_start_time = time.time() + output_tokens = self.batch_infer_one_round([infer_task]) + prefill_end_time = time.time() + prefill_time = prefill_end_time - prefill_start_time + steps += 1 + + output_str = self.tokenizer.decode(output_tokens[0]) + output_content += output_str + print(output_str, end="", flush=True) + if output_tokens[0] in self.eos_token_id: + # If generation ends after prefill, calculate metrics + total_time = prefill_time + total_tokens = len(tokens) + 1 # input tokens + first output token + + print("\n") + print(f"Time per step: {total_time * 1000:.3f}ms") + + if verbose: + overall_throughput = total_tokens / total_time + prefill_throughput = len(tokens) / prefill_time + decode_throughput = 1 / 0.001 # Avoid division by zero, use small value + + print("=" * 50) + print("PERFORMANCE METRICS") + print("=" * 50) + print(f"Input tokens: {len(tokens)}") + print(f"Generated tokens: 1") + print(f"Total tokens: {total_tokens}") + print(f"Total time: {total_time * 1000:.3f}ms") + print(f"Prefill time: {prefill_time * 1000:.3f}ms") + print(f"Decode time: 0.000ms") + print("-" * 50) + print(f"Time per step: {total_time * 1000:.3f}ms") + print( + f"Avg prefill time per token: {prefill_time * 1000 / len(tokens):.3f}ms" + ) + print(f"Avg decode time per token: N/A") + print("-" * 50) + print(f"Overall throughput: {overall_throughput:.2f} tokens/s") + print(f"Prefill throughput: {prefill_throughput:.2f} tokens/s") + print(f"Decode throughput: N/A") + print("=" * 50) + + return output_content, total_time * 1000 + + infer_task.next(output_tokens[0]) + + # Decode phase - generate subsequent tokens + decode_start_time = time.time() + for step_i in range(1, max_steps): + start_time = time.time() + output_tokens = self.batch_infer_one_round([infer_task]) + end_time = time.time() + steps += 1 + output_str = self.tokenizer.decode(output_tokens[0]) + + output_content += output_str + print(output_str, end="", flush=True) + if output_tokens[0] in self.eos_token_id: + break + infer_task.next(output_tokens[0]) + + if step_i > 0: + total_time += end_time - start_time + + decode_end_time = time.time() + decode_time = decode_end_time - decode_start_time + + print("\n") + + # Calculate performance metrics + total_time = prefill_time + decode_time + input_tokens = len(tokens) + generated_tokens = steps # including first token from prefill + + # Time per token calculations + avg_time_per_step = ( + total_time * 1000 / (steps - 1) if steps > 1 else total_time * 1000 + ) + + print(f"Time per step: {avg_time_per_step:.3f}ms") + + # Only print detailed metrics if verbose flag is set + if verbose: + total_tokens = input_tokens + generated_tokens + + # Throughput calculations + overall_throughput = total_tokens / total_time # tokens per second + prefill_throughput = input_tokens / prefill_time if prefill_time > 0 else 0 + decode_throughput = ( + (generated_tokens - 1) / decode_time if decode_time > 0 else 0 + ) # exclude first token from prefill + + # Time per token calculations + avg_prefill_time_per_token = ( + prefill_time * 1000 / input_tokens if input_tokens > 0 else 0 + ) + avg_decode_time_per_token = ( + decode_time * 1000 / (generated_tokens - 1) + if generated_tokens > 1 + else 0 + ) + + print("=" * 50) + print("PERFORMANCE METRICS") + print("=" * 50) + print(f"Input tokens: {input_tokens}") + print(f"Generated tokens: {generated_tokens}") + print(f"Total tokens: {total_tokens}") + print(f"Total time: {total_time * 1000:.3f}ms") + print(f"Prefill time: {prefill_time * 1000:.3f}ms") + print(f"Decode time: {decode_time * 1000:.3f}ms") + print("-" * 50) + print(f"Time per step: {avg_time_per_step:.3f}ms") + print(f"Avg prefill time per token: {avg_prefill_time_per_token:.3f}ms") + print(f"Avg decode time per token: {avg_decode_time_per_token:.3f}ms") + print("-" * 50) + print(f"Overall throughput: {overall_throughput:.2f} tokens/s") + print(f"Prefill throughput: {prefill_throughput:.2f} tokens/s") + print(f"Decode throughput: {decode_throughput:.2f} tokens/s") + print("=" * 50) + + infer_task._kv_cache.drop(self) + return output_content, avg_time_per_step + + def perplexity(self, test_sequences: List[Sequence[int]], batch_size=10): + tasks = [ + InferTask(i, [], self.max_context_len(), 1.0, 1, 1.0, self.eos_token_id) + for i in range(batch_size) + ] + kv_caches = [KVCache(self) for _ in range(batch_size)] + + nll = 0.0 + total_len = 0 + + for i in range(0, len(test_sequences), batch_size): + batch_id = 0 + true_tokens = [] + while batch_id < batch_size and batch_id + i < len(test_sequences): + input_tokens = test_sequences[i + batch_id][:-1] + true_tokens.extend(test_sequences[i + batch_id][1:]) + tasks[batch_id].tokens = input_tokens + tasks[batch_id].bind_kvcache(kv_caches[batch_id]) + batch_id += 1 + + batch_inputs = JiugeBatchedTask(tasks[:batch_id]) + logits = torch.zeros( + (batch_inputs.ntok, self.meta.dvoc), dtype=self.meta.torch_dtype_logits + ) + self.jiuge_model.forward_batch( + self.model_instance, + batch_inputs.tokens, + batch_inputs.ntok, + batch_inputs.req_lens, + batch_inputs.nreq, + batch_inputs.req_pos, + batch_inputs.kv_caches, + logits.data_ptr(), + ) + + logits = logits.float() + token_ids = torch.tensor(true_tokens, dtype=torch.int64) # [ntok,] + log_probs = torch.nn.functional.log_softmax(logits, dim=-1) # (ntok, vocab) + token_logprobs = log_probs[ + torch.arange(batch_inputs.ntok), token_ids + ] # (ntok,) + + start = 0 + for l in batch_inputs.req_lens_list: + nll += -token_logprobs[start : start + l].sum().item() + start += l + total_len += token_logprobs.numel() + + for task in tasks: + task.release_kvcache() + + return math.exp(nll / total_len) + + def destroy_model_instance(self): + self.jiuge_model.destroy_model(self.model_instance) + print("Model destroyed") + + +def test(): + cfg = BaseTestConfig() + + # 2. 【关键】统一从 cfg 对象中提取属性 + model_path = cfg.model_path + device_type = cfg.device_type + ndev = cfg.ndev + verbose = cfg.verbose + + # 打印出来确认一下,确保输出逻辑和变量对应 + print(f"DEBUG: path={model_path}") + print(f"DEBUG: device={device_type}") # 这里应该打印出类似 DeviceType.DEVICE_TYPE_NVIDIA + print(f"DEBUG: ndev={ndev}") + print(f"DEBUG: verbose={verbose}") + print(type(model_path)) + print(device_type) + print(ndev) + print(verbose) + model = JiugeForCauslLM(model_path, device_type, ndev) + model.generate("山东最高的山是?", 500, verbose=verbose) + model.destroy_model_instance() + + +if __name__ == "__main__": + test() diff --git a/scripts/jiuge_ppl_config.py b/scripts/jiuge_ppl_config.py new file mode 100644 index 00000000..d93ab55f --- /dev/null +++ b/scripts/jiuge_ppl_config.py @@ -0,0 +1,122 @@ +import torch +from transformers import AutoTokenizer, AutoModelForCausalLM +from datasets import load_dataset +from jiuge import JiugeForCauslLM +from libinfinicore_infer import DeviceType + + +from base_config import BaseTestConfig +cfg = BaseTestConfig() + +# DEVICE_TYPE_MAP = { +# "cpu": DeviceType.DEVICE_TYPE_CPU, +# "nvidia": DeviceType.DEVICE_TYPE_NVIDIA, +# "qy": DeviceType.DEVICE_TYPE_QY, +# "cambricon": DeviceType.DEVICE_TYPE_CAMBRICON, +# "ascend": DeviceType.DEVICE_TYPE_ASCEND, +# "metax": DeviceType.DEVICE_TYPE_METAX, +# "moore": DeviceType.DEVICE_TYPE_MOORE, +# "iluvatar": DeviceType.DEVICE_TYPE_ILUVATAR, +# "kunlun": DeviceType.DEVICE_TYPE_KUNLUN, +# "hygon": DeviceType.DEVICE_TYPE_HYGON, +# } + +TORCH_DEVICE_TYPE_MAP = { + "cpu": "cpu", + "nvidia": "cuda", + "qy": "cuda", + "cambricon": "mlu", + "ascend": "npu", + "metax": "cuda", + "moore": "cuda", + "iluvatar": "cuda", + "kunlun": "cuda", + "hygon": "cuda", +} + + +def test_torch(input_ids_list, ): + device = TORCH_DEVICE_TYPE_MAP[cfg.device_name] + model = AutoModelForCausalLM.from_pretrained(cfg.model_path, trust_remote_code=True).to( + device + ) + model.eval() + + total_neg_log_likelihood = 0 + total_tokens = 0 + + with torch.no_grad(): + for input_ids in input_ids_list: + input_ids = torch.tensor(input_ids, device=device) + # shift inputs and labels + inputs = input_ids[:-1].unsqueeze(0) # [1, seq_len-1] + labels = input_ids[1:].unsqueeze(0) # [1, seq_len-1] + + outputs = model(inputs, use_cache=False) + logits = outputs.logits # [1, seq_len-1, vocab_size] + + log_probs = torch.nn.functional.log_softmax(logits, dim=-1) + # gather log probs of true tokens + true_token_log_probs = log_probs.gather( + dim=-1, index=labels.unsqueeze(-1) + ).squeeze(-1) + + total_neg_log_likelihood += -true_token_log_probs.sum().item() + total_tokens += labels.numel() + + perplexity = torch.exp(torch.tensor(total_neg_log_likelihood / total_tokens)) + return perplexity + + +def test_infinicore(input_ids_list, device_, ndev_): + device = cfg.device_type + + model = JiugeForCauslLM( + cfg.model_path, device, max_tokens=len(input_ids_list[0]), ndev=ndev_ + ) + perplexity = model.perplexity(input_ids_list) + model.destroy_model_instance() + return perplexity + + +if __name__ == "__main__": + + # parser = argparse.ArgumentParser() + # parser.add_argument("--model-path", type=str, required=True) + # parser.add_argument( + # "--dev", type=str, default="cpu", choices=DEVICE_TYPE_MAP.keys() + # ) + # parser.add_argument( + # "--ndev", + # type=int, + # default=1, + # help="Number of devices to use (default: 1)", + # ) + # args = parser.parse_args() + + seq_len = 512 + + # model_path = args.model_path + tokenizer = AutoTokenizer.from_pretrained(cfg.model_path, trust_remote_code=True) + dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test") + + texts = dataset["text"] + texts = [t.strip() for t in texts if len(t.strip()) > 0] + + input_ids_list = [] + for text in texts: + ids = tokenizer.encode(text) + # split long sequences into chunks + for i in range(0, len(ids) - seq_len + 1, seq_len): + input_ids_list.append(ids[i : i + seq_len]) + + # perplexity = test_infinicore(input_ids_list, args.dev, args.ndev) + perplexity = test_infinicore(input_ids_list, cfg.device_type, cfg.ndev) + print(f"InfiniCore Perplexity: {perplexity:.2f}") + + # if args.ndev == 1: # Todo: support multi-device testing with torch + # perplexity = test_torch(input_ids_list, args.dev) + # print(f"Torch Perplexity: {perplexity.item():.2f}") + if cfg.ndev == 1: # Todo: support multi-device testing with torch + perplexity = test_torch(input_ids_list) + print(f"Torch Perplexity: {perplexity.item():.2f}") diff --git a/scripts/launch_server_config.py b/scripts/launch_server_config.py new file mode 100644 index 00000000..aac5af4f --- /dev/null +++ b/scripts/launch_server_config.py @@ -0,0 +1,302 @@ +from jiuge import JiugeForCauslLM +from jiuge_awq import JiugeAWQForCausalLM +from libinfinicore_infer import DeviceType +from infer_task import InferTask +from kvcache_pool import KVCachePool + +import argparse +import queue +from fastapi import FastAPI, Request +from fastapi.responses import StreamingResponse, JSONResponse +import contextlib +import uvicorn +import time +import uuid +import json +import threading +import janus + +from base_config import BaseTestConfig + +class launch_config(BaseTestConfig): + def __init__(self): + super().__init__() + # 创建专门的 parser 解析 launch_server 特有参数 + self.launch_parser = argparse.ArgumentParser() + self._add_launch_args() + self.launch_args = self.launch_parser.parse_known_args()[0] + + # 设置 launch_server 特有参数 + self.awq = self.launch_args.awq + self.max_batch = self.launch_args.max_batch + self.max_tokens = self.launch_args.max_tokens + + def _add_launch_args(self): + """添加 launch_server 特有的参数""" + self.launch_parser.add_argument( + "--awq", + action="store_true", + default=False, + help="Whether to use AWQ quantized model (default: False)", + ) + self.launch_parser.add_argument( + "--max-batch", + type=int, + default=3, + help="Maximum number of requests that can be batched together (default: 3)", + ) + self.launch_parser.add_argument( + "--max-tokens", + type=int, + required=False, + default=None, + help="Max token sequence length that model will handle (follows model config if not provided)", + ) + + +cfg = launch_config() + +# 使用 cfg 对象获取配置 +device_type = cfg.device_type +model_path = cfg.model_path +ndev = cfg.ndev +max_tokens = cfg.max_tokens +USE_AWQ = cfg.awq +MAX_BATCH = cfg.max_batch +print( + f"Using MAX_BATCH={MAX_BATCH}. Try reduce this value if out of memory error occurs." +) + + +def chunk_json(id_, content=None, role=None, finish_reason=None): + delta = {} + if content: + delta["content"] = content + if role: + delta["role"] = role + return { + "id": id_, + "object": "chat.completion.chunk", + "created": int(time.time()), + "model": "jiuge", + "system_fingerprint": None, + "choices": [ + { + "index": 0, + "text": content, + "delta": delta, + "logprobs": None, + "finish_reason": finish_reason, + } + ], + } + + +# A wrapper for InferTask that supports async output queue +class AsyncInferTask(InferTask): + def __init__(self, id, tokens, max_tokens, temperature, topk, topp, end_tokens): + super().__init__(id, tokens, max_tokens, temperature, topk, topp, end_tokens) + self.output_queue = janus.Queue() + print(f"[INFO] Create InferTask {self.id}") + + def output(self, out_token): + self.next(out_token) + self.output_queue.sync_q.put(out_token) + + +@contextlib.asynccontextmanager +async def lifespan(app: FastAPI): + # Startup + if USE_AWQ: + app.state.model = JiugeAWQForCausalLM( + model_path, device_type, ndev, max_tokens=max_tokens + ) + else: + app.state.model = JiugeForCauslLM( + model_path, device_type, ndev, max_tokens=max_tokens + ) + app.state.kv_cache_pool = KVCachePool(app.state.model, MAX_BATCH) + app.state.request_queue = janus.Queue() + worker_thread = threading.Thread(target=worker_loop, args=(app,), daemon=True) + worker_thread.start() + + try: + yield # The app runs here + finally: + # Shutdown + app.state.request_queue.sync_q.put(None) + worker_thread.join() + app.state.request_queue.shutdown() + + app.state.kv_cache_pool.finalize() + app.state.model.destroy_model_instance() + + +App = FastAPI(lifespan=lifespan) + + +# App loop: take requests from the queue, do inference, and put unfinished requests back into the queue. +def worker_loop(app): + while True: + try: + task = app.state.request_queue.sync_q.get(timeout=0.01) + except queue.Empty: + continue + + if task is None: + return + + batch = [task] + while len(batch) < MAX_BATCH: + try: + req = app.state.request_queue.sync_q.get_nowait() + if req is not None: + batch.append(req) + except queue.Empty: + break + output_tokens = app.state.model.batch_infer_one_round(batch) + for task, token in zip(batch, output_tokens): + task.output(token) + if task.finish_reason is None: + app.state.request_queue.sync_q.put(task) + else: + print(f"[INFO] Task {task.id} finished infer.") + app.state.kv_cache_pool.release_sync(task) + + +def build_task(id_, request_data, request: Request): + messages = request_data.get("messages", []) + input_content = request.app.state.model.tokenizer.apply_chat_template( + conversation=messages, + add_generation_prompt=True, + tokenize=False, + ) + tokens = request.app.state.model.tokenizer.encode(input_content) + return AsyncInferTask( + id_, + tokens, + request_data.get("max_tokens", request.app.state.model.max_context_len()), + request_data.get("temperature", 1.0), + request_data.get("top_k", 1), + request_data.get("top_p", 1.0), + request.app.state.model.eos_token_id, + ) + + +async def chat_stream(id_, request_data, request: Request): + try: + infer_task = build_task(id_, request_data, request) + await request.app.state.kv_cache_pool.acquire(infer_task) + + # Initial empty content + chunk = json.dumps( + chunk_json(id_, content="", role="assistant"), ensure_ascii=False + ) + yield f"data: {chunk}\n\n" + + request.app.state.request_queue.sync_q.put(infer_task) + + while True: + if await request.is_disconnected(): + print("Client disconnected. Aborting stream.") + break + if ( + infer_task.finish_reason is not None + and infer_task.output_queue.async_q.empty() + ): + chunk = json.dumps( + chunk_json(id_, finish_reason=infer_task.finish_reason), + ensure_ascii=False, + ) + yield f"data: {chunk}\n\n" + break + + token = await infer_task.output_queue.async_q.get() + content = request.app.state.model.tokenizer.decode(token) + + chunk = json.dumps(chunk_json(id_, content=content), ensure_ascii=False) + yield f"data: {chunk}\n\n" + + except Exception as e: + print(f"[Error] ID : {id_} Exception: {e}") + finally: + if infer_task.finish_reason is None: + infer_task.finish_reason = "cancel" + + +async def chat(id_, request_data, request: Request): + try: + infer_task = build_task(id_, request_data, request) + await request.app.state.kv_cache_pool.acquire(infer_task) + request.app.state.request_queue.sync_q.put(infer_task) + output = [] + while True: + if ( + infer_task.finish_reason is not None + and infer_task.output_queue.async_q.empty() + ): + break + + token = await infer_task.output_queue.async_q.get() + content = request.app.state.model.tokenizer.decode(token) + output.append(content) + + output_text = "".join(output).strip() + response = chunk_json( + id_, + content=output_text, + role="assistant", + finish_reason=infer_task.finish_reason or "stop", + ) + return response + + except Exception as e: + print(f"[Error] ID: {id_} Exception: {e}") + return JSONResponse(content={"error": str(e)}, status_code=500) + finally: + if infer_task.finish_reason is None: + infer_task.finish_reason = "cancel" + + +@App.post("/chat/completions") +async def chat_completions(request: Request): + data = await request.json() + print('-----------------------------------------') + print(data) + print('-----------------------------------------') + + if not data.get("messages"): + if not data.get("prompt"): + return JSONResponse(content={"error": "No message provided"}, status_code=400) + else: + data['messages'] = [{"role": "user", "content": data.get("prompt")}] + + stream = data.get("stream", False) + id_ = f"cmpl-{uuid.uuid4().hex}" + if stream: + return StreamingResponse( + chat_stream(id_, data, request), media_type="text/event-stream" + ) + else: + response = await chat(id_, data, request) + return JSONResponse(content=response) + + +if __name__ == "__main__": + uvicorn.run(App, host="0.0.0.0", port=8000) + +""" +curl -N -H "Content-Type: application/json" \ + -X POST http://127.0.0.1:8000/chat/completions \ + -d '{ + "model": "jiuge", + "messages": [ + {"role": "user", "content": "山东最高的山是?"} + ], + "temperature": 1.0, + "top_k": 50, + "top_p": 0.8, + "max_tokens": 512, + "stream": true + }' +"""