diff --git a/DeepSeek-R1-Distill-Qwen-1.5B b/DeepSeek-R1-Distill-Qwen-1.5B new file mode 160000 index 00000000..ad9f0ae0 --- /dev/null +++ b/DeepSeek-R1-Distill-Qwen-1.5B @@ -0,0 +1 @@ +Subproject commit ad9f0ae0864d7fbcd1cd905e3c6c5b069cc8b562 diff --git a/include/llaisys/models/qwen2.h b/include/llaisys/models/qwen2.h index 7054626d..c4dd10d9 100644 --- a/include/llaisys/models/qwen2.h +++ b/include/llaisys/models/qwen2.h @@ -38,5 +38,7 @@ __C { __export struct LlaisysQwen2Weights *llaisysQwen2ModelWeights(struct LlaisysQwen2Model * model); __export int64_t llaisysQwen2ModelInfer(struct LlaisysQwen2Model * model, int64_t * token_ids, size_t ntoken); + + __export void llaisysQwen2ModelResetCache(struct LlaisysQwen2Model * model); } #endif // LLAISYS_MODELS_QWEN2_H diff --git a/python/llaisys/libllaisys/__init__.py b/python/llaisys/libllaisys/__init__.py index f536fb52..29d57c4c 100644 --- a/python/llaisys/libllaisys/__init__.py +++ b/python/llaisys/libllaisys/__init__.py @@ -12,6 +12,8 @@ from .tensor import llaisysTensor_t from .tensor import load_tensor from .ops import load_ops +from .models import load_models +from .models import LlaisysQwen2Meta, LlaisysQwen2Weights, llaisysQwen2Model_t def load_shared_library(): @@ -38,6 +40,7 @@ def load_shared_library(): load_runtime(LIB_LLAISYS) load_tensor(LIB_LLAISYS) load_ops(LIB_LLAISYS) +load_models(LIB_LLAISYS) __all__ = [ @@ -52,4 +55,7 @@ def load_shared_library(): "llaisysMemcpyKind_t", "MemcpyKind", "llaisysStream_t", + "LlaisysQwen2Meta", + "LlaisysQwen2Weights", + "llaisysQwen2Model_t", ] diff --git a/python/llaisys/libllaisys/models.py b/python/llaisys/libllaisys/models.py new file mode 100644 index 00000000..fe625dc8 --- /dev/null +++ b/python/llaisys/libllaisys/models.py @@ -0,0 +1,74 @@ +from ctypes import POINTER, c_void_p, c_size_t, c_int64, c_int, c_float, Structure +from .llaisys_types import llaisysDataType_t, llaisysDeviceType_t +from .tensor import llaisysTensor_t + + +class LlaisysQwen2Meta(Structure): + _fields_ = [ + ("dtype", llaisysDataType_t), + ("nlayer", c_size_t), + ("hs", c_size_t), + ("nh", c_size_t), + ("nkvh", c_size_t), + ("dh", c_size_t), + ("di", c_size_t), + ("maxseq", c_size_t), + ("voc", c_size_t), + ("epsilon", c_float), + ("theta", c_float), + ("end_token", c_int64), + ] + + +class LlaisysQwen2Weights(Structure): + _fields_ = [ + ("in_embed", llaisysTensor_t), + ("out_embed", llaisysTensor_t), + ("out_norm_w", llaisysTensor_t), + ("attn_norm_w", POINTER(llaisysTensor_t)), + ("attn_q_w", POINTER(llaisysTensor_t)), + ("attn_q_b", POINTER(llaisysTensor_t)), + ("attn_k_w", POINTER(llaisysTensor_t)), + ("attn_k_b", POINTER(llaisysTensor_t)), + ("attn_v_w", POINTER(llaisysTensor_t)), + ("attn_v_b", POINTER(llaisysTensor_t)), + ("attn_o_w", POINTER(llaisysTensor_t)), + ("mlp_norm_w", POINTER(llaisysTensor_t)), + ("mlp_gate_w", POINTER(llaisysTensor_t)), + ("mlp_up_w", POINTER(llaisysTensor_t)), + ("mlp_down_w", POINTER(llaisysTensor_t)), + ] + + +llaisysQwen2Model_t = c_void_p + + +def load_models(lib): + # llaisysQwen2ModelCreate + lib.llaisysQwen2ModelCreate.argtypes = [ + POINTER(LlaisysQwen2Meta), + llaisysDeviceType_t, + POINTER(c_int), + c_int, + ] + lib.llaisysQwen2ModelCreate.restype = llaisysQwen2Model_t + + # llaisysQwen2ModelDestroy + lib.llaisysQwen2ModelDestroy.argtypes = [llaisysQwen2Model_t] + lib.llaisysQwen2ModelDestroy.restype = None + + # llaisysQwen2ModelWeights + lib.llaisysQwen2ModelWeights.argtypes = [llaisysQwen2Model_t] + lib.llaisysQwen2ModelWeights.restype = POINTER(LlaisysQwen2Weights) + + # llaisysQwen2ModelInfer + lib.llaisysQwen2ModelInfer.argtypes = [ + llaisysQwen2Model_t, + POINTER(c_int64), + c_size_t, + ] + lib.llaisysQwen2ModelInfer.restype = c_int64 + + # llaisysQwen2ModelResetCache + lib.llaisysQwen2ModelResetCache.argtypes = [llaisysQwen2Model_t] + lib.llaisysQwen2ModelResetCache.restype = None diff --git a/python/llaisys/models/qwen2.py b/python/llaisys/models/qwen2.py index 0d07b0b2..e6890bb8 100644 --- a/python/llaisys/models/qwen2.py +++ b/python/llaisys/models/qwen2.py @@ -1,23 +1,244 @@ from typing import Sequence from ..libllaisys import LIB_LLAISYS -from ..libllaisys import DeviceType +from ..libllaisys import DeviceType, DataType +from ..libllaisys import LlaisysQwen2Meta, llaisysQwen2Model_t +from ..tensor import Tensor from pathlib import Path -import safetensors +import json +import numpy as np +from ctypes import c_int64, c_size_t, c_int, c_float, pointer, byref, POINTER, cast +import struct class Qwen2: - def __init__(self, model_path, device: DeviceType = DeviceType.CPU): - # TODO: Implement model constructor - model_path = Path(model_path) - for file in sorted(model_path.glob("*.safetensors")): - data_ = safetensors.safe_open(file, framework="numpy", device="cpu") - for name_ in data_.keys(): - ## TODO: load the model weights - pass + # 读取配置文件 + config_path = model_path / "config.json" + with open(config_path, "r") as f: + config = json.load(f) + + # 提取模型参数 + self.hidden_size = config["hidden_size"] + self.num_hidden_layers = config["num_hidden_layers"] + self.num_attention_heads = config["num_attention_heads"] + self.num_key_value_heads = config.get("num_key_value_heads", self.num_attention_heads) + self.intermediate_size = config["intermediate_size"] + self.vocab_size = config["vocab_size"] + self.rms_norm_eps = config.get("rms_norm_eps", 1e-6) + self.rope_theta = config.get("rope_theta", 10000.0) + self.max_position_embeddings = config.get("max_position_embeddings", 131072) + + # 计算每头维度 + self.head_dim = self.hidden_size // self.num_attention_heads + + # 确定数据类型 + torch_dtype = config.get("torch_dtype", "bfloat16") + if torch_dtype == "bfloat16": + self.dtype = DataType.BF16 + elif torch_dtype == "float16": + self.dtype = DataType.F16 + else: + self.dtype = DataType.F32 + + # 创建模型元数据 + meta = LlaisysQwen2Meta() + meta.dtype = self.dtype + meta.nlayer = self.num_hidden_layers + meta.hs = self.hidden_size + meta.nh = self.num_attention_heads + meta.nkvh = self.num_key_value_heads + meta.dh = self.head_dim + meta.di = self.intermediate_size + meta.maxseq = self.max_position_embeddings + meta.voc = self.vocab_size + meta.epsilon = self.rms_norm_eps + meta.theta = self.rope_theta + meta.end_token = config.get("eos_token_id", 151643) + + # 创建设备ID数组 + device_ids = (c_int * 1)(0) + + # 创建模型 + self.model = LIB_LLAISYS.llaisysQwen2ModelCreate( + byref(meta), + device, + device_ids, + 1 + ) + + if not self.model: + raise RuntimeError("Failed to create Qwen2 model") + + # 获取权重结构 + weights_ptr = LIB_LLAISYS.llaisysQwen2ModelWeights(self.model) + self.weights = weights_ptr.contents + + # 加载权重 + self._load_weights(model_path) + + # 存储设备信息 + self.device = device + + # 保存结束token + self.end_token = meta.end_token + + def _load_weights(self, model_path: Path): + """加载模型权重""" + + # 收集所有safetensors文件 + safetensor_files = sorted(model_path.glob("*.safetensors")) + + # 逐个文件加载权重 + for file in safetensor_files: + self._load_weights_from_file(file) + + def _load_weights_from_file(self, file_path: Path): + """从单个safetensors文件加载权重""" + # 读取文件头 + with open(file_path, 'rb') as f: + # 读取长度前缀 (8 bytes, little-endian unsigned long long) + length_bytes = f.read(8) + header_len = struct.unpack(' +#include +#include + +using namespace llaisys; + +// Qwen2模型结构 +struct LlaisysQwen2Model { + LlaisysQwen2Meta meta; + llaisysDeviceType_t device_type; + int device_id; + + // 权重 + LlaisysQwen2Weights weights; + + // 中间张量(用于重用内存) + tensor_t hidden_states; + tensor_t residual; + tensor_t q_proj; + tensor_t k_proj; + tensor_t v_proj; + tensor_t o_proj; + tensor_t q_rotated; + tensor_t k_rotated; + tensor_t attn_output; + tensor_t gate_proj; + tensor_t up_proj; + tensor_t mlp_output; + tensor_t logits; + tensor_t max_val; + tensor_t max_idx; + + // KV Cache + std::vector k_cache; + std::vector v_cache; + + // 位置ID张量 + tensor_t pos_ids; + + // KV Cache状态 + size_t cache_pos; // 当前cache位置(已缓存的token数) + size_t total_len; // 总token数(用于注意力计算) +}; + +// 辅助函数:从llaisysTensor_t获取tensor_t +inline tensor_t get_tensor(llaisysTensor_t t) { + return t ? t->tensor : nullptr; +} + +// 创建模型 +__C struct LlaisysQwen2Model *llaisysQwen2ModelCreate( + const LlaisysQwen2Meta *meta, + llaisysDeviceType_t device, + int *device_ids, + int ndevice) { + + auto *model = new LlaisysQwen2Model(); + model->meta = *meta; + model->device_type = device; + model->device_id = ndevice > 0 ? device_ids[0] : 0; + + size_t hs = meta->hs; + size_t maxseq = meta->maxseq; + + // 创建中间张量 + model->hidden_states = Tensor::create({maxseq, hs}, meta->dtype, device, model->device_id); + model->residual = Tensor::create({maxseq, hs}, meta->dtype, device, model->device_id); + + // QKV投影输出 [maxseq, nh * dh] 和 [maxseq, nkvh * dh] + size_t q_size = meta->nh * meta->dh; + size_t kv_size = meta->nkvh * meta->dh; + model->q_proj = Tensor::create({maxseq, q_size}, meta->dtype, device, model->device_id); + model->k_proj = Tensor::create({maxseq, kv_size}, meta->dtype, device, model->device_id); + model->v_proj = Tensor::create({maxseq, kv_size}, meta->dtype, device, model->device_id); + + // 旋转后的QK [maxseq, nh, dh] 和 [maxseq, nkvh, dh] + model->q_rotated = Tensor::create({maxseq, meta->nh, meta->dh}, meta->dtype, device, model->device_id); + model->k_rotated = Tensor::create({maxseq, meta->nkvh, meta->dh}, meta->dtype, device, model->device_id); + + // 注意力输出 [maxseq, nh, dh] + model->attn_output = Tensor::create({maxseq, meta->nh, meta->dh}, meta->dtype, device, model->device_id); + + // O投影输出 [maxseq, hs] + model->o_proj = Tensor::create({maxseq, hs}, meta->dtype, device, model->device_id); + + // MLP中间张量 + model->gate_proj = Tensor::create({maxseq, meta->di}, meta->dtype, device, model->device_id); + model->up_proj = Tensor::create({maxseq, meta->di}, meta->dtype, device, model->device_id); + model->mlp_output = Tensor::create({maxseq, meta->di}, meta->dtype, device, model->device_id); + + // 输出logits [maxseq, voc] + model->logits = Tensor::create({maxseq, meta->voc}, meta->dtype, device, model->device_id); + + // argmax输出 + model->max_val = Tensor::create({1}, meta->dtype, device, model->device_id); + model->max_idx = Tensor::create({1}, LLAISYS_DTYPE_I64, device, model->device_id); + + // 创建权重张量 + // Embedding权重 [voc, hs] + model->weights.in_embed = new LlaisysTensor{Tensor::create({meta->voc, hs}, meta->dtype, device, model->device_id)}; + model->weights.out_embed = new LlaisysTensor{Tensor::create({meta->voc, hs}, meta->dtype, device, model->device_id)}; + + // 最终归一化权重 [hs] + model->weights.out_norm_w = new LlaisysTensor{Tensor::create({hs}, meta->dtype, device, model->device_id)}; + + // 分配每层权重指针数组 + model->weights.attn_norm_w = new llaisysTensor_t[meta->nlayer](); + model->weights.attn_q_w = new llaisysTensor_t[meta->nlayer](); + model->weights.attn_q_b = new llaisysTensor_t[meta->nlayer](); + model->weights.attn_k_w = new llaisysTensor_t[meta->nlayer](); + model->weights.attn_k_b = new llaisysTensor_t[meta->nlayer](); + model->weights.attn_v_w = new llaisysTensor_t[meta->nlayer](); + model->weights.attn_v_b = new llaisysTensor_t[meta->nlayer](); + model->weights.attn_o_w = new llaisysTensor_t[meta->nlayer](); + model->weights.mlp_norm_w = new llaisysTensor_t[meta->nlayer](); + model->weights.mlp_gate_w = new llaisysTensor_t[meta->nlayer](); + model->weights.mlp_up_w = new llaisysTensor_t[meta->nlayer](); + model->weights.mlp_down_w = new llaisysTensor_t[meta->nlayer](); + + // 创建每层权重张量 + for (size_t i = 0; i < meta->nlayer; i++) { + // Attention归一化权重 [hs] + model->weights.attn_norm_w[i] = new LlaisysTensor{Tensor::create({hs}, meta->dtype, device, model->device_id)}; + + // Q投影权重 [nh*dh, hs] 和偏置 [nh*dh] + model->weights.attn_q_w[i] = new LlaisysTensor{Tensor::create({meta->nh * meta->dh, hs}, meta->dtype, device, model->device_id)}; + model->weights.attn_q_b[i] = new LlaisysTensor{Tensor::create({meta->nh * meta->dh}, meta->dtype, device, model->device_id)}; + + // K投影权重 [nkvh*dh, hs] 和偏置 [nkvh*dh] + model->weights.attn_k_w[i] = new LlaisysTensor{Tensor::create({meta->nkvh * meta->dh, hs}, meta->dtype, device, model->device_id)}; + model->weights.attn_k_b[i] = new LlaisysTensor{Tensor::create({meta->nkvh * meta->dh}, meta->dtype, device, model->device_id)}; + + // V投影权重 [nkvh*dh, hs] 和偏置 [nkvh*dh] + model->weights.attn_v_w[i] = new LlaisysTensor{Tensor::create({meta->nkvh * meta->dh, hs}, meta->dtype, device, model->device_id)}; + model->weights.attn_v_b[i] = new LlaisysTensor{Tensor::create({meta->nkvh * meta->dh}, meta->dtype, device, model->device_id)}; + + // O投影权重 [hs, nh*dh] + model->weights.attn_o_w[i] = new LlaisysTensor{Tensor::create({hs, meta->nh * meta->dh}, meta->dtype, device, model->device_id)}; + + // MLP归一化权重 [hs] + model->weights.mlp_norm_w[i] = new LlaisysTensor{Tensor::create({hs}, meta->dtype, device, model->device_id)}; + + // Gate投影权重 [di, hs] + model->weights.mlp_gate_w[i] = new LlaisysTensor{Tensor::create({meta->di, hs}, meta->dtype, device, model->device_id)}; + + // Up投影权重 [di, hs] + model->weights.mlp_up_w[i] = new LlaisysTensor{Tensor::create({meta->di, hs}, meta->dtype, device, model->device_id)}; + + // Down投影权重 [hs, di] + model->weights.mlp_down_w[i] = new LlaisysTensor{Tensor::create({hs, meta->di}, meta->dtype, device, model->device_id)}; + } + + // 初始化KV Cache + model->k_cache.resize(meta->nlayer); + model->v_cache.resize(meta->nlayer); + for (size_t i = 0; i < meta->nlayer; i++) { + model->k_cache[i] = Tensor::create({maxseq, meta->nkvh, meta->dh}, meta->dtype, device, model->device_id); + model->v_cache[i] = Tensor::create({maxseq, meta->nkvh, meta->dh}, meta->dtype, device, model->device_id); + } + + // 位置ID张量 + model->pos_ids = Tensor::create({maxseq}, LLAISYS_DTYPE_I64, device, model->device_id); + + // 初始化KV Cache状态 + model->cache_pos = 0; + model->total_len = 0; + + return model; +} + +// 销毁模型 +__C void llaisysQwen2ModelDestroy(struct LlaisysQwen2Model *model) { + if (!model) return; + + // 释放权重张量 + delete model->weights.in_embed; + delete model->weights.out_embed; + delete model->weights.out_norm_w; + + // 释放每层权重张量 + for (size_t i = 0; i < model->meta.nlayer; i++) { + delete model->weights.attn_norm_w[i]; + delete model->weights.attn_q_w[i]; + delete model->weights.attn_q_b[i]; + delete model->weights.attn_k_w[i]; + delete model->weights.attn_k_b[i]; + delete model->weights.attn_v_w[i]; + delete model->weights.attn_v_b[i]; + delete model->weights.attn_o_w[i]; + delete model->weights.mlp_norm_w[i]; + delete model->weights.mlp_gate_w[i]; + delete model->weights.mlp_up_w[i]; + delete model->weights.mlp_down_w[i]; + } + + // 释放权重指针数组 + delete[] model->weights.attn_norm_w; + delete[] model->weights.attn_q_w; + delete[] model->weights.attn_q_b; + delete[] model->weights.attn_k_w; + delete[] model->weights.attn_k_b; + delete[] model->weights.attn_v_w; + delete[] model->weights.attn_v_b; + delete[] model->weights.attn_o_w; + delete[] model->weights.mlp_norm_w; + delete[] model->weights.mlp_gate_w; + delete[] model->weights.mlp_up_w; + delete[] model->weights.mlp_down_w; + + delete model; +} + +// 获取模型权重 +__C struct LlaisysQwen2Weights *llaisysQwen2ModelWeights(struct LlaisysQwen2Model *model) { + return &model->weights; +} + +// 辅助函数:复制张量数据到KV Cache +static void copy_to_kv_cache(tensor_t cache, tensor_t src, size_t start_pos, size_t len) { + // src: [len, nkvh, dh] + // cache: [maxseq, nkvh, dh] + // 将src复制到cache的[start_pos, start_pos+len)位置 + + auto cache_slice = cache->slice(0, start_pos, start_pos + len); + + // 获取数据指针 + std::byte *cache_data = cache_slice->data(); + const std::byte *src_data = src->data(); + + size_t bytes = len * src->shape()[1] * src->shape()[2] * src->elementSize(); + std::memcpy(cache_data, src_data, bytes); +} + +// 模型推理 - 支持KV Cache的增量推理 +// 当use_cache为true时,只处理新token(ntoken应为1),利用KV Cache +// 当use_cache为false时,处理所有token(首次调用或重置时) +__C int64_t llaisysQwen2ModelInfer( + struct LlaisysQwen2Model *model, + int64_t *token_ids, + size_t ntoken) { + + const auto &meta = model->meta; + size_t nh = meta.nh; + size_t nkvh = meta.nkvh; + size_t dh = meta.dh; + size_t voc = meta.voc; + size_t nlayer = meta.nlayer; + float epsilon = meta.epsilon; + float theta = meta.theta; + + // 判断是否使用KV Cache + bool use_cache = model->cache_pos > 0; + size_t start_pos = use_cache ? model->cache_pos : 0; + size_t total_len = start_pos + ntoken; + + // 使用hidden_states的[start_pos, total_len)行作为当前处理的hidden + auto hidden = model->hidden_states->slice(0, start_pos, total_len); + + // 1. Embedding(只对新token做embedding) + auto token_ids_tensor = Tensor::create({ntoken}, LLAISYS_DTYPE_I64, model->device_type, model->device_id); + token_ids_tensor->load(token_ids); + ops::embedding(hidden, token_ids_tensor, get_tensor(model->weights.in_embed)); + + // 设置位置ID(使用实际位置) + auto pos_ids_slice = model->pos_ids->slice(0, start_pos, total_len); + std::vector pos_ids_data(ntoken); + for (size_t i = 0; i < ntoken; i++) { + pos_ids_data[i] = static_cast(start_pos + i); + } + pos_ids_slice->load(pos_ids_data.data()); + + // Transformer层 + for (size_t layer = 0; layer < nlayer; layer++) { + // 保存当前hidden到residual张量(用于残差连接) + // 需要将hidden_states[start_pos:total_len]复制到residual[start_pos:total_len] + auto residual_slice = model->residual->slice(0, start_pos, total_len); + std::memcpy(residual_slice->data(), hidden->data(), + ntoken * meta.hs * hidden->elementSize()); + + // 1. RMS Norm (input_layernorm) + ops::rms_norm(hidden, hidden, get_tensor(model->weights.attn_norm_w[layer]), epsilon); + + // 2. QKV投影 + auto q_proj_slice = model->q_proj->slice(0, start_pos, total_len); + auto k_proj_slice = model->k_proj->slice(0, start_pos, total_len); + auto v_proj_slice = model->v_proj->slice(0, start_pos, total_len); + + ops::linear(q_proj_slice, hidden, get_tensor(model->weights.attn_q_w[layer]), + get_tensor(model->weights.attn_q_b[layer])); + ops::linear(k_proj_slice, hidden, get_tensor(model->weights.attn_k_w[layer]), + get_tensor(model->weights.attn_k_b[layer])); + ops::linear(v_proj_slice, hidden, get_tensor(model->weights.attn_v_w[layer]), + get_tensor(model->weights.attn_v_b[layer])); + + // 3. 重塑为 [ntoken, nh, dh] 和 [ntoken, nkvh, dh] + auto q_reshaped = q_proj_slice->view({ntoken, nh, dh}); + auto k_reshaped = k_proj_slice->view({ntoken, nkvh, dh}); + auto v_reshaped = v_proj_slice->view({ntoken, nkvh, dh}); + + // 4. RoPE + auto q_rotated_slice = model->q_rotated->slice(0, start_pos, total_len); + auto k_rotated_slice = model->k_rotated->slice(0, start_pos, total_len); + + ops::rope(q_rotated_slice, q_reshaped, pos_ids_slice, theta); + ops::rope(k_rotated_slice, k_reshaped, pos_ids_slice, theta); + + // 5. 更新KV Cache + copy_to_kv_cache(model->k_cache[layer], k_rotated_slice, start_pos, ntoken); + copy_to_kv_cache(model->v_cache[layer], v_reshaped, start_pos, ntoken); + + // 6. Self Attention - 使用完整的KV Cache + auto k_cache_slice = model->k_cache[layer]->slice(0, 0, total_len); + auto v_cache_slice = model->v_cache[layer]->slice(0, 0, total_len); + + auto attn_output_slice = model->attn_output->slice(0, start_pos, total_len); + float scale = 1.0f / std::sqrt(static_cast(dh)); + ops::self_attention(attn_output_slice, q_rotated_slice, k_cache_slice, v_cache_slice, scale); + + // 7. O投影 + auto o_proj_slice = model->o_proj->slice(0, start_pos, total_len); + auto attn_flat = attn_output_slice->view({ntoken, nh * dh}); + ops::linear(o_proj_slice, attn_flat, get_tensor(model->weights.attn_o_w[layer]), nullptr); + + // 8. 残差连接:o_proj + residual + // 需要将residual[start_pos:total_len]加到o_proj_slice + ops::add(o_proj_slice, o_proj_slice, residual_slice); + + // 9. 保存当前结果到residual用于MLP残差连接 + std::memcpy(residual_slice->data(), o_proj_slice->data(), + ntoken * meta.hs * o_proj_slice->elementSize()); + + // 10. RMS Norm (post_attention_layernorm) + ops::rms_norm(o_proj_slice, o_proj_slice, get_tensor(model->weights.mlp_norm_w[layer]), epsilon); + + // 11. MLP + auto gate_slice = model->gate_proj->slice(0, start_pos, total_len); + auto up_slice = model->up_proj->slice(0, start_pos, total_len); + + ops::linear(gate_slice, o_proj_slice, get_tensor(model->weights.mlp_gate_w[layer]), nullptr); + ops::linear(up_slice, o_proj_slice, get_tensor(model->weights.mlp_up_w[layer]), nullptr); + + auto mlp_out_slice = model->mlp_output->slice(0, start_pos, total_len); + ops::swiglu(mlp_out_slice, gate_slice, up_slice); + + // 12. Down投影(结果存回hidden) + ops::linear(hidden, mlp_out_slice, get_tensor(model->weights.mlp_down_w[layer]), nullptr); + + // 13. 残差连接:hidden + residual + ops::add(hidden, hidden, residual_slice); + } + + // 最终RMS Norm + ops::rms_norm(hidden, hidden, get_tensor(model->weights.out_norm_w), epsilon); + + // 输出投影 + auto logits_slice = model->logits->slice(0, start_pos, total_len); + ops::linear(logits_slice, hidden, get_tensor(model->weights.out_embed), nullptr); + + // 取最后一个token的logits进行argmax + auto last_logits = logits_slice->slice(0, ntoken - 1, ntoken); + auto last_logits_flat = last_logits->view({voc}); + + ops::argmax(model->max_idx, model->max_val, last_logits_flat); + + // 获取结果 + int64_t result; + std::memcpy(&result, model->max_idx->data(), sizeof(int64_t)); + + // 更新KV Cache状态 + model->cache_pos = total_len; + model->total_len = total_len; + + return result; +} + +// 重置KV Cache状态(用于新的对话) +__C void llaisysQwen2ModelResetCache(struct LlaisysQwen2Model *model) { + if (!model) return; + model->cache_pos = 0; + model->total_len = 0; +} diff --git a/src/llaisys/models/qwen2.hpp b/src/llaisys/models/qwen2.hpp new file mode 100644 index 00000000..dedab2da --- /dev/null +++ b/src/llaisys/models/qwen2.hpp @@ -0,0 +1,11 @@ +#pragma once + +#include "llaisys/models/qwen2.h" +#include "../llaisys_tensor.hpp" + +namespace llaisys { + +// 前向声明 +struct LlaisysQwen2Model; + +} // namespace llaisys diff --git a/src/ops/argmax/cpu/argmax_cpu.cpp b/src/ops/argmax/cpu/argmax_cpu.cpp new file mode 100644 index 00000000..a332be19 --- /dev/null +++ b/src/ops/argmax/cpu/argmax_cpu.cpp @@ -0,0 +1,113 @@ +#include "argmax_cpu.hpp" + +#include "../../../utils.hpp" + +#include + +namespace llaisys::ops::cpu { + +template +void argmax_impl(std::byte *max_idx, std::byte *max_val, const std::byte *vals, size_t size) { + const T* vals_data = reinterpret_cast(vals); + T* max_val_data = reinterpret_cast(max_val); + int64_t* max_idx_data = reinterpret_cast(max_idx); + + // 初始化最大值和索引 + T max_val_val = vals_data[0]; + size_t max_idx_val = 0; + + // 遍历所有元素找最大值 + for (size_t i = 1; i < size; i++) { + if (vals_data[i] > max_val_val) { + max_val_val = vals_data[i]; + max_idx_val = i; + } + } + + // 存储结果 + max_val_data[0] = max_val_val; + max_idx_data[0] = static_cast(max_idx_val); +} + +// 处理F16类型的特化实现 +template <> +void argmax_impl(std::byte *max_idx, std::byte *max_val, const std::byte *vals, size_t size) { + const llaisys::fp16_t* vals_data = reinterpret_cast(vals); + llaisys::fp16_t* max_val_data = reinterpret_cast(max_val); + int64_t* max_idx_data = reinterpret_cast(max_idx); + + // 初始化最大值和索引 + float max_val_val = llaisys::utils::cast(vals_data[0]); + size_t max_idx_val = 0; + + // 遍历所有元素找最大值 + for (size_t i = 1; i < size; i++) { + float current_val = llaisys::utils::cast(vals_data[i]); + if (current_val > max_val_val) { + max_val_val = current_val; + max_idx_val = i; + } + } + + // 存储结果 + max_val_data[0] = llaisys::utils::cast(max_val_val); + max_idx_data[0] = static_cast(max_idx_val); +} + +// 处理BF16类型的特化实现 +template <> +void argmax_impl(std::byte *max_idx, std::byte *max_val, const std::byte *vals, size_t size) { + const llaisys::bf16_t* vals_data = reinterpret_cast(vals); + llaisys::bf16_t* max_val_data = reinterpret_cast(max_val); + int64_t* max_idx_data = reinterpret_cast(max_idx); + + // 初始化最大值和索引 + float max_val_val = llaisys::utils::cast(vals_data[0]); + size_t max_idx_val = 0; + + // 遍历所有元素找最大值 + for (size_t i = 1; i < size; i++) { + float current_val = llaisys::utils::cast(vals_data[i]); + if (current_val > max_val_val) { + max_val_val = current_val; + max_idx_val = i; + } + } + + // 存储结果 + max_val_data[0] = llaisys::utils::cast(max_val_val); + max_idx_data[0] = static_cast(max_idx_val); +} + +void argmax(std::byte *max_idx, std::byte *max_val, const std::byte *vals, llaisysDataType_t type, size_t size) { + switch (type) { + case LLAISYS_DTYPE_F32: + return argmax_impl(max_idx, max_val, vals, size); + case LLAISYS_DTYPE_F64: + return argmax_impl(max_idx, max_val, vals, size); + case LLAISYS_DTYPE_I8: + return argmax_impl(max_idx, max_val, vals, size); + case LLAISYS_DTYPE_I16: + return argmax_impl(max_idx, max_val, vals, size); + case LLAISYS_DTYPE_I32: + return argmax_impl(max_idx, max_val, vals, size); + case LLAISYS_DTYPE_I64: + return argmax_impl(max_idx, max_val, vals, size); + case LLAISYS_DTYPE_U8: + return argmax_impl(max_idx, max_val, vals, size); + case LLAISYS_DTYPE_U16: + return argmax_impl(max_idx, max_val, vals, size); + case LLAISYS_DTYPE_U32: + return argmax_impl(max_idx, max_val, vals, size); + case LLAISYS_DTYPE_U64: + return argmax_impl(max_idx, max_val, vals, size); + case LLAISYS_DTYPE_F16: + return argmax_impl(max_idx, max_val, vals, size); + case LLAISYS_DTYPE_BF16: + return argmax_impl(max_idx, max_val, vals, size); + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } +} + +} // namespace llaisys::ops::cpu \ No newline at end of file diff --git a/src/ops/argmax/cpu/argmax_cpu.hpp b/src/ops/argmax/cpu/argmax_cpu.hpp new file mode 100644 index 00000000..0c362ee4 --- /dev/null +++ b/src/ops/argmax/cpu/argmax_cpu.hpp @@ -0,0 +1,8 @@ +#pragma once +#include "llaisys.h" + +#include + +namespace llaisys::ops::cpu { +void argmax(std::byte *max_idx, std::byte *max_val, const std::byte *vals, llaisysDataType_t type, size_t size); +} \ No newline at end of file diff --git a/src/ops/argmax/op.cpp b/src/ops/argmax/op.cpp index 6dc37d42..5622a993 100644 --- a/src/ops/argmax/op.cpp +++ b/src/ops/argmax/op.cpp @@ -1,7 +1,37 @@ #include "op.hpp" +#include "../../core/llaisys_core.hpp" +#include "../../utils.hpp" + +#include "cpu/argmax_cpu.hpp" + namespace llaisys::ops { + void argmax(tensor_t max_idx, tensor_t max_val, tensor_t vals) { - TO_BE_IMPLEMENTED(); + // 参数检查 + CHECK_ARGUMENT(max_idx->ndim() == 1, "max_idx must be a 1D tensor"); + CHECK_ARGUMENT(max_val->ndim() == 1 && max_val->shape()[0] == 1, "max_val must be a 1D tensor with one element"); + CHECK_ARGUMENT(max_val->dtype() == vals->dtype(), "max_val must have the same dtype as vals"); + CHECK_ARGUMENT(max_idx->dtype() == LLAISYS_DTYPE_I64, "max_idx must have dtype I64"); + CHECK_ARGUMENT(vals->isContiguous(), "argmax: vals tensor must be contiguous."); + + // 总是支持CPU计算 + if (vals->deviceType() == LLAISYS_DEVICE_CPU) { + return cpu::argmax(max_idx->data(), max_val->data(), vals->data(), vals->dtype(), vals->numel()); + } + + llaisys::core::context().setDevice(vals->deviceType(), vals->deviceId()); + + switch (vals->deviceType()) { + case LLAISYS_DEVICE_CPU: + return cpu::argmax(max_idx->data(), max_val->data(), vals->data(), vals->dtype(), vals->numel()); +#ifdef ENABLE_NVIDIA_API + case LLAISYS_DEVICE_NVIDIA: + TO_BE_IMPLEMENTED(); + return; +#endif + default: + EXCEPTION_UNSUPPORTED_DEVICE; + } } -} // namespace llaisys::ops +} // namespace llaisys::ops \ No newline at end of file diff --git a/src/ops/embedding/cpu/embedding_cpu.cpp b/src/ops/embedding/cpu/embedding_cpu.cpp new file mode 100644 index 00000000..130ccc3c --- /dev/null +++ b/src/ops/embedding/cpu/embedding_cpu.cpp @@ -0,0 +1,92 @@ +#include "embedding_cpu.hpp" + +#include "../../../utils.hpp" + +#include + +namespace llaisys::ops::cpu { + +template +void embedding_impl(std::byte *out, const std::byte *index, const std::byte *weight, size_t out_size, size_t weight_dim1) { + const int64_t* index_data = reinterpret_cast(index); + const T* weight_data = reinterpret_cast(weight); + T* out_data = reinterpret_cast(out); + + size_t batch_size = out_size / weight_dim1; + + for (size_t i = 0; i < batch_size; i++) { + int64_t idx = index_data[i]; + // 处理负索引 + if (idx < 0) { + idx = weight_dim1 + idx; + } + + // 复制对应的行 + for (size_t j = 0; j < weight_dim1; j++) { + out_data[i * weight_dim1 + j] = weight_data[idx * weight_dim1 + j]; + } + } +} + +// 处理F16类型的特化实现 +template <> +void embedding_impl(std::byte *out, const std::byte *index, const std::byte *weight, size_t out_size, size_t weight_dim1) { + const int64_t* index_data = reinterpret_cast(index); + const llaisys::fp16_t* weight_data = reinterpret_cast(weight); + llaisys::fp16_t* out_data = reinterpret_cast(out); + + size_t batch_size = out_size / weight_dim1; + + for (size_t i = 0; i < batch_size; i++) { + int64_t idx = index_data[i]; + // 处理负索引 + if (idx < 0) { + idx = weight_dim1 + idx; + } + + // 复制对应的行 + for (size_t j = 0; j < weight_dim1; j++) { + out_data[i * weight_dim1 + j] = weight_data[idx * weight_dim1 + j]; + } + } +} + +// 处理BF16类型的特化实现 +template <> +void embedding_impl(std::byte *out, const std::byte *index, const std::byte *weight, size_t out_size, size_t weight_dim1) { + const int64_t* index_data = reinterpret_cast(index); + const llaisys::bf16_t* weight_data = reinterpret_cast(weight); + llaisys::bf16_t* out_data = reinterpret_cast(out); + + size_t batch_size = out_size / weight_dim1; + + for (size_t i = 0; i < batch_size; i++) { + int64_t idx = index_data[i]; + // 处理负索引 + if (idx < 0) { + idx = weight_dim1 + idx; + } + + // 复制对应的行 + for (size_t j = 0; j < weight_dim1; j++) { + out_data[i * weight_dim1 + j] = weight_data[idx * weight_dim1 + j]; + } + } +} + +void embedding(std::byte *out, const std::byte *index, const std::byte *weight, llaisysDataType_t type, size_t out_size, size_t weight_dim1) { + switch (type) { + case LLAISYS_DTYPE_F32: + return embedding_impl(out, index, weight, out_size, weight_dim1); + case LLAISYS_DTYPE_F64: + return embedding_impl(out, index, weight, out_size, weight_dim1); + case LLAISYS_DTYPE_F16: + return embedding_impl(out, index, weight, out_size, weight_dim1); + case LLAISYS_DTYPE_BF16: + return embedding_impl(out, index, weight, out_size, weight_dim1); + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } +} + +} // namespace llaisys::ops::cpu \ No newline at end of file diff --git a/src/ops/embedding/cpu/embedding_cpu.hpp b/src/ops/embedding/cpu/embedding_cpu.hpp new file mode 100644 index 00000000..b5d2a0f6 --- /dev/null +++ b/src/ops/embedding/cpu/embedding_cpu.hpp @@ -0,0 +1,8 @@ +#pragma once +#include "llaisys.h" + +#include + +namespace llaisys::ops::cpu { +void embedding(std::byte *out, const std::byte *index, const std::byte *weight, llaisysDataType_t type, size_t out_size, size_t weight_dim1); +} \ No newline at end of file diff --git a/src/ops/embedding/op.cpp b/src/ops/embedding/op.cpp index 84b9a5d0..e5e8e5ba 100644 --- a/src/ops/embedding/op.cpp +++ b/src/ops/embedding/op.cpp @@ -1,7 +1,39 @@ #include "op.hpp" +#include "../../core/llaisys_core.hpp" +#include "../../utils.hpp" + +#include "cpu/embedding_cpu.hpp" + namespace llaisys::ops { void embedding(tensor_t out, tensor_t index, tensor_t weight) { - TO_BE_IMPLEMENTED(); + CHECK_ARGUMENT(out->ndim() == 2, "out must be a 2D tensor"); + CHECK_ARGUMENT(index->ndim() == 1, "index must be a 1D tensor"); + CHECK_ARGUMENT(weight->ndim() == 2, "weight must be a 2D tensor"); + CHECK_ARGUMENT(index->dtype() == LLAISYS_DTYPE_I64, "index must have dtype I64"); + CHECK_ARGUMENT(out->shape()[0] == index->shape()[0], "out must have the same shape as index"); + CHECK_ARGUMENT(out->shape()[1] == weight->shape()[1], "out must have the same shape as weight"); + CHECK_ARGUMENT(out->isContiguous(), "embedding: out tensor must be contiguous."); + CHECK_ARGUMENT(index->isContiguous(), "embedding: index tensor must be contiguous."); + CHECK_ARGUMENT(weight->isContiguous(), "embedding: weight tensor must be contiguous."); + + // 总是支持CPU计算 + if (out->deviceType() == LLAISYS_DEVICE_CPU) { + return cpu::embedding(out->data(), index->data(), weight->data(), out->dtype(), out->numel(), weight->shape()[1]); + } + + llaisys::core::context().setDevice(out->deviceType(), out->deviceId()); + + switch (out->deviceType()) { + case LLAISYS_DEVICE_CPU: + return cpu::embedding(out->data(), index->data(), weight->data(), out->dtype(), out->numel(), weight->shape()[1]); +#ifdef ENABLE_NVIDIA_API + case LLAISYS_DEVICE_NVIDIA: + TO_BE_IMPLEMENTED(); + return; +#endif + default: + EXCEPTION_UNSUPPORTED_DEVICE; + } } } // namespace llaisys::ops diff --git a/src/ops/linear/cpu/linear_cpu.cpp b/src/ops/linear/cpu/linear_cpu.cpp new file mode 100644 index 00000000..284f8e54 --- /dev/null +++ b/src/ops/linear/cpu/linear_cpu.cpp @@ -0,0 +1,112 @@ +#include "linear_cpu.hpp" + +#include "../../../utils.hpp" + +#include + +namespace llaisys::ops::cpu { + +template +void linear_impl(std::byte *out, const std::byte *in, const std::byte *weight, const std::byte *bias, size_t batch_size, size_t in_features, size_t out_features) { + const T* in_data = reinterpret_cast(in); + const T* weight_data = reinterpret_cast(weight); + T* out_data = reinterpret_cast(out); + + // 矩阵乘法:Y = xW^T + for (size_t i = 0; i < batch_size; i++) { + for (size_t j = 0; j < out_features; j++) { + T sum = 0; + for (size_t k = 0; k < in_features; k++) { + sum += in_data[i * in_features + k] * weight_data[j * in_features + k]; + } + out_data[i * out_features + j] = sum; + } + } + + // 添加偏置 + if (bias != nullptr) { + const T* bias_data = reinterpret_cast(bias); + for (size_t i = 0; i < batch_size; i++) { + for (size_t j = 0; j < out_features; j++) { + out_data[i * out_features + j] += bias_data[j]; + } + } + } +} + +// 处理F16类型的特化实现 +template <> +void linear_impl(std::byte *out, const std::byte *in, const std::byte *weight, const std::byte *bias, size_t batch_size, size_t in_features, size_t out_features) { + const llaisys::fp16_t* in_data = reinterpret_cast(in); + const llaisys::fp16_t* weight_data = reinterpret_cast(weight); + llaisys::fp16_t* out_data = reinterpret_cast(out); + + // 矩阵乘法:Y = xW^T + for (size_t i = 0; i < batch_size; i++) { + for (size_t j = 0; j < out_features; j++) { + float sum = 0; + for (size_t k = 0; k < in_features; k++) { + sum += llaisys::utils::cast(in_data[i * in_features + k]) * llaisys::utils::cast(weight_data[j * in_features + k]); + } + out_data[i * out_features + j] = llaisys::utils::cast(sum); + } + } + + // 添加偏置 + if (bias != nullptr) { + const llaisys::fp16_t* bias_data = reinterpret_cast(bias); + for (size_t i = 0; i < batch_size; i++) { + for (size_t j = 0; j < out_features; j++) { + float val = llaisys::utils::cast(out_data[i * out_features + j]) + llaisys::utils::cast(bias_data[j]); + out_data[i * out_features + j] = llaisys::utils::cast(val); + } + } + } +} + +// 处理BF16类型的特化实现 +template <> +void linear_impl(std::byte *out, const std::byte *in, const std::byte *weight, const std::byte *bias, size_t batch_size, size_t in_features, size_t out_features) { + const llaisys::bf16_t* in_data = reinterpret_cast(in); + const llaisys::bf16_t* weight_data = reinterpret_cast(weight); + llaisys::bf16_t* out_data = reinterpret_cast(out); + + // 矩阵乘法:Y = xW^T + for (size_t i = 0; i < batch_size; i++) { + for (size_t j = 0; j < out_features; j++) { + float sum = 0; + for (size_t k = 0; k < in_features; k++) { + sum += llaisys::utils::cast(in_data[i * in_features + k]) * llaisys::utils::cast(weight_data[j * in_features + k]); + } + out_data[i * out_features + j] = llaisys::utils::cast(sum); + } + } + + // 添加偏置 + if (bias != nullptr) { + const llaisys::bf16_t* bias_data = reinterpret_cast(bias); + for (size_t i = 0; i < batch_size; i++) { + for (size_t j = 0; j < out_features; j++) { + float val = llaisys::utils::cast(out_data[i * out_features + j]) + llaisys::utils::cast(bias_data[j]); + out_data[i * out_features + j] = llaisys::utils::cast(val); + } + } + } +} + +void linear(std::byte *out, const std::byte *in, const std::byte *weight, const std::byte *bias, llaisysDataType_t type, size_t batch_size, size_t in_features, size_t out_features) { + switch (type) { + case LLAISYS_DTYPE_F32: + return linear_impl(out, in, weight, bias, batch_size, in_features, out_features); + case LLAISYS_DTYPE_F64: + return linear_impl(out, in, weight, bias, batch_size, in_features, out_features); + case LLAISYS_DTYPE_F16: + return linear_impl(out, in, weight, bias, batch_size, in_features, out_features); + case LLAISYS_DTYPE_BF16: + return linear_impl(out, in, weight, bias, batch_size, in_features, out_features); + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } +} + +} // namespace llaisys::ops::cpu \ No newline at end of file diff --git a/src/ops/linear/cpu/linear_cpu.hpp b/src/ops/linear/cpu/linear_cpu.hpp new file mode 100644 index 00000000..3d01b1c5 --- /dev/null +++ b/src/ops/linear/cpu/linear_cpu.hpp @@ -0,0 +1,8 @@ +#pragma once +#include "llaisys.h" + +#include + +namespace llaisys::ops::cpu { +void linear(std::byte *out, const std::byte *in, const std::byte *weight, const std::byte *bias, llaisysDataType_t type, size_t batch_size, size_t in_features, size_t out_features); +} \ No newline at end of file diff --git a/src/ops/linear/op.cpp b/src/ops/linear/op.cpp index 97d1f865..5500f23b 100644 --- a/src/ops/linear/op.cpp +++ b/src/ops/linear/op.cpp @@ -1,7 +1,68 @@ #include "op.hpp" +#include "../../core/llaisys_core.hpp" +#include "../../utils.hpp" + +#include "cpu/linear_cpu.hpp" + namespace llaisys::ops { void linear(tensor_t out, tensor_t in, tensor_t weight, tensor_t bias) { - TO_BE_IMPLEMENTED(); + CHECK_ARGUMENT(out->ndim() == 2, "out must be a 2D tensor"); + CHECK_ARGUMENT(in->ndim() == 2, "in must be a 2D tensor"); + CHECK_ARGUMENT(weight->ndim() == 2, "weight must be a 2D tensor"); + CHECK_ARGUMENT(out->shape()[0] == in->shape()[0], "out and in must have the same batch size"); + CHECK_ARGUMENT(out->shape()[1] == weight->shape()[0], "out features must match weight rows"); + CHECK_ARGUMENT(in->shape()[1] == weight->shape()[1], "in features must match weight columns"); + if (bias != nullptr) { + CHECK_ARGUMENT(bias->ndim() == 1, "bias must be a 1D tensor"); + CHECK_ARGUMENT(bias->shape()[0] == out->shape()[1], "bias size must match out features"); + } + CHECK_ARGUMENT(out->isContiguous(), "linear: out tensor must be contiguous."); + CHECK_ARGUMENT(in->isContiguous(), "linear: in tensor must be contiguous."); + CHECK_ARGUMENT(weight->isContiguous(), "linear: weight tensor must be contiguous."); + if (bias != nullptr) { + CHECK_ARGUMENT(bias->isContiguous(), "linear: bias tensor must be contiguous."); + } + + size_t batch_size = in->shape()[0]; + size_t in_features = in->shape()[1]; + size_t out_features = out->shape()[1]; + + // 总是支持CPU计算 + if (out->deviceType() == LLAISYS_DEVICE_CPU) { + return cpu::linear( + out->data(), + in->data(), + weight->data(), + bias ? bias->data() : nullptr, + out->dtype(), + batch_size, + in_features, + out_features + ); + } + + llaisys::core::context().setDevice(out->deviceType(), out->deviceId()); + + switch (out->deviceType()) { + case LLAISYS_DEVICE_CPU: + return cpu::linear( + out->data(), + in->data(), + weight->data(), + bias ? bias->data() : nullptr, + out->dtype(), + batch_size, + in_features, + out_features + ); +#ifdef ENABLE_NVIDIA_API + case LLAISYS_DEVICE_NVIDIA: + TO_BE_IMPLEMENTED(); + return; +#endif + default: + EXCEPTION_UNSUPPORTED_DEVICE; + } } } // namespace llaisys::ops diff --git a/src/ops/rms_norm/cpu/rms_norm_cpu.cpp b/src/ops/rms_norm/cpu/rms_norm_cpu.cpp new file mode 100644 index 00000000..f7cd4094 --- /dev/null +++ b/src/ops/rms_norm/cpu/rms_norm_cpu.cpp @@ -0,0 +1,105 @@ +#include "rms_norm_cpu.hpp" + +#include "../../../utils.hpp" + +#include +#include + +namespace llaisys::ops::cpu { + +template +void rms_norm_impl(std::byte *out, const std::byte *in, const std::byte *weight, float eps, size_t batch_size, size_t hidden_size) { + const T* in_data = reinterpret_cast(in); + const T* weight_data = reinterpret_cast(weight); + T* out_data = reinterpret_cast(out); + + for (size_t i = 0; i < batch_size; i++) { + // 计算平方和 + float sum_sq = 0.0f; + for (size_t j = 0; j < hidden_size; j++) { + float val = static_cast(in_data[i * hidden_size + j]); + sum_sq += val * val; + } + + // 计算RMS + float rms = std::sqrt(sum_sq / static_cast(hidden_size) + eps); + + // 计算输出 + for (size_t j = 0; j < hidden_size; j++) { + float val = static_cast(in_data[i * hidden_size + j]); + float w = static_cast(weight_data[j]); + out_data[i * hidden_size + j] = static_cast((w * val) / rms); + } + } +} + +// 处理F16类型的特化实现 +template <> +void rms_norm_impl(std::byte *out, const std::byte *in, const std::byte *weight, float eps, size_t batch_size, size_t hidden_size) { + const llaisys::fp16_t* in_data = reinterpret_cast(in); + const llaisys::fp16_t* weight_data = reinterpret_cast(weight); + llaisys::fp16_t* out_data = reinterpret_cast(out); + + for (size_t i = 0; i < batch_size; i++) { + // 计算平方和 + float sum_sq = 0.0f; + for (size_t j = 0; j < hidden_size; j++) { + float val = llaisys::utils::cast(in_data[i * hidden_size + j]); + sum_sq += val * val; + } + + // 计算RMS + float rms = std::sqrt(sum_sq / static_cast(hidden_size) + eps); + + // 计算输出 + for (size_t j = 0; j < hidden_size; j++) { + float val = llaisys::utils::cast(in_data[i * hidden_size + j]); + float w = llaisys::utils::cast(weight_data[j]); + out_data[i * hidden_size + j] = llaisys::utils::cast((w * val) / rms); + } + } +} + +// 处理BF16类型的特化实现 +template <> +void rms_norm_impl(std::byte *out, const std::byte *in, const std::byte *weight, float eps, size_t batch_size, size_t hidden_size) { + const llaisys::bf16_t* in_data = reinterpret_cast(in); + const llaisys::bf16_t* weight_data = reinterpret_cast(weight); + llaisys::bf16_t* out_data = reinterpret_cast(out); + + for (size_t i = 0; i < batch_size; i++) { + // 计算平方和 + float sum_sq = 0.0f; + for (size_t j = 0; j < hidden_size; j++) { + float val = llaisys::utils::cast(in_data[i * hidden_size + j]); + sum_sq += val * val; + } + + // 计算RMS + float rms = std::sqrt(sum_sq / static_cast(hidden_size) + eps); + + // 计算输出 + for (size_t j = 0; j < hidden_size; j++) { + float val = llaisys::utils::cast(in_data[i * hidden_size + j]); + float w = llaisys::utils::cast(weight_data[j]); + out_data[i * hidden_size + j] = llaisys::utils::cast((w * val) / rms); + } + } +} + +void rms_norm(std::byte *out, const std::byte *in, const std::byte *weight, float eps, llaisysDataType_t type, size_t batch_size, size_t hidden_size) { + switch (type) { + case LLAISYS_DTYPE_F32: + return rms_norm_impl(out, in, weight, eps, batch_size, hidden_size); + case LLAISYS_DTYPE_F64: + return rms_norm_impl(out, in, weight, eps, batch_size, hidden_size); + case LLAISYS_DTYPE_F16: + return rms_norm_impl(out, in, weight, eps, batch_size, hidden_size); + case LLAISYS_DTYPE_BF16: + return rms_norm_impl(out, in, weight, eps, batch_size, hidden_size); + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } +} + +} // namespace llaisys::ops::cpu \ No newline at end of file diff --git a/src/ops/rms_norm/cpu/rms_norm_cpu.hpp b/src/ops/rms_norm/cpu/rms_norm_cpu.hpp new file mode 100644 index 00000000..7a4db6df --- /dev/null +++ b/src/ops/rms_norm/cpu/rms_norm_cpu.hpp @@ -0,0 +1,8 @@ +#pragma once +#include "llaisys.h" + +#include + +namespace llaisys::ops::cpu { +void rms_norm(std::byte *out, const std::byte *in, const std::byte *weight, float eps, llaisysDataType_t type, size_t batch_size, size_t hidden_size); +} \ No newline at end of file diff --git a/src/ops/rms_norm/op.cpp b/src/ops/rms_norm/op.cpp index 529553d9..f9016a3c 100644 --- a/src/ops/rms_norm/op.cpp +++ b/src/ops/rms_norm/op.cpp @@ -1,7 +1,58 @@ #include "op.hpp" +#include "../../core/llaisys_core.hpp" +#include "../../utils.hpp" + +#include "cpu/rms_norm_cpu.hpp" + namespace llaisys::ops { void rms_norm(tensor_t out, tensor_t in, tensor_t weight, float eps) { - TO_BE_IMPLEMENTED(); + CHECK_ARGUMENT(out->ndim() == 2, "out must be a 2D tensor"); + CHECK_ARGUMENT(in->ndim() == 2, "in must be a 2D tensor"); + CHECK_ARGUMENT(weight->ndim() == 1, "weight must be a 1D tensor"); + CHECK_ARGUMENT(out->shape()[0] == in->shape()[0], "out and in must have the same batch size"); + CHECK_ARGUMENT(out->shape()[1] == in->shape()[1], "out and in must have the same hidden size"); + CHECK_ARGUMENT(weight->shape()[0] == in->shape()[1], "weight size must match hidden size"); + CHECK_ARGUMENT(out->isContiguous(), "rms_norm: out tensor must be contiguous."); + CHECK_ARGUMENT(in->isContiguous(), "rms_norm: in tensor must be contiguous."); + CHECK_ARGUMENT(weight->isContiguous(), "rms_norm: weight tensor must be contiguous."); + + size_t batch_size = in->shape()[0]; + size_t hidden_size = in->shape()[1]; + + // 总是支持CPU计算 + if (out->deviceType() == LLAISYS_DEVICE_CPU) { + return cpu::rms_norm( + out->data(), + in->data(), + weight->data(), + eps, + out->dtype(), + batch_size, + hidden_size + ); + } + + llaisys::core::context().setDevice(out->deviceType(), out->deviceId()); + + switch (out->deviceType()) { + case LLAISYS_DEVICE_CPU: + return cpu::rms_norm( + out->data(), + in->data(), + weight->data(), + eps, + out->dtype(), + batch_size, + hidden_size + ); +#ifdef ENABLE_NVIDIA_API + case LLAISYS_DEVICE_NVIDIA: + TO_BE_IMPLEMENTED(); + return; +#endif + default: + EXCEPTION_UNSUPPORTED_DEVICE; + } } } // namespace llaisys::ops diff --git a/src/ops/rope/cpu/rope_cpu.cpp b/src/ops/rope/cpu/rope_cpu.cpp new file mode 100644 index 00000000..b8a0ca18 --- /dev/null +++ b/src/ops/rope/cpu/rope_cpu.cpp @@ -0,0 +1,132 @@ +#include "rope_cpu.hpp" + +#include "../../../utils.hpp" + +#include +#include + +namespace llaisys::ops::cpu { + +template +void rope_impl(std::byte *out, const std::byte *in, const std::byte *pos_ids, float theta, size_t seqlen, size_t nhead, size_t d) { + const T* in_data = reinterpret_cast(in); + const int64_t* pos_ids_data = reinterpret_cast(pos_ids); + T* out_data = reinterpret_cast(out); + + size_t d_half = d / 2; + + for (size_t i = 0; i < seqlen; i++) { + int64_t p = pos_ids_data[i]; + + for (size_t j = 0; j < nhead; j++) { + for (size_t k = 0; k < d_half; k++) { + // 计算角度: phi = p / theta^(2k/d) + // 使用与PyTorch相同的计算顺序以确保数值一致性 + float exp_value = 2.0f * static_cast(k) / static_cast(d); + float theta_exp = std::pow(theta, exp_value); + float phi = static_cast(p) / theta_exp; + float cos_phi = std::cos(phi); + float sin_phi = std::sin(phi); + + // 获取输入值 + size_t idx = i * nhead * d + j * d + k; + size_t idx_b = idx + d_half; + float a = static_cast(in_data[idx]); + float b = static_cast(in_data[idx_b]); + + // 计算输出 + out_data[idx] = static_cast(a * cos_phi - b * sin_phi); + out_data[idx_b] = static_cast(b * cos_phi + a * sin_phi); + } + } + } +} + +// 处理F16类型的特化实现 +template <> +void rope_impl(std::byte *out, const std::byte *in, const std::byte *pos_ids, float theta, size_t seqlen, size_t nhead, size_t d) { + const llaisys::fp16_t* in_data = reinterpret_cast(in); + const int64_t* pos_ids_data = reinterpret_cast(pos_ids); + llaisys::fp16_t* out_data = reinterpret_cast(out); + + size_t d_half = d / 2; + + for (size_t i = 0; i < seqlen; i++) { + int64_t p = pos_ids_data[i]; + + for (size_t j = 0; j < nhead; j++) { + for (size_t k = 0; k < d_half; k++) { + // 计算角度: phi = p / theta^(2k/d) + // 使用与PyTorch相同的计算顺序以确保数值一致性 + float exp_value = 2.0f * static_cast(k) / static_cast(d); + float theta_exp = std::pow(theta, exp_value); + float phi = static_cast(p) / theta_exp; + float cos_phi = std::cos(phi); + float sin_phi = std::sin(phi); + + // 获取输入值 + size_t idx = i * nhead * d + j * d + k; + size_t idx_b = idx + d_half; + float a = llaisys::utils::cast(in_data[idx]); + float b = llaisys::utils::cast(in_data[idx_b]); + + // 计算输出 + out_data[idx] = llaisys::utils::cast(a * cos_phi - b * sin_phi); + out_data[idx_b] = llaisys::utils::cast(b * cos_phi + a * sin_phi); + } + } + } +} + +// 处理BF16类型的特化实现 +template <> +void rope_impl(std::byte *out, const std::byte *in, const std::byte *pos_ids, float theta, size_t seqlen, size_t nhead, size_t d) { + const llaisys::bf16_t* in_data = reinterpret_cast(in); + const int64_t* pos_ids_data = reinterpret_cast(pos_ids); + llaisys::bf16_t* out_data = reinterpret_cast(out); + + size_t d_half = d / 2; + + for (size_t i = 0; i < seqlen; i++) { + int64_t p = pos_ids_data[i]; + + for (size_t j = 0; j < nhead; j++) { + for (size_t k = 0; k < d_half; k++) { + // 计算角度: phi = p / theta^(2k/d) + // 使用与PyTorch相同的计算顺序以确保数值一致性 + float exp_value = 2.0f * static_cast(k) / static_cast(d); + float theta_exp = std::pow(theta, exp_value); + float phi = static_cast(p) / theta_exp; + float cos_phi = std::cos(phi); + float sin_phi = std::sin(phi); + + // 获取输入值 + size_t idx = i * nhead * d + j * d + k; + size_t idx_b = idx + d_half; + float a = llaisys::utils::cast(in_data[idx]); + float b = llaisys::utils::cast(in_data[idx_b]); + + // 计算输出 + out_data[idx] = llaisys::utils::cast(a * cos_phi - b * sin_phi); + out_data[idx_b] = llaisys::utils::cast(b * cos_phi + a * sin_phi); + } + } + } +} + +void rope(std::byte *out, const std::byte *in, const std::byte *pos_ids, float theta, llaisysDataType_t type, size_t seqlen, size_t nhead, size_t d) { + switch (type) { + case LLAISYS_DTYPE_F32: + return rope_impl(out, in, pos_ids, theta, seqlen, nhead, d); + case LLAISYS_DTYPE_F64: + return rope_impl(out, in, pos_ids, theta, seqlen, nhead, d); + case LLAISYS_DTYPE_F16: + return rope_impl(out, in, pos_ids, theta, seqlen, nhead, d); + case LLAISYS_DTYPE_BF16: + return rope_impl(out, in, pos_ids, theta, seqlen, nhead, d); + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } +} + +} // namespace llaisys::ops::cpu \ No newline at end of file diff --git a/src/ops/rope/cpu/rope_cpu.hpp b/src/ops/rope/cpu/rope_cpu.hpp new file mode 100644 index 00000000..e8262244 --- /dev/null +++ b/src/ops/rope/cpu/rope_cpu.hpp @@ -0,0 +1,8 @@ +#pragma once +#include "llaisys.h" + +#include + +namespace llaisys::ops::cpu { +void rope(std::byte *out, const std::byte *in, const std::byte *pos_ids, float theta, llaisysDataType_t type, size_t seq_len, size_t num_heads, size_t head_dim); +} \ No newline at end of file diff --git a/src/ops/rope/op.cpp b/src/ops/rope/op.cpp index d60dbe64..2f834ef0 100644 --- a/src/ops/rope/op.cpp +++ b/src/ops/rope/op.cpp @@ -1,7 +1,62 @@ #include "op.hpp" +#include "../../core/llaisys_core.hpp" +#include "../../utils.hpp" + +#include "cpu/rope_cpu.hpp" + namespace llaisys::ops { void rope(tensor_t out, tensor_t in, tensor_t pos_ids, float theta) { - TO_BE_IMPLEMENTED(); + CHECK_ARGUMENT(out->ndim() == 3, "out must be a 3D tensor"); + CHECK_ARGUMENT(in->ndim() == 3, "in must be a 3D tensor"); + CHECK_ARGUMENT(pos_ids->ndim() == 1, "pos_ids must be a 1D tensor"); + CHECK_ARGUMENT(out->shape()[0] == in->shape()[0], "out and in must have the same sequence length"); + CHECK_ARGUMENT(out->shape()[1] == in->shape()[1], "out and in must have the same number of heads"); + CHECK_ARGUMENT(out->shape()[2] == in->shape()[2], "out and in must have the same head dimension"); + CHECK_ARGUMENT(pos_ids->shape()[0] == in->shape()[0], "pos_ids length must match sequence length"); + CHECK_ARGUMENT(out->isContiguous(), "rope: out tensor must be contiguous."); + CHECK_ARGUMENT(in->isContiguous(), "rope: in tensor must be contiguous."); + CHECK_ARGUMENT(pos_ids->isContiguous(), "rope: pos_ids tensor must be contiguous."); + + size_t seq_len = in->shape()[0]; + size_t num_heads = in->shape()[1]; + size_t head_dim = in->shape()[2]; + + // 总是支持CPU计算 + if (out->deviceType() == LLAISYS_DEVICE_CPU) { + return cpu::rope( + out->data(), + in->data(), + pos_ids->data(), + theta, + out->dtype(), + seq_len, + num_heads, + head_dim + ); + } + + llaisys::core::context().setDevice(out->deviceType(), out->deviceId()); + + switch (out->deviceType()) { + case LLAISYS_DEVICE_CPU: + return cpu::rope( + out->data(), + in->data(), + pos_ids->data(), + theta, + out->dtype(), + seq_len, + num_heads, + head_dim + ); +#ifdef ENABLE_NVIDIA_API + case LLAISYS_DEVICE_NVIDIA: + TO_BE_IMPLEMENTED(); + return; +#endif + default: + EXCEPTION_UNSUPPORTED_DEVICE; + } } } // namespace llaisys::ops diff --git a/src/ops/self_attention/cpu/self_attention_cpu.cpp b/src/ops/self_attention/cpu/self_attention_cpu.cpp new file mode 100644 index 00000000..15c7a4f6 --- /dev/null +++ b/src/ops/self_attention/cpu/self_attention_cpu.cpp @@ -0,0 +1,296 @@ +#include "self_attention_cpu.hpp" + +#include "../../../utils.hpp" + +#include +#include +#include + +namespace llaisys::ops::cpu { + +template +void softmax(T* data, size_t size) { + T max_val = data[0]; + for (size_t i = 1; i < size; i++) { + if (data[i] > max_val) { + max_val = data[i]; + } + } + + T sum = 0; + for (size_t i = 0; i < size; i++) { + data[i] = static_cast(std::exp(static_cast(data[i] - max_val))); + sum += data[i]; + } + + for (size_t i = 0; i < size; i++) { + data[i] /= sum; + } +} + +template <> +void softmax(llaisys::fp16_t* data, size_t size) { + float max_val = llaisys::utils::cast(data[0]); + for (size_t i = 1; i < size; i++) { + float val = llaisys::utils::cast(data[i]); + if (val > max_val) { + max_val = val; + } + } + + float sum = 0; + for (size_t i = 0; i < size; i++) { + float val = llaisys::utils::cast(data[i]); + data[i] = llaisys::utils::cast(std::exp(val - max_val)); + sum += llaisys::utils::cast(data[i]); + } + + for (size_t i = 0; i < size; i++) { + float val = llaisys::utils::cast(data[i]); + data[i] = llaisys::utils::cast(val / sum); + } +} + +template <> +void softmax(llaisys::bf16_t* data, size_t size) { + float max_val = llaisys::utils::cast(data[0]); + for (size_t i = 1; i < size; i++) { + float val = llaisys::utils::cast(data[i]); + if (val > max_val) { + max_val = val; + } + } + + float sum = 0; + for (size_t i = 0; i < size; i++) { + float val = llaisys::utils::cast(data[i]); + data[i] = llaisys::utils::cast(std::exp(val - max_val)); + sum += llaisys::utils::cast(data[i]); + } + + for (size_t i = 0; i < size; i++) { + float val = llaisys::utils::cast(data[i]); + data[i] = llaisys::utils::cast(val / sum); + } +} + +template +void self_attention_impl(std::byte *attn_val, const std::byte *q, const std::byte *k, const std::byte *v, float scale, size_t seqlen, size_t nhead, size_t nkhead, size_t d, size_t total_len) { + const T* q_data = reinterpret_cast(q); + const T* k_data = reinterpret_cast(k); + const T* v_data = reinterpret_cast(v); + T* attn_val_data = reinterpret_cast(attn_val); + + size_t repeats = nhead / nkhead; + + T* k_expanded = new T[total_len * nhead * d]; + T* v_expanded = new T[total_len * nhead * d]; + + for (size_t i = 0; i < total_len; i++) { + for (size_t j = 0; j < nhead; j++) { + size_t kv_head = j / repeats; + for (size_t k_idx = 0; k_idx < d; k_idx++) { + size_t src_idx = i * nkhead * d + kv_head * d + k_idx; + size_t dst_idx = i * nhead * d + j * d + k_idx; + k_expanded[dst_idx] = k_data[src_idx]; + v_expanded[dst_idx] = v_data[src_idx]; + } + } + } + + T* attn_scores = new T[nhead * seqlen * total_len]; + + for (size_t j = 0; j < nhead; j++) { + for (size_t i = 0; i < seqlen; i++) { + for (size_t t = 0; t < total_len; t++) { + float score = 0.0f; + for (size_t k_idx = 0; k_idx < d; k_idx++) { + size_t q_idx = i * nhead * d + j * d + k_idx; + size_t k_idx_local = t * nhead * d + j * d + k_idx; + score += llaisys::utils::cast(q_data[q_idx]) * llaisys::utils::cast(k_expanded[k_idx_local]); + } + score *= scale; + + size_t mask_threshold = (total_len > seqlen) ? (i + total_len - seqlen) : i; + if (t > mask_threshold) { + score = -1e9f; + } + + attn_scores[j * seqlen * total_len + i * total_len + t] = llaisys::utils::cast(score); + } + softmax(&attn_scores[j * seqlen * total_len + i * total_len], total_len); + } + } + + for (size_t j = 0; j < nhead; j++) { + for (size_t i = 0; i < seqlen; i++) { + for (size_t d_idx = 0; d_idx < d; d_idx++) { + float val = 0.0f; + for (size_t t = 0; t < total_len; t++) { + float attn_weight = llaisys::utils::cast(attn_scores[j * seqlen * total_len + i * total_len + t]); + size_t v_idx = t * nhead * d + j * d + d_idx; + val += attn_weight * llaisys::utils::cast(v_expanded[v_idx]); + } + size_t out_idx = i * nhead * d + j * d + d_idx; + attn_val_data[out_idx] = llaisys::utils::cast(val); + } + } + } + + delete[] attn_scores; + delete[] k_expanded; + delete[] v_expanded; +} + +template <> +void self_attention_impl(std::byte *attn_val, const std::byte *q, const std::byte *k, const std::byte *v, float scale, size_t seqlen, size_t nhead, size_t nkhead, size_t d, size_t total_len) { + const llaisys::fp16_t* q_data = reinterpret_cast(q); + const llaisys::fp16_t* k_data = reinterpret_cast(k); + const llaisys::fp16_t* v_data = reinterpret_cast(v); + llaisys::fp16_t* attn_val_data = reinterpret_cast(attn_val); + + size_t repeats = nhead / nkhead; + + llaisys::fp16_t* k_expanded = new llaisys::fp16_t[total_len * nhead * d]; + llaisys::fp16_t* v_expanded = new llaisys::fp16_t[total_len * nhead * d]; + + for (size_t i = 0; i < total_len; i++) { + for (size_t j = 0; j < nhead; j++) { + size_t kv_head = j / repeats; + for (size_t k_idx = 0; k_idx < d; k_idx++) { + size_t src_idx = i * nkhead * d + kv_head * d + k_idx; + size_t dst_idx = i * nhead * d + j * d + k_idx; + k_expanded[dst_idx] = k_data[src_idx]; + v_expanded[dst_idx] = v_data[src_idx]; + } + } + } + + llaisys::fp16_t* attn_scores = new llaisys::fp16_t[nhead * seqlen * total_len]; + + for (size_t j = 0; j < nhead; j++) { + for (size_t i = 0; i < seqlen; i++) { + for (size_t t = 0; t < total_len; t++) { + float score = 0.0f; + for (size_t k_idx = 0; k_idx < d; k_idx++) { + size_t q_idx = i * nhead * d + j * d + k_idx; + size_t k_idx_local = t * nhead * d + j * d + k_idx; + score += llaisys::utils::cast(q_data[q_idx]) * llaisys::utils::cast(k_expanded[k_idx_local]); + } + score *= scale; + + size_t mask_threshold = (total_len > seqlen) ? (i + total_len - seqlen) : i; + if (t > mask_threshold) { + score = -1e9f; + } + + attn_scores[j * seqlen * total_len + i * total_len + t] = llaisys::utils::cast(score); + } + softmax(&attn_scores[j * seqlen * total_len + i * total_len], total_len); + } + } + + for (size_t j = 0; j < nhead; j++) { + for (size_t i = 0; i < seqlen; i++) { + for (size_t d_idx = 0; d_idx < d; d_idx++) { + float val = 0.0f; + for (size_t t = 0; t < total_len; t++) { + float attn_weight = llaisys::utils::cast(attn_scores[j * seqlen * total_len + i * total_len + t]); + size_t v_idx = t * nhead * d + j * d + d_idx; + val += attn_weight * llaisys::utils::cast(v_expanded[v_idx]); + } + size_t out_idx = i * nhead * d + j * d + d_idx; + attn_val_data[out_idx] = llaisys::utils::cast(val); + } + } + } + + delete[] attn_scores; + delete[] k_expanded; + delete[] v_expanded; +} + +template <> +void self_attention_impl(std::byte *attn_val, const std::byte *q, const std::byte *k, const std::byte *v, float scale, size_t seqlen, size_t nhead, size_t nkhead, size_t d, size_t total_len) { + const llaisys::bf16_t* q_data = reinterpret_cast(q); + const llaisys::bf16_t* k_data = reinterpret_cast(k); + const llaisys::bf16_t* v_data = reinterpret_cast(v); + llaisys::bf16_t* attn_val_data = reinterpret_cast(attn_val); + + size_t repeats = nhead / nkhead; + + llaisys::bf16_t* k_expanded = new llaisys::bf16_t[total_len * nhead * d]; + llaisys::bf16_t* v_expanded = new llaisys::bf16_t[total_len * nhead * d]; + + for (size_t i = 0; i < total_len; i++) { + for (size_t j = 0; j < nhead; j++) { + size_t kv_head = j / repeats; + for (size_t k_idx = 0; k_idx < d; k_idx++) { + size_t src_idx = i * nkhead * d + kv_head * d + k_idx; + size_t dst_idx = i * nhead * d + j * d + k_idx; + k_expanded[dst_idx] = k_data[src_idx]; + v_expanded[dst_idx] = v_data[src_idx]; + } + } + } + + llaisys::bf16_t* attn_scores = new llaisys::bf16_t[nhead * seqlen * total_len]; + + for (size_t j = 0; j < nhead; j++) { + for (size_t i = 0; i < seqlen; i++) { + for (size_t t = 0; t < total_len; t++) { + float score = 0.0f; + for (size_t k_idx = 0; k_idx < d; k_idx++) { + size_t q_idx = i * nhead * d + j * d + k_idx; + size_t k_idx_local = t * nhead * d + j * d + k_idx; + score += llaisys::utils::cast(q_data[q_idx]) * llaisys::utils::cast(k_expanded[k_idx_local]); + } + score *= scale; + + size_t mask_threshold = (total_len > seqlen) ? (i + total_len - seqlen) : i; + if (t > mask_threshold) { + score = -1e9f; + } + + attn_scores[j * seqlen * total_len + i * total_len + t] = llaisys::utils::cast(score); + } + softmax(&attn_scores[j * seqlen * total_len + i * total_len], total_len); + } + } + + for (size_t j = 0; j < nhead; j++) { + for (size_t i = 0; i < seqlen; i++) { + for (size_t d_idx = 0; d_idx < d; d_idx++) { + float val = 0.0f; + for (size_t t = 0; t < total_len; t++) { + float attn_weight = llaisys::utils::cast(attn_scores[j * seqlen * total_len + i * total_len + t]); + size_t v_idx = t * nhead * d + j * d + d_idx; + val += attn_weight * llaisys::utils::cast(v_expanded[v_idx]); + } + size_t out_idx = i * nhead * d + j * d + d_idx; + attn_val_data[out_idx] = llaisys::utils::cast(val); + } + } + } + + delete[] attn_scores; + delete[] k_expanded; + delete[] v_expanded; +} + +void self_attention(std::byte *attn_val, const std::byte *q, const std::byte *k, const std::byte *v, float scale, llaisysDataType_t type, size_t seqlen, size_t nhead, size_t num_kv_heads, size_t d, size_t total_len) { + switch (type) { + case LLAISYS_DTYPE_F32: + return self_attention_impl(attn_val, q, k, v, scale, seqlen, nhead, num_kv_heads, d, total_len); + case LLAISYS_DTYPE_F64: + return self_attention_impl(attn_val, q, k, v, scale, seqlen, nhead, num_kv_heads, d, total_len); + case LLAISYS_DTYPE_F16: + return self_attention_impl(attn_val, q, k, v, scale, seqlen, nhead, num_kv_heads, d, total_len); + case LLAISYS_DTYPE_BF16: + return self_attention_impl(attn_val, q, k, v, scale, seqlen, nhead, num_kv_heads, d, total_len); + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } +} + +} // namespace llaisys::ops::cpu diff --git a/src/ops/self_attention/cpu/self_attention_cpu.hpp b/src/ops/self_attention/cpu/self_attention_cpu.hpp new file mode 100644 index 00000000..6f7ddb0c --- /dev/null +++ b/src/ops/self_attention/cpu/self_attention_cpu.hpp @@ -0,0 +1,8 @@ +#pragma once +#include "llaisys.h" + +#include + +namespace llaisys::ops::cpu { +void self_attention(std::byte *attn_val, const std::byte *q, const std::byte *k, const std::byte *v, float scale, llaisysDataType_t type, size_t seq_len, size_t num_heads, size_t num_kv_heads, size_t head_dim, size_t total_len); +} \ No newline at end of file diff --git a/src/ops/self_attention/op.cpp b/src/ops/self_attention/op.cpp index 43d62014..d8161d2e 100644 --- a/src/ops/self_attention/op.cpp +++ b/src/ops/self_attention/op.cpp @@ -1,7 +1,75 @@ #include "op.hpp" +#include "../../core/llaisys_core.hpp" +#include "../../utils.hpp" + +#include "cpu/self_attention_cpu.hpp" + namespace llaisys::ops { void self_attention(tensor_t attn_val, tensor_t q, tensor_t k, tensor_t v, float scale) { - TO_BE_IMPLEMENTED(); + CHECK_ARGUMENT(attn_val->ndim() == 3, "attn_val must be a 3D tensor"); + CHECK_ARGUMENT(q->ndim() == 3, "q must be a 3D tensor"); + CHECK_ARGUMENT(k->ndim() == 3, "k must be a 3D tensor"); + CHECK_ARGUMENT(v->ndim() == 3, "v must be a 3D tensor"); + CHECK_ARGUMENT(attn_val->shape()[0] == q->shape()[0], "attn_val and q must have the same sequence length"); + CHECK_ARGUMENT(attn_val->shape()[1] == q->shape()[1], "attn_val and q must have the same number of heads"); + CHECK_ARGUMENT(attn_val->shape()[2] == v->shape()[2], "attn_val and v must have the same head dimension"); + CHECK_ARGUMENT(q->shape()[2] == k->shape()[2], "q and k must have the same head dimension"); + CHECK_ARGUMENT(k->shape()[1] == v->shape()[1], "k and v must have the same number of heads"); + CHECK_ARGUMENT(k->shape()[2] == v->shape()[2], "k and v must have the same head dimension"); + CHECK_ARGUMENT(q->shape()[1] % k->shape()[1] == 0, "q's number of heads must be a multiple of k's number of heads"); + CHECK_ARGUMENT(attn_val->isContiguous(), "self_attention: attn_val tensor must be contiguous."); + CHECK_ARGUMENT(q->isContiguous(), "self_attention: q tensor must be contiguous."); + CHECK_ARGUMENT(k->isContiguous(), "self_attention: k tensor must be contiguous."); + CHECK_ARGUMENT(v->isContiguous(), "self_attention: v tensor must be contiguous."); + + size_t seq_len = q->shape()[0]; + size_t num_heads = q->shape()[1]; + size_t num_kv_heads = k->shape()[1]; + size_t head_dim = q->shape()[2]; + size_t total_len = k->shape()[0]; + + // 总是支持CPU计算 + if (attn_val->deviceType() == LLAISYS_DEVICE_CPU) { + return cpu::self_attention( + attn_val->data(), + q->data(), + k->data(), + v->data(), + scale, + attn_val->dtype(), + seq_len, + num_heads, + num_kv_heads, + head_dim, + total_len + ); + } + + llaisys::core::context().setDevice(attn_val->deviceType(), attn_val->deviceId()); + + switch (attn_val->deviceType()) { + case LLAISYS_DEVICE_CPU: + return cpu::self_attention( + attn_val->data(), + q->data(), + k->data(), + v->data(), + scale, + attn_val->dtype(), + seq_len, + num_heads, + num_kv_heads, + head_dim, + total_len + ); +#ifdef ENABLE_NVIDIA_API + case LLAISYS_DEVICE_NVIDIA: + TO_BE_IMPLEMENTED(); + return; +#endif + default: + EXCEPTION_UNSUPPORTED_DEVICE; + } } } // namespace llaisys::ops diff --git a/src/ops/swiglu/cpu/swiglu_cpu.cpp b/src/ops/swiglu/cpu/swiglu_cpu.cpp new file mode 100644 index 00000000..763821de --- /dev/null +++ b/src/ops/swiglu/cpu/swiglu_cpu.cpp @@ -0,0 +1,97 @@ +#include "swiglu_cpu.hpp" + +#include "../../../utils.hpp" + +#include +#include + +namespace llaisys::ops::cpu { + +// Sigmoid函数模板 +template +T sigmoid(T x) { + return static_cast(1.0f / (1.0f + std::exp(-static_cast(x)))); +} + +// F16类型的sigmoid特化 +template <> +llaisys::fp16_t sigmoid(llaisys::fp16_t x) { + float val = llaisys::utils::cast(x); + float sigmoid_val = 1.0f / (1.0f + std::exp(-val)); + return llaisys::utils::cast(sigmoid_val); +} + +// BF16类型的sigmoid特化 +template <> +llaisys::bf16_t sigmoid(llaisys::bf16_t x) { + float val = llaisys::utils::cast(x); + float sigmoid_val = 1.0f / (1.0f + std::exp(-val)); + return llaisys::utils::cast(sigmoid_val); +} + +template +void swiglu_impl(std::byte *out, const std::byte *gate, const std::byte *up, size_t size) { + const T* gate_data = reinterpret_cast(gate); + const T* up_data = reinterpret_cast(up); + T* out_data = reinterpret_cast(out); + + for (size_t i = 0; i < size; i++) { + T gate_val = gate_data[i]; + T up_val = up_data[i]; + T sigmoid_gate = sigmoid(gate_val); + out_data[i] = up_val * gate_val * sigmoid_gate; + } +} + +// F16类型的特化实现 +template <> +void swiglu_impl(std::byte *out, const std::byte *gate, const std::byte *up, size_t size) { + const llaisys::fp16_t* gate_data = reinterpret_cast(gate); + const llaisys::fp16_t* up_data = reinterpret_cast(up); + llaisys::fp16_t* out_data = reinterpret_cast(out); + + for (size_t i = 0; i < size; i++) { + llaisys::fp16_t gate_val = gate_data[i]; + llaisys::fp16_t up_val = up_data[i]; + llaisys::fp16_t sigmoid_gate = sigmoid(gate_val); + float up_float = llaisys::utils::cast(up_val); + float gate_float = llaisys::utils::cast(gate_val); + float sigmoid_float = llaisys::utils::cast(sigmoid_gate); + out_data[i] = llaisys::utils::cast(up_float * gate_float * sigmoid_float); + } +} + +// BF16类型的特化实现 +template <> +void swiglu_impl(std::byte *out, const std::byte *gate, const std::byte *up, size_t size) { + const llaisys::bf16_t* gate_data = reinterpret_cast(gate); + const llaisys::bf16_t* up_data = reinterpret_cast(up); + llaisys::bf16_t* out_data = reinterpret_cast(out); + + for (size_t i = 0; i < size; i++) { + llaisys::bf16_t gate_val = gate_data[i]; + llaisys::bf16_t up_val = up_data[i]; + llaisys::bf16_t sigmoid_gate = sigmoid(gate_val); + float up_float = llaisys::utils::cast(up_val); + float gate_float = llaisys::utils::cast(gate_val); + float sigmoid_float = llaisys::utils::cast(sigmoid_gate); + out_data[i] = llaisys::utils::cast(up_float * gate_float * sigmoid_float); + } +} + +void swiglu(std::byte *out, const std::byte *gate, const std::byte *up, llaisysDataType_t type, size_t size) { + switch (type) { + case LLAISYS_DTYPE_F32: + return swiglu_impl(out, gate, up, size); + case LLAISYS_DTYPE_F64: + return swiglu_impl(out, gate, up, size); + case LLAISYS_DTYPE_F16: + return swiglu_impl(out, gate, up, size); + case LLAISYS_DTYPE_BF16: + return swiglu_impl(out, gate, up, size); + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } +} + +} // namespace llaisys::ops::cpu \ No newline at end of file diff --git a/src/ops/swiglu/cpu/swiglu_cpu.hpp b/src/ops/swiglu/cpu/swiglu_cpu.hpp new file mode 100644 index 00000000..ed570a75 --- /dev/null +++ b/src/ops/swiglu/cpu/swiglu_cpu.hpp @@ -0,0 +1,8 @@ +#pragma once +#include "llaisys.h" + +#include + +namespace llaisys::ops::cpu { +void swiglu(std::byte *out, const std::byte *gate, const std::byte *up, llaisysDataType_t type, size_t size); +} \ No newline at end of file diff --git a/src/ops/swiglu/op.cpp b/src/ops/swiglu/op.cpp index 47edbcc9..37752f2c 100644 --- a/src/ops/swiglu/op.cpp +++ b/src/ops/swiglu/op.cpp @@ -1,7 +1,40 @@ #include "op.hpp" +#include "../../core/llaisys_core.hpp" +#include "../../utils.hpp" + +#include "cpu/swiglu_cpu.hpp" + namespace llaisys::ops { void swiglu(tensor_t out, tensor_t gate, tensor_t up) { - TO_BE_IMPLEMENTED(); + CHECK_ARGUMENT(out->ndim() == 2, "out must be a 2D tensor"); + CHECK_ARGUMENT(gate->ndim() == 2, "gate must be a 2D tensor"); + CHECK_ARGUMENT(up->ndim() == 2, "up must be a 2D tensor"); + CHECK_ARGUMENT(out->shape()[0] == gate->shape()[0], "out and gate must have the same shape"); + CHECK_ARGUMENT(out->shape()[1] == gate->shape()[1], "out and gate must have the same shape"); + CHECK_ARGUMENT(out->shape()[0] == up->shape()[0], "out and up must have the same shape"); + CHECK_ARGUMENT(out->shape()[1] == up->shape()[1], "out and up must have the same shape"); + CHECK_ARGUMENT(out->isContiguous(), "swiglu: out tensor must be contiguous."); + CHECK_ARGUMENT(gate->isContiguous(), "swiglu: gate tensor must be contiguous."); + CHECK_ARGUMENT(up->isContiguous(), "swiglu: up tensor must be contiguous."); + + // 总是支持CPU计算 + if (out->deviceType() == LLAISYS_DEVICE_CPU) { + return cpu::swiglu(out->data(), gate->data(), up->data(), out->dtype(), out->numel()); + } + + llaisys::core::context().setDevice(out->deviceType(), out->deviceId()); + + switch (out->deviceType()) { + case LLAISYS_DEVICE_CPU: + return cpu::swiglu(out->data(), gate->data(), up->data(), out->dtype(), out->numel()); +#ifdef ENABLE_NVIDIA_API + case LLAISYS_DEVICE_NVIDIA: + TO_BE_IMPLEMENTED(); + return; +#endif + default: + EXCEPTION_UNSUPPORTED_DEVICE; + } } } // namespace llaisys::ops diff --git a/src/tensor/tensor.cpp b/src/tensor/tensor.cpp index 2f594bb6..43cfedb2 100644 --- a/src/tensor/tensor.cpp +++ b/src/tensor/tensor.cpp @@ -2,6 +2,7 @@ #include "../utils.hpp" +#include #include #include #include @@ -164,27 +165,122 @@ void Tensor::debug() const { } bool Tensor::isContiguous() const { - TO_BE_IMPLEMENTED(); + size_t ndim_ = this->ndim(); + ptrdiff_t stride = 1; + for (size_t i = 1; i <= ndim_; i++) { + if (this->_meta.strides[ndim_ - i] != stride) { + return false; + } + stride *= static_cast(this->_meta.shape[ndim_ - i]); + } return true; } tensor_t Tensor::permute(const std::vector &order) const { - TO_BE_IMPLEMENTED(); - return std::shared_ptr(new Tensor(_meta, _storage)); + size_t ndim_ = this->ndim(); + CHECK_ARGUMENT(order.size() == ndim_, "order size must be equal to ndim"); + // 检查order是否包含所有维度 + std::vector used(ndim_, false); + for (auto idx : order) { + if (idx >= ndim_ || used[idx]) { + CHECK_ARGUMENT(false, "Invalid permutation order"); + } + used[idx] = true; + } + + std::vector shape(ndim_); + std::vector strides(ndim_); + for (size_t i = 0; i < ndim_; i++) { + shape[i] = this->_meta.shape[order[i]]; + strides[i] = this->_meta.strides[order[i]]; + } + TensorMeta meta{this->_meta.dtype, shape, strides}; + return std::shared_ptr(new Tensor(meta, _storage, _offset)); } tensor_t Tensor::view(const std::vector &shape) const { - TO_BE_IMPLEMENTED(); - return std::shared_ptr(new Tensor(_meta, _storage)); + // 计算新形状的元素总数 + size_t new_numel = std::accumulate(shape.begin(), shape.end(), size_t(1), std::multiplies()); + // 检查元素总数是否与原张量相同 + if (new_numel != this->numel()) { + CHECK_ARGUMENT(false, "New shape has different number of elements"); + } + + // 对于连续张量,直接计算新的步长 + if(isContiguous()) { + size_t ndim_ = shape.size(); + std::vector strides(ndim_); + size_t stride = 1; + for (size_t i = 1; i <= ndim_; i++) { + strides[ndim_ - i] = stride; + stride *= shape[ndim_ - i]; + } + TensorMeta meta{this->_meta.dtype, shape, strides}; + return std::shared_ptr(new Tensor(meta, _storage, _offset)); + } else { + // 对于非连续张量,检查是否可以进行视图操作 + // 这里简化实现,实际中可能需要更复杂的检查 + CHECK_ARGUMENT(false, "Cannot view non-contiguous tensor"); + } } tensor_t Tensor::slice(size_t dim, size_t start, size_t end) const { - TO_BE_IMPLEMENTED(); - return std::shared_ptr(new Tensor(_meta, _storage)); + // 检查dim是否有效 + if (dim >= this->ndim()) { + CHECK_ARGUMENT(false, "Dimension out of range"); + } + + // 检查start和end是否有效 + if (start >= end || end > this->_meta.shape[dim]) { + CHECK_ARGUMENT(false, "Invalid start or end indices"); + } + + // 创建新的形状 + std::vector new_shape = this->_meta.shape; + new_shape[dim] = end - start; + + // 计算新的偏移量 + size_t new_offset = this->_offset; + new_offset += start * this->_meta.strides[dim] * this->elementSize(); + + // 创建新的步长(保持不变) + std::vector new_strides = this->_meta.strides; + + // 创建新的meta + TensorMeta meta{this->_meta.dtype, new_shape, new_strides}; + + // 创建并返回新的张量 + return std::shared_ptr(new Tensor(meta, _storage, new_offset)); } void Tensor::load(const void *src_) { - TO_BE_IMPLEMENTED(); + // 计算需要复制的字节数 + size_t bytes_to_copy = this->numel() * this->elementSize(); + + // 检查存储大小是否足够 + if (this->_storage->size() < this->_offset + bytes_to_copy) { + CHECK_ARGUMENT(false, "Storage size is not sufficient"); + } + + // 获取目标内存地址(考虑偏移量) + std::byte *dst = this->data(); + + // 根据设备类型选择合适的内存复制方式 + if (this->deviceType() == LLAISYS_DEVICE_CPU) { + // CPU到CPU的复制 + std::memcpy(dst, src_, bytes_to_copy); + } else { + // 主机到设备的复制 + core::context().setDevice(this->deviceType(), this->deviceId()); + core::context().runtime().api()->memcpy_sync( + dst, + src_, + bytes_to_copy, + LLAISYS_MEMCPY_H2D + ); + // 同步设备确保复制完成 + core::context().runtime().api()->device_synchronize(); + } } tensor_t Tensor::contiguous() const { diff --git a/xmake.lua b/xmake.lua index 1f65f7a9..f18fa292 100644 --- a/xmake.lua +++ b/xmake.lua @@ -95,6 +95,22 @@ target("llaisys-ops") on_install(function (target) end) target_end() +target("llaisys-models") + set_kind("static") + add_deps("llaisys-tensor") + add_deps("llaisys-ops") + + set_languages("cxx17") + set_warnings("all", "error") + if not is_plat("windows") then + add_cxflags("-fPIC", "-Wno-unknown-pragmas") + end + + add_files("src/llaisys/models/*.cpp") + + on_install(function (target) end) +target_end() + target("llaisys") set_kind("shared") add_deps("llaisys-utils") @@ -106,6 +122,7 @@ target("llaisys") set_languages("cxx17") set_warnings("all", "error") add_files("src/llaisys/*.cc") + add_files("src/llaisys/models/*.cpp") set_installdir(".")