diff --git a/.envrc b/.envrc new file mode 100644 index 00000000..86241311 --- /dev/null +++ b/.envrc @@ -0,0 +1 @@ +source .venv/bin/activate diff --git a/.gitignore b/.gitignore index e38cf574..8fb09640 100644 --- a/.gitignore +++ b/.gitignore @@ -87,4 +87,7 @@ htmlcov/ # Windows Thumbs.db ehthumbs.db -desktop.ini \ No newline at end of file +desktop.ini + +# model +DeepSeek-R1-Distill-Qwen-1.5B/ diff --git a/include/llaisys/models/qwen2.h b/include/llaisys/models/qwen2.h index 7054626d..6cd98dd7 100644 --- a/include/llaisys/models/qwen2.h +++ b/include/llaisys/models/qwen2.h @@ -4,14 +4,14 @@ #include "../tensor.h" __C { - struct LlaisysQwen2Meta { + typedef struct LlaisysQwen2Meta_ { llaisysDataType_t dtype; size_t nlayer, hs, nh, nkvh, dh, di, maxseq, voc; float epsilon, theta; int64_t end_token; - }; + }LlaisysQwen2Meta; - struct LlaisysQwen2Weights { + typedef struct LlaisysQwen2Weights_ { llaisysTensor_t in_embed; llaisysTensor_t out_embed; llaisysTensor_t out_norm_w; // a.k.a. model.norm.weight @@ -27,16 +27,24 @@ __C { llaisysTensor_t *mlp_gate_w; llaisysTensor_t *mlp_up_w; llaisysTensor_t *mlp_down_w; - }; + }LlaisysQwen2Weights; - struct LlaisysQwen2Model; + typedef struct LlaisysQwen2Model_ { + LlaisysQwen2Meta* meta; + LlaisysQwen2Weights* weights = nullptr; + void *impl = nullptr; // Opaque pointer to the actual model implementation (e.g., a C++ class instance). + }LlaisysQwen2Model; - __export struct LlaisysQwen2Model *llaisysQwen2ModelCreate(const LlaisysQwen2Meta *meta, llaisysDeviceType_t device, int *device_ids, int ndevice); - __export void llaisysQwen2ModelDestroy(struct LlaisysQwen2Model * model); - __export struct LlaisysQwen2Weights *llaisysQwen2ModelWeights(struct LlaisysQwen2Model * model); + __export LlaisysQwen2Model *llaisysQwen2ModelCreate(const LlaisysQwen2Meta *meta, llaisysDeviceType_t device, int *device_ids, int ndevice); - __export int64_t llaisysQwen2ModelInfer(struct LlaisysQwen2Model * model, int64_t * token_ids, size_t ntoken); + __export void llaisysQwen2ModelDestroy(LlaisysQwen2Model * model); + + __export void llaisysQwen2modelLoadWeight(LlaisysQwen2Model * model, const void *weight_data, const char *weight_name); + + __export LlaisysQwen2Weights *llaisysQwen2ModelWeights(LlaisysQwen2Model * model); + + __export int64_t llaisysQwen2ModelInfer(LlaisysQwen2Model * model, int64_t * token_ids, size_t ntoken); } #endif // LLAISYS_MODELS_QWEN2_H diff --git a/make.sh b/make.sh new file mode 100755 index 00000000..89daef43 --- /dev/null +++ b/make.sh @@ -0,0 +1,4 @@ +xmake +xmake install +pip install ./python + diff --git a/python/llaisys/libllaisys/__init__.py b/python/llaisys/libllaisys/__init__.py index f536fb52..d3691ce4 100644 --- a/python/llaisys/libllaisys/__init__.py +++ b/python/llaisys/libllaisys/__init__.py @@ -10,9 +10,10 @@ from .llaisys_types import llaisysMemcpyKind_t, MemcpyKind from .llaisys_types import llaisysStream_t from .tensor import llaisysTensor_t +from .models.qwen2 import Qwen2Meta, LlaisysQwen2Meta_t, LlaisysQwen2Model_t, LlaisysQwen2Weights_t from .tensor import load_tensor from .ops import load_ops - +from .models.qwen2 import load_qwen2 def load_shared_library(): lib_dir = Path(__file__).parent @@ -38,6 +39,7 @@ def load_shared_library(): load_runtime(LIB_LLAISYS) load_tensor(LIB_LLAISYS) load_ops(LIB_LLAISYS) +load_qwen2(LIB_LLAISYS) __all__ = [ @@ -46,6 +48,10 @@ def load_shared_library(): "llaisysStream_t", "llaisysTensor_t", "llaisysDataType_t", + "Qwen2Meta", + "LlaisysQwen2Meta_t", + "LlaisysQwen2Model_t", + "LlaisysQwen2Weights_t", "DataType", "llaisysDeviceType_t", "DeviceType", diff --git a/python/llaisys/libllaisys/llaisys_types.py b/python/llaisys/libllaisys/llaisys_types.py index c5a0b467..3da82aa6 100644 --- a/python/llaisys/libllaisys/llaisys_types.py +++ b/python/llaisys/libllaisys/llaisys_types.py @@ -49,6 +49,16 @@ class MemcpyKind(IntEnum): llaisysMemcpyKind_t = ctypes.c_int +''' +struct LlaisysQwen2Meta_ { + llaisysDataType_t dtype; + size_t nlayer, hs, nh, nkvh, dh, di, maxseq, voc; + float epsilon, theta; + int64_t end_token; +} +''' + + # Stream type (opaque pointer) llaisysStream_t = ctypes.c_void_p diff --git a/python/llaisys/libllaisys/models/__init__.py b/python/llaisys/libllaisys/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/python/llaisys/libllaisys/models/qwen2.py b/python/llaisys/libllaisys/models/qwen2.py new file mode 100644 index 00000000..98beb969 --- /dev/null +++ b/python/llaisys/libllaisys/models/qwen2.py @@ -0,0 +1,59 @@ +from ctypes import POINTER, c_void_p, c_size_t, c_int, c_char_p, Structure, c_float, c_int64 +from ..llaisys_types import * +from ..tensor import llaisysTensor_t + +class Qwen2Meta(Structure): + _fields_ = [ + ("dtype", llaisysDataType_t), + ("nlayer", c_size_t), + ("hs", c_size_t), + ("nh", c_size_t), + ("nkvh", c_size_t), + ("dh", c_size_t), + ("di", c_size_t), + ("maxseq", c_size_t), + ("voc", c_size_t), + ("epsilon", c_float), + ("theta", c_float), + ("end_token", c_int64), + ] + +class LlaisysQwen2Weights(Structure): + _fields_ = [ + ("in_embed", llaisysTensor_t), + ("out_embed", llaisysTensor_t), + ("out_norm_w", llaisysTensor_t), + ("attn_norm_w", POINTER(llaisysTensor_t)), + ("attn_q_w", POINTER(llaisysTensor_t)), + ("attn_q_b", POINTER(llaisysTensor_t)), + ("attn_k_w", POINTER(llaisysTensor_t)), + ("attn_k_b", POINTER(llaisysTensor_t)), + ("attn_v_w", POINTER(llaisysTensor_t)), + ("attn_v_b", POINTER(llaisysTensor_t)), + ("attn_o_w", POINTER(llaisysTensor_t)), + ("mlp_norm_w", POINTER(llaisysTensor_t)), + ("mlp_gate_w", POINTER(llaisysTensor_t)), + ("mlp_up_w", POINTER(llaisysTensor_t)), + ("mlp_down_w", POINTER(llaisysTensor_t)), + ] + +LlaisysQwen2Meta_t = POINTER(Qwen2Meta) +LlaisysQwen2Model_t = c_void_p +LlaisysQwen2Weights_t = POINTER(LlaisysQwen2Weights) + + +def load_qwen2(lib): + lib.llaisysQwen2ModelCreate.argtypes = [LlaisysQwen2Meta_t, llaisysDeviceType_t, POINTER(c_int), c_int] + lib.llaisysQwen2ModelCreate.restype = LlaisysQwen2Model_t + + lib.llaisysQwen2ModelDestroy.argtypes = [LlaisysQwen2Model_t] + lib.llaisysQwen2ModelDestroy.restype = None + + lib.llaisysQwen2modelLoadWeight.argtypes = [LlaisysQwen2Model_t, c_void_p, c_char_p] + lib.llaisysQwen2modelLoadWeight.restype = None + + lib.llaisysQwen2ModelInfer.argtypes = [LlaisysQwen2Model_t, POINTER(c_int64), c_size_t] + lib.llaisysQwen2ModelInfer.restype = c_int64 + + lib.llaisysQwen2ModelWeights.argtypes = [LlaisysQwen2Model_t] + lib.llaisysQwen2ModelWeights.restype = LlaisysQwen2Weights_t diff --git a/python/llaisys/models/__init__.py b/python/llaisys/models/__init__.py index af9918b0..c8129885 100644 --- a/python/llaisys/models/__init__.py +++ b/python/llaisys/models/__init__.py @@ -1 +1 @@ -from .qwen2 import Qwen2 +from .qwen2 import Qwen2 \ No newline at end of file diff --git a/python/llaisys/models/qwen2.py b/python/llaisys/models/qwen2.py index 0d07b0b2..28eaa6e0 100644 --- a/python/llaisys/models/qwen2.py +++ b/python/llaisys/models/qwen2.py @@ -1,23 +1,28 @@ +import ctypes +import numpy as np +import gc +from enum import IntEnum from typing import Sequence from ..libllaisys import LIB_LLAISYS -from ..libllaisys import DeviceType +from ..libllaisys import * +from ..tensor import Tensor +import torch + from pathlib import Path import safetensors +import json class Qwen2: - def __init__(self, model_path, device: DeviceType = DeviceType.CPU): - # TODO: Implement model constructor - - model_path = Path(model_path) - - for file in sorted(model_path.glob("*.safetensors")): - data_ = safetensors.safe_open(file, framework="numpy", device="cpu") - for name_ in data_.keys(): - ## TODO: load the model weights - pass + self.model_path = Path(model_path) + self.device = device + self._load_config() + self._load_weights() + + def __delete__(self): + LIB_LLAISYS.llaisysQwen2ModelDestroy(self.model) def generate( self, @@ -28,6 +33,45 @@ def generate( temperature: float = 0.8, ): - # TODO: Implement generate function + ptr = np.array(inputs, dtype=np.int64).ctypes.data_as(ctypes.POINTER(ctypes.c_int64)) + l = len(inputs) + ret = list(inputs) + id = 0 + while id != self.config["eos_token_id"]: + id = int(LIB_LLAISYS.llaisysQwen2ModelInfer(self.model, ptr, ctypes.c_size_t(l))) + ret.append(id) + ptr = ctypes.byref(ctypes.c_int64(id)) + l = 1 + return ret + + def _load_config(self): + config_file = self.model_path / "config.json" + with open(config_file, "r") as f: + self.config = json.load(f) + meta = Qwen2Meta() + meta.dtype = ctypes.c_int(DataType.BF16) + meta.nlayer = ctypes.c_size_t(self.config["num_hidden_layers"]) + meta.hs = ctypes.c_size_t(self.config["hidden_size"]) + meta.nh = ctypes.c_size_t(self.config["num_attention_heads"]) + meta.nkvh = ctypes.c_size_t(self.config["num_key_value_heads"]) + meta.dh = ctypes.c_size_t(self.config["hidden_size"] // self.config["num_attention_heads"]) + meta.di = ctypes.c_size_t(self.config["intermediate_size"]) + meta.maxseq = ctypes.c_size_t(self.config["max_position_embeddings"]) + meta.voc = ctypes.c_size_t(self.config["vocab_size"]) + meta.epsilon = ctypes.c_float(self.config["rms_norm_eps"]) + meta.theta = ctypes.c_float(self.config["rope_theta"]) + meta.end_token = ctypes.c_int64(self.config["eos_token_id"]) + - return [] + id = ctypes.c_int(0) + self.model = LIB_LLAISYS.llaisysQwen2ModelCreate(ctypes.byref(meta), self.device, ctypes.byref(id), 1) + + def _load_weights(self): + for file in sorted(self.model_path.glob("*.safetensors")): + data_ = safetensors.safe_open(file, framework="torch", device="cpu") + for name_ in data_.keys(): + tensor = data_.get_tensor(name_) + name_c = ctypes.c_char_p(name_.encode('utf-8')) + LIB_LLAISYS.llaisysQwen2modelLoadWeight(self.model, ctypes.c_void_p(tensor.data_ptr()), name_c) + del tensor + gc.collect() \ No newline at end of file diff --git a/python/llaisys/tensor.py b/python/llaisys/tensor.py index 1466d851..7963dddb 100644 --- a/python/llaisys/tensor.py +++ b/python/llaisys/tensor.py @@ -9,6 +9,7 @@ DataType, ) from ctypes import c_size_t, c_int, c_ssize_t, c_void_p +import torch class Tensor: @@ -95,3 +96,26 @@ def slice(self, dim: int, start: int, end: int): self._tensor, c_size_t(dim), c_size_t(start), c_size_t(end) ) ) + + @staticmethod + def from_torch(torch_tensor: torch.Tensor): + assert torch_tensor.is_contiguous(), "Only contiguous tensors are supported" + assert torch_tensor.device.type in ["cpu", "cuda"], "Only CPU and CUDA devices are supported" + + device_type = DeviceType.CPU if torch_tensor.device.type == "cpu" else DeviceType.NVIDIA + dtype = DataType.F32 + if torch_tensor.dtype == torch.float16: + dtype = DataType.F16 + elif torch_tensor.dtype == torch.bfloat16: + dtype = DataType.BF16 + else: + raise ValueError(f"Unsupported data type: {torch_tensor.dtype}") + _tensor = Tensor( + shape=torch_tensor.shape, + dtype=dtype, + device=device_type, + device_id=torch_tensor.device.index if torch_tensor.device.type == "cuda" else 0, + ) + + _tensor.load(torch_tensor.data_ptr()) + return _tensor diff --git a/src/core/runtime/runtime.hpp b/src/core/runtime/runtime.hpp index 43235824..86f40a4e 100644 --- a/src/core/runtime/runtime.hpp +++ b/src/core/runtime/runtime.hpp @@ -37,7 +37,6 @@ class Runtime { const LlaisysRuntimeAPI *api() const; storage_t allocateDeviceStorage(size_t size); - ; storage_t allocateHostStorage(size_t size); void freeStorage(Storage *storage); diff --git a/src/llaisys/models/qwen2.cc b/src/llaisys/models/qwen2.cc new file mode 100644 index 00000000..61619f4e --- /dev/null +++ b/src/llaisys/models/qwen2.cc @@ -0,0 +1,436 @@ +#include "llaisys/models/qwen2.h" + +#include "../../tensor/tensor.hpp" +#include "../../ops/argmax/op.hpp" +#include "../../ops/embedding/op.hpp" +#include "../../ops/linear/op.hpp" +#include "../../ops/rms_norm/op.hpp" +#include "../../ops/add/op.hpp" +#include "../../ops/rope/op.hpp" +#include "../../ops/self_attention/op.hpp" +#include "../../ops/swiglu/op.hpp" +#include "../../utils.hpp" +#include "../../core/llaisys_core.hpp" +#include +#include +#include +#include + + +using namespace llaisys; + +struct Qwen2Weights { + tensor_t in_embed; + tensor_t out_embed; + tensor_t out_norm_w; // a.k.a. model.norm.weight + std::vector attn_norm_w; // a.k.a. input_layernorm.weight + std::vector attn_q_w; + std::vector attn_q_b; + std::vector attn_k_w; + std::vector attn_k_b; + std::vector attn_v_w; + std::vector attn_v_b; + std::vector attn_o_w; + std::vector mlp_norm_w; // a.k.a. post_attention_layernorm.weight + std::vector mlp_gate_w; + std::vector mlp_up_w; + std::vector mlp_down_w; +}; + +class debug { +public: + debug& get() { + static debug instance; + return instance; + } + + static void print_shape(tensor_t tensor, const std::string& tensor_name) { + auto shape = tensor->shape(); + std::cout << tensor_name << " shape: ["; + for (size_t i = 0; i < shape.size(); i++) { + std::cout << shape[i]; + if (i != shape.size() - 1) { + std::cout << ", "; + } + } + std::cout << "]" << std::endl; + } + + template + static void print(T... args) { + ((std::cout << args << '\t'), ...); + std::cout << std::endl; + } + + debug(const debug&) = delete; + debug& operator=(const debug&) = delete; +private: + debug() {} +}; + +class Kv_cache { +public: + Kv_cache(size_t nlayers, size_t dh, size_t nkvh, llaisysDataType_t dtype, llaisysDeviceType_t device_type) + : nlayer_(nlayers), dh_(dh), nkvh_(nkvh), dtype_(dtype), device_type_(device_type) { + k_cache_.resize(nlayer_); + v_cache_.resize(nlayer_); + total_len_.assign(nlayer_, 0); + buf_size_.assign(nlayer_, 5); + for(size_t i = 0; i < nlayer_; ++i) { + k_cache_[i] = Tensor::create({buf_size_[i], nkvh_, dh_}, dtype_, device_type_); + v_cache_[i] = Tensor::create({buf_size_[i], nkvh_, dh_}, dtype_, device_type_); + } + } + + void add(size_t layer_id, const tensor_t& k, const tensor_t& v, size_t seq_len) {//only support cpu for now + if (layer_id >= nlayer_) { + throw std::runtime_error("Layer id exceeds the number of layers in the model."); + } + auto& k_cache = k_cache_[layer_id]; + auto& v_cache = v_cache_[layer_id]; + if (total_len_[layer_id] + seq_len > buf_size_[layer_id]) { + // If the total length exceeds the buffer size, we need to reallocate larger buffers and copy the existing data + size_t new_buf_size = std::max(buf_size_[layer_id] * 2, total_len_[layer_id] + seq_len); + auto new_k = Tensor::create({new_buf_size, nkvh_, dh_}, dtype_, device_type_); + auto new_v = Tensor::create({new_buf_size, nkvh_, dh_}, dtype_, device_type_); + llaisys::core::context().runtime().api()->memcpy_sync( + new_k->data(), + k_cache->data(), + total_len_[layer_id] * nkvh_ * dh_ * k_cache->elementSize(), + LLAISYS_MEMCPY_H2H + ); + llaisys::core::context().runtime().api()->memcpy_sync( + new_v->data(), + v_cache->data(), + total_len_[layer_id] * nkvh_ * dh_ * v_cache->elementSize(), + LLAISYS_MEMCPY_H2H + ); + k_cache_[layer_id] = new_k; + v_cache_[layer_id] = new_v; + buf_size_[layer_id] = new_buf_size; + } + // Copy the new k and v to the cache at the correct position. + llaisys::core::context().runtime().api()->memcpy_sync( + k_cache_[layer_id]->data() + total_len_[layer_id] * nkvh_ * dh_ * k->elementSize(), + k->data(), + k->numel() * k->elementSize(), + LLAISYS_MEMCPY_H2H + ); + llaisys::core::context().runtime().api()->memcpy_sync( + v_cache_[layer_id]->data() + total_len_[layer_id] * nkvh_ * dh_ * v->elementSize(), + v->data(), + v->numel() * v->elementSize(), + LLAISYS_MEMCPY_H2H + ); + + total_len_[layer_id] += seq_len; + } + + tensor_t k(size_t layer_id) { + if (layer_id >= nlayer_) { + throw std::runtime_error("Layer id exceeds the number of layers in the model."); + } + return k_cache_[layer_id]->slice(0, 0, total_len_[layer_id]); + } + + tensor_t v(size_t layer_id) { + if (layer_id >= nlayer_) { + throw std::runtime_error("Layer id exceeds the number of layers in the model."); + } + return v_cache_[layer_id]->slice(0, 0, total_len_[layer_id]); + } + +private: + std::vector k_cache_; + std::vector v_cache_; + size_t nlayer_; + std::vector total_len_; + std::vector buf_size_; + size_t dh_; + size_t nkvh_; + llaisysDataType_t dtype_; + llaisysDeviceType_t device_type_; +}; + +class Qwen2ModelImpl { +public: + Qwen2ModelImpl(const LlaisysQwen2Meta &meta, llaisysDeviceType_t device, const std::vector &device_ids) + : meta_(meta), device_(device), device_ids_(device_ids), kv_cache_(meta.nlayer, meta.dh, meta.nkvh, meta.dtype, device) + { + auto hs = meta_.hs; + auto nh = meta_.nh; + auto nkvh = meta_.nkvh; + auto dh = meta_.dh; + auto di = meta_.di; + + weights_.in_embed = Tensor::create({meta_.voc, hs}, meta_.dtype, device_, device_ids_[0]); + weights_.out_embed = Tensor::create({meta_.voc, hs}, meta_.dtype, device_, device_ids_[0]); + weights_.out_norm_w = Tensor::create({hs}, meta_.dtype, device_, device_ids_[0]); + for (size_t i = 0; i < meta_.nlayer; ++i) { + weights_.attn_norm_w.push_back(Tensor::create({hs}, meta_.dtype, device_, device_ids_[0])); + weights_.attn_q_w.push_back(Tensor::create({nh * dh, hs}, meta_.dtype, device_, device_ids_[0])); + weights_.attn_q_b.push_back(Tensor::create({hs}, meta_.dtype, device_, device_ids_[0])); + weights_.attn_k_w.push_back(Tensor::create({nkvh * dh, hs}, meta_.dtype, device_, device_ids_[0])); + weights_.attn_k_b.push_back(Tensor::create({nkvh * dh}, meta_.dtype, device_, device_ids_[0])); + weights_.attn_v_w.push_back(Tensor::create({nkvh * dh, hs}, meta_.dtype, device_, device_ids_[0])); + weights_.attn_v_b.push_back(Tensor::create({nkvh * dh}, meta_.dtype, device_, device_ids_[0])); + weights_.attn_o_w.push_back(Tensor::create({hs, hs}, meta_.dtype, device_, device_ids_[0])); + weights_.mlp_norm_w.push_back(Tensor::create({hs}, meta_.dtype, device_, device_ids_[0])); + weights_.mlp_gate_w.push_back(Tensor::create({di, hs}, meta_.dtype, device_, device_ids_[0])); + weights_.mlp_up_w.push_back(Tensor::create({di, hs}, meta_.dtype, device_, device_ids_[0])); + weights_.mlp_down_w.push_back(Tensor::create({hs, di}, meta_.dtype, device_, device_ids_[0])); + } + } + ~Qwen2ModelImpl() = default; + + void loadWeight(const void* src, std::string name) { + if (name == "lm_head.weight") { + weights_.out_embed->load(src); + } else if (name.find("embed_tokens.weight") != std::string::npos) { + weights_.in_embed->load(src); + }else if (name == "model.norm.weight") { + weights_.out_norm_w->load(src); + } else { + constexpr size_t prefix_len = 13; // "model.layers." + name = name.substr(prefix_len); + auto pos = name.find('.'); + auto layer_id = std::stoi(name.substr(0, pos)); + auto param_name = name.substr(pos + 1); + if (param_name == "input_layernorm.weight") { + weights_.attn_norm_w[layer_id]->load(src); + }else if (param_name == "self_attn.q_proj.weight") { + weights_.attn_q_w[layer_id]->load(src); + }else if (param_name == "self_attn.q_proj.bias") { + weights_.attn_q_b[layer_id]->load(src); + }else if (param_name == "self_attn.k_proj.weight") { + weights_.attn_k_w[layer_id]->load(src); + }else if (param_name == "self_attn.k_proj.bias") { + weights_.attn_k_b[layer_id]->load(src); + }else if (param_name == "self_attn.v_proj.weight") { + weights_.attn_v_w[layer_id]->load(src); + }else if (param_name == "self_attn.v_proj.bias") { + weights_.attn_v_b[layer_id]->load(src); + }else if (param_name == "self_attn.o_proj.weight") { + weights_.attn_o_w[layer_id]->load(src); + }else if (param_name == "post_attention_layernorm.weight") { + weights_.mlp_norm_w[layer_id]->load(src); + }else if (param_name == "mlp.gate_proj.weight") { + weights_.mlp_gate_w[layer_id]->load(src); + }else if (param_name == "mlp.up_proj.weight") { + weights_.mlp_up_w[layer_id]->load(src); + }else if (param_name == "mlp.down_proj.weight") { + weights_.mlp_down_w[layer_id]->load(src); + }else { + throw std::runtime_error("Unknown weight name: " + name); + } + } + } + + size_t forward_with_cache(const std::vector &input_ids) { + using namespace ops; + size_t seq_len = input_ids.size(); + //Allocate tensor + auto cache = get_(seq_len); + auto& tensor_input_ids = cache.tensor_input_ids; + auto& x = cache.x; + auto& x_norm = cache.x_norm; + auto& q = cache.q; + auto& k_ = cache.k_; + auto& v_ = cache.v_; + auto& q_rope = cache.q_rope; + auto& k_rope = cache.k_rope; + auto& pos = cache.pos; + auto& attn_val = cache.attn_val; + auto& attn_out = cache.attn_out; + auto& swiglu_out = cache.swiglu_out; + auto& gate_out = cache.gate_out; + auto& up_out = cache.up_out; + auto& norm_out = cache.norm_out; + auto& logits = cache.logits; + auto& next_token_id = cache.next_token_id; + auto& next_token_possibility = cache.next_token_possibility; + tensor_t k; + tensor_t v; + // input embedding + tensor_input_ids->load(input_ids.data()); + embedding(x, tensor_input_ids, weights_.in_embed); + + for (size_t i = 0; i < meta_.nlayer; i++) { + rms_norm(x_norm, x, weights_.attn_norm_w[i], meta_.epsilon); + //compute q, k, v + linear(q, x_norm, weights_.attn_q_w[i], weights_.attn_q_b[i]); + linear(k_, x_norm, weights_.attn_k_w[i], weights_.attn_k_b[i]); + linear(v_, x_norm, weights_.attn_v_w[i], weights_.attn_v_b[i]); + //rope + rope(q_rope, q, pos, meta_.theta); + rope(k_rope, k_, pos, meta_.theta); + //cache and load k, v + kv_cache_.add(i, k_rope, v_, seq_len); + k = kv_cache_.k(i); + v = kv_cache_.v(i); + //attention + self_attention( + attn_val, + q_rope, k, v, + 1.0f / std::sqrt(utils::cast(meta_.dh)) + ); + linear(attn_out, attn_val, weights_.attn_o_w[i]); + add(x, x, attn_out); + rms_norm(x_norm, x, weights_.mlp_norm_w[i], meta_.epsilon); + //FFN + linear(gate_out, x_norm, weights_.mlp_gate_w[i]); + linear(up_out, x_norm, weights_.mlp_up_w[i]); + swiglu(swiglu_out, gate_out, up_out); + linear(x_norm, swiglu_out, weights_.mlp_down_w[i]); + add(x, x, x_norm); + } + rms_norm(norm_out, x->slice(0, seq_len - 1, seq_len), weights_.out_norm_w, meta_.epsilon); + linear(logits, norm_out, weights_.out_embed); + argmax(next_token_id, next_token_possibility, logits); + auto ret = *reinterpret_cast(next_token_id->data()); + + //debug + debug::print("Next token id: ", ret, " possibility: ", utils::cast(*reinterpret_cast(next_token_possibility->data()))); + + return ret; + } + + + const Qwen2Weights& weights() const { + return weights_; + } +private: + struct Infer_tensors_buf { + tensor_t tensor_input_ids; + tensor_t x; + tensor_t x_norm; + tensor_t q; + tensor_t k_; + tensor_t v_; + tensor_t q_rope; + tensor_t k_rope; + tensor_t pos; + tensor_t attn_val; + tensor_t attn_out; + tensor_t swiglu_out; + tensor_t gate_out; + tensor_t up_out; + tensor_t norm_out; + tensor_t logits; + tensor_t next_token_id; + tensor_t next_token_possibility; + size_t seq_len = 0; + size_t total_len = 0; + }; + Infer_tensors_buf get_(size_t seq_len) { + infer_buf_.total_len += seq_len; + if (!infer_buf_.norm_out) { + infer_buf_.norm_out = Tensor::create({1, meta_.hs}, meta_.dtype, device_, device_ids_[0]); + infer_buf_.logits = Tensor::create({meta_.voc}, meta_.dtype, device_, device_ids_[0]); + infer_buf_.next_token_id = Tensor::create({1}, LLAISYS_DTYPE_I64, device_, device_ids_[0]); + infer_buf_.next_token_possibility = Tensor::create({1}, meta_.dtype, device_, device_ids_[0]); + } + + if (!infer_buf_.pos || infer_buf_.total_len > infer_buf_.pos->shape()[0]) { + infer_buf_.pos = Tensor::create({infer_buf_.total_len * 2}, LLAISYS_DTYPE_I64, device_, device_ids_[0]); + auto p = reinterpret_cast(infer_buf_.pos->data()); + for (size_t i = 0; i < infer_buf_.pos->numel(); i++) { + *p++ = i; + } + } + Infer_tensors_buf cache; + if (infer_buf_.seq_len < seq_len) { + infer_buf_.tensor_input_ids = Tensor::create({seq_len}, LLAISYS_DTYPE_I64, device_, device_ids_[0]); + infer_buf_.x = Tensor::create({seq_len, meta_.hs}, meta_.dtype, device_, device_ids_[0]); + infer_buf_.x_norm = Tensor::create({seq_len, meta_.hs}, meta_.dtype, device_, device_ids_[0]); + infer_buf_.q = Tensor::create({seq_len, meta_.nh, meta_.dh}, meta_.dtype, device_, device_ids_[0]); + infer_buf_.k_ = Tensor::create({seq_len, meta_.nkvh, meta_.dh}, meta_.dtype, device_, device_ids_[0]); + infer_buf_.v_ = Tensor::create({seq_len, meta_.nkvh, meta_.dh}, meta_.dtype, device_, device_ids_[0]); + infer_buf_.q_rope = Tensor::create({seq_len, meta_.nh, meta_.dh}, meta_.dtype, device_, device_ids_[0]); + infer_buf_.k_rope = Tensor::create({seq_len, meta_.nkvh, meta_.dh}, meta_.dtype, device_, device_ids_[0]); + infer_buf_.attn_val = Tensor::create({seq_len, meta_.hs}, meta_.dtype, device_, device_ids_[0]); + infer_buf_.attn_out = Tensor::create({seq_len, meta_.hs}, meta_.dtype, device_, device_ids_[0]); + infer_buf_.swiglu_out = Tensor::create({seq_len, meta_.di}, meta_.dtype, device_, device_ids_[0]); + infer_buf_.gate_out = Tensor::create({seq_len, meta_.di}, meta_.dtype, device_, device_ids_[0]); + infer_buf_.up_out = Tensor::create({seq_len, meta_.di}, meta_.dtype, device_, device_ids_[0]); + infer_buf_.seq_len = seq_len; + cache = infer_buf_; + }else { + cache.tensor_input_ids = infer_buf_.tensor_input_ids->slice(0, 0, seq_len); + cache.x = infer_buf_.x->slice(0, 0, seq_len); + cache.x_norm = infer_buf_.x_norm->slice(0, 0, seq_len); + cache.q = infer_buf_.q->slice(0, 0, seq_len); + cache.k_ = infer_buf_.k_->slice(0, 0, seq_len); + cache.v_ = infer_buf_.v_->slice(0, 0, seq_len); + cache.q_rope = infer_buf_.q_rope->slice(0, 0, seq_len); + cache.k_rope = infer_buf_.k_rope->slice(0, 0, seq_len); + cache.attn_val = infer_buf_.attn_val->slice(0, 0, seq_len); + cache.attn_out = infer_buf_.attn_out->slice(0, 0, seq_len); + cache.swiglu_out = infer_buf_.swiglu_out->slice(0, 0, seq_len); + cache.gate_out = infer_buf_.gate_out->slice(0, 0, seq_len); + cache.up_out = infer_buf_.up_out->slice(0, 0, seq_len); + } + cache.pos = infer_buf_.pos->slice(0, infer_buf_.total_len - seq_len, infer_buf_.total_len); + cache.norm_out = infer_buf_.norm_out; + cache.logits = infer_buf_.logits; + cache.next_token_id = infer_buf_.next_token_id; + cache.next_token_possibility = infer_buf_.next_token_possibility; + return cache; + } +private: + LlaisysQwen2Meta meta_; + Qwen2Weights weights_; + llaisysDeviceType_t device_; + std::vector device_ids_; + Kv_cache kv_cache_; + Infer_tensors_buf infer_buf_; +}; + +__C { + LlaisysQwen2Model *llaisysQwen2ModelCreate(const LlaisysQwen2Meta *meta, llaisysDeviceType_t device, int *device_ids, int ndevice) { + auto impl = new Qwen2ModelImpl(*meta, device, std::vector(device_ids, device_ids + ndevice)); + auto model = new LlaisysQwen2Model; + model->impl = impl; + return model; + } + + void llaisysQwen2ModelDestroy(LlaisysQwen2Model * model) { + delete model->meta; + delete model->weights; + delete static_cast(model->impl); + delete model; + } + + void llaisysQwen2modelLoadWeight(LlaisysQwen2Model * model, const void *weight_data, const char *weight_name) { + auto impl = static_cast(model->impl); + impl->loadWeight(weight_data, weight_name); + } + + LlaisysQwen2Weights *llaisysQwen2ModelWeights(LlaisysQwen2Model * model) { + delete model->weights; // Free previously allocated weights if any + auto weights = reinterpret_cast(model->impl)->weights(); + model->weights = new LlaisysQwen2Weights{ + reinterpret_cast(weights.in_embed.get()), + reinterpret_cast(weights.out_embed.get()), + reinterpret_cast(weights.out_norm_w.get()), + reinterpret_cast(weights.attn_norm_w.data()), + reinterpret_cast(weights.attn_q_w.data()), + reinterpret_cast(weights.attn_q_b.data()), + reinterpret_cast(weights.attn_k_w.data()), + reinterpret_cast(weights.attn_k_b.data()), + reinterpret_cast(weights.attn_v_w.data()), + reinterpret_cast(weights.attn_v_b.data()), + reinterpret_cast(weights.attn_o_w.data()), + reinterpret_cast(weights.mlp_norm_w.data()), + reinterpret_cast(weights.mlp_gate_w.data()), + reinterpret_cast(weights.mlp_up_w.data()), + reinterpret_cast(weights.mlp_down_w.data()) + }; + return model->weights; + } + + int64_t llaisysQwen2ModelInfer(LlaisysQwen2Model * model, int64_t * token_ids, size_t ntoken) { + auto impl = static_cast(model->impl); + return impl->forward_with_cache(std::vector(token_ids,token_ids + ntoken)); + } +} \ No newline at end of file diff --git a/src/ops/add/cpu/add_cpu.hpp b/src/ops/add/cpu/add_cpu.hpp index 34d809a1..d9167beb 100644 --- a/src/ops/add/cpu/add_cpu.hpp +++ b/src/ops/add/cpu/add_cpu.hpp @@ -5,4 +5,4 @@ namespace llaisys::ops::cpu { void add(std::byte *c, const std::byte *a, const std::byte *b, llaisysDataType_t type, size_t size); -} \ No newline at end of file +} \ No newline at end of file diff --git a/src/ops/argmax/cpu/argmax.cpp b/src/ops/argmax/cpu/argmax.cpp new file mode 100644 index 00000000..fbe884ca --- /dev/null +++ b/src/ops/argmax/cpu/argmax.cpp @@ -0,0 +1,38 @@ +#include "argmax.hpp" + +#include "../../../utils.hpp" + +#include + +template +static void argmax_(size_t *max_idx, T *max_val, const T *input, size_t numel) { + if (numel == 0) { + return; + } + *max_idx = static_cast(0); + *max_val = input[0]; + for(size_t i = 1; i < numel; ++i) { + if (input[i] > *max_val) { + *max_val = input[i]; + *max_idx = i; + } + } +} + +namespace llaisys::ops::cpu { +void argmax(size_t *max_idx, std::byte *max_val, const std::byte *input, llaisysDataType_t val_type, size_t numel) { + switch (val_type) { + case LLAISYS_DTYPE_F32: + argmax_(max_idx, reinterpret_cast(max_val), reinterpret_cast(input), numel); + break; + case LLAISYS_DTYPE_BF16: + argmax_(max_idx, reinterpret_cast(max_val), reinterpret_cast(input), numel); + break; + case LLAISYS_DTYPE_F16: + argmax_(max_idx, reinterpret_cast(max_val), reinterpret_cast(input), numel); + break; + default: + EXCEPTION_UNSUPPORTED_DATATYPE(val_type); + } +} +} // namespace llaisys::ops::cpu diff --git a/src/ops/argmax/cpu/argmax.hpp b/src/ops/argmax/cpu/argmax.hpp new file mode 100644 index 00000000..db92c915 --- /dev/null +++ b/src/ops/argmax/cpu/argmax.hpp @@ -0,0 +1,8 @@ +#pragma once +#include "llaisys.h" + +#include + +namespace llaisys::ops::cpu { +void argmax(size_t *max_idx, std::byte *max_val, const std::byte *input, llaisysDataType_t type, size_t size); +} \ No newline at end of file diff --git a/src/ops/argmax/op.cpp b/src/ops/argmax/op.cpp index 6dc37d42..7597d993 100644 --- a/src/ops/argmax/op.cpp +++ b/src/ops/argmax/op.cpp @@ -1,7 +1,29 @@ #include "op.hpp" +#include "../../core/llaisys_core.hpp" +#include "../../utils.hpp" + +#include "cpu/argmax.hpp" + namespace llaisys::ops { void argmax(tensor_t max_idx, tensor_t max_val, tensor_t vals) { - TO_BE_IMPLEMENTED(); + CHECK_SAME_DEVICE(max_idx, max_val, vals); + CHECK_SAME_DTYPE(max_val->dtype(), vals->dtype()); + //Only support 1D tensor for now. + CHECK_ARGUMENT(vals->ndim() == 1, "Argmax: only 1D tensor is supported for now."); + + // Only support contiguous inputs for now. + ASSERT(max_idx->isContiguous() && max_val->isContiguous() && vals->isContiguous(), "Argmax: all tensors must be contiguous."); + + llaisys::core::context().setDevice(vals->deviceType(), vals->deviceId()); + switch (vals->deviceType()) { + case LLAISYS_DEVICE_CPU: + return cpu::argmax(reinterpret_cast(max_idx->data()), max_val->data(), vals->data(), vals->dtype(), vals->numel()); +#ifdef ENABLE_NVIDIA_API + case LLAISYS_DEVICE_NVIDIA: +#endif + default: + EXCEPTION_UNSUPPORTED_DEVICE; + } } } // namespace llaisys::ops diff --git a/src/ops/embedding/cpu/embedding.cpp b/src/ops/embedding/cpu/embedding.cpp new file mode 100644 index 00000000..6c2d386b --- /dev/null +++ b/src/ops/embedding/cpu/embedding.cpp @@ -0,0 +1,65 @@ +#include "embedding.hpp" + +#include "../../../utils.hpp" +#include "../../../core/llaisys_core.hpp" + +template +static void embedding_( + T* out, + const int64_t* index, + const T* weight, + std::size_t embedding_dim, + std::size_t index_size +) { + for (size_t i = 0; i < index_size; ++i) { + llaisys::core::context().runtime().api()->memcpy_sync( + out + i * embedding_dim, + weight + index[i] * embedding_dim, + embedding_dim * sizeof(T), + LLAISYS_MEMCPY_H2H + ); + } +} + +namespace llaisys::ops::cpu { +void embedding( + void* out, + const void* index, + const void* weight, + size_t index_size, + size_t embedding_dim, + llaisysDataType_t data_type +) { + switch (data_type) { + case LLAISYS_DTYPE_F32: + embedding_( + reinterpret_cast(out), + reinterpret_cast(index), + reinterpret_cast(weight), + embedding_dim, + index_size + ); + break; + case LLAISYS_DTYPE_F16: + embedding_( + reinterpret_cast(out), + static_cast(index), + reinterpret_cast(weight), + embedding_dim, + index_size + ); + break; + case LLAISYS_DTYPE_BF16: + embedding_( + reinterpret_cast(out), + static_cast(index), + reinterpret_cast(weight), + embedding_dim, + index_size + ); + break; + default: + throw std::runtime_error("Unsupported data type in embedding operation."); + } +} +} // namespace llaisys::ops::cpu \ No newline at end of file diff --git a/src/ops/embedding/cpu/embedding.hpp b/src/ops/embedding/cpu/embedding.hpp new file mode 100644 index 00000000..ed5ca94a --- /dev/null +++ b/src/ops/embedding/cpu/embedding.hpp @@ -0,0 +1,15 @@ +#pragma once +#include "llaisys.h" + +#include + +namespace llaisys::ops::cpu { +void embedding( + void* out, + const void* index, + const void* weight, + size_t index_size, + size_t embedding_dim, + llaisysDataType_t data_type +); +} \ No newline at end of file diff --git a/src/ops/embedding/op.cpp b/src/ops/embedding/op.cpp index 84b9a5d0..baafb72f 100644 --- a/src/ops/embedding/op.cpp +++ b/src/ops/embedding/op.cpp @@ -1,7 +1,38 @@ #include "op.hpp" +#include "../../utils.hpp" + +#include "cpu/embedding.hpp" + namespace llaisys::ops { void embedding(tensor_t out, tensor_t index, tensor_t weight) { - TO_BE_IMPLEMENTED(); + CHECK_SAME_DEVICE(out, index, weight); + CHECK_SAME_DTYPE(out->dtype(), weight->dtype()); + CHECK_ARGUMENT( + index->dtype() == LLAISYS_DTYPE_I64, + "Index tensor must be of type INT64." + ); + + if(out->deviceType() == LLAISYS_DEVICE_CPU) { + return cpu::embedding( + out->data(), + index->data(), + weight->data(), + index->numel(), + weight->shape().back(), + out->dtype() + ); + } + + llaisys::core::context().setDevice(out->deviceType(), out->deviceId()); + switch (out->deviceType()) { +#ifdef ENABLE_NVIDIA_API + case LLAISYS_DEVICE_NVIDIA: + TO_BE_IMPLEMENTED(); + return; +#endif + default: + EXCEPTION_UNSUPPORTED_DEVICE; + } } } // namespace llaisys::ops diff --git a/src/ops/linear/cpu/linear.cpp b/src/ops/linear/cpu/linear.cpp new file mode 100644 index 00000000..410d9c4c --- /dev/null +++ b/src/ops/linear/cpu/linear.cpp @@ -0,0 +1,79 @@ +#include "linear.hpp" + +#include "../../../utils.hpp" + +template +static void linear_( + T* out, + const T* in, + const T* weight, + const T* bias, + size_t batch_size, + size_t in_features, + size_t out_features +) { + using namespace llaisys::utils; + for (size_t b = 0; b < batch_size; b++) { + const T* in_batch = in + b * in_features; + T* out_batch = out + b * out_features; + + for (size_t o = 0; o < out_features; o++) { + float sum = bias ? cast(bias[o]) : 0.0f; + const T* weight_ = weight + o * in_features; + + for (size_t i = 0; i < in_features; i++) { + sum += cast(in_batch[i]) * cast(weight_[i]); + } + out_batch[o] = cast(sum); + } + } +} + +namespace llaisys::ops::cpu { +void linear( + void* out, + const void* in, + const void* weight, + const void* bias, + size_t batch_size, + size_t in_features, + size_t out_features, + llaisysDataType_t data_type +) { + switch (data_type) { + case LLAISYS_DTYPE_F32: + return linear_( + reinterpret_cast(out), + reinterpret_cast(in), + reinterpret_cast(weight), + reinterpret_cast(bias), + batch_size, + in_features, + out_features + ); + case LLAISYS_DTYPE_F16: + return linear_( + reinterpret_cast(out), + reinterpret_cast(in), + reinterpret_cast(weight), + reinterpret_cast(bias), + batch_size, + in_features, + out_features + ); + case LLAISYS_DTYPE_BF16: + return linear_( + reinterpret_cast(out), + reinterpret_cast(in), + reinterpret_cast(weight), + reinterpret_cast(bias), + batch_size, + in_features, + out_features + ); + default: + EXCEPTION_UNSUPPORTED_DATATYPE(data_type); + } + +} +} \ No newline at end of file diff --git a/src/ops/linear/cpu/linear.hpp b/src/ops/linear/cpu/linear.hpp new file mode 100644 index 00000000..795cc827 --- /dev/null +++ b/src/ops/linear/cpu/linear.hpp @@ -0,0 +1,17 @@ +#pragma once +#include "llaisys.h" + +#include + +namespace llaisys::ops::cpu { +void linear( + void* out, + const void* in, + const void* weight, + const void* bias, + size_t batch_size, + size_t in_features, + size_t out_features, + llaisysDataType_t data_type +); +} \ No newline at end of file diff --git a/src/ops/linear/op.cpp b/src/ops/linear/op.cpp index 97d1f865..3b4cf84a 100644 --- a/src/ops/linear/op.cpp +++ b/src/ops/linear/op.cpp @@ -1,7 +1,79 @@ #include "op.hpp" +#include "../../utils.hpp" + +#include "cpu/linear.hpp" + namespace llaisys::ops { void linear(tensor_t out, tensor_t in, tensor_t weight, tensor_t bias) { - TO_BE_IMPLEMENTED(); + CHECK_SAME_DEVICE(out, in, weight, bias); + CHECK_ARGUMENT( + in->shape().back() == weight->shape().back(), + "Input dimension does not match weight dimension." + ); + + size_t batch_size = 1; + for (size_t i = 0; i < in->ndim() - 1; ++i) { + batch_size *= in->shape()[i]; + } + if (out->deviceType() == LLAISYS_DEVICE_CPU) { + return cpu::linear( + out->data(), + in->data(), + weight->data(), + bias->data(), + batch_size, + in->shape().back(), + weight->shape()[weight->ndim() - 2], + in->dtype() + ); + } + + llaisys::core::context().setDevice(out->deviceType(), out->deviceId()); + switch (out->deviceType()) { +#ifdef ENABLE_NVIDIA_API + case LLAISYS_DEVICE_NVIDIA: + TO_BE_IMPLEMENTED(); + return; +#endif + default: + EXCEPTION_UNSUPPORTED_DEVICE; + } +} + +void linear(tensor_t out, tensor_t in, tensor_t weight) { + CHECK_SAME_DEVICE(out, in, weight); + CHECK_ARGUMENT( + in->shape().back() == weight->shape().back(), + "Input dimension does not match weight dimension." + ); + + size_t batch_size = 1; + for (size_t i = 0; i < in->ndim() - 1; ++i) { + batch_size *= in->shape()[i]; + } + if (out->deviceType() == LLAISYS_DEVICE_CPU) { + return cpu::linear( + out->data(), + in->data(), + weight->data(), + nullptr, + batch_size, + in->shape().back(), + weight->shape()[weight->ndim() - 2], + in->dtype() + ); + } + + llaisys::core::context().setDevice(out->deviceType(), out->deviceId()); + switch (out->deviceType()) { +#ifdef ENABLE_NVIDIA_API + case LLAISYS_DEVICE_NVIDIA: + TO_BE_IMPLEMENTED(); + return; +#endif + default: + EXCEPTION_UNSUPPORTED_DEVICE; + } } } // namespace llaisys::ops diff --git a/src/ops/linear/op.hpp b/src/ops/linear/op.hpp index 7bf06f01..b8b41d1b 100644 --- a/src/ops/linear/op.hpp +++ b/src/ops/linear/op.hpp @@ -4,4 +4,5 @@ namespace llaisys::ops { void linear(tensor_t out, tensor_t in, tensor_t weight, tensor_t bias); +void linear(tensor_t out, tensor_t in, tensor_t weight); } diff --git a/src/ops/rms_norm/cpu/rms_norm.cpp b/src/ops/rms_norm/cpu/rms_norm.cpp new file mode 100644 index 00000000..9fd4f987 --- /dev/null +++ b/src/ops/rms_norm/cpu/rms_norm.cpp @@ -0,0 +1,93 @@ +#include "rms_norm.hpp" + +#include "../../../utils.hpp" + +#include +#include + +template +void rms_norm_( + T *output, + const T *input, + const T *weight, + const T *bias, + size_t batch_size, + size_t feature_size, + float epsilon +) { + for (size_t b = 0; b < batch_size; b++) { + const T *input_batch = input + b * feature_size; + T *output_batch = output + b * feature_size; + + // Compute mean square + float mean_square = 0.0f; + for (size_t i = 0; i < feature_size; i++) { + float val = llaisys::utils::cast(input_batch[i]); + mean_square += val * val; + } + mean_square /= static_cast(feature_size); + + // Compute RMS + float rms = std::sqrt(mean_square + epsilon); + + // Normalize and apply weight and bias + for (size_t i = 0; i < feature_size; i++) { + float normalized = llaisys::utils::cast(input_batch[i]) / rms; + if (weight) { + normalized *= llaisys::utils::cast(weight[i]); + } + if (bias) { + normalized += llaisys::utils::cast(bias[i]); + } + output_batch[i] = llaisys::utils::cast(normalized); + } + } +} + +namespace llaisys::ops::cpu { +void rms_norm( + std::byte *output, + const std::byte *input, + const std::byte *weight, + const std::byte *bias, + llaisysDataType_t type, + size_t batch_size, + size_t feature_size, + float epsilon +) { + switch (type) { + case LLAISYS_DTYPE_F32: + return rms_norm_( + reinterpret_cast(output), + reinterpret_cast(input), + reinterpret_cast(weight), + reinterpret_cast(bias), + batch_size, + feature_size, + epsilon + ); + case LLAISYS_DTYPE_F16: + return rms_norm_( + reinterpret_cast(output), + reinterpret_cast(input), + reinterpret_cast(weight), + reinterpret_cast(bias), + batch_size, + feature_size, + epsilon + ); + case LLAISYS_DTYPE_BF16: + return rms_norm_( + reinterpret_cast(output), + reinterpret_cast(input), + reinterpret_cast(weight), + reinterpret_cast(bias), + batch_size, + feature_size, + epsilon + ); + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } +} +} diff --git a/src/ops/rms_norm/cpu/rms_norm.hpp b/src/ops/rms_norm/cpu/rms_norm.hpp new file mode 100644 index 00000000..11e796f7 --- /dev/null +++ b/src/ops/rms_norm/cpu/rms_norm.hpp @@ -0,0 +1,17 @@ +#pragma once +#include "llaisys.h" + +#include + +namespace llaisys::ops::cpu { +void rms_norm( + std::byte *output, + const std::byte *input, + const std::byte *weight, + const std::byte *bias, + llaisysDataType_t type, + size_t batch_size, + size_t feature_size, + float epsilon +); +} // namespace llaisys::ops::cpu \ No newline at end of file diff --git a/src/ops/rms_norm/op.cpp b/src/ops/rms_norm/op.cpp index 529553d9..f1b789b2 100644 --- a/src/ops/rms_norm/op.cpp +++ b/src/ops/rms_norm/op.cpp @@ -1,7 +1,27 @@ #include "op.hpp" +#include "../../utils.hpp" + +#include "cpu/rms_norm.hpp" + namespace llaisys::ops { void rms_norm(tensor_t out, tensor_t in, tensor_t weight, float eps) { - TO_BE_IMPLEMENTED(); + size_t feature_size = in->shape().back(); + size_t batch_size = in->numel() / feature_size; + + // Call CPU implementation + if (in->deviceType() == LLAISYS_DEVICE_CPU) { + return cpu::rms_norm( + out->data(), + in->data(), + weight->data(), + nullptr, + in->dtype(), + batch_size, + feature_size, + eps); + } + //TODO: Add more device implementations here + EXCEPTION_UNSUPPORTED_DEVICE; } } // namespace llaisys::ops diff --git a/src/ops/rope/cpu/rope.cpp b/src/ops/rope/cpu/rope.cpp new file mode 100644 index 00000000..dc331ac4 --- /dev/null +++ b/src/ops/rope/cpu/rope.cpp @@ -0,0 +1,66 @@ +#include "rope.hpp" + +#include "../../../utils.hpp" + +#include +#include + +template +static void rope_( + T* out, const T* in, const int64_t* pos_id, float theta, + size_t seqlen, size_t nhead, size_t d +) { + using namespace llaisys::utils; + size_t half = d / 2; + + std::vector inv_freq(half); + float log_theta = std::log(theta); + for (size_t j = 0; j < half; j++) { + inv_freq[j] = std::exp(-log_theta * (2.0f * j / d)); + } + + for (size_t i = 0; i < seqlen; i++) { + float p = static_cast(pos_id[i]); + for (size_t h = 0; h < nhead; h++) { + size_t base = (i * nhead + h) * d; + for (size_t j = 0; j < half; j++) { + float angle = p * inv_freq[j]; + float cos_val = std::cos(angle); + float sin_val = std::sin(angle); + + float a = cast(in[base + j]); + float b = cast(in[base + j + half]); + + out[base + j] = cast(a * cos_val - b * sin_val); + out[base + j + half] = cast(b * cos_val + a * sin_val); + } + } + } +} + +namespace llaisys::ops::cpu { + void rope( + void* out, + const void* in, + const void* pos_id, + float theta, + size_t seqlen, + size_t nhead, + size_t d, + llaisysDataType_t dtype + ) { + switch (dtype) { + case LLAISYS_DTYPE_F32: + return rope_(reinterpret_cast(out), reinterpret_cast(in), + reinterpret_cast(pos_id), theta, seqlen, nhead, d); + case LLAISYS_DTYPE_BF16: + return rope_(reinterpret_cast(out), reinterpret_cast(in), + reinterpret_cast(pos_id), theta, seqlen, nhead, d); + case LLAISYS_DTYPE_F16: + return rope_(reinterpret_cast(out), reinterpret_cast(in), + reinterpret_cast(pos_id), theta, seqlen, nhead, d); + default: + EXCEPTION_UNSUPPORTED_DATATYPE(dtype); + } + } +} // namespace llaisys::ops::cpu \ No newline at end of file diff --git a/src/ops/rope/cpu/rope.hpp b/src/ops/rope/cpu/rope.hpp new file mode 100644 index 00000000..8a332258 --- /dev/null +++ b/src/ops/rope/cpu/rope.hpp @@ -0,0 +1,17 @@ +#pragma once +#include "llaisys.h" + +#include + +namespace llaisys::ops::cpu { + void rope( + void* out, + const void* in, + const void* pos_id, + float theta, + size_t seqlen, + size_t nhead, + size_t d, + llaisysDataType_t dtype + ); +} \ No newline at end of file diff --git a/src/ops/rope/op.cpp b/src/ops/rope/op.cpp index d60dbe64..ad5cdbe3 100644 --- a/src/ops/rope/op.cpp +++ b/src/ops/rope/op.cpp @@ -1,7 +1,26 @@ #include "op.hpp" +#include "../../utils.hpp" + +#include "cpu/rope.hpp" + namespace llaisys::ops { void rope(tensor_t out, tensor_t in, tensor_t pos_ids, float theta) { + CHECK_SAME_DEVICE(out, in, pos_ids); + CHECK_SAME_SHAPE(out->shape(), in->shape()); + CHECK_ARGUMENT(pos_ids->ndim() == 1, "pos_ids must be 1D"); + CHECK_ARGUMENT(pos_ids->dtype() == LLAISYS_DTYPE_I64, "pos_ids must be of type int64"); + CHECK_ARGUMENT(out->dtype() == in->dtype(), "out and in must have the same dtype"); + + size_t seqlen = in->shape()[0]; + size_t nhead = in->shape()[1]; + size_t d = in->shape()[2]; + if(out->deviceType() == LLAISYS_DEVICE_CPU) { + return cpu::rope( + out->data(), in->data(), pos_ids->data(), + theta, seqlen, nhead, d, in->dtype() + ); + } TO_BE_IMPLEMENTED(); } } // namespace llaisys::ops diff --git a/src/ops/self_attention/cpu/self_attention.cpp b/src/ops/self_attention/cpu/self_attention.cpp new file mode 100644 index 00000000..3d2dbdf1 --- /dev/null +++ b/src/ops/self_attention/cpu/self_attention.cpp @@ -0,0 +1,82 @@ +#include "self_attention.hpp" + +#include "../../../utils.hpp" + +#include +#include +#include + +template +void self_attention_( + T* out, const T* q, const T* k, const T* v, + size_t seqlen, size_t nhead, size_t d, + size_t total_len, size_t nkvhead, size_t dv, + float scale +) { + using namespace llaisys::utils; + for (size_t h = 0; h < nhead; h++) { + size_t q_base = h * d; + size_t k_base = h / (nhead / nkvhead) * d; + size_t v_base = h / (nhead / nkvhead) * dv; + + for (size_t i = 0; i < seqlen; i++) { + size_t len = total_len - seqlen + 1; + std::vector attn_weights(len + i); + for (size_t j = 0; j < i + len; j++) { + float sum = 0.f; + for (size_t n = 0; n < d; n++) { + sum += cast( + q[q_base + i * nhead * d + n] * k[k_base + j * nkvhead * d + n] + ); + } + sum *= scale; + attn_weights[j] = sum; + } + // softmax + float max_weight = *std::max_element(attn_weights.begin(), attn_weights.end()); + float sum_exp = 0.f; + for (float& w : attn_weights) { + w = std::exp(w - max_weight); + sum_exp += w; + } + for (float& w : attn_weights) { + w /= sum_exp; + } + // output + for (size_t j = 0; j < dv; j++) { + float sum = 0.f; + for (size_t n = 0; n < attn_weights.size(); n++) { + sum += attn_weights[n] * cast(v[v_base + n * nkvhead * dv + j]); + } + out[(h + i * nhead) * dv + j] = cast(sum); + } + } + } +} + +namespace llaisys::ops::cpu { +void self_attention( + void* out, const void* q, const void* k, const void* v, + size_t seqlen, size_t nhead, size_t d, + size_t total_len, size_t nkvhead, size_t dv, + float scale, + llaisysDataType_t dtype +) { + switch (dtype) { + case LLAISYS_DTYPE_F32: + return self_attention_(reinterpret_cast(out), reinterpret_cast(q), + reinterpret_cast(k), reinterpret_cast(v), + seqlen, nhead, d, total_len, nkvhead, dv, scale); + case LLAISYS_DTYPE_BF16: + return self_attention_(reinterpret_cast(out), reinterpret_cast(q), + reinterpret_cast(k), reinterpret_cast(v), + seqlen, nhead, d, total_len, nkvhead, dv, scale); + case LLAISYS_DTYPE_F16: + return self_attention_(reinterpret_cast(out), reinterpret_cast(q), + reinterpret_cast(k), reinterpret_cast(v), + seqlen, nhead, d, total_len, nkvhead, dv, scale); + default: + EXCEPTION_UNSUPPORTED_DATATYPE(dtype); + } +} // namespace llaisys::ops::cpu +} \ No newline at end of file diff --git a/src/ops/self_attention/cpu/self_attention.hpp b/src/ops/self_attention/cpu/self_attention.hpp new file mode 100644 index 00000000..6aa374d0 --- /dev/null +++ b/src/ops/self_attention/cpu/self_attention.hpp @@ -0,0 +1,14 @@ +#pragma once +#include "llaisys.h" + +#include + +namespace llaisys::ops::cpu { +void self_attention( + void* out, const void* q, const void* k, const void* v, + size_t seqlen, size_t nhead, size_t d, + size_t total_len, size_t nkvhead, size_t dv, + float scale, + llaisysDataType_t dtype +); +} \ No newline at end of file diff --git a/src/ops/self_attention/op.cpp b/src/ops/self_attention/op.cpp index 43d62014..dd6eceee 100644 --- a/src/ops/self_attention/op.cpp +++ b/src/ops/self_attention/op.cpp @@ -1,7 +1,29 @@ #include "op.hpp" +#include "../../utils.hpp" + +#include "cpu/self_attention.hpp" + namespace llaisys::ops { void self_attention(tensor_t attn_val, tensor_t q, tensor_t k, tensor_t v, float scale) { + CHECK_SAME_DEVICE(attn_val, q, k, v); + + size_t seqlen = q->shape()[0]; + size_t nhead = q->shape()[1]; + size_t d = q->shape()[2]; + size_t total_len = k->shape()[0]; + size_t nkvhead = k->shape()[1]; + size_t dv = v->shape()[2]; + + if(attn_val->deviceType() == LLAISYS_DEVICE_CPU) { + return cpu::self_attention( + attn_val->data(), q->data(), k->data(), v->data(), + seqlen, nhead, d, + total_len, nkvhead, dv, + scale, + q->dtype() + ); + } TO_BE_IMPLEMENTED(); } } // namespace llaisys::ops diff --git a/src/ops/swiglu/cpu/swiglu.cpp b/src/ops/swiglu/cpu/swiglu.cpp new file mode 100644 index 00000000..c0552ca0 --- /dev/null +++ b/src/ops/swiglu/cpu/swiglu.cpp @@ -0,0 +1,46 @@ +#include "swiglu.hpp" + +#include "../../../utils.hpp" + +#include +#include + +template +void swiglu_( + T *output, + const T *input, + const T *gate, + size_t total_size +) { + for(size_t i = 0; i < total_size; i++) { + float gate_val = llaisys::utils::cast(gate[i]); + float silu = gate_val / (1.0f + std::exp(-gate_val)); + float input_val = llaisys::utils::cast(input[i]); + output[i] = llaisys::utils::cast(input_val * silu); + } +} + +namespace llaisys::ops::cpu { +void swiglu( + std::byte *output, + const std::byte *input, + const std::byte *gate, + llaisysDataType_t type, + size_t total_size +) { + switch (type) { + case LLAISYS_DTYPE_F32: + return swiglu_(reinterpret_cast(output), reinterpret_cast(input), + reinterpret_cast(gate), total_size); + case LLAISYS_DTYPE_F16: + return swiglu_(reinterpret_cast(output), reinterpret_cast(input), + reinterpret_cast(gate), total_size); + case LLAISYS_DTYPE_BF16: + return swiglu_(reinterpret_cast(output), reinterpret_cast(input), + reinterpret_cast(gate), total_size); + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } +} +} // namespace llaisys::ops::cpu + diff --git a/src/ops/swiglu/cpu/swiglu.hpp b/src/ops/swiglu/cpu/swiglu.hpp new file mode 100644 index 00000000..1b0b91f5 --- /dev/null +++ b/src/ops/swiglu/cpu/swiglu.hpp @@ -0,0 +1,14 @@ +#pragma once +#include "llaisys.h" + +#include + +namespace llaisys::ops::cpu { +void swiglu( + std::byte *output, + const std::byte *input, + const std::byte *gate, + llaisysDataType_t type, + size_t tatal_size +); +} \ No newline at end of file diff --git a/src/ops/swiglu/op.cpp b/src/ops/swiglu/op.cpp index 47edbcc9..ac4fa5bd 100644 --- a/src/ops/swiglu/op.cpp +++ b/src/ops/swiglu/op.cpp @@ -1,7 +1,21 @@ #include "op.hpp" +#include "cpu/swiglu.hpp" namespace llaisys::ops { void swiglu(tensor_t out, tensor_t gate, tensor_t up) { - TO_BE_IMPLEMENTED(); + CHECK_SAME_DEVICE(out, gate, up); + CHECK_SAME_SHAPE(out->shape(), gate->shape(), up->shape()); + + if (out->deviceType() == LLAISYS_DEVICE_CPU) { + return cpu::swiglu( + out->data(), + up->data(), + gate->data(), + out->dtype(), + out->numel() + ); + } else { + TO_BE_IMPLEMENTED(); + } } } // namespace llaisys::ops diff --git a/src/tensor/tensor.cpp b/src/tensor/tensor.cpp index 2f594bb6..5e9d8730 100644 --- a/src/tensor/tensor.cpp +++ b/src/tensor/tensor.cpp @@ -26,14 +26,17 @@ tensor_t Tensor::create(const std::vector &shape, size_t total_elems = stride; size_t dtype_size = utils::dsize(dtype); + tensor_t ret; if (device_type == LLAISYS_DEVICE_CPU && core::context().runtime().deviceType() != LLAISYS_DEVICE_CPU) { auto storage = core::context().runtime().allocateHostStorage(total_elems * dtype_size); - return std::shared_ptr(new Tensor(meta, storage)); + ret = std::shared_ptr(new Tensor(meta, storage, 0)); } else { core::context().setDevice(device_type, device); auto storage = core::context().runtime().allocateDeviceStorage(total_elems * dtype_size); - return std::shared_ptr(new Tensor(meta, storage)); + ret = std::shared_ptr(new Tensor(meta, storage, 0)); } + ASSERT(ret->data() != nullptr, "Failed to allocate memory for tensor."); + return ret; } std::byte *Tensor::data() { @@ -164,32 +167,100 @@ void Tensor::debug() const { } bool Tensor::isContiguous() const { - TO_BE_IMPLEMENTED(); - return true; + auto& shape = this->shape(); + auto& strides = this->strides(); + size_t ndim = shape.size(); + + bool ret = true; + if(ndim != 0) { + ptrdiff_t expected_stride = 1; + for(int i = static_cast(ndim - 1); i >= 0; i--) { + if(strides[i] != expected_stride) { + ret = false; + break; + } + expected_stride *= static_cast(shape[i]); + } + } + return ret; } tensor_t Tensor::permute(const std::vector &order) const { - TO_BE_IMPLEMENTED(); - return std::shared_ptr(new Tensor(_meta, _storage)); + auto meta = _meta; + for(size_t i = 0; i < order.size(); i++) { + CHECK_ARGUMENT(order[i] < order.size(), "Permute order out of range"); + meta.shape[i] = _meta.shape[order[i]]; + meta.strides[i] = _meta.strides[order[i]]; + } + return std::shared_ptr(new Tensor(meta, _storage, _offset)); } tensor_t Tensor::view(const std::vector &shape) const { - TO_BE_IMPLEMENTED(); - return std::shared_ptr(new Tensor(_meta, _storage)); + size_t new_numel = 1; + for (auto s : shape) { + new_numel *= s; + } + if (new_numel != this->numel()) { + EXCEPTION_SHAPE_MISMATCH; + } + if (!this->isContiguous()) { + std::cerr << "[ERROR] View error" << EXCEPTION_LOCATION_MSG << std::endl; + throw std::runtime_error("View on non-contiguous tensor, call contiguous() first."); + } + + std::vector new_strides(shape.size()); + size_t stride = 1; + for (size_t i = 1; i <= shape.size(); i++) { + new_strides[shape.size() - i] = stride; + stride *= shape[shape.size() - i]; + } + auto new_meta = TensorMeta{ + this->dtype(), + shape, + new_strides, + }; + return tensor_t(new Tensor(new_meta, _storage, _offset)); } tensor_t Tensor::slice(size_t dim, size_t start, size_t end) const { - TO_BE_IMPLEMENTED(); - return std::shared_ptr(new Tensor(_meta, _storage)); + CHECK_ARGUMENT(dim < this->ndim(), "Slice dimension out of range"); + CHECK_ARGUMENT(start <= end && end <= this->shape()[dim], "Slice indices out of range"); + + TensorMeta meta = _meta; + meta.shape[dim] = end - start; + size_t offset = _offset + start * _meta.strides[dim] * this->elementSize(); + return tensor_t(new Tensor(meta, _storage, offset)); } void Tensor::load(const void *src_) { - TO_BE_IMPLEMENTED(); + size_t total_bytes = this->numel() * this->elementSize(); + if (this->deviceType() == LLAISYS_DEVICE_CPU) { + core::context().runtime().api()->memcpy_sync( + this->data(), + src_, + total_bytes, + LLAISYS_MEMCPY_H2H + ); + } + else { + core::context().runtime().api()->memcpy_sync( + this->data(), + src_, + total_bytes, + LLAISYS_MEMCPY_H2D + ); + } } tensor_t Tensor::contiguous() const { + if(this->isContiguous()) { + return std::shared_ptr(new Tensor(_meta, _storage, _offset)); + } + auto new_tensor = Tensor::create(shape(), dtype(), deviceType(), deviceId()); + //size_t total_bytes = this->numel() * this->elementSize(); + TO_BE_IMPLEMENTED(); - return std::shared_ptr(new Tensor(_meta, _storage)); + return new_tensor; } tensor_t Tensor::reshape(const std::vector &shape) const { diff --git a/src/utils/types.cpp b/src/utils/types.cpp index 4163c214..958d632f 100644 --- a/src/utils/types.cpp +++ b/src/utils/types.cpp @@ -2,6 +2,58 @@ #include +namespace llaisys { + bool CustomFloat16::operator<(const CustomFloat16 &other) const { + return utils::cast(*this) < utils::cast(other); + } + bool CustomFloat16::operator>(const CustomFloat16 &other) const { + return utils::cast(*this) > utils::cast(other); + } + bool CustomFloat16::operator<=(const CustomFloat16 &other) const { + return utils::cast(*this) <= utils::cast(other); + } + bool CustomFloat16::operator>=(const CustomFloat16 &other) const { + return utils::cast(*this) >= utils::cast(other); + } + CustomFloat16 CustomFloat16::operator+(const CustomFloat16 &other) const{ + return utils::cast(utils::cast(*this) + utils::cast(other)); + } + CustomFloat16 CustomFloat16::operator-(const CustomFloat16 &other) const { + return utils::cast(utils::cast(*this) - utils::cast(other)); + } + CustomFloat16 CustomFloat16::operator*(const CustomFloat16 &other) const { + return utils::cast(utils::cast(*this) * utils::cast(other)); + } + CustomFloat16 CustomFloat16::operator/(const CustomFloat16 &other) const { + return utils::cast(utils::cast(*this) / utils::cast(other)); + } + + bool CustomBFloat16::operator<(const CustomBFloat16 &other) const { + return utils::cast(*this) < utils::cast(other); + } + bool CustomBFloat16::operator>(const CustomBFloat16 &other) const { + return utils::cast(*this) > utils::cast(other); + } + bool CustomBFloat16::operator<=(const CustomBFloat16 &other) const { + return utils::cast(*this) <= utils::cast(other); + } + bool CustomBFloat16::operator>=(const CustomBFloat16 &other) const { + return utils::cast(*this) >= utils::cast(other); + } + CustomBFloat16 CustomBFloat16::operator+(const CustomBFloat16 &other) const { + return utils::cast(utils::cast(*this) + utils::cast(other)); + } + CustomBFloat16 CustomBFloat16::operator-(const CustomBFloat16 &other) const { + return utils::cast(utils::cast(*this) - utils::cast(other)); + } + CustomBFloat16 CustomBFloat16::operator*(const CustomBFloat16 &other) const { + return utils::cast(utils::cast(*this) * utils::cast(other)); + } + CustomBFloat16 CustomBFloat16::operator/(const CustomBFloat16 &other) const { + return utils::cast(utils::cast(*this) / utils::cast(other)); + } +}//llaisys + namespace llaisys::utils { float _f16_to_f32(fp16_t val) { uint16_t h = val._v; @@ -63,14 +115,6 @@ fp16_t _f32_to_f16(float val) { } } -float _bf16_to_f32(bf16_t val) { - uint32_t bits32 = static_cast(val._v) << 16; - - float out; - std::memcpy(&out, &bits32, sizeof(out)); - return out; -} - bf16_t _f32_to_bf16(float val) { uint32_t bits32; std::memcpy(&bits32, &val, sizeof(bits32)); diff --git a/src/utils/types.hpp b/src/utils/types.hpp index e09619db..a95986a7 100644 --- a/src/utils/types.hpp +++ b/src/utils/types.hpp @@ -2,15 +2,44 @@ #include #include +#include namespace llaisys { struct CustomFloat16 { uint16_t _v; + bool operator==(const CustomFloat16 &other) const { + return _v == other._v; + } + bool operator!=(const CustomFloat16 &other) const { + return _v != other._v; + } + bool operator<(const CustomFloat16 &other) const; + bool operator>(const CustomFloat16 &other) const; + bool operator<=(const CustomFloat16 &other) const; + bool operator>=(const CustomFloat16 &other) const; + CustomFloat16 operator+(const CustomFloat16 &other) const; + CustomFloat16 operator-(const CustomFloat16 &other) const; + CustomFloat16 operator*(const CustomFloat16 &other) const; + CustomFloat16 operator/(const CustomFloat16 &other) const; }; typedef struct CustomFloat16 fp16_t; struct CustomBFloat16 { uint16_t _v; + bool operator==(const CustomBFloat16 &other) const { + return _v == other._v; + } + bool operator!=(const CustomBFloat16 &other) const { + return _v != other._v; + } + bool operator<(const CustomBFloat16 &other) const; + bool operator>(const CustomBFloat16 &other) const; + bool operator<=(const CustomBFloat16 &other) const; + bool operator>=(const CustomBFloat16 &other) const; + CustomBFloat16 operator+(const CustomBFloat16 &other) const; + CustomBFloat16 operator-(const CustomBFloat16 &other) const; + CustomBFloat16 operator*(const CustomBFloat16 &other) const; + CustomBFloat16 operator/(const CustomBFloat16 &other) const; }; typedef struct CustomBFloat16 bf16_t; @@ -110,7 +139,13 @@ inline const char *dtype_to_str(llaisysDataType_t dtype) { float _f16_to_f32(fp16_t val); fp16_t _f32_to_f16(float val); -float _bf16_to_f32(bf16_t val); +inline float _bf16_to_f32(bf16_t val) { + uint32_t bits32 = static_cast(val._v) << 16; + + float out; + std::memcpy(&out, &bits32, sizeof(out)); + return out; +} bf16_t _f32_to_bf16(float val); template diff --git a/test/my_test.py b/test/my_test.py new file mode 100644 index 00000000..1797f31b --- /dev/null +++ b/test/my_test.py @@ -0,0 +1,4 @@ +from llaisys.models.qwen2 import * + +path = "/home/jie/learning/ai_camp_2026/DeepSeek-R1-Distill-Qwen-1.5B" +model = Qwen2(path) \ No newline at end of file diff --git a/xmake.lua b/xmake.lua index 1f65f7a9..43f00c57 100644 --- a/xmake.lua +++ b/xmake.lua @@ -106,6 +106,7 @@ target("llaisys") set_languages("cxx17") set_warnings("all", "error") add_files("src/llaisys/*.cc") + add_files("src/llaisys/models/*.cc") set_installdir(".")