Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions include/llaisys/build_config.h.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#ifndef LLAISYS_BUILD_CONFIG_H
#define LLAISYS_BUILD_CONFIG_H

${define ENABLE_NVIDIA_API}

#endif
5 changes: 5 additions & 0 deletions include/llaisys/models/qwen2.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,5 +38,10 @@ __C {
__export struct LlaisysQwen2Weights *llaisysQwen2ModelWeights(struct LlaisysQwen2Model * model);

__export int64_t llaisysQwen2ModelInfer(struct LlaisysQwen2Model * model, int64_t * token_ids, size_t ntoken);

__export int64_t llaisysQwen2ModelInferSample(struct LlaisysQwen2Model * model, int64_t * token_ids, size_t ntoken,
float temperature, int top_k, float top_p);

__export void llaisysQwen2ModelResetKVCache(struct LlaisysQwen2Model * model);
}
#endif // LLAISYS_MODELS_QWEN2_H
1 change: 1 addition & 0 deletions include/llaisys/ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ __C {
__export void llaisysROPE(llaisysTensor_t out, llaisysTensor_t in, llaisysTensor_t pos_ids, float theta);
__export void llaisysSelfAttention(llaisysTensor_t attn_val, llaisysTensor_t q, llaisysTensor_t k, llaisysTensor_t v, float scale);
__export void llaisysSwiGLU(llaisysTensor_t out, llaisysTensor_t gate, llaisysTensor_t up);
__export void llaisysSample(llaisysTensor_t out_idx, llaisysTensor_t logits, float temperature, int top_k, float top_p);
}

#endif
3 changes: 3 additions & 0 deletions python/llaisys/libllaisys/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
from .tensor import llaisysTensor_t
from .tensor import load_tensor
from .ops import load_ops
from .qwen2 import load_qwen2
from .qwen2 import LlaisysQwen2Meta, LlaisysQwen2Weights, llaisysQwen2Model_t


def load_shared_library():
Expand All @@ -38,6 +40,7 @@ def load_shared_library():
load_runtime(LIB_LLAISYS)
load_tensor(LIB_LLAISYS)
load_ops(LIB_LLAISYS)
load_qwen2(LIB_LLAISYS)


__all__ = [
Expand Down
5 changes: 4 additions & 1 deletion python/llaisys/libllaisys/ops.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from .tensor import llaisysTensor_t
from ctypes import c_float
from ctypes import c_float, c_int

def load_ops(lib):
lib.llaisysAdd.argtypes = [llaisysTensor_t, llaisysTensor_t, llaisysTensor_t]
Expand Down Expand Up @@ -34,3 +34,6 @@ def load_ops(lib):

lib.llaisysSwiGLU.argtypes = [llaisysTensor_t, llaisysTensor_t, llaisysTensor_t]
lib.llaisysSwiGLU.restype = None

lib.llaisysSample.argtypes = [llaisysTensor_t, llaisysTensor_t, c_float, c_int, c_float]
lib.llaisysSample.restype = None
72 changes: 72 additions & 0 deletions python/llaisys/libllaisys/qwen2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import ctypes
from ctypes import c_void_p, c_size_t, c_int, c_int64, c_float, Structure, POINTER
from .llaisys_types import llaisysDataType_t, llaisysDeviceType_t
from .tensor import llaisysTensor_t


class LlaisysQwen2Meta(Structure):
_fields_ = [
("dtype", llaisysDataType_t),
("nlayer", c_size_t),
("hs", c_size_t),
("nh", c_size_t),
("nkvh", c_size_t),
("dh", c_size_t),
("di", c_size_t),
("maxseq", c_size_t),
("voc", c_size_t),
("epsilon", c_float),
("theta", c_float),
("end_token", c_int64),
]


class LlaisysQwen2Weights(Structure):
_fields_ = [
("in_embed", llaisysTensor_t),
("out_embed", llaisysTensor_t),
("out_norm_w", llaisysTensor_t),
("attn_norm_w", POINTER(llaisysTensor_t)),
("attn_q_w", POINTER(llaisysTensor_t)),
("attn_q_b", POINTER(llaisysTensor_t)),
("attn_k_w", POINTER(llaisysTensor_t)),
("attn_k_b", POINTER(llaisysTensor_t)),
("attn_v_w", POINTER(llaisysTensor_t)),
("attn_v_b", POINTER(llaisysTensor_t)),
("attn_o_w", POINTER(llaisysTensor_t)),
("mlp_norm_w", POINTER(llaisysTensor_t)),
("mlp_gate_w", POINTER(llaisysTensor_t)),
("mlp_up_w", POINTER(llaisysTensor_t)),
("mlp_down_w", POINTER(llaisysTensor_t)),
]


llaisysQwen2Model_t = c_void_p


def load_qwen2(lib):
lib.llaisysQwen2ModelCreate.argtypes = [
POINTER(LlaisysQwen2Meta),
llaisysDeviceType_t,
POINTER(c_int),
c_int,
]
lib.llaisysQwen2ModelCreate.restype = llaisysQwen2Model_t

lib.llaisysQwen2ModelDestroy.argtypes = [llaisysQwen2Model_t]
lib.llaisysQwen2ModelDestroy.restype = None

lib.llaisysQwen2ModelWeights.argtypes = [llaisysQwen2Model_t]
lib.llaisysQwen2ModelWeights.restype = POINTER(LlaisysQwen2Weights)

lib.llaisysQwen2ModelInfer.argtypes = [llaisysQwen2Model_t, POINTER(c_int64), c_size_t]
lib.llaisysQwen2ModelInfer.restype = c_int64

lib.llaisysQwen2ModelInferSample.argtypes = [
llaisysQwen2Model_t, POINTER(c_int64), c_size_t,
c_float, c_int, c_float,
]
lib.llaisysQwen2ModelInferSample.restype = c_int64

lib.llaisysQwen2ModelResetKVCache.argtypes = [llaisysQwen2Model_t]
lib.llaisysQwen2ModelResetKVCache.restype = None
147 changes: 138 additions & 9 deletions python/llaisys/models/qwen2.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,121 @@
from typing import Sequence
from typing import Sequence, Iterator
from ..libllaisys import LIB_LLAISYS
from ..libllaisys import DeviceType
from ..libllaisys import DeviceType, DataType
from ..libllaisys import LlaisysQwen2Meta, LlaisysQwen2Weights

from pathlib import Path
import ctypes
import json
import safetensors
import torch


class Qwen2:

def __init__(self, model_path, device: DeviceType = DeviceType.CPU):
# TODO: Implement model constructor
DTYPE_MAP = {
"bfloat16": DataType.BF16,
"float16": DataType.F16,
"float32": DataType.F32,
}

def __init__(self, model_path, device: DeviceType = DeviceType.CPU):
model_path = Path(model_path)

with open(model_path / "config.json") as f:
config = json.load(f)

torch_dtype = config.get("torch_dtype", "bfloat16")
dtype = self.DTYPE_MAP.get(torch_dtype, DataType.BF16)

nh = config["num_attention_heads"]
nkvh = config["num_key_value_heads"]
hs = config["hidden_size"]
dh = hs // nh

meta = LlaisysQwen2Meta()
meta.dtype = dtype
meta.nlayer = config["num_hidden_layers"]
meta.hs = hs
meta.nh = nh
meta.nkvh = nkvh
meta.dh = dh
meta.di = config["intermediate_size"]
meta.maxseq = min(config.get("max_position_embeddings", 131072), 4096)
meta.voc = config["vocab_size"]
meta.epsilon = config.get("rms_norm_eps", 1e-6)
meta.theta = config.get("rope_theta", 10000.0)
meta.end_token = config.get("eos_token_id", 151643)
if isinstance(meta.end_token, list):
meta.end_token = meta.end_token[0]

self._nlayer = meta.nlayer
self._end_token = meta.end_token
self._device = device

device_ids = (ctypes.c_int * 1)(0)
self._model = LIB_LLAISYS.llaisysQwen2ModelCreate(
ctypes.byref(meta),
ctypes.c_int(device),
device_ids,
ctypes.c_int(1),
)

weights_ptr = LIB_LLAISYS.llaisysQwen2ModelWeights(self._model)
weights = weights_ptr.contents

name_map = self._build_name_map(weights)

for file in sorted(model_path.glob("*.safetensors")):
data_ = safetensors.safe_open(file, framework="numpy", device="cpu")
data_ = safetensors.safe_open(file, framework="pt", device="cpu")
for name_ in data_.keys():
## TODO: load the model weights
pass
if name_ in name_map:
tensor_handle = name_map[name_]
t = data_.get_tensor(name_).contiguous()
LIB_LLAISYS.tensorLoad(tensor_handle, ctypes.c_void_p(t.data_ptr()))

def _build_name_map(self, weights: LlaisysQwen2Weights):
m = {}
m["model.embed_tokens.weight"] = weights.in_embed
m["lm_head.weight"] = weights.out_embed
m["model.norm.weight"] = weights.out_norm_w

for i in range(self._nlayer):
prefix = f"model.layers.{i}"
m[f"{prefix}.input_layernorm.weight"] = weights.attn_norm_w[i]
m[f"{prefix}.self_attn.q_proj.weight"] = weights.attn_q_w[i]
m[f"{prefix}.self_attn.q_proj.bias"] = weights.attn_q_b[i]
m[f"{prefix}.self_attn.k_proj.weight"] = weights.attn_k_w[i]
m[f"{prefix}.self_attn.k_proj.bias"] = weights.attn_k_b[i]
m[f"{prefix}.self_attn.v_proj.weight"] = weights.attn_v_w[i]
m[f"{prefix}.self_attn.v_proj.bias"] = weights.attn_v_b[i]
m[f"{prefix}.self_attn.o_proj.weight"] = weights.attn_o_w[i]
m[f"{prefix}.post_attention_layernorm.weight"] = weights.mlp_norm_w[i]
m[f"{prefix}.mlp.gate_proj.weight"] = weights.mlp_gate_w[i]
m[f"{prefix}.mlp.up_proj.weight"] = weights.mlp_up_w[i]
m[f"{prefix}.mlp.down_proj.weight"] = weights.mlp_down_w[i]

return m

def __del__(self):
if hasattr(self, "_model") and self._model is not None:
LIB_LLAISYS.llaisysQwen2ModelDestroy(self._model)
self._model = None

def reset_kvcache(self):
LIB_LLAISYS.llaisysQwen2ModelResetKVCache(self._model)

def _infer_one(self, token_ids, use_sample, temperature, top_k, top_p):
arr = (ctypes.c_int64 * len(token_ids))(*token_ids)
n = ctypes.c_size_t(len(token_ids))
if use_sample:
return LIB_LLAISYS.llaisysQwen2ModelInferSample(
self._model, arr, n,
ctypes.c_float(temperature),
ctypes.c_int(top_k),
ctypes.c_float(top_p),
)
else:
return LIB_LLAISYS.llaisysQwen2ModelInfer(self._model, arr, n)

def generate(
self,
Expand All @@ -27,7 +125,38 @@ def generate(
top_p: float = 0.8,
temperature: float = 0.8,
):
if max_new_tokens is None:
max_new_tokens = 128

use_sample = not (top_k == 1 and temperature == 1.0)
tokens = list(inputs)

next_token = self._infer_one(tokens, use_sample, temperature, top_k, top_p)
tokens.append(next_token)

for _ in range(max_new_tokens - 1):
if next_token == self._end_token:
break
next_token = self._infer_one([next_token], use_sample, temperature, top_k, top_p)
tokens.append(next_token)

return tokens

def generate_stream(
self,
inputs: Sequence[int],
max_new_tokens: int = 512,
top_k: int = 50,
top_p: float = 0.9,
temperature: float = 0.8,
) -> Iterator[int]:
use_sample = not (top_k == 1 and temperature == 1.0)

# TODO: Implement generate function
next_token = self._infer_one(list(inputs), use_sample, temperature, top_k, top_p)
yield next_token

return []
for _ in range(max_new_tokens - 1):
if next_token == self._end_token:
return
next_token = self._infer_one([next_token], use_sample, temperature, top_k, top_p)
yield next_token
14 changes: 11 additions & 3 deletions python/llaisys/ops.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from .libllaisys import LIB_LLAISYS
from .tensor import Tensor
from ctypes import c_float, c_int
from ctypes import c_float, c_int, c_int64


class Ops:
Expand All @@ -19,9 +19,10 @@ def embedding(out: Tensor, index: Tensor, weight: Tensor):
)

@staticmethod
def linear(out: Tensor, inp: Tensor, weight: Tensor, bias: Tensor):
def linear(out: Tensor, inp: Tensor, weight: Tensor, bias: Tensor = None):
bias_handle = bias.lib_tensor() if bias is not None else None
LIB_LLAISYS.llaisysLinear(
out.lib_tensor(), inp.lib_tensor(), weight.lib_tensor(), bias.lib_tensor()
out.lib_tensor(), inp.lib_tensor(), weight.lib_tensor(), bias_handle
)

@staticmethod
Expand Down Expand Up @@ -53,3 +54,10 @@ def self_attention(attn_val: Tensor, q: Tensor, k: Tensor, v: Tensor, scale: flo
@staticmethod
def swiglu(out: Tensor, gate: Tensor, up: Tensor):
LIB_LLAISYS.llaisysSwiGLU(out.lib_tensor(), gate.lib_tensor(), up.lib_tensor())

@staticmethod
def sample(out_idx: Tensor, logits: Tensor, temperature: float = 1.0, top_k: int = 50, top_p: float = 0.9):
LIB_LLAISYS.llaisysSample(
out_idx.lib_tensor(), logits.lib_tensor(),
c_float(temperature), c_int(top_k), c_float(top_p)
)
Loading