Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion python/llaisys/libllaisys/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from .tensor import llaisysTensor_t
from .tensor import load_tensor
from .ops import load_ops

from .models import load_models, LlaisysQwen2Meta, LlaisysQwen2Weights

def load_shared_library():
lib_dir = Path(__file__).parent
Expand All @@ -38,6 +38,7 @@ def load_shared_library():
load_runtime(LIB_LLAISYS)
load_tensor(LIB_LLAISYS)
load_ops(LIB_LLAISYS)
load_models(LIB_LLAISYS)


__all__ = [
Expand All @@ -52,4 +53,5 @@ def load_shared_library():
"llaisysMemcpyKind_t",
"MemcpyKind",
"llaisysStream_t",
"LlaisysQwen2Meta", "LlaisysQwen2Weights",
]
72 changes: 72 additions & 0 deletions python/llaisys/libllaisys/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
from ctypes import Structure, POINTER, c_size_t, c_float, c_int, c_int64
from .llaisys_types import llaisysDataType_t, llaisysDeviceType_t
from .tensor import llaisysTensor_t


class LlaisysQwen2Meta(Structure):
_fields_ = [
("dtype", llaisysDataType_t),
("nlayer", c_size_t),
("hs", c_size_t),
("nh", c_size_t),
("nkvh", c_size_t),
("dh", c_size_t),
("di", c_size_t),
("maxseq", c_size_t),
("voc", c_size_t),
("epsilon", c_float),
("theta", c_float),
("end_token", c_int64),
]


class LlaisysQwen2Weights(Structure):
_fields_ = [
("in_embed", llaisysTensor_t),
("out_embed", llaisysTensor_t),
("out_norm_w", llaisysTensor_t),
("attn_norm_w", POINTER(llaisysTensor_t)),
("attn_q_w", POINTER(llaisysTensor_t)),
("attn_q_b", POINTER(llaisysTensor_t)),
("attn_k_w", POINTER(llaisysTensor_t)),
("attn_k_b", POINTER(llaisysTensor_t)),
("attn_v_w", POINTER(llaisysTensor_t)),
("attn_v_b", POINTER(llaisysTensor_t)),
("attn_o_w", POINTER(llaisysTensor_t)),
("mlp_norm_w", POINTER(llaisysTensor_t)),
("mlp_gate_w", POINTER(llaisysTensor_t)),
("mlp_up_w", POINTER(llaisysTensor_t)),
("mlp_down_w", POINTER(llaisysTensor_t)),
]

# Opaque handle
class LlaisysQwen2Model(Structure):
pass # 定义一个空结构体用于类型占位

LlaisysQwen2Model_p = POINTER(LlaisysQwen2Model)

def load_models(lib):
# llaisysQwen2ModelCreate
lib.llaisysQwen2ModelCreate.argtypes = [
POINTER(LlaisysQwen2Meta),
llaisysDeviceType_t,
POINTER(c_int), # device_ids
c_int # ndevice
]
lib.llaisysQwen2ModelCreate.restype = LlaisysQwen2Model_p

# llaisysQwen2ModelDestroy
lib.llaisysQwen2ModelDestroy.argtypes = [LlaisysQwen2Model_p]
lib.llaisysQwen2ModelDestroy.restype = None

# llaisysQwen2ModelWeights
lib.llaisysQwen2ModelWeights.argtypes = [LlaisysQwen2Model_p]
lib.llaisysQwen2ModelWeights.restype = POINTER(LlaisysQwen2Weights)

# llaisysQwen2ModelInfer
lib.llaisysQwen2ModelInfer.argtypes = [
LlaisysQwen2Model_p,
POINTER(c_int64), # token_ids
c_size_t # ntoken
]
lib.llaisysQwen2ModelInfer.restype = c_int64
165 changes: 152 additions & 13 deletions python/llaisys/models/qwen2.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,172 @@
from typing import Sequence
from ..libllaisys import LIB_LLAISYS
from ..libllaisys import DeviceType

from ..libllaisys.models import LlaisysQwen2Meta, LlaisysQwen2Weights
from ..tensor import Tensor
import ctypes
from ctypes import POINTER, c_int, c_int64, c_float, c_size_t
from pathlib import Path
import safetensors

import json
import numpy as np
import mmap
import struct

class Qwen2:

def __init__(self, model_path, device: DeviceType = DeviceType.CPU):
# TODO: Implement model constructor

model_path = Path(model_path)

# 1. Load Config
with open(model_path / "config.json", "r") as f:
config = json.load(f)

# 2. Prepare Meta
self.meta = LlaisysQwen2Meta()
self.meta.dtype = 19 # BF16
self.meta.nlayer = config["num_hidden_layers"]
self.meta.hs = config["hidden_size"]
self.meta.nh = config["num_attention_heads"]
self.meta.nkvh = config["num_key_value_heads"]
self.meta.dh = self.meta.hs // self.meta.nh
self.meta.di = config["intermediate_size"]
self.meta.maxseq = 2048
self.meta.voc = config["vocab_size"]
self.meta.epsilon = config["rms_norm_eps"]
self.meta.theta = config.get("rope_theta", 10000.0)
self.meta.end_token = 151643 # <|end_of_text|>

# 3. Create C Model
device_ids = (c_int * 1)(0)
self._model_handle = LIB_LLAISYS.llaisysQwen2ModelCreate(
ctypes.byref(self.meta),
device.value,
device_ids,
1
)

# 4. Get Weights Structure Pointers
self.weights_ptr = LIB_LLAISYS.llaisysQwen2ModelWeights(self._model_handle).contents

# 5. Load Weights Manually
print("Loading weights...")
self._load_safetensors_manually(model_path)

def _load_safetensors_manually(self, model_path: Path):
"""
Manually parse safetensors headers and mmap data as uint16.
"""
for file in sorted(model_path.glob("*.safetensors")):
data_ = safetensors.safe_open(file, framework="numpy", device="cpu")
for name_ in data_.keys():
## TODO: load the model weights
pass
with open(file, 'rb') as f:
# Read Header Size
header_len_bytes = f.read(8)
if not header_len_bytes: continue
header_len = struct.unpack('<Q', header_len_bytes)[0]

# Read Header
header_bytes = f.read(header_len)
header = json.loads(header_bytes)

# Start of data section
data_start = 8 + header_len

# Use mmap
with mmap.mmap(f.fileno(), length=0, access=mmap.ACCESS_READ) as mm:
for name, info in header.items():
if name == "__metadata__": continue

target_tensor = self._map_weight_name(name)
if not target_tensor:
continue

begin, end = info['data_offsets']
total_bytes = end - begin

# Create numpy view
raw_data = np.frombuffer(
mm,
dtype=np.uint16,
count=total_bytes // 2,
offset=data_start + begin
)

target_tensor.load(raw_data.ctypes.data)

del raw_data

def _map_weight_name(self, name):
if "model.embed_tokens.weight" in name:
return Tensor(tensor=self.weights_ptr.in_embed)
elif "lm_head.weight" in name:
return Tensor(tensor=self.weights_ptr.out_embed)
elif "model.norm.weight" in name:
return Tensor(tensor=self.weights_ptr.out_norm_w)
elif "layers" in name:
parts = name.split(".")
layer_idx = int(parts[2])
module = parts[3]

if module == "input_layernorm":
return Tensor(tensor=self.weights_ptr.attn_norm_w[layer_idx])
elif module == "post_attention_layernorm":
return Tensor(tensor=self.weights_ptr.mlp_norm_w[layer_idx])
elif module == "self_attn":
proj = parts[4]
type_ = parts[5]
if type_ == "weight":
if "q_proj" in proj: return Tensor(tensor=self.weights_ptr.attn_q_w[layer_idx])
elif "k_proj" in proj: return Tensor(tensor=self.weights_ptr.attn_k_w[layer_idx])
elif "v_proj" in proj: return Tensor(tensor=self.weights_ptr.attn_v_w[layer_idx])
elif "o_proj" in proj: return Tensor(tensor=self.weights_ptr.attn_o_w[layer_idx])
elif type_ == "bias":
if "q_proj" in proj: return Tensor(tensor=self.weights_ptr.attn_q_b[layer_idx])
elif "k_proj" in proj: return Tensor(tensor=self.weights_ptr.attn_k_b[layer_idx])
elif "v_proj" in proj: return Tensor(tensor=self.weights_ptr.attn_v_b[layer_idx])
elif module == "mlp":
proj = parts[4]
if "gate_proj" in proj: return Tensor(tensor=self.weights_ptr.mlp_gate_w[layer_idx])
elif "up_proj" in proj: return Tensor(tensor=self.weights_ptr.mlp_up_w[layer_idx])
elif "down_proj" in proj: return Tensor(tensor=self.weights_ptr.mlp_down_w[layer_idx])
return None

def generate(
self,
inputs: Sequence[int],
max_new_tokens: int = None,
max_new_tokens: int = 128,
top_k: int = 1,
top_p: float = 0.8,
temperature: float = 0.8,
):
tokens = list(inputs)

# 1. Prefill
input_arr = (c_int64 * len(tokens))(*tokens)
next_token = LIB_LLAISYS.llaisysQwen2ModelInfer(
self._model_handle,
input_arr,
len(tokens)
)
tokens.append(next_token)

# 2. Decode Loop
for _ in range(max_new_tokens - 1):
if next_token == self.meta.end_token:
break

input_arr = (c_int64 * 1)(next_token)
next_token = LIB_LLAISYS.llaisysQwen2ModelInfer(
self._model_handle,
input_arr,
1
)
tokens.append(next_token)

# TODO: Implement generate function

return []
return tokens

def __del__(self):
try:
if hasattr(self, '_model_handle') and self._model_handle:
if LIB_LLAISYS and hasattr(LIB_LLAISYS, 'llaisysQwen2ModelDestroy'):
LIB_LLAISYS.llaisysQwen2ModelDestroy(self._model_handle)
self._model_handle = None
except:
pass
3 changes: 2 additions & 1 deletion python/setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,10 @@ install_requires =
torch>=2.4.0
transformers
accelerate
numpy>=1.21.0

[options.package_data]
llaisys =
libllaisys/*.so
libllaisys/*.dll
libllaisys/*.dylib
libllaisys/*.dylib
16 changes: 16 additions & 0 deletions src/core/runtime/runtime.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,25 @@

#include "../../device/runtime_api.hpp"
#include "../allocator/naive_allocator.hpp"
#ifdef ENABLE_NVIDIA_API
#include "../../device/nvidia/nvidia_resource.cuh"
#endif

namespace llaisys::core {
Runtime::Runtime(llaisysDeviceType_t device_type, int device_id)
: _device_type(device_type), _device_id(device_id), _is_active(false) {
_api = llaisys::device::getRuntimeAPI(_device_type);
_stream = _api->create_stream();
_allocator = new allocators::NaiveAllocator(_api);
if (device_type == LLAISYS_DEVICE_NVIDIA) {
#ifdef ENABLE_NVIDIA_API
// 只有在 CUDA 工具链存在的环境下,才会被编译进二进制文件
_device_resource = new llaisys::device::nvidia::Resource(device_id);
#else
// 在无 CUDA 环境下抛出绝对明确的运行时异常
throw std::runtime_error("Llaisys Runtime Error: The framework was compiled without NVIDIA backend support. No CUDA toolkit detected during build.");
#endif
}
}

Runtime::~Runtime() {
Expand All @@ -19,6 +31,10 @@ Runtime::~Runtime() {
_allocator = nullptr;
_api->destroy_stream(_stream);
_api = nullptr;
if (_device_resource) {
delete _device_resource;
_device_resource = nullptr;
}
}

void Runtime::_activate() {
Expand Down
4 changes: 3 additions & 1 deletion src/core/runtime/runtime.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

#include "../../device/runtime_api.hpp"
#include "../allocator/allocator.hpp"
#include "../../device/device_resource.hpp"

namespace llaisys::core {
class Runtime {
Expand All @@ -11,6 +12,7 @@ class Runtime {
int _device_id;
const LlaisysRuntimeAPI *_api;
MemoryAllocator *_allocator;
llaisys::device::DeviceResource *_device_resource = nullptr;
bool _is_active;
void _activate();
void _deactivate();
Expand All @@ -35,7 +37,7 @@ class Runtime {
bool isActive() const;

const LlaisysRuntimeAPI *api() const;

llaisys::device::DeviceResource *deviceResource() const { return _device_resource; }
storage_t allocateDeviceStorage(size_t size);
;
storage_t allocateHostStorage(size_t size);
Expand Down
2 changes: 1 addition & 1 deletion src/device/device_resource.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ class DeviceResource {
: _device_type(device_type),
_device_id(device_id) {
}
~DeviceResource() = default;
virtual ~DeviceResource() = default;

llaisysDeviceType_t getDeviceType() const { return _device_type; }
int getDeviceId() const { return _device_id; };
Expand Down
24 changes: 23 additions & 1 deletion src/device/nvidia/nvidia_resource.cu
Original file line number Diff line number Diff line change
@@ -1,7 +1,29 @@
#include "nvidia_resource.cuh"
#include <stdexcept>
#include <cuda_runtime.h>
#include <cstdio>

namespace llaisys::device::nvidia {

Resource::Resource(int device_id) : llaisys::device::DeviceResource(LLAISYS_DEVICE_NVIDIA, device_id) {}
Resource::Resource(int device_id) : llaisys::device::DeviceResource(LLAISYS_DEVICE_NVIDIA, device_id) {
cudaError_t err=cudaSetDevice(device_id);
if (err != cudaSuccess) {
throw std::runtime_error("Failed to set CUDA device in Resource constructor");
}

// Create cuBLAS handle
cublasStatus_t status=cublasCreate(&_cublas_handle);
if (status != CUBLAS_STATUS_SUCCESS) {
throw std::runtime_error("Failed to create cuBLAS handle in Resource constructor");
}


}
Resource::~Resource() {
// Destroy cuBLAS handle
if (_cublas_handle) {
cublasDestroy(_cublas_handle);
_cublas_handle = nullptr;
}
}
} // namespace llaisys::device::nvidia
Loading