Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
7caa504
Pass the test,may be need more review and check
usersforsomebody Jan 22, 2026
0ac6892
Finish the taks 2.1 Argmax
usersforsomebody Jan 24, 2026
f5e61c2
use the std::move
usersforsomebody Jan 25, 2026
3a1ca5c
Finish Task 2.2 Embedding
usersforsomebody Jan 25, 2026
4ed8b98
Pass Task 2-3 linear
usersforsomebody Jan 25, 2026
75306f7
Finish the Task-2.4 RMS
usersforsomebody Jan 26, 2026
e9a4937
Finish the task 2-5 RoPE and fix the kernel
usersforsomebody Jan 27, 2026
f3220e0
Finish The Task 2-6 attention is all your need
usersforsomebody Jan 28, 2026
29bd22f
Finish all the Task 1 Task 2
usersforsomebody Jan 29, 2026
5f8a982
Finsh the bind
usersforsomebody Jan 31, 2026
840d134
Finish forward init
usersforsomebody Feb 1, 2026
424234f
kv_cache
usersforsomebody Feb 2, 2026
123f48f
Fix the bug
usersforsomebody Feb 4, 2026
d99d6cd
Fix another bug
usersforsomebody Feb 4, 2026
687a55a
fix: remove Windows-invalid path core.hpp:Zone.Identifier and ignore …
usersforsomebody Feb 4, 2026
702a9d9
fix(win): C4267 size_t to long in tensor view() and add utils_stub fo…
usersforsomebody Feb 4, 2026
a10c6da
fix(win): C4267 size_t to int in rearrange loop; cast argmax index to…
usersforsomebody Feb 4, 2026
5ffe1c0
fix(windows): 导出 Qwen2 推理 API 以修复 Windows DLL 符号未找到
usersforsomebody Feb 4, 2026
78ea945
perf: enable OpenMP and parallelize linear operator (4x-5x speedup)
usersforsomebody Mar 8, 2026
7c00cf3
perf(ops): 分割 linear 循环块 (Tiling) 试图优化缓存命中,但因内部 utils::cast 阻断 SIMD 导…
usersforsomebody Mar 9, 2026
fb003e3
perf(linear): AVX2 SIMD 优化 f32/bf16/fp16 矩阵乘法,提取共享 SIMD 工具函数
usersforsomebody Mar 13, 2026
2380d3e
feat(linear): 添加OpenMP多线程并行化,6线程加速约3x
usersforsomebody Mar 14, 2026
f10b579
fix(linear): 用C级if替代OpenMP if子句,消除M=1时GOMP运行时开销
usersforsomebody Mar 14, 2026
acc5d75
feat(linear): 集成 OpenBLAS sgemm 加速 linear 算子
usersforsomebody Mar 15, 2026
08b7fc7
fix(qwen2): 修复 generate 函数多生成 1 个 token 的 bug
usersforsomebody Mar 16, 2026
00c00c8
feat(linear): 运行时 BLAS 检测, 支持 MKL/OpenBLAS dlopen 加载
usersforsomebody Mar 16, 2026
88b8d9b
feat(linear): N维度并行化加速M=1解码, 添加AVX-512内核与--native编译选项
usersforsomebody Mar 16, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -87,4 +87,7 @@ htmlcov/
# Windows
Thumbs.db
ehthumbs.db
desktop.ini
desktop.ini
# Windows Zone.Identifier (invalid path on Windows if committed)
*:Zone.Identifier
*.Identifier
18 changes: 16 additions & 2 deletions include/llaisys/models/qwen2.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,26 @@ __C {

struct LlaisysQwen2Model;

__export struct LlaisysQwen2Model *llaisysQwen2ModelCreate(const LlaisysQwen2Meta *meta, llaisysDeviceType_t device, int *device_ids, int ndevice);
__export struct LlaisysQwen2Model *llaisysQwen2ModelCreate(const LlaisysQwen2Meta *meta, llaisysDeviceType_t device, int *device_ids, int ndevice,llaisysDataType_t dtype);

__export void llaisysQwen2ModelDestroy(struct LlaisysQwen2Model * model);

__export struct LlaisysQwen2Weights *llaisysQwen2ModelWeights(struct LlaisysQwen2Model * model);

__export int64_t llaisysQwen2ModelInfer(struct LlaisysQwen2Model * model, int64_t * token_ids, size_t ntoken);
__export void llaisysQwen2LoadWeight(
struct LlaisysQwen2Model * model,
const char * name,
void * data,
size_t * shape,
size_t ndim,
llaisysDataType_t dtype);

__export void *llaisysQwen2ModelForward(
struct LlaisysQwen2Model * model,
int64_t * token_ids,
size_t seq_len,
size_t start_pos);

__export int llaisysQwen2Sample(void * logits_ptr);
}
#endif // LLAISYS_MODELS_QWEN2_H
Binary file added linear.prof
Binary file not shown.
44 changes: 44 additions & 0 deletions python/llaisys/libllaisys/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from .llaisys_types import llaisysDataType_t, DataType
from .llaisys_types import llaisysMemcpyKind_t, MemcpyKind
from .llaisys_types import llaisysStream_t
from .llaisys_types import LlaisysQwen2Meta
from .tensor import llaisysTensor_t
from .tensor import load_tensor
from .ops import load_ops
Expand All @@ -33,11 +34,51 @@ def load_shared_library():

return ctypes.CDLL(str(lib_path))

def load_qwen2_api(lib):
try:
if hasattr(lib,'llaisysQwen2ModelCreate'):
lib.llaisysQwen2ModelCreate.argtypes=[
ctypes.POINTER(LlaisysQwen2Meta),
ctypes.c_int,
ctypes.POINTER(ctypes.c_int),
ctypes.c_int,
ctypes.c_int
]
lib.llaisysQwen2ModelCreate.restype=ctypes.c_void_p
if hasattr(lib,'llaisysQwen2LoadWeight'):
lib.llaisysQwen2LoadWeight.argtypes=[
ctypes.c_void_p,
ctypes.c_char_p,
ctypes.c_void_p,
ctypes.POINTER(ctypes.c_size_t),
ctypes.c_size_t,
ctypes.c_int
]
lib.llaisysQwen2LoadWeight.restype=None
except Exception as e:
print(f"Warning: Failed to load Qwen2 API signatures. {e}")
def llaisys_qwen2_create(meta,device_id):
return LIB_LLAISYS.llaisysQwen2ModelCreate(
ctypes.byref(meta),
device_id,
None,
0
Comment on lines +60 to +65
)
def llaisys_qwen2_load_weight(model_handle,name,data_ptr,shape,ndim,dtype):
LIB_LLAISYS.llaisysQwen2LoadWeight(
model_handle,
name,
data_ptr,
shape,
ndim,
dtype
)

LIB_LLAISYS = load_shared_library()
load_runtime(LIB_LLAISYS)
load_tensor(LIB_LLAISYS)
load_ops(LIB_LLAISYS)
load_qwen2_api(LIB_LLAISYS)


__all__ = [
Expand All @@ -52,4 +93,7 @@ def load_shared_library():
"llaisysMemcpyKind_t",
"MemcpyKind",
"llaisysStream_t",
"LlaisysQwen2Meta",
"llaisys_qwen2_create",
"llaisys_qwen2_load_weight"
]
17 changes: 17 additions & 0 deletions python/llaisys/libllaisys/llaisys_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,3 +61,20 @@ class MemcpyKind(IntEnum):
"MemcpyKind",
"llaisysStream_t",
]


class LlaisysQwen2Meta(ctypes.Structure):
_fields_ = [
("dtype",ctypes.c_int),
("nlayer",ctypes.c_size_t),
("hs",ctypes.c_size_t),
("nh",ctypes.c_size_t),
("nkvh",ctypes.c_size_t),
("dh",ctypes.c_size_t),
("di",ctypes.c_size_t),
("maxseq",ctypes.c_size_t),
("voc",ctypes.c_size_t),
("epsilon",ctypes.c_float),
("theta",ctypes.c_float),
("end_token",ctypes.c_int64,)
]
166 changes: 157 additions & 9 deletions python/llaisys/models/qwen2.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,21 +4,141 @@

from pathlib import Path
import safetensors
import ctypes
import numpy as np
import struct
import json
import mmap
import os


from ..libllaisys import(
DeviceType,
LlaisysQwen2Meta,
llaisys_qwen2_create,
llaisys_qwen2_load_weight
)
TYPE_MAP={
# 必须与 C++ enum llaisysDataType_t 完全一致
# 参见 include/llaisys.h 和 python/llaisys/libllaisys/llaisys_types.py
"F32":13,
"F16":12, # 修复: 之前错误地映射为 11(F8)
"BF16":19
}
class Qwen2:

def __init__(self, model_path, device: DeviceType = DeviceType.CPU):
# TODO: Implement model constructor
self.lib=LIB_LLAISYS

model_path = Path(model_path)
config_path=os.path.join(model_path,"config.json")
if not os.path.exists(config_path):
raise FileNotFoundError(f"Config not found at {config_path}")
with open(config_path,"r") as f:
config=json.load(f)

meta=LlaisysQwen2Meta()
meta.nlayer=config.get("num_hidden_layers",28)
meta.hs=config.get("hidden_size",1536)
meta.nh=config.get("num_attention_heads",12)
meta.nkvh=config.get("num_key_value_heads",2)
meta.vocab_size=config.get("vocab_size",151936)
meta.maxseq=config.get("max_position_embeddings",32768)
meta.epsilon=config.get("rms_norm_eps",1e-6)
meta.theta=config.get("rope_theta",10000.0)
Comment on lines +43 to +46

config_dtype_str=config.get("torch_dtype","float16")

target_key="F16"
if config_dtype_str=="float32":
target_key="F32"
elif config_dtype_str=="bfloat16":
target_key="BF16"
elif config_dtype_str=="float16":
target_key="F16"

if target_key not in TYPE_MAP:
print(f"Warning: Unknown dtype {config_dtype_str}, using F16")
target_dtype=TYPE_MAP["F16"]
else:
target_dtype=TYPE_MAP[target_key]
self.model=self.lib.llaisysQwen2ModelCreate(
ctypes.byref(meta),
device.value,
None,
0,
target_dtype
)

self.lib.llaisysQwen2ModelForward.restype=ctypes.c_void_p
self.lib.llaisysQwen2ModelForward.argtypes=[
ctypes.c_void_p,
ctypes.POINTER(ctypes.c_int64),
ctypes.c_size_t,
ctypes.c_size_t
]
if hasattr(self.lib,'llaisysQwen2Sample'):
self.lib.llaisysQwen2Sample.restype=ctypes.c_int
self.lib.llaisysQwen2Sample.argtypes=[ctypes.c_void_p]

model_path = Path(model_path)
for file in sorted(model_path.glob("*.safetensors")):
data_ = safetensors.safe_open(file, framework="numpy", device="cpu")
for name_ in data_.keys():
## TODO: load the model weights
pass
with open(file,'rb') as f_obj:
header_size=struct.unpack('<Q',f_obj.read(8))[0]
header_json=f_obj.read(header_size)
header_data=json.loads(header_json)
data_start=8+header_size

with mmap.mmap(f_obj.fileno(),0,access=mmap.ACCESS_READ) as mm:
for name_,info in header_data.items():
if name_=="__metadata__": continue
dtype_str=info['dtype']
if dtype_str not in TYPE_MAP:
continue
dtype=TYPE_MAP[dtype_str]

shape=info['shape']
start,end=info['data_offsets']

np_dtype=np.float32 if dtype_str == 'F32' else np.uint16
itemsize=np.dtype(np_dtype).itemsize
tensor_np=np.frombuffer(
mm,
dtype=np_dtype,
count=(end-start)//itemsize,
offset=data_start+start
)

c_name=name_.encode('utf-8')
ndim=len(shape)
ShapeArrayType=ctypes.c_size_t*ndim
c_shape=ShapeArrayType(*shape)

c_data_ptr=ctypes.c_void_p(tensor_np.ctypes.data)
llaisys_qwen2_load_weight(
self.model,
c_name,
c_data_ptr,
c_shape,
ndim,
dtype
)
del tensor_np

def forward(self,input_ids:Sequence[int],start_pos:int):
seq_len=len(input_ids)

InputArrayType=ctypes.c_int64*seq_len
input_c_array=InputArrayType(*input_ids)

input_ptr=ctypes.cast(input_c_array,ctypes.POINTER(ctypes.c_int64))

logits_ptr=self.lib.llaisysQwen2ModelForward(
self.model,
input_ptr,
seq_len,
start_pos
)
return logits_ptr

def generate(
self,
inputs: Sequence[int],
Expand All @@ -27,7 +147,35 @@ def generate(
top_p: float = 0.8,
temperature: float = 0.8,
):
if max_new_tokens is None:
max_new_tokens=100

tokens=list(inputs)
start_pos=0

eos_token_id=151643

logits_ptr=self.forward(tokens,start_pos)

next_token=self.lib.llaisysQwen2Sample(logits_ptr)
tokens.append(next_token)
if next_token ==eos_token_id:
return tokens

start_pos=len(inputs)

for i in range(max_new_tokens - 1):
input_step=[tokens[-1]]

logits_ptr=self.forward(input_step,start_pos)

next_token=self.lib.llaisysQwen2Sample(logits_ptr)

tokens.append(next_token)
# 与 HF 一样,将 EOS token 也包含在输出序列中
if next_token == eos_token_id:
break

# TODO: Implement generate function
start_pos+=1

return []
return tokens
Binary file added report_linear.pdf
Binary file not shown.
Loading
Loading