InfiniTensor · wsgjisjdgh · Feb 1, 2026 · Feb 1, 2026 · Feb 1, 2026 · Feb 1, 2026
diff --git a/python/llaisys/libllaisys/__init__.py b/python/llaisys/libllaisys/__init__.py
@@ -12,7 +12,7 @@
 from .tensor import llaisysTensor_t
 from .tensor import load_tensor
 from .ops import load_ops
-
+from .models import load_models, LlaisysQwen2Meta, LlaisysQwen2Weights
 
 def load_shared_library():
     lib_dir = Path(__file__).parent
@@ -38,6 +38,7 @@ def load_shared_library():
 load_runtime(LIB_LLAISYS)
 load_tensor(LIB_LLAISYS)
 load_ops(LIB_LLAISYS)
+load_models(LIB_LLAISYS)
 
 
 __all__ = [
@@ -52,4 +53,5 @@ def load_shared_library():
     "llaisysMemcpyKind_t",
     "MemcpyKind",
     "llaisysStream_t",
+    "LlaisysQwen2Meta", "LlaisysQwen2Weights",
 ]
diff --git a/python/llaisys/libllaisys/models.py b/python/llaisys/libllaisys/models.py
@@ -0,0 +1,72 @@
+from ctypes import Structure, POINTER, c_size_t, c_float, c_int, c_int64
+from .llaisys_types import llaisysDataType_t, llaisysDeviceType_t
+from .tensor import llaisysTensor_t
+
+
+class LlaisysQwen2Meta(Structure):
+    _fields_ = [
+        ("dtype", llaisysDataType_t),
+        ("nlayer", c_size_t),
+        ("hs", c_size_t),
+        ("nh", c_size_t),
+        ("nkvh", c_size_t),
+        ("dh", c_size_t),
+        ("di", c_size_t),
+        ("maxseq", c_size_t),
+        ("voc", c_size_t),
+        ("epsilon", c_float),
+        ("theta", c_float),
+        ("end_token", c_int64),
+    ]
+
+
+class LlaisysQwen2Weights(Structure):
+    _fields_ = [
+        ("in_embed", llaisysTensor_t),
+        ("out_embed", llaisysTensor_t),
+        ("out_norm_w", llaisysTensor_t),
+        ("attn_norm_w", POINTER(llaisysTensor_t)),
+        ("attn_q_w", POINTER(llaisysTensor_t)),
+        ("attn_q_b", POINTER(llaisysTensor_t)),
+        ("attn_k_w", POINTER(llaisysTensor_t)),
+        ("attn_k_b", POINTER(llaisysTensor_t)),
+        ("attn_v_w", POINTER(llaisysTensor_t)),
+        ("attn_v_b", POINTER(llaisysTensor_t)),
+        ("attn_o_w", POINTER(llaisysTensor_t)),
+        ("mlp_norm_w", POINTER(llaisysTensor_t)),
+        ("mlp_gate_w", POINTER(llaisysTensor_t)),
+        ("mlp_up_w", POINTER(llaisysTensor_t)),
+        ("mlp_down_w", POINTER(llaisysTensor_t)),
+    ]
+
+# Opaque handle
+class LlaisysQwen2Model(Structure):
+    pass # 定义一个空结构体用于类型占位
+
+LlaisysQwen2Model_p = POINTER(LlaisysQwen2Model)
+
+def load_models(lib):
+    # llaisysQwen2ModelCreate
+    lib.llaisysQwen2ModelCreate.argtypes = [
+        POINTER(LlaisysQwen2Meta),
+        llaisysDeviceType_t,
+        POINTER(c_int), # device_ids
+        c_int # ndevice
+    ]
+    lib.llaisysQwen2ModelCreate.restype = LlaisysQwen2Model_p
+
+    # llaisysQwen2ModelDestroy
+    lib.llaisysQwen2ModelDestroy.argtypes = [LlaisysQwen2Model_p]
+    lib.llaisysQwen2ModelDestroy.restype = None
+
+    # llaisysQwen2ModelWeights
+    lib.llaisysQwen2ModelWeights.argtypes = [LlaisysQwen2Model_p]
+    lib.llaisysQwen2ModelWeights.restype = POINTER(LlaisysQwen2Weights)
+
+    # llaisysQwen2ModelInfer
+    lib.llaisysQwen2ModelInfer.argtypes = [
+        LlaisysQwen2Model_p,
+        POINTER(c_int64), # token_ids
+        c_size_t # ntoken
+    ]
+    lib.llaisysQwen2ModelInfer.restype = c_int64
diff --git a/python/llaisys/models/qwen2.py b/python/llaisys/models/qwen2.py
@@ -1,33 +1,172 @@
 from typing import Sequence
 from ..libllaisys import LIB_LLAISYS
 from ..libllaisys import DeviceType
-
+from ..libllaisys.models import LlaisysQwen2Meta, LlaisysQwen2Weights
+from ..tensor import Tensor
+import ctypes
+from ctypes import POINTER, c_int, c_int64, c_float, c_size_t
 from pathlib import Path
-import safetensors
-
+import json
+import numpy as np
+import mmap
+import struct
 
 class Qwen2:
 
     def __init__(self, model_path, device: DeviceType = DeviceType.CPU):
-        # TODO: Implement model constructor
-
         model_path = Path(model_path)
+
+        # 1. Load Config
+        with open(model_path / "config.json", "r") as f:
+            config = json.load(f)
+
+        # 2. Prepare Meta
+        self.meta = LlaisysQwen2Meta()
+        self.meta.dtype = 19 # BF16
+        self.meta.nlayer = config["num_hidden_layers"]
+        self.meta.hs = config["hidden_size"]
+        self.meta.nh = config["num_attention_heads"]
+        self.meta.nkvh = config["num_key_value_heads"]
+        self.meta.dh = self.meta.hs // self.meta.nh
+        self.meta.di = config["intermediate_size"]
+        self.meta.maxseq = 2048
+        self.meta.voc = config["vocab_size"]
+        self.meta.epsilon = config["rms_norm_eps"]
+        self.meta.theta = config.get("rope_theta", 10000.0)
+        self.meta.end_token = 151643 # <|end_of_text|>
+
+        # 3. Create C Model
+        device_ids = (c_int * 1)(0)
+        self._model_handle = LIB_LLAISYS.llaisysQwen2ModelCreate(
+            ctypes.byref(self.meta),
+            device.value,
+            device_ids,
+            1
+        )
+
+        # 4. Get Weights Structure Pointers
+        self.weights_ptr = LIB_LLAISYS.llaisysQwen2ModelWeights(self._model_handle).contents
 
+        # 5. Load Weights Manually
+        print("Loading weights...")
+        self._load_safetensors_manually(model_path)
+
+    def _load_safetensors_manually(self, model_path: Path):
+        """
+        Manually parse safetensors headers and mmap data as uint16.
+        """
         for file in sorted(model_path.glob("*.safetensors")):
-            data_ = safetensors.safe_open(file, framework="numpy", device="cpu")
-            for name_ in data_.keys():
-                ## TODO: load the model weights
-                pass
+            with open(file, 'rb') as f:
+                # Read Header Size
+                header_len_bytes = f.read(8)
+                if not header_len_bytes: continue
+                header_len = struct.unpack('<Q', header_len_bytes)[0]
+
+                # Read Header
+                header_bytes = f.read(header_len)
+                header = json.loads(header_bytes)
+
+                # Start of data section
+                data_start = 8 + header_len
+
+                # Use mmap
+                with mmap.mmap(f.fileno(), length=0, access=mmap.ACCESS_READ) as mm:
+                    for name, info in header.items():
+                        if name == "__metadata__": continue
+
+                        target_tensor = self._map_weight_name(name)
+                        if not target_tensor:
+                            continue
+
+                        begin, end = info['data_offsets']
+                        total_bytes = end - begin
+
+                        # Create numpy view
+                        raw_data = np.frombuffer(
+                            mm, 
+                            dtype=np.uint16, 
+                            count=total_bytes // 2, 
+                            offset=data_start + begin
+                        )
+
+                        target_tensor.load(raw_data.ctypes.data)
+
+                        del raw_data 
+
+    def _map_weight_name(self, name):
+        if "model.embed_tokens.weight" in name:
+            return Tensor(tensor=self.weights_ptr.in_embed)
+        elif "lm_head.weight" in name:
+            return Tensor(tensor=self.weights_ptr.out_embed)
+        elif "model.norm.weight" in name:
+            return Tensor(tensor=self.weights_ptr.out_norm_w)
+        elif "layers" in name:
+            parts = name.split(".")
+            layer_idx = int(parts[2])
+            module = parts[3]
+
+            if module == "input_layernorm":
+                return Tensor(tensor=self.weights_ptr.attn_norm_w[layer_idx])
+            elif module == "post_attention_layernorm":
+                return Tensor(tensor=self.weights_ptr.mlp_norm_w[layer_idx])
+            elif module == "self_attn":
+                proj = parts[4]
+                type_ = parts[5]
+                if type_ == "weight":
+                    if "q_proj" in proj: return Tensor(tensor=self.weights_ptr.attn_q_w[layer_idx])
+                    elif "k_proj" in proj: return Tensor(tensor=self.weights_ptr.attn_k_w[layer_idx])
+                    elif "v_proj" in proj: return Tensor(tensor=self.weights_ptr.attn_v_w[layer_idx])
+                    elif "o_proj" in proj: return Tensor(tensor=self.weights_ptr.attn_o_w[layer_idx])
+                elif type_ == "bias":
+                    if "q_proj" in proj: return Tensor(tensor=self.weights_ptr.attn_q_b[layer_idx])
+                    elif "k_proj" in proj: return Tensor(tensor=self.weights_ptr.attn_k_b[layer_idx])
+                    elif "v_proj" in proj: return Tensor(tensor=self.weights_ptr.attn_v_b[layer_idx])
+            elif module == "mlp":
+                proj = parts[4]
+                if "gate_proj" in proj: return Tensor(tensor=self.weights_ptr.mlp_gate_w[layer_idx])
+                elif "up_proj" in proj: return Tensor(tensor=self.weights_ptr.mlp_up_w[layer_idx])
+                elif "down_proj" in proj: return Tensor(tensor=self.weights_ptr.mlp_down_w[layer_idx])
+        return None
 
     def generate(
         self,
         inputs: Sequence[int],
-        max_new_tokens: int = None,
+        max_new_tokens: int = 128,
         top_k: int = 1,
         top_p: float = 0.8,
         temperature: float = 0.8,
     ):
+        tokens = list(inputs)
+
+        # 1. Prefill
+        input_arr = (c_int64 * len(tokens))(*tokens)
+        next_token = LIB_LLAISYS.llaisysQwen2ModelInfer(
+            self._model_handle,
+            input_arr,
+            len(tokens)
+        )
+        tokens.append(next_token)
+
+        # 2. Decode Loop
+        for _ in range(max_new_tokens - 1):
+            if next_token == self.meta.end_token:
+                break
+
+            input_arr = (c_int64 * 1)(next_token)
+            next_token = LIB_LLAISYS.llaisysQwen2ModelInfer(
+                self._model_handle,
+                input_arr,
+                1
+            )
+            tokens.append(next_token)
 
-        # TODO: Implement generate function
-
-        return []
+        return tokens
+
+    def __del__(self):
+        try:
+            if hasattr(self, '_model_handle') and self._model_handle:
+                if LIB_LLAISYS and hasattr(LIB_LLAISYS, 'llaisysQwen2ModelDestroy'):
+                    LIB_LLAISYS.llaisysQwen2ModelDestroy(self._model_handle)
+                    self._model_handle = None
+        except:
+            pass
diff --git a/python/setup.cfg b/python/setup.cfg
@@ -13,9 +13,10 @@ install_requires =
     torch>=2.4.0
     transformers
     accelerate
+    numpy>=1.21.0
 
 [options.package_data]
 llaisys = 
     libllaisys/*.so
     libllaisys/*.dll
-    libllaisys/*.dylib
+    libllaisys/*.dylib
diff --git a/src/core/runtime/runtime.cpp b/src/core/runtime/runtime.cpp
@@ -2,13 +2,25 @@
 
 #include "../../device/runtime_api.hpp"
 #include "../allocator/naive_allocator.hpp"
+#ifdef ENABLE_NVIDIA_API
+#include "../../device/nvidia/nvidia_resource.cuh"
+#endif
 
 namespace llaisys::core {
 Runtime::Runtime(llaisysDeviceType_t device_type, int device_id)
     : _device_type(device_type), _device_id(device_id), _is_active(false) {
     _api = llaisys::device::getRuntimeAPI(_device_type);
     _stream = _api->create_stream();
     _allocator = new allocators::NaiveAllocator(_api);
+    if (device_type == LLAISYS_DEVICE_NVIDIA) {
+#ifdef ENABLE_NVIDIA_API
+    // 只有在 CUDA 工具链存在的环境下，才会被编译进二进制文件
+    _device_resource = new llaisys::device::nvidia::Resource(device_id);
+#else
+    // 在无 CUDA 环境下抛出绝对明确的运行时异常
+    throw std::runtime_error("Llaisys Runtime Error: The framework was compiled without NVIDIA backend support. No CUDA toolkit detected during build.");
+#endif
+}
 }
 
 Runtime::~Runtime() {
@@ -19,6 +31,10 @@ Runtime::~Runtime() {
     _allocator = nullptr;
     _api->destroy_stream(_stream);
     _api = nullptr;
+    if (_device_resource) {
+        delete _device_resource;
+        _device_resource = nullptr;
+    }
 }
 
 void Runtime::_activate() {

diff --git a/src/core/runtime/runtime.hpp b/src/core/runtime/runtime.hpp
@@ -3,6 +3,7 @@
 
 #include "../../device/runtime_api.hpp"
 #include "../allocator/allocator.hpp"
+#include "../../device/device_resource.hpp"
 
 namespace llaisys::core {
 class Runtime {
@@ -11,6 +12,7 @@ class Runtime {
     int _device_id;
     const LlaisysRuntimeAPI *_api;
     MemoryAllocator *_allocator;
+    llaisys::device::DeviceResource *_device_resource = nullptr;
     bool _is_active;
     void _activate();
     void _deactivate();
@@ -35,7 +37,7 @@ class Runtime {
     bool isActive() const;
 
     const LlaisysRuntimeAPI *api() const;
-
+    llaisys::device::DeviceResource *deviceResource() const { return _device_resource; }
     storage_t allocateDeviceStorage(size_t size);
     ;
     storage_t allocateHostStorage(size_t size);

diff --git a/src/device/device_resource.hpp b/src/device/device_resource.hpp
@@ -14,7 +14,7 @@ class DeviceResource {
         : _device_type(device_type),
           _device_id(device_id) {
     }
-    ~DeviceResource() = default;
+    virtual ~DeviceResource() = default;
 
     llaisysDeviceType_t getDeviceType() const { return _device_type; }
     int getDeviceId() const { return _device_id; };

diff --git a/src/device/nvidia/nvidia_resource.cu b/src/device/nvidia/nvidia_resource.cu
@@ -1,7 +1,29 @@
 #include "nvidia_resource.cuh"
+#include <stdexcept>
+#include <cuda_runtime.h>
+#include <cstdio>
 
 namespace llaisys::device::nvidia {
 
-Resource::Resource(int device_id) : llaisys::device::DeviceResource(LLAISYS_DEVICE_NVIDIA, device_id) {}
+Resource::Resource(int device_id) : llaisys::device::DeviceResource(LLAISYS_DEVICE_NVIDIA, device_id) {
+    cudaError_t err=cudaSetDevice(device_id);
+    if (err != cudaSuccess) {
+        throw std::runtime_error("Failed to set CUDA device in Resource constructor");
+    }
 
+    // Create cuBLAS handle
+    cublasStatus_t status=cublasCreate(&_cublas_handle);
+    if (status != CUBLAS_STATUS_SUCCESS) {
+        throw std::runtime_error("Failed to create cuBLAS handle in Resource constructor");
+    }
+
+
+}
+Resource::~Resource() {
+    // Destroy cuBLAS handle
+    if (_cublas_handle) {
+        cublasDestroy(_cublas_handle);
+        _cublas_handle = nullptr;
+    }
+}
 } // namespace llaisys::device::nvidia