InfiniTensor · TheJieee · Feb 3, 2026 · Feb 3, 2026 · Feb 4, 2026 · Feb 5, 2026
diff --git a/.envrc b/.envrc
@@ -0,0 +1 @@
+source .venv/bin/activate
diff --git a/.gitignore b/.gitignore
@@ -87,4 +87,7 @@ htmlcov/
 # Windows
 Thumbs.db
 ehthumbs.db
-desktop.ini
+desktop.ini
+
+# model
+DeepSeek-R1-Distill-Qwen-1.5B/
diff --git a/include/llaisys/models/qwen2.h b/include/llaisys/models/qwen2.h
@@ -4,14 +4,14 @@
 #include "../tensor.h"
 
 __C {
-    struct LlaisysQwen2Meta {
+    typedef struct LlaisysQwen2Meta_ {
         llaisysDataType_t dtype;
         size_t nlayer, hs, nh, nkvh, dh, di, maxseq, voc;
         float epsilon, theta;
         int64_t end_token;
-    };
+    }LlaisysQwen2Meta;
 
-    struct LlaisysQwen2Weights {
+    typedef struct LlaisysQwen2Weights_ {
         llaisysTensor_t in_embed;
         llaisysTensor_t out_embed;
         llaisysTensor_t out_norm_w;   // a.k.a. model.norm.weight
@@ -27,16 +27,24 @@ __C {
         llaisysTensor_t *mlp_gate_w;
         llaisysTensor_t *mlp_up_w;
         llaisysTensor_t *mlp_down_w;
-    };
+    }LlaisysQwen2Weights;
 
-    struct LlaisysQwen2Model;
+    typedef struct LlaisysQwen2Model_ {
+        LlaisysQwen2Meta* meta;
+        LlaisysQwen2Weights* weights = nullptr;
+        void *impl = nullptr; // Opaque pointer to the actual model implementation (e.g., a C++ class instance).
+    }LlaisysQwen2Model;
 
-    __export struct LlaisysQwen2Model *llaisysQwen2ModelCreate(const LlaisysQwen2Meta *meta, llaisysDeviceType_t device, int *device_ids, int ndevice);
 
-    __export void llaisysQwen2ModelDestroy(struct LlaisysQwen2Model * model);
 
-    __export struct LlaisysQwen2Weights *llaisysQwen2ModelWeights(struct LlaisysQwen2Model * model);
+    __export LlaisysQwen2Model *llaisysQwen2ModelCreate(const LlaisysQwen2Meta *meta, llaisysDeviceType_t device, int *device_ids, int ndevice);
 
-    __export int64_t llaisysQwen2ModelInfer(struct LlaisysQwen2Model * model, int64_t * token_ids, size_t ntoken);
+    __export void llaisysQwen2ModelDestroy(LlaisysQwen2Model * model);
+
+    __export void llaisysQwen2modelLoadWeight(LlaisysQwen2Model * model, const void *weight_data, const char *weight_name);
+
+    __export LlaisysQwen2Weights *llaisysQwen2ModelWeights(LlaisysQwen2Model * model);
+
+    __export int64_t llaisysQwen2ModelInfer(LlaisysQwen2Model * model, int64_t * token_ids, size_t ntoken);
 }
 #endif // LLAISYS_MODELS_QWEN2_H
diff --git a/make.sh b/make.sh
@@ -0,0 +1,4 @@
+xmake
+xmake install
+pip install ./python
+
diff --git a/python/llaisys/libllaisys/__init__.py b/python/llaisys/libllaisys/__init__.py
@@ -10,9 +10,10 @@
 from .llaisys_types import llaisysMemcpyKind_t, MemcpyKind
 from .llaisys_types import llaisysStream_t
 from .tensor import llaisysTensor_t
+from .models.qwen2 import Qwen2Meta, LlaisysQwen2Meta_t, LlaisysQwen2Model_t, LlaisysQwen2Weights_t
 from .tensor import load_tensor
 from .ops import load_ops
-
+from .models.qwen2 import load_qwen2
 
 def load_shared_library():
     lib_dir = Path(__file__).parent
@@ -38,6 +39,7 @@ def load_shared_library():
 load_runtime(LIB_LLAISYS)
 load_tensor(LIB_LLAISYS)
 load_ops(LIB_LLAISYS)
+load_qwen2(LIB_LLAISYS)
 
 
 __all__ = [
@@ -46,6 +48,10 @@ def load_shared_library():
     "llaisysStream_t",
     "llaisysTensor_t",
     "llaisysDataType_t",
+    "Qwen2Meta",
+    "LlaisysQwen2Meta_t",
+    "LlaisysQwen2Model_t",
+    "LlaisysQwen2Weights_t",
     "DataType",
     "llaisysDeviceType_t",
     "DeviceType",

diff --git a/python/llaisys/libllaisys/llaisys_types.py b/python/llaisys/libllaisys/llaisys_types.py
@@ -49,6 +49,16 @@ class MemcpyKind(IntEnum):
 
 llaisysMemcpyKind_t = ctypes.c_int
 
+'''
+struct LlaisysQwen2Meta_ {
+        llaisysDataType_t dtype;
+        size_t nlayer, hs, nh, nkvh, dh, di, maxseq, voc;
+        float epsilon, theta;
+        int64_t end_token;
+}
+'''
+
+
 # Stream type (opaque pointer)
 llaisysStream_t = ctypes.c_void_p
 

diff --git a/python/llaisys/libllaisys/models/__init__.py b/python/llaisys/libllaisys/models/__init__.py
diff --git a/python/llaisys/libllaisys/models/qwen2.py b/python/llaisys/libllaisys/models/qwen2.py
@@ -0,0 +1,59 @@
+from ctypes import POINTER, c_void_p, c_size_t, c_int, c_char_p, Structure, c_float, c_int64
+from ..llaisys_types import *
+from ..tensor import llaisysTensor_t
+
+class Qwen2Meta(Structure):
+    _fields_ = [
+        ("dtype", llaisysDataType_t),
+        ("nlayer", c_size_t),
+        ("hs", c_size_t),
+        ("nh", c_size_t),
+        ("nkvh", c_size_t),
+        ("dh", c_size_t),
+        ("di", c_size_t),
+        ("maxseq", c_size_t),
+        ("voc", c_size_t),
+        ("epsilon", c_float),
+        ("theta", c_float),
+        ("end_token", c_int64),
+    ]
+
+class LlaisysQwen2Weights(Structure):
+    _fields_ = [
+        ("in_embed", llaisysTensor_t),
+        ("out_embed", llaisysTensor_t),
+        ("out_norm_w", llaisysTensor_t),
+        ("attn_norm_w", POINTER(llaisysTensor_t)),
+        ("attn_q_w", POINTER(llaisysTensor_t)),
+        ("attn_q_b", POINTER(llaisysTensor_t)),
+        ("attn_k_w", POINTER(llaisysTensor_t)),
+        ("attn_k_b", POINTER(llaisysTensor_t)),
+        ("attn_v_w", POINTER(llaisysTensor_t)),
+        ("attn_v_b", POINTER(llaisysTensor_t)),
+        ("attn_o_w", POINTER(llaisysTensor_t)),
+        ("mlp_norm_w", POINTER(llaisysTensor_t)),
+        ("mlp_gate_w", POINTER(llaisysTensor_t)),
+        ("mlp_up_w", POINTER(llaisysTensor_t)),
+        ("mlp_down_w", POINTER(llaisysTensor_t)),
+    ]
+
+LlaisysQwen2Meta_t = POINTER(Qwen2Meta)
+LlaisysQwen2Model_t = c_void_p
+LlaisysQwen2Weights_t = POINTER(LlaisysQwen2Weights)
+
+
+def load_qwen2(lib):
+    lib.llaisysQwen2ModelCreate.argtypes = [LlaisysQwen2Meta_t, llaisysDeviceType_t, POINTER(c_int), c_int]
+    lib.llaisysQwen2ModelCreate.restype = LlaisysQwen2Model_t
+
+    lib.llaisysQwen2ModelDestroy.argtypes = [LlaisysQwen2Model_t]
+    lib.llaisysQwen2ModelDestroy.restype = None
+
+    lib.llaisysQwen2modelLoadWeight.argtypes = [LlaisysQwen2Model_t, c_void_p, c_char_p]
+    lib.llaisysQwen2modelLoadWeight.restype = None
+
+    lib.llaisysQwen2ModelInfer.argtypes = [LlaisysQwen2Model_t, POINTER(c_int64), c_size_t]
+    lib.llaisysQwen2ModelInfer.restype = c_int64
+
+    lib.llaisysQwen2ModelWeights.argtypes = [LlaisysQwen2Model_t]
+    lib.llaisysQwen2ModelWeights.restype = LlaisysQwen2Weights_t
diff --git a/python/llaisys/models/__init__.py b/python/llaisys/models/__init__.py
@@ -1 +1 @@
-from .qwen2 import Qwen2
+from .qwen2 import Qwen2
diff --git a/python/llaisys/models/qwen2.py b/python/llaisys/models/qwen2.py
@@ -1,23 +1,28 @@
+import ctypes
+import numpy as np
+import gc
+from enum import IntEnum
 from typing import Sequence
 from ..libllaisys import LIB_LLAISYS
-from ..libllaisys import DeviceType
+from ..libllaisys import *
+from ..tensor import Tensor
+import torch
+
 
 from pathlib import Path
 import safetensors
+import json
 
 
 class Qwen2:
-
     def __init__(self, model_path, device: DeviceType = DeviceType.CPU):
-        # TODO: Implement model constructor
-
-        model_path = Path(model_path)
-
-        for file in sorted(model_path.glob("*.safetensors")):
-            data_ = safetensors.safe_open(file, framework="numpy", device="cpu")
-            for name_ in data_.keys():
-                ## TODO: load the model weights
-                pass
+        self.model_path = Path(model_path)
+        self.device = device
+        self._load_config()
+        self._load_weights()
+
+    def __delete__(self):
+        LIB_LLAISYS.llaisysQwen2ModelDestroy(self.model)
 
     def generate(
         self,
@@ -28,6 +33,45 @@ def generate(
         temperature: float = 0.8,
     ):
 
-        # TODO: Implement generate function
+        ptr = np.array(inputs, dtype=np.int64).ctypes.data_as(ctypes.POINTER(ctypes.c_int64))
+        l = len(inputs)
+        ret = list(inputs)
+        id = 0
+        while id != self.config["eos_token_id"]:
+            id = int(LIB_LLAISYS.llaisysQwen2ModelInfer(self.model, ptr, ctypes.c_size_t(l)))
+            ret.append(id)
+            ptr = ctypes.byref(ctypes.c_int64(id))
+            l = 1
+        return ret
+
+    def _load_config(self):
+        config_file = self.model_path / "config.json"
+        with open(config_file, "r") as f:
+            self.config = json.load(f)
+        meta = Qwen2Meta()
+        meta.dtype = ctypes.c_int(DataType.BF16)
+        meta.nlayer = ctypes.c_size_t(self.config["num_hidden_layers"])
+        meta.hs = ctypes.c_size_t(self.config["hidden_size"])
+        meta.nh = ctypes.c_size_t(self.config["num_attention_heads"])
+        meta.nkvh = ctypes.c_size_t(self.config["num_key_value_heads"])
+        meta.dh = ctypes.c_size_t(self.config["hidden_size"] // self.config["num_attention_heads"])
+        meta.di = ctypes.c_size_t(self.config["intermediate_size"])
+        meta.maxseq = ctypes.c_size_t(self.config["max_position_embeddings"])
+        meta.voc = ctypes.c_size_t(self.config["vocab_size"])
+        meta.epsilon = ctypes.c_float(self.config["rms_norm_eps"])
+        meta.theta = ctypes.c_float(self.config["rope_theta"])
+        meta.end_token = ctypes.c_int64(self.config["eos_token_id"])
+
 
-        return []
+        id = ctypes.c_int(0)
+        self.model = LIB_LLAISYS.llaisysQwen2ModelCreate(ctypes.byref(meta), self.device, ctypes.byref(id), 1)
+
+    def _load_weights(self):
+        for file in sorted(self.model_path.glob("*.safetensors")):
+            data_ = safetensors.safe_open(file, framework="torch", device="cpu")
+            for name_ in data_.keys():
+                tensor = data_.get_tensor(name_)
+                name_c = ctypes.c_char_p(name_.encode('utf-8'))
+                LIB_LLAISYS.llaisysQwen2modelLoadWeight(self.model, ctypes.c_void_p(tensor.data_ptr()), name_c)
+                del tensor
+        gc.collect()
diff --git a/python/llaisys/tensor.py b/python/llaisys/tensor.py
@@ -9,6 +9,7 @@
     DataType,
 )
 from ctypes import c_size_t, c_int, c_ssize_t, c_void_p
+import torch
 
 
 class Tensor:
@@ -95,3 +96,26 @@ def slice(self, dim: int, start: int, end: int):
                 self._tensor, c_size_t(dim), c_size_t(start), c_size_t(end)
             )
         )
+
+    @staticmethod
+    def from_torch(torch_tensor: torch.Tensor):
+        assert torch_tensor.is_contiguous(), "Only contiguous tensors are supported"
+        assert torch_tensor.device.type in ["cpu", "cuda"], "Only CPU and CUDA devices are supported"
+
+        device_type = DeviceType.CPU if torch_tensor.device.type == "cpu" else DeviceType.NVIDIA
+        dtype = DataType.F32
+        if torch_tensor.dtype == torch.float16:
+            dtype = DataType.F16
+        elif torch_tensor.dtype == torch.bfloat16:
+            dtype = DataType.BF16
+        else:
+            raise ValueError(f"Unsupported data type: {torch_tensor.dtype}")
+        _tensor = Tensor(
+            shape=torch_tensor.shape,
+            dtype=dtype,
+            device=device_type,
+            device_id=torch_tensor.device.index if torch_tensor.device.type == "cuda" else 0,
+        )
+
+        _tensor.load(torch_tensor.data_ptr())
+        return _tensor
diff --git a/src/core/runtime/runtime.hpp b/src/core/runtime/runtime.hpp
@@ -37,7 +37,6 @@ class Runtime {
     const LlaisysRuntimeAPI *api() const;
 
     storage_t allocateDeviceStorage(size_t size);
-    ;
     storage_t allocateHostStorage(size_t size);
     void freeStorage(Storage *storage);