InfiniTensor · Promin3 · Feb 1, 2026 · Feb 1, 2026 · Feb 1, 2026 · Feb 3, 2026
diff --git a/include/llaisys/build_config.h.in b/include/llaisys/build_config.h.in
@@ -0,0 +1,6 @@
+#ifndef LLAISYS_BUILD_CONFIG_H
+#define LLAISYS_BUILD_CONFIG_H
+
+${define ENABLE_NVIDIA_API}
+
+#endif
diff --git a/include/llaisys/models/qwen2.h b/include/llaisys/models/qwen2.h
@@ -38,5 +38,10 @@ __C {
     __export struct LlaisysQwen2Weights *llaisysQwen2ModelWeights(struct LlaisysQwen2Model * model);
 
     __export int64_t llaisysQwen2ModelInfer(struct LlaisysQwen2Model * model, int64_t * token_ids, size_t ntoken);
+
+    __export int64_t llaisysQwen2ModelInferSample(struct LlaisysQwen2Model * model, int64_t * token_ids, size_t ntoken,
+                                                   float temperature, int top_k, float top_p);
+
+    __export void llaisysQwen2ModelResetKVCache(struct LlaisysQwen2Model * model);
 }
 #endif // LLAISYS_MODELS_QWEN2_H
diff --git a/include/llaisys/ops.h b/include/llaisys/ops.h
@@ -13,6 +13,7 @@ __C {
     __export void llaisysROPE(llaisysTensor_t out, llaisysTensor_t in, llaisysTensor_t pos_ids, float theta);
     __export void llaisysSelfAttention(llaisysTensor_t attn_val, llaisysTensor_t q, llaisysTensor_t k, llaisysTensor_t v, float scale);
     __export void llaisysSwiGLU(llaisysTensor_t out, llaisysTensor_t gate, llaisysTensor_t up);
+    __export void llaisysSample(llaisysTensor_t out_idx, llaisysTensor_t logits, float temperature, int top_k, float top_p);
 }
 
 #endif
diff --git a/python/llaisys/libllaisys/__init__.py b/python/llaisys/libllaisys/__init__.py
@@ -12,6 +12,8 @@
 from .tensor import llaisysTensor_t
 from .tensor import load_tensor
 from .ops import load_ops
+from .qwen2 import load_qwen2
+from .qwen2 import LlaisysQwen2Meta, LlaisysQwen2Weights, llaisysQwen2Model_t
 
 
 def load_shared_library():
@@ -38,6 +40,7 @@ def load_shared_library():
 load_runtime(LIB_LLAISYS)
 load_tensor(LIB_LLAISYS)
 load_ops(LIB_LLAISYS)
+load_qwen2(LIB_LLAISYS)
 
 
 __all__ = [

diff --git a/python/llaisys/libllaisys/ops.py b/python/llaisys/libllaisys/ops.py
@@ -1,5 +1,5 @@
 from .tensor import llaisysTensor_t
-from ctypes import c_float
+from ctypes import c_float, c_int
 
 def load_ops(lib):
     lib.llaisysAdd.argtypes = [llaisysTensor_t, llaisysTensor_t, llaisysTensor_t]
@@ -34,3 +34,6 @@ def load_ops(lib):
 
     lib.llaisysSwiGLU.argtypes = [llaisysTensor_t, llaisysTensor_t, llaisysTensor_t]
     lib.llaisysSwiGLU.restype = None
+
+    lib.llaisysSample.argtypes = [llaisysTensor_t, llaisysTensor_t, c_float, c_int, c_float]
+    lib.llaisysSample.restype = None
diff --git a/python/llaisys/libllaisys/qwen2.py b/python/llaisys/libllaisys/qwen2.py
@@ -0,0 +1,72 @@
+import ctypes
+from ctypes import c_void_p, c_size_t, c_int, c_int64, c_float, Structure, POINTER
+from .llaisys_types import llaisysDataType_t, llaisysDeviceType_t
+from .tensor import llaisysTensor_t
+
+
+class LlaisysQwen2Meta(Structure):
+    _fields_ = [
+        ("dtype", llaisysDataType_t),
+        ("nlayer", c_size_t),
+        ("hs", c_size_t),
+        ("nh", c_size_t),
+        ("nkvh", c_size_t),
+        ("dh", c_size_t),
+        ("di", c_size_t),
+        ("maxseq", c_size_t),
+        ("voc", c_size_t),
+        ("epsilon", c_float),
+        ("theta", c_float),
+        ("end_token", c_int64),
+    ]
+
+
+class LlaisysQwen2Weights(Structure):
+    _fields_ = [
+        ("in_embed", llaisysTensor_t),
+        ("out_embed", llaisysTensor_t),
+        ("out_norm_w", llaisysTensor_t),
+        ("attn_norm_w", POINTER(llaisysTensor_t)),
+        ("attn_q_w", POINTER(llaisysTensor_t)),
+        ("attn_q_b", POINTER(llaisysTensor_t)),
+        ("attn_k_w", POINTER(llaisysTensor_t)),
+        ("attn_k_b", POINTER(llaisysTensor_t)),
+        ("attn_v_w", POINTER(llaisysTensor_t)),
+        ("attn_v_b", POINTER(llaisysTensor_t)),
+        ("attn_o_w", POINTER(llaisysTensor_t)),
+        ("mlp_norm_w", POINTER(llaisysTensor_t)),
+        ("mlp_gate_w", POINTER(llaisysTensor_t)),
+        ("mlp_up_w", POINTER(llaisysTensor_t)),
+        ("mlp_down_w", POINTER(llaisysTensor_t)),
+    ]
+
+
+llaisysQwen2Model_t = c_void_p
+
+
+def load_qwen2(lib):
+    lib.llaisysQwen2ModelCreate.argtypes = [
+        POINTER(LlaisysQwen2Meta),
+        llaisysDeviceType_t,
+        POINTER(c_int),
+        c_int,
+    ]
+    lib.llaisysQwen2ModelCreate.restype = llaisysQwen2Model_t
+
+    lib.llaisysQwen2ModelDestroy.argtypes = [llaisysQwen2Model_t]
+    lib.llaisysQwen2ModelDestroy.restype = None
+
+    lib.llaisysQwen2ModelWeights.argtypes = [llaisysQwen2Model_t]
+    lib.llaisysQwen2ModelWeights.restype = POINTER(LlaisysQwen2Weights)
+
+    lib.llaisysQwen2ModelInfer.argtypes = [llaisysQwen2Model_t, POINTER(c_int64), c_size_t]
+    lib.llaisysQwen2ModelInfer.restype = c_int64
+
+    lib.llaisysQwen2ModelInferSample.argtypes = [
+        llaisysQwen2Model_t, POINTER(c_int64), c_size_t,
+        c_float, c_int, c_float,
+    ]
+    lib.llaisysQwen2ModelInferSample.restype = c_int64
+
+    lib.llaisysQwen2ModelResetKVCache.argtypes = [llaisysQwen2Model_t]
+    lib.llaisysQwen2ModelResetKVCache.restype = None
diff --git a/python/llaisys/models/qwen2.py b/python/llaisys/models/qwen2.py
@@ -1,23 +1,121 @@
-from typing import Sequence
+from typing import Sequence, Iterator
 from ..libllaisys import LIB_LLAISYS
-from ..libllaisys import DeviceType
+from ..libllaisys import DeviceType, DataType
+from ..libllaisys import LlaisysQwen2Meta, LlaisysQwen2Weights
 
 from pathlib import Path
+import ctypes
+import json
 import safetensors
+import torch
 
 
 class Qwen2:
 
-    def __init__(self, model_path, device: DeviceType = DeviceType.CPU):
-        # TODO: Implement model constructor
+    DTYPE_MAP = {
+        "bfloat16": DataType.BF16,
+        "float16": DataType.F16,
+        "float32": DataType.F32,
+    }
 
+    def __init__(self, model_path, device: DeviceType = DeviceType.CPU):
         model_path = Path(model_path)
 
+        with open(model_path / "config.json") as f:
+            config = json.load(f)
+
+        torch_dtype = config.get("torch_dtype", "bfloat16")
+        dtype = self.DTYPE_MAP.get(torch_dtype, DataType.BF16)
+
+        nh = config["num_attention_heads"]
+        nkvh = config["num_key_value_heads"]
+        hs = config["hidden_size"]
+        dh = hs // nh
+
+        meta = LlaisysQwen2Meta()
+        meta.dtype = dtype
+        meta.nlayer = config["num_hidden_layers"]
+        meta.hs = hs
+        meta.nh = nh
+        meta.nkvh = nkvh
+        meta.dh = dh
+        meta.di = config["intermediate_size"]
+        meta.maxseq = min(config.get("max_position_embeddings", 131072), 4096)
+        meta.voc = config["vocab_size"]
+        meta.epsilon = config.get("rms_norm_eps", 1e-6)
+        meta.theta = config.get("rope_theta", 10000.0)
+        meta.end_token = config.get("eos_token_id", 151643)
+        if isinstance(meta.end_token, list):
+            meta.end_token = meta.end_token[0]
+
+        self._nlayer = meta.nlayer
+        self._end_token = meta.end_token
+        self._device = device
+
+        device_ids = (ctypes.c_int * 1)(0)
+        self._model = LIB_LLAISYS.llaisysQwen2ModelCreate(
+            ctypes.byref(meta),
+            ctypes.c_int(device),
+            device_ids,
+            ctypes.c_int(1),
+        )
+
+        weights_ptr = LIB_LLAISYS.llaisysQwen2ModelWeights(self._model)
+        weights = weights_ptr.contents
+
+        name_map = self._build_name_map(weights)
+
         for file in sorted(model_path.glob("*.safetensors")):
-            data_ = safetensors.safe_open(file, framework="numpy", device="cpu")
+            data_ = safetensors.safe_open(file, framework="pt", device="cpu")
             for name_ in data_.keys():
-                ## TODO: load the model weights
-                pass
+                if name_ in name_map:
+                    tensor_handle = name_map[name_]
+                    t = data_.get_tensor(name_).contiguous()
+                    LIB_LLAISYS.tensorLoad(tensor_handle, ctypes.c_void_p(t.data_ptr()))
+
+    def _build_name_map(self, weights: LlaisysQwen2Weights):
+        m = {}
+        m["model.embed_tokens.weight"] = weights.in_embed
+        m["lm_head.weight"] = weights.out_embed
+        m["model.norm.weight"] = weights.out_norm_w
+
+        for i in range(self._nlayer):
+            prefix = f"model.layers.{i}"
+            m[f"{prefix}.input_layernorm.weight"] = weights.attn_norm_w[i]
+            m[f"{prefix}.self_attn.q_proj.weight"] = weights.attn_q_w[i]
+            m[f"{prefix}.self_attn.q_proj.bias"] = weights.attn_q_b[i]
+            m[f"{prefix}.self_attn.k_proj.weight"] = weights.attn_k_w[i]
+            m[f"{prefix}.self_attn.k_proj.bias"] = weights.attn_k_b[i]
+            m[f"{prefix}.self_attn.v_proj.weight"] = weights.attn_v_w[i]
+            m[f"{prefix}.self_attn.v_proj.bias"] = weights.attn_v_b[i]
+            m[f"{prefix}.self_attn.o_proj.weight"] = weights.attn_o_w[i]
+            m[f"{prefix}.post_attention_layernorm.weight"] = weights.mlp_norm_w[i]
+            m[f"{prefix}.mlp.gate_proj.weight"] = weights.mlp_gate_w[i]
+            m[f"{prefix}.mlp.up_proj.weight"] = weights.mlp_up_w[i]
+            m[f"{prefix}.mlp.down_proj.weight"] = weights.mlp_down_w[i]
+
+        return m
+
+    def __del__(self):
+        if hasattr(self, "_model") and self._model is not None:
+            LIB_LLAISYS.llaisysQwen2ModelDestroy(self._model)
+            self._model = None
+
+    def reset_kvcache(self):
+        LIB_LLAISYS.llaisysQwen2ModelResetKVCache(self._model)
+
+    def _infer_one(self, token_ids, use_sample, temperature, top_k, top_p):
+        arr = (ctypes.c_int64 * len(token_ids))(*token_ids)
+        n = ctypes.c_size_t(len(token_ids))
+        if use_sample:
+            return LIB_LLAISYS.llaisysQwen2ModelInferSample(
+                self._model, arr, n,
+                ctypes.c_float(temperature),
+                ctypes.c_int(top_k),
+                ctypes.c_float(top_p),
+            )
+        else:
+            return LIB_LLAISYS.llaisysQwen2ModelInfer(self._model, arr, n)
 
     def generate(
         self,
@@ -27,7 +125,38 @@ def generate(
         top_p: float = 0.8,
         temperature: float = 0.8,
     ):
+        if max_new_tokens is None:
+            max_new_tokens = 128
+
+        use_sample = not (top_k == 1 and temperature == 1.0)
+        tokens = list(inputs)
+
+        next_token = self._infer_one(tokens, use_sample, temperature, top_k, top_p)
+        tokens.append(next_token)
+
+        for _ in range(max_new_tokens - 1):
+            if next_token == self._end_token:
+                break
+            next_token = self._infer_one([next_token], use_sample, temperature, top_k, top_p)
+            tokens.append(next_token)
+
+        return tokens
+
+    def generate_stream(
+        self,
+        inputs: Sequence[int],
+        max_new_tokens: int = 512,
+        top_k: int = 50,
+        top_p: float = 0.9,
+        temperature: float = 0.8,
+    ) -> Iterator[int]:
+        use_sample = not (top_k == 1 and temperature == 1.0)
 
-        # TODO: Implement generate function
+        next_token = self._infer_one(list(inputs), use_sample, temperature, top_k, top_p)
+        yield next_token
 
-        return []
+        for _ in range(max_new_tokens - 1):
+            if next_token == self._end_token:
+                return
+            next_token = self._infer_one([next_token], use_sample, temperature, top_k, top_p)
+            yield next_token
diff --git a/python/llaisys/ops.py b/python/llaisys/ops.py
@@ -1,6 +1,6 @@
 from .libllaisys import LIB_LLAISYS
 from .tensor import Tensor
-from ctypes import c_float, c_int
+from ctypes import c_float, c_int, c_int64
 
 
 class Ops:
@@ -19,9 +19,10 @@ def embedding(out: Tensor, index: Tensor, weight: Tensor):
         )
 
     @staticmethod
-    def linear(out: Tensor, inp: Tensor, weight: Tensor, bias: Tensor):
+    def linear(out: Tensor, inp: Tensor, weight: Tensor, bias: Tensor = None):
+        bias_handle = bias.lib_tensor() if bias is not None else None
         LIB_LLAISYS.llaisysLinear(
-            out.lib_tensor(), inp.lib_tensor(), weight.lib_tensor(), bias.lib_tensor()
+            out.lib_tensor(), inp.lib_tensor(), weight.lib_tensor(), bias_handle
         )
 
     @staticmethod
@@ -53,3 +54,10 @@ def self_attention(attn_val: Tensor, q: Tensor, k: Tensor, v: Tensor, scale: flo
     @staticmethod
     def swiglu(out: Tensor, gate: Tensor, up: Tensor):
         LIB_LLAISYS.llaisysSwiGLU(out.lib_tensor(), gate.lib_tensor(), up.lib_tensor())
+
+    @staticmethod
+    def sample(out_idx: Tensor, logits: Tensor, temperature: float = 1.0, top_k: int = 50, top_p: float = 0.9):
+        LIB_LLAISYS.llaisysSample(
+            out_idx.lib_tensor(), logits.lib_tensor(),
+            c_float(temperature), c_int(top_k), c_float(top_p)
+        )