diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 3d31c23b..29c37472 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -7,14 +7,18 @@ on:
       - 'LICENSE'
 
 jobs:
-  build:
-    name: Build
+  build-cuda:
+    name: Build and test (CUDA, ${{ matrix.target }})
     strategy:
       fail-fast: false
       matrix:
-        os: [windows-latest, ubuntu-latest]
-        type: [release]
+        include:
+          - target: ubuntu
+            os: ubuntu-latest
     runs-on: ${{ matrix.os }}
+    env:
+      TMPDIR: ${{ github.workspace }}/.tmp
+      PYTHONUTF8: "1"
     steps:
 
     - name: checkout code
@@ -24,37 +28,144 @@ jobs:
       uses: xmake-io/github-action-setup-xmake@v1
       with:
         xmake-version: latest
-    
-    - name: Xmake Build & Install
-      run: | 
-        xmake
+
+    - name: install cuda toolkit
+      uses: Jimver/cuda-toolkit@v0.2.24
+      with:
+        cuda: "12.8.0"
+
+    - name: prepare tmp (linux)
+      if: matrix.target == 'ubuntu'
+      run: |
+        mkdir -p "$TMPDIR"
+      shell: bash
+
+    - name: prepare tmp (windows)
+      if: matrix.target == 'windows'
+      run: |
+        New-Item -ItemType Directory -Force -Path $env:TMPDIR | Out-Null
+      shell: pwsh
+
+    - name: check toolchain (linux)
+      if: matrix.target == 'ubuntu'
+      run: |
+        command -v python
+        command -v pip
+        command -v xmake
+        command -v nvcc
+        python --version
+        pip --version
+        xmake --version
+        nvcc --version
+      shell: bash
+
+    - name: check toolchain (windows)
+      if: matrix.target == 'windows'
+      run: |
+        Get-Command python
+        Get-Command pip
+        Get-Command xmake
+        if (-not $env:CUDA_PATH) {
+          $nvccCmd = Get-Command nvcc -ErrorAction SilentlyContinue
+          if ($nvccCmd) {
+            $env:CUDA_PATH = Split-Path (Split-Path $nvccCmd.Source -Parent) -Parent
+          }
+        }
+        if (-not $env:CUDA_PATH) {
+          throw "CUDA_PATH is not set and nvcc not found in PATH"
+        }
+        "$env:CUDA_PATH\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+        python --version
+        pip --version
+        xmake --version
+        & "$env:CUDA_PATH\bin\nvcc.exe" --version
+      shell: pwsh
+
+    - name: detect nvidia gpu (linux)
+      if: matrix.target == 'ubuntu'
+      run: |
+        if command -v nvidia-smi >/dev/null 2>&1; then
+          nvidia-smi || true
+          echo "HAS_NVIDIA_GPU=1" >> "$GITHUB_ENV"
+        else
+          echo "HAS_NVIDIA_GPU=0" >> "$GITHUB_ENV"
+        fi
+      shell: bash
+
+    - name: detect nvidia gpu (windows)
+      if: matrix.target == 'windows'
+      run: |
+        if (Get-Command nvidia-smi -ErrorAction SilentlyContinue) {
+          nvidia-smi
+          "HAS_NVIDIA_GPU=1" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
+        } else {
+          "HAS_NVIDIA_GPU=0" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
+        }
+      shell: pwsh
+
+    - name: Xmake CUDA Build & Install (linux)
+      if: matrix.target == 'ubuntu'
+      run: |
+        xmake f --nv-gpu=y -c -v
+        xmake -v
+        xmake install
+      shell: bash
+
+    - name: Xmake CUDA Build & Install (windows)
+      if: matrix.target == 'windows'
+      run: |
+        if (-not $env:CUDA_PATH) {
+          throw "CUDA_PATH is required for windows CUDA build"
+        }
+        $env:Path = "$env:CUDA_PATH\bin;$env:Path"
+        xmake f -p windows -a x64 --toolchain=msvc --cuda="$env:CUDA_PATH" --nv-gpu=y -c -v
+        xmake -v
         xmake install
+      shell: pwsh
     
-    - name: Install Python
+    - name: Install Python (linux)
+      if: matrix.target == 'ubuntu'
       run: | 
         cd python
-        pip install .
+        pip install ./llaisyscore/
+        pip install ./server-project/
         cd ..
+      shell: bash
 
-    - name: Assignment-0
+    - name: Install Python (windows)
+      if: matrix.target == 'windows'
       run: |
-        python test/test_runtime.py --device cpu
+        Set-Location python
+        pip install ./llaisyscore/
+        pip install ./server-project/
+        Set-Location ..
+      shell: pwsh
 
-    - name: Assignment-1
+    - name: CUDA runtime api test
+      if: env.HAS_NVIDIA_GPU == '1'
       run: |
-        python test/test_tensor.py
-    
-    - name: Assignment-2
+        python test/test_runtime.py --device nvidia
+
+    - name: CUDA ops tests
+      if: env.HAS_NVIDIA_GPU == '1'
+      run: |
+        python test/ops/add.py --device nvidia
+        python test/ops/argmax.py --device nvidia
+        python test/ops/embedding.py --device nvidia
+        python test/ops/linear.py --device nvidia
+        python test/ops/random_sample.py --device nvidia
+        python test/ops/rms_norm.py --device nvidia
+        python test/ops/rope.py --device nvidia
+        python test/ops/self_attention.py --device nvidia
+        python test/ops/swiglu.py --device nvidia
+
+    - name: CUDA infer test
+      if: env.HAS_NVIDIA_GPU == '1'
       run: |
-        python test/ops/add.py 
-        python test/ops/argmax.py
-        python test/ops/embedding.py
-        python test/ops/linear.py 
-        python test/ops/rms_norm.py
-        python test/ops/rope.py
-        python test/ops/self_attention.py
-        python test/ops/swiglu.py
+        python test/test_infer.py --device nvidia --test
 
-    - name: Assignment-3
+    - name: skip gpu tests when no gpu
+      if: env.HAS_NVIDIA_GPU != '1'
       run: |
-        python test/test_infer.py --test
+        echo "No NVIDIA GPU available on this runner, skipped runtime/ops/infer GPU tests."
+      
diff --git a/.gitignore b/.gitignore
index e38cf574..1dff4f73 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,7 +9,8 @@ lib/
 *.dll
 *.dylib
 *.pyd
-
+# tmpfile
+.tmp/
 # MacOS Cache
 .DS_Store
 
diff --git a/include/llaisys/models/qwen2.h b/include/llaisys/models/qwen2.h
index 7054626d..842d644c 100644
--- a/include/llaisys/models/qwen2.h
+++ b/include/llaisys/models/qwen2.h
@@ -29,14 +29,20 @@ __C {
         llaisysTensor_t *mlp_down_w;
     };
 
-    struct LlaisysQwen2Model;
+    struct LlaisysQwen2Model
+    {
+        struct LlaisysQwen2Meta *meta;
+        struct LlaisysQwen2Weights *weights;
+        llaisysDeviceType_t device;
+        int *device_ids;
+        int ndevice;
+    };
 
     __export struct LlaisysQwen2Model *llaisysQwen2ModelCreate(const LlaisysQwen2Meta *meta, llaisysDeviceType_t device, int *device_ids, int ndevice);
-
     __export void llaisysQwen2ModelDestroy(struct LlaisysQwen2Model * model);
-
     __export struct LlaisysQwen2Weights *llaisysQwen2ModelWeights(struct LlaisysQwen2Model * model);
-
-    __export int64_t llaisysQwen2ModelInfer(struct LlaisysQwen2Model * model, int64_t * token_ids, size_t ntoken);
+    __export int64_t llaisysQwen2ModelInfer(struct LlaisysQwen2Model * model, int64_t * token_ids, size_t ntoken,
+                                            llaisysTensor_t *kcache, llaisysTensor_t *vcache, size_t past_len,
+                                            float temperature, int top_k, float top_p, int64_t seed);
 }
 #endif // LLAISYS_MODELS_QWEN2_H
diff --git a/include/llaisys/ops.h b/include/llaisys/ops.h
index ddb3be24..822a415f 100644
--- a/include/llaisys/ops.h
+++ b/include/llaisys/ops.h
@@ -13,6 +13,8 @@ __C {
     __export void llaisysROPE(llaisysTensor_t out, llaisysTensor_t in, llaisysTensor_t pos_ids, float theta);
     __export void llaisysSelfAttention(llaisysTensor_t attn_val, llaisysTensor_t q, llaisysTensor_t k, llaisysTensor_t v, float scale);
     __export void llaisysSwiGLU(llaisysTensor_t out, llaisysTensor_t gate, llaisysTensor_t up);
+    __export void llaisysRandomSample(llaisysTensor_t sample_idx, llaisysTensor_t sample_val, llaisysTensor_t logits,
+                                      float temperature, int top_k, float top_p, int64_t seed);
 }
 
 #endif
diff --git a/include/llaisys/tensor.h b/include/llaisys/tensor.h
index 76f13fbc..ab37c59b 100644
--- a/include/llaisys/tensor.h
+++ b/include/llaisys/tensor.h
@@ -63,6 +63,16 @@ __C {
         size_t dim,
         size_t start,
         size_t end);
+    __export llaisysTensor_t tensorReshape(
+        llaisysTensor_t tensor,
+        size_t * shape,
+        size_t ndim);
+
+    __export llaisysTensor_t tensorTo(
+        llaisysTensor_t tensor,
+        llaisysDeviceType_t device_type,
+        int device_id);
+    
 }
 
 #endif // LLAISYS_TENSOR_H
diff --git a/python/README.md b/python/README.md
new file mode 100644
index 00000000..9dc4aad0
--- /dev/null
+++ b/python/README.md
@@ -0,0 +1,25 @@
+# llaisys-server
+
+Standalone server package for Project #3.
+
+## Install order
+
+1. Install core package:
+
+```bash
+python3 -m pip install -e /home/vankari/code/llaisys/python/llaisyscore --user --break-system-packages
+```
+
+2. Install server package:
+
+```bash
+python3 -m pip install -e /home/vankari/code/llaisys/python/server-project --user --break-system-packages
+```
+
+## Run server
+
+```bash
+cd /home/vankari/code/llaisys/python
+python3 -m uvicorn server.app:app --host 0.0.0.0 --port 8000
+```
+### attention! the --break-system-packages was enabled on public server ,it should be disabled in personal computer.
\ No newline at end of file
diff --git a/python/llaisys/__init__.py b/python/llaisys/__init__.py
index de8d99f4..a16b3fff 100644
--- a/python/llaisys/__init__.py
+++ b/python/llaisys/__init__.py
@@ -6,6 +6,7 @@
 from .tensor import Tensor
 from .ops import Ops
 from . import models
+from . import backend
 from .models import *
 
 __all__ = [
@@ -17,4 +18,5 @@
     "Tensor",
     "Ops",
     "models",
+    "backend",
 ]
diff --git a/python/llaisys/backend/__init__.py b/python/llaisys/backend/__init__.py
new file mode 100644
index 00000000..44d54020
--- /dev/null
+++ b/python/llaisys/backend/__init__.py
@@ -0,0 +1,3 @@
+from .inference_backend import InferenceBackend, SessionState
+
+__all__ = ["InferenceBackend", "SessionState"]
diff --git a/python/llaisys/backend/inference_backend.py b/python/llaisys/backend/inference_backend.py
new file mode 100644
index 00000000..7ca1be03
--- /dev/null
+++ b/python/llaisys/backend/inference_backend.py
@@ -0,0 +1,306 @@
+import threading
+import time
+import uuid
+from dataclasses import dataclass, field
+from typing import Any, Dict, Iterator, List, Optional
+
+from transformers import AutoTokenizer
+
+import llaisys
+from llaisys.models.kvcachepool import KVCachePool
+
+
+@dataclass
+class SessionState:
+    session_id: str
+    messages: List[Dict[str, str]] = field(default_factory=list)
+    created_at: int = field(default_factory=lambda: int(time.time()))
+    updated_at: int = field(default_factory=lambda: int(time.time()))
+
+
+class InferenceBackend:
+    """Project #3 optional backend for session-aware inference.
+
+    Responsibilities:
+    - Manage multi-session message history
+    - Integrate Qwen2 model with external KVCachePool
+    - Support history edit + regenerate workflow
+    """
+
+    def __init__(
+        self,
+        model_path: Optional[str] = None,
+        device: llaisys.DeviceType = llaisys.DeviceType.CPU,
+        max_sessions: int = 32,
+    ) -> None:
+        tokenizer_source = model_path or llaisys.models.Qwen2.DEFAULT_MODEL_ID
+        self._tokenizer = AutoTokenizer.from_pretrained(tokenizer_source, trust_remote_code=True)
+        self._model = llaisys.models.Qwen2(model_path=model_path, device=device)
+        self._pool = KVCachePool(self._model, max_sessions=max_sessions)
+
+        self._sessions: Dict[str, SessionState] = {}
+        self._lock = threading.Lock()
+
+    def create_session(self, session_id: Optional[str] = None, messages: Optional[List[Dict[str, str]]] = None) -> SessionState:
+        with self._lock:
+            sid = session_id or f"sess-{uuid.uuid4().hex}"
+            if sid in self._sessions:
+                raise ValueError(f"session already exists: {sid}")
+            state = SessionState(session_id=sid, messages=list(messages or []))
+            self._sessions[sid] = state
+            return state
+
+    def list_sessions(self) -> List[SessionState]:
+        with self._lock:
+            return list(self._sessions.values())
+
+    def get_session(self, session_id: str) -> SessionState:
+        with self._lock:
+            state = self._sessions.get(session_id)
+            if state is None:
+                raise KeyError(f"session not found: {session_id}")
+            return state
+
+    def delete_session(self, session_id: str) -> None:
+        with self._lock:
+            state = self._sessions.pop(session_id, None)
+            if state is None:
+                return
+            self._pool.reset_session(session_id)
+
+    def clear_sessions(self) -> None:
+        with self._lock:
+            self._sessions.clear()
+            self._pool.clear()
+
+    def append_message(self, session_id: str, role: str, content: str) -> SessionState:
+        if role not in ("system", "user", "assistant"):
+            raise ValueError(f"unsupported role: {role}")
+        with self._lock:
+            state = self._sessions.get(session_id)
+            if state is None:
+                raise KeyError(f"session not found: {session_id}")
+            state.messages.append({"role": role, "content": content})
+            state.updated_at = int(time.time())
+            return state
+
+    def replace_messages(self, session_id: str, messages: List[Dict[str, str]]) -> SessionState:
+        """Replace full history for one session.
+
+        This is suitable for edit-history + regenerate workflow.
+        KV cache validity will be recalculated on next generate call.
+        """
+        with self._lock:
+            state = self._sessions.get(session_id)
+            if state is None:
+                raise KeyError(f"session not found: {session_id}")
+            state.messages = list(messages)
+            state.updated_at = int(time.time())
+            return state
+
+    def _build_prompt(self, messages: List[Dict[str, str]]) -> str:
+        return self._tokenizer.apply_chat_template(
+            conversation=messages,
+            add_generation_prompt=True,
+            tokenize=False,
+        )
+
+    @staticmethod
+    def _normalize_think_for_session(prompt: str, completion_text: str) -> str:
+        if not completion_text:
+            return completion_text
+        prompt_has_think = ("<think>" in prompt) or ("</think>" in prompt)
+        if not prompt_has_think:
+            return completion_text
+
+        has_open = "<think>" in completion_text
+        has_close = "</think>" in completion_text
+        if has_close and not has_open:
+            completion_text = f"<think>{completion_text}"
+            has_open = True
+        if has_open and not has_close:
+            completion_text = f"{completion_text}</think>"
+        return completion_text
+
+    def generate(
+        self,
+        session_id: str,
+        max_tokens: int,
+        top_k: int,
+        top_p: float,
+        temperature: float,
+        use_cache: bool = True,
+        append_assistant_message: bool = True,
+    ) -> Dict[str, Any]:
+        with self._lock:
+            state = self._sessions.get(session_id)
+            if state is None:
+                raise KeyError(f"session not found: {session_id}")
+
+            prompt = self._build_prompt(state.messages)
+            prompt_ids = self._tokenizer.encode(prompt)
+
+            if not use_cache:
+                output_ids = self._model.generate(
+                    prompt_ids,
+                    max_new_tokens=max_tokens,
+                    top_k=top_k,
+                    top_p=top_p,
+                    temperature=temperature,
+                    use_cache=False,
+                )
+                completion_ids = output_ids[len(prompt_ids):]
+                self._pool.reset_session(session_id)
+            else:
+                slot = self._pool.prepare_session(session_id, prompt_ids, max_tokens)
+
+                if slot.past_len == 0:
+                    model_input_ids = prompt_ids
+                    model_past_len = 0
+                else:
+                    model_input_ids = prompt_ids[slot.past_len:]
+                    model_past_len = slot.past_len
+                    if not model_input_ids:
+                        model_input_ids = [prompt_ids[-1]]
+                        model_past_len = max(0, slot.past_len - 1)
+
+                output_ids = self._model.generate(
+                    model_input_ids,
+                    max_new_tokens=max_tokens,
+                    top_k=top_k,
+                    top_p=top_p,
+                    temperature=temperature,
+                    use_cache=True,
+                    kcache_array=slot.kcache_array,
+                    vcache_array=slot.vcache_array,
+                    past_len=model_past_len,
+                )
+
+                completion_ids = output_ids[len(model_input_ids):]
+                full_output_ids = list(prompt_ids) + list(completion_ids)
+                self._pool.commit_session(session_id, full_output_ids)
+
+            if self._model.eos_token_id in completion_ids:
+                completion_ids = completion_ids[:completion_ids.index(self._model.eos_token_id)]
+
+            completion_text = self._tokenizer.decode(completion_ids, skip_special_tokens=False)
+            if append_assistant_message:
+                stored_text = self._normalize_think_for_session(prompt, completion_text)
+                state.messages.append({"role": "assistant", "content": stored_text})
+                state.updated_at = int(time.time())
+
+            return {
+                "session_id": session_id,
+                "prompt_ids": prompt_ids,
+                "completion_ids": completion_ids,
+                "completion_text": completion_text,
+                "created": int(time.time()),
+            }
+
+    def stream_generate(
+        self,
+        session_id: str,
+        max_tokens: int,
+        top_k: int,
+        top_p: float,
+        temperature: float,
+        use_cache: bool = True,
+        append_assistant_message: bool = True,
+    ) -> Iterator[str]:
+        if max_tokens <= 0:
+            raise ValueError("max_tokens must be positive")
+        if temperature <= 0:
+            raise ValueError("temperature must be positive")
+        if top_k == 0:
+            raise ValueError("top_k cannot be 0")
+        if top_p < 0 or top_p > 1:
+            raise ValueError("top_p must be in [0, 1]")
+
+        with self._lock:
+            state = self._sessions.get(session_id)
+            if state is None:
+                raise KeyError(f"session not found: {session_id}")
+
+            prompt = self._build_prompt(state.messages)
+            prompt_ids = self._tokenizer.encode(prompt)
+            completion_ids: List[int] = []
+
+            if not use_cache:
+                generated_ids = list(prompt_ids)
+                for _ in range(max_tokens):
+                    token_id = self._model._infer_tokens(
+                        generated_ids,
+                        None,
+                        None,
+                        0,
+                        temperature,
+                        top_k,
+                        top_p,
+                    )
+                    if token_id == self._model.eos_token_id:
+                        break
+                    completion_ids.append(token_id)
+                    generated_ids.append(token_id)
+                    piece = self._tokenizer.decode([token_id], skip_special_tokens=False)
+                    if piece:
+                        yield piece
+                self._pool.reset_session(session_id)
+            else:
+                slot = self._pool.prepare_session(session_id, prompt_ids, max_tokens)
+
+                if slot.past_len == 0:
+                    prefill_input_ids = prompt_ids
+                    prefill_past_len = 0
+                else:
+                    prefill_input_ids = prompt_ids[slot.past_len:]
+                    prefill_past_len = slot.past_len
+                    if not prefill_input_ids:
+                        prefill_input_ids = [prompt_ids[-1]]
+                        prefill_past_len = max(0, slot.past_len - 1)
+
+                token_id = self._model._infer_tokens(
+                    prefill_input_ids,
+                    slot.kcache_array,
+                    slot.vcache_array,
+                    prefill_past_len,
+                    temperature,
+                    top_k,
+                    top_p,
+                )
+                if token_id != self._model.eos_token_id:
+                    completion_ids.append(token_id)
+                    piece = self._tokenizer.decode([token_id], skip_special_tokens=False)
+                    if piece:
+                        yield piece
+                else:
+                    token_id = None
+
+                cached_token_len = prefill_past_len + len(prefill_input_ids)
+                for _ in range(max_tokens - 1):
+                    if token_id is None:
+                        break
+                    token_id = self._model._infer_tokens(
+                        [token_id],
+                        slot.kcache_array,
+                        slot.vcache_array,
+                        cached_token_len,
+                        temperature,
+                        top_k,
+                        top_p,
+                    )
+                    if token_id == self._model.eos_token_id:
+                        break
+                    completion_ids.append(token_id)
+                    cached_token_len += 1
+                    piece = self._tokenizer.decode([token_id], skip_special_tokens=False)
+                    if piece:
+                        yield piece
+
+                full_output_ids = list(prompt_ids) + completion_ids
+                self._pool.commit_session(session_id, full_output_ids)
+
+            if append_assistant_message:
+                completion_text = self._tokenizer.decode(completion_ids, skip_special_tokens=False)
+                stored_text = self._normalize_think_for_session(prompt, completion_text)
+                state.messages.append({"role": "assistant", "content": stored_text})
+                state.updated_at = int(time.time())
diff --git a/python/llaisys/libllaisys/__init__.py b/python/llaisys/libllaisys/__init__.py
index f536fb52..0f952d00 100644
--- a/python/llaisys/libllaisys/__init__.py
+++ b/python/llaisys/libllaisys/__init__.py
@@ -12,7 +12,7 @@
 from .tensor import llaisysTensor_t
 from .tensor import load_tensor
 from .ops import load_ops
-
+from .models import load_qwen2
 
 def load_shared_library():
     lib_dir = Path(__file__).parent
@@ -38,7 +38,7 @@ def load_shared_library():
 load_runtime(LIB_LLAISYS)
 load_tensor(LIB_LLAISYS)
 load_ops(LIB_LLAISYS)
-
+load_qwen2(LIB_LLAISYS)
 
 __all__ = [
     "LIB_LLAISYS",
@@ -52,4 +52,6 @@ def load_shared_library():
     "llaisysMemcpyKind_t",
     "MemcpyKind",
     "llaisysStream_t",
+    "LlaisysQwen2Meta",
+    "LlaisysQwen2Weights"
 ]
diff --git a/python/llaisys/libllaisys/models.py b/python/llaisys/libllaisys/models.py
new file mode 100644
index 00000000..dfd480ae
--- /dev/null
+++ b/python/llaisys/libllaisys/models.py
@@ -0,0 +1,75 @@
+from ctypes import c_int, c_int64, c_size_t
+from .llaisys_types import llaisysDeviceType_t, llaisysDataType_t
+from .tensor import llaisysTensor_t
+import ctypes
+
+class LlaisysQwen2Meta(ctypes.Structure):
+    _fields_ = [
+        ("dtype", llaisysDataType_t),
+        ("nlayer", ctypes.c_size_t),
+        ("hs", ctypes.c_size_t),
+        ("nh", ctypes.c_size_t),
+        ("nkvh", ctypes.c_size_t),
+        ("dh", ctypes.c_size_t),
+        ("di", ctypes.c_size_t),
+        ("maxseq", ctypes.c_size_t),
+        ("voc", ctypes.c_size_t),
+        ("epsilon", ctypes.c_float),
+        ("theta", ctypes.c_float),
+        ("end_token", ctypes.c_int64)
+    ]
+
+class LlaisysQwen2Weights(ctypes.Structure):
+    _fields_ = [
+        ("in_embed",      llaisysTensor_t),
+        ("out_embed",     llaisysTensor_t),
+        ("out_norm_w",    llaisysTensor_t),
+
+        ("attn_norm_w",   ctypes.POINTER(llaisysTensor_t)),
+        ("attn_q_w",      ctypes.POINTER(llaisysTensor_t)),
+        ("attn_q_b",      ctypes.POINTER(llaisysTensor_t)),
+        ("attn_k_w",      ctypes.POINTER(llaisysTensor_t)),
+        ("attn_k_b",      ctypes.POINTER(llaisysTensor_t)),
+        ("attn_v_w",      ctypes.POINTER(llaisysTensor_t)),
+        ("attn_v_b",      ctypes.POINTER(llaisysTensor_t)),
+        ("attn_o_w",      ctypes.POINTER(llaisysTensor_t)),
+
+        ("mlp_norm_w",    ctypes.POINTER(llaisysTensor_t)),
+        ("mlp_gate_w",    ctypes.POINTER(llaisysTensor_t)),
+        ("mlp_up_w",      ctypes.POINTER(llaisysTensor_t)),
+        ("mlp_down_w",    ctypes.POINTER(llaisysTensor_t)),
+    ]
+
+class LlaisysQwen2Model(ctypes.Structure):
+    _fields_ = [
+        ("meta", ctypes.POINTER(LlaisysQwen2Meta)),
+        ("device", ctypes.c_int),  # llaisysDeviceType_t
+        ("ndevice", ctypes.c_int),
+        ("device_ids", ctypes.POINTER(ctypes.c_int)),
+        ("weights", ctypes.POINTER(LlaisysQwen2Weights)),
+    ]
+
+# Load shared library
+def load_qwen2(lib):
+    lib.llaisysQwen2ModelCreate.argtypes = [ctypes.POINTER(LlaisysQwen2Meta), llaisysDeviceType_t, ctypes.POINTER(c_int), c_int]
+    lib.llaisysQwen2ModelCreate.restype = ctypes.POINTER(LlaisysQwen2Model)
+
+    lib.llaisysQwen2ModelDestroy.argtypes = [ctypes.POINTER(LlaisysQwen2Model)]
+    lib.llaisysQwen2ModelDestroy.restype = None
+
+    lib.llaisysQwen2ModelWeights.argtypes = [ctypes.POINTER(LlaisysQwen2Model)]
+    lib.llaisysQwen2ModelWeights.restype = ctypes.POINTER(LlaisysQwen2Weights)
+
+    lib.llaisysQwen2ModelInfer.argtypes = [
+        ctypes.POINTER(LlaisysQwen2Model),
+        ctypes.POINTER(c_int64),
+        c_size_t,
+        ctypes.POINTER(llaisysTensor_t),
+        ctypes.POINTER(llaisysTensor_t),
+        c_size_t,
+        ctypes.c_float,
+        c_int,
+        ctypes.c_float,
+        c_int64,
+    ]
+    lib.llaisysQwen2ModelInfer.restype = c_int64
\ No newline at end of file
diff --git a/python/llaisys/libllaisys/ops.py b/python/llaisys/libllaisys/ops.py
index 5be095ef..49b70620 100644
--- a/python/llaisys/libllaisys/ops.py
+++ b/python/llaisys/libllaisys/ops.py
@@ -1,5 +1,5 @@
 from .tensor import llaisysTensor_t
-from ctypes import c_float
+from ctypes import c_float, c_int, c_int64
 
 def load_ops(lib):
     lib.llaisysAdd.argtypes = [llaisysTensor_t, llaisysTensor_t, llaisysTensor_t]
@@ -34,3 +34,14 @@ def load_ops(lib):
 
     lib.llaisysSwiGLU.argtypes = [llaisysTensor_t, llaisysTensor_t, llaisysTensor_t]
     lib.llaisysSwiGLU.restype = None
+
+    lib.llaisysRandomSample.argtypes = [
+        llaisysTensor_t,
+        llaisysTensor_t,
+        llaisysTensor_t,
+        c_float,
+        c_int,
+        c_float,
+        c_int64,
+    ]
+    lib.llaisysRandomSample.restype = None
diff --git a/python/llaisys/libllaisys/tensor.py b/python/llaisys/libllaisys/tensor.py
index b5805788..ce7e674b 100644
--- a/python/llaisys/libllaisys/tensor.py
+++ b/python/llaisys/libllaisys/tensor.py
@@ -76,3 +76,14 @@ def load_tensor(lib):
         c_size_t,  # end  : exclusive
     ]
     lib.tensorSlice.restype = llaisysTensor_t
+    # Function: tensorToDevice(llaisysTensor_t tensor,
+    #                         llaisysDeviceType_t device_type,
+    #                         int device_id);
+    lib.tensorTo.argtypes = [
+        llaisysTensor_t,
+        llaisysDeviceType_t,
+        c_int,
+    ]
+    lib.tensorTo.restype = llaisysTensor_t
+    lib.tensorReshape.argtypes = [llaisysTensor_t, POINTER(c_size_t), c_size_t]
+    lib.tensorReshape.restype = llaisysTensor_t
\ No newline at end of file
diff --git a/python/llaisys/models/__init__.py b/python/llaisys/models/__init__.py
index af9918b0..fea68919 100644
--- a/python/llaisys/models/__init__.py
+++ b/python/llaisys/models/__init__.py
@@ -1 +1,2 @@
 from .qwen2 import Qwen2
+from .kvcachepool import KVCachePool, KVCacheSlot, PrefixMatch
diff --git a/python/llaisys/models/kvcachepool.py b/python/llaisys/models/kvcachepool.py
new file mode 100644
index 00000000..0737f2a1
--- /dev/null
+++ b/python/llaisys/models/kvcachepool.py
@@ -0,0 +1,487 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from collections import OrderedDict
+from typing import Dict, Optional, Sequence
+import ctypes
+
+from ..libllaisys import LIB_LLAISYS, llaisysTensor_t, DeviceType, DataType, MemcpyKind
+from ..runtime import RuntimeAPI
+
+
+@dataclass
+class PrefixMatch:
+    """Prefix match result from pool-level lookup.
+
+    Attributes:
+        session_id: Matched source session id.
+        matched_tokens: Number of matched prefix tokens.
+    """
+    session_id: str
+    matched_tokens: int
+
+
+@dataclass
+class KVCacheSlot:
+    """Per-session KV cache slot.
+
+    Attributes:
+        session_id: Session identifier.
+        capacity: Max tokens storable in this slot.
+        kcache_array: C-level K cache tensor array, one per layer.
+        vcache_array: C-level V cache tensor array, one per layer.
+        tokens: Full token history tracked by upper layer.
+        past_len: Valid reusable prefix length in current caches.
+    """
+    session_id: str
+    capacity: int
+    kcache_array: object
+    vcache_array: object
+    tokens: tuple[int, ...]
+    past_len: int
+
+
+class _TrieNode:
+    """Trie node for token-prefix indexing.
+
+    Attributes:
+        token: Current node token value, root is None.
+        children: Mapping token -> child node.
+        session_indices: Sessions that contain this prefix.
+    """
+    __slots__ = ("token", "children", "session_indices")
+
+    def __init__(self, token: Optional[int] = None):
+        self.token = token
+        self.children: dict[int, _TrieNode] = {}
+        self.session_indices: list[int] = []
+
+
+class KVCachePool:
+    """External KV cache manager for multi-session inference.
+
+    Notes:
+    - This pool owns tensor lifecycle for created caches.
+    - Prefix matching is based on token history per session.
+    - Backend should call `prepare_session` before generation and
+      `commit_session` after generation.
+    """
+
+    def __init__(self, model, max_sessions: int = 32):
+        """Create a KV cache pool.
+
+        Args:
+            model: Qwen2 model instance that provides tensor/meta attributes.
+            max_sessions: Maximum active sessions retained in memory.
+        """
+        self._model = model
+        self._max_sessions = max_sessions
+        self._slots: "OrderedDict[str, KVCacheSlot]" = OrderedDict()
+        self._trie_root = _TrieNode()
+        self._next_session_index = 0
+        self._session_to_index: dict[str, int] = {}
+        self._index_to_session: dict[int, str] = {}
+        self._active_indices: set[int] = set()
+        self._runtime = RuntimeAPI(self._model.device)
+
+    @staticmethod
+    def _common_prefix_len(a: Sequence[int], b: Sequence[int]) -> int:
+        """Return common-prefix length for two token sequences."""
+        limit = min(len(a), len(b))
+        for i in range(limit):
+            if a[i] != b[i]:
+                return i
+        return limit
+
+    def _destroy_slot(self, slot: KVCacheSlot) -> None:
+        """Release all tensor resources owned by one slot."""
+        for i in range(self._model.num_hidden_layers):
+            if slot.kcache_array[i]:
+                LIB_LLAISYS.tensorDestroy(slot.kcache_array[i])
+            if slot.vcache_array[i]:
+                LIB_LLAISYS.tensorDestroy(slot.vcache_array[i])
+
+    @staticmethod
+    def _dtype_nbytes(dtype: DataType) -> int:
+        if dtype in (DataType.BYTE, DataType.BOOL, DataType.I8, DataType.U8):
+            return 1
+        if dtype in (DataType.I16, DataType.U16, DataType.F16, DataType.BF16):
+            return 2
+        if dtype in (DataType.I32, DataType.U32, DataType.F32):
+            return 4
+        if dtype in (DataType.I64, DataType.U64, DataType.F64):
+            return 8
+        raise ValueError(f"unsupported dtype for KV cache copy: {dtype}")
+
+    def _copy_prefix_cache(self, src_slot: KVCacheSlot, dst_slot: KVCacheSlot, prefix_len: int) -> None:
+        if prefix_len <= 0:
+            return
+
+        nbytes = (
+            prefix_len
+            * self._model.num_key_value_heads
+            * self._model.per_kvhead_dim
+            * self._dtype_nbytes(self._model.data_type)
+        )
+        memcpy_kind = MemcpyKind.H2H if self._model.device == DeviceType.CPU else MemcpyKind.D2D
+
+        for i in range(self._model.num_hidden_layers):
+            src_k = LIB_LLAISYS.tensorSlice(src_slot.kcache_array[i], 0, 0, prefix_len)
+            dst_k = LIB_LLAISYS.tensorSlice(dst_slot.kcache_array[i], 0, 0, prefix_len)
+            src_v = LIB_LLAISYS.tensorSlice(src_slot.vcache_array[i], 0, 0, prefix_len)
+            dst_v = LIB_LLAISYS.tensorSlice(dst_slot.vcache_array[i], 0, 0, prefix_len)
+            try:
+                self._runtime.memcpy_sync(
+                    LIB_LLAISYS.tensorGetData(dst_k),
+                    LIB_LLAISYS.tensorGetData(src_k),
+                    nbytes,
+                    memcpy_kind,
+                )
+                self._runtime.memcpy_sync(
+                    LIB_LLAISYS.tensorGetData(dst_v),
+                    LIB_LLAISYS.tensorGetData(src_v),
+                    nbytes,
+                    memcpy_kind,
+                )
+            finally:
+                LIB_LLAISYS.tensorDestroy(src_k)
+                LIB_LLAISYS.tensorDestroy(dst_k)
+                LIB_LLAISYS.tensorDestroy(src_v)
+                LIB_LLAISYS.tensorDestroy(dst_v)
+
+    def _get_or_create_session_index(self, session_id: str) -> int:
+        """Get stable integer index for a session id."""
+        idx = self._session_to_index.get(session_id)
+        if idx is not None:
+            return idx
+        idx = self._next_session_index
+        self._next_session_index += 1
+        self._session_to_index[session_id] = idx
+        self._index_to_session[idx] = session_id
+        return idx
+
+    @staticmethod
+    def _add_session_index(node: _TrieNode, session_index: int) -> None:
+        """Attach one session index to trie node if absent."""
+        if session_index not in node.session_indices:
+            node.session_indices.append(session_index)
+
+    @staticmethod
+    def _remove_session_index(node: _TrieNode, session_index: int) -> None:
+        """Detach one session index from trie node if present."""
+        if session_index in node.session_indices:
+            node.session_indices.remove(session_index)
+
+    def _trie_insert(self, tokens: Sequence[int], session_index: int) -> None:
+        """Insert a token prefix path into trie for one session."""
+        node = self._trie_root
+        for token in tokens:
+            child = node.children.get(token)
+            if child is None:
+                child = _TrieNode(token)
+                node.children[token] = child
+            node = child
+            self._add_session_index(node, session_index)
+
+    def _trie_remove(self, tokens: Sequence[int], session_index: int) -> None:
+        """Remove one session's token prefix path from trie."""
+        if not tokens:
+            return
+
+        stack: list[tuple[_TrieNode, int, _TrieNode]] = []
+        node = self._trie_root
+        for token in tokens:
+            child = node.children.get(token)
+            if child is None:
+                return
+            stack.append((node, token, child))
+            node = child
+
+        for _, _, child in stack:
+            self._remove_session_index(child, session_index)
+
+        for parent, token, child in reversed(stack):
+            if child.children or child.session_indices:
+                break
+            del parent.children[token]
+
+    def _trie_remove_from_depth(self, tokens: Sequence[int], session_index: int, keep_depth: int) -> None:
+        """Remove session index from trie nodes after `keep_depth` on one token path.
+
+        Args:
+            tokens: Existing cached token path.
+            session_index: Target session index.
+            keep_depth: Number of leading nodes to keep unchanged.
+        """
+        if not tokens:
+            return
+
+        stack: list[tuple[_TrieNode, int, _TrieNode]] = []
+        node = self._trie_root
+        for token in tokens:
+            child = node.children.get(token)
+            if child is None:
+                return
+            stack.append((node, token, child))
+            node = child
+
+        for depth, (_, _, child) in enumerate(stack, start=1):
+            if depth > keep_depth:
+                self._remove_session_index(child, session_index)
+
+        for parent, token, child in reversed(stack):
+            if child.children or child.session_indices:
+                break
+            del parent.children[token]
+
+    def _trie_best_prefix_len(self, tokens: Sequence[int], exclude_session_id: Optional[str]) -> tuple[int, Optional[str]]:
+        """Find longest matched prefix and source session by trie traversal."""
+        exclude_index = None
+        if exclude_session_id is not None:
+            exclude_index = self._session_to_index.get(exclude_session_id)
+
+        best_len = 0
+        best_session_id: Optional[str] = None
+        node = self._trie_root
+
+        for depth, token in enumerate(tokens, start=1):
+            node = node.children.get(token)
+            if node is None:
+                break
+
+            matched_session_id = None
+            for session_index in node.session_indices:
+                if session_index == exclude_index:
+                    continue
+                if session_index not in self._active_indices:
+                    continue
+                sid = self._index_to_session.get(session_index)
+                if sid is None or sid not in self._slots:
+                    continue
+                matched_session_id = sid
+                break
+
+            if matched_session_id is not None:
+                best_len = depth
+                best_session_id = matched_session_id
+
+        return best_len, best_session_id
+
+    def _update_slot_tokens(self, session_id: str, new_tokens: Sequence[int]) -> None:
+        """Update a session slot token history and synchronize trie index."""
+        slot = self._slots.get(session_id)
+        if slot is None:
+            return
+
+        session_index = self._get_or_create_session_index(session_id)
+        self._active_indices.add(session_index)
+
+        old_tokens = slot.tokens[:slot.past_len]
+        if old_tokens:
+            self._trie_remove(old_tokens, session_index)
+
+        slot.tokens = tuple(new_tokens)
+        slot.past_len = len(slot.tokens)
+
+        if slot.past_len > 0:
+            self._trie_insert(slot.tokens[:slot.past_len], session_index)
+
+    def _set_slot_reuse_state(self, session_id: str, new_tokens: Sequence[int], reusable_len: int) -> KVCacheSlot:
+        slot = self._slots.get(session_id)
+        if slot is None:
+            raise KeyError(f"session not found: {session_id}")
+
+        new_tokens = tuple(new_tokens)
+        reusable_len = max(0, min(reusable_len, len(new_tokens)))
+
+        session_index = self._get_or_create_session_index(session_id)
+        self._active_indices.add(session_index)
+
+        old_cached_tokens = slot.tokens[:slot.past_len]
+        if old_cached_tokens:
+            self._trie_remove(old_cached_tokens, session_index)
+
+        slot.tokens = new_tokens
+        slot.past_len = reusable_len
+
+        if reusable_len > 0:
+            self._trie_insert(new_tokens[:reusable_len], session_index)
+
+        return slot
+
+    def _evict_if_needed(self) -> None:
+        """Evict least-recently-used sessions when exceeding pool capacity."""
+        while len(self._slots) > self._max_sessions:
+            session_id, slot = self._slots.popitem(last=False)
+            session_index = self._session_to_index.get(session_id)
+            if session_index is not None:
+                if slot.past_len > 0:
+                    self._trie_remove(slot.tokens[:slot.past_len], session_index)
+                self._active_indices.discard(session_index)
+            self._destroy_slot(slot)
+
+    def _allocate_slot(self, session_id: str, capacity: int) -> KVCacheSlot:
+        """Allocate one session slot and its per-layer KV tensors."""
+        array_type = llaisysTensor_t * self._model.num_hidden_layers
+        kcache_array = array_type()
+        vcache_array = array_type()
+
+        for i in range(self._model.num_hidden_layers):
+            shape_arr = (ctypes.c_size_t * 3)(
+                capacity,
+                self._model.num_key_value_heads,
+                self._model.per_kvhead_dim,
+            )
+            kcache_array[i] = LIB_LLAISYS.tensorCreate(
+                shape_arr, 3, self._model.data_type, self._model.device, self._model.device_id
+            )
+            vcache_array[i] = LIB_LLAISYS.tensorCreate(
+                shape_arr, 3, self._model.data_type, self._model.device, self._model.device_id
+            )
+
+        return KVCacheSlot(
+            session_id=session_id,
+            capacity=capacity,
+            kcache_array=kcache_array,
+            vcache_array=vcache_array,
+            tokens=(),
+            past_len=0,
+        )
+
+    def find_best_prefix(self, tokens: Sequence[int], exclude_session_id: Optional[str] = None) -> Optional[PrefixMatch]:
+        """Find best reusable prefix from pool.
+
+        Args:
+            tokens: Target token sequence.
+            exclude_session_id: Optional session id to skip.
+
+        Returns:
+            PrefixMatch for longest hit, or None if no positive-length match.
+        """
+        matched_len, matched_session = self._trie_best_prefix_len(tokens, exclude_session_id)
+        if matched_session is None or matched_len == 0:
+            return None
+        return PrefixMatch(session_id=matched_session, matched_tokens=matched_len)
+
+    def prepare_session(self, session_id: str, prompt_tokens: Sequence[int], max_new_tokens: int) -> KVCacheSlot:
+        """Prepare slot for a new inference call.
+
+        This method ensures slot existence/capacity, computes best reusable
+        prefix from current session and pool-level matches, and returns slot
+        handles for model inference.
+
+        Args:
+            session_id: Target session id.
+            prompt_tokens: Full prompt tokens for this call.
+            max_new_tokens: Planned generation upper bound.
+
+        Returns:
+            Prepared KVCacheSlot with updated `past_len`.
+        """
+        prompt_tokens = tuple(prompt_tokens)
+        if not prompt_tokens:
+            raise ValueError("prompt_tokens cannot be empty")
+        if max_new_tokens <= 0:
+            raise ValueError("max_new_tokens must be positive")
+
+        required_capacity = len(prompt_tokens) + max_new_tokens
+        slot = self._slots.get(session_id)
+        existed_before = slot is not None
+
+        if slot is None:
+            slot = self._allocate_slot(session_id, max(required_capacity, self._model.max_position_embeddings))
+            self._slots[session_id] = slot
+        elif slot.capacity < required_capacity:
+            self._destroy_slot(slot)
+            slot = self._allocate_slot(session_id, max(required_capacity, self._model.max_position_embeddings))
+            self._slots[session_id] = slot
+
+        self_shared = self._common_prefix_len(slot.tokens[:slot.past_len], prompt_tokens) if existed_before else 0
+
+        pool_match = self.find_best_prefix(prompt_tokens, exclude_session_id=session_id)
+        pool_shared = pool_match.matched_tokens if pool_match is not None else 0
+
+        chosen_shared = self_shared
+        source_slot: Optional[KVCacheSlot] = None
+        if pool_shared > chosen_shared and pool_match is not None:
+            source_slot = self._slots.get(pool_match.session_id)
+            if source_slot is not None:
+                chosen_shared = pool_shared
+
+        slot = self._set_slot_reuse_state(session_id, prompt_tokens, chosen_shared)
+        if source_slot is not None and chosen_shared > 0:
+            self._copy_prefix_cache(source_slot, slot, chosen_shared)
+
+        self._slots.move_to_end(session_id)
+        self._evict_if_needed()
+        return slot
+
+    def modify_session_tokens(self, session_id: str, edited_tokens: Sequence[int]) -> KVCacheSlot:
+        """Apply history edit for one session and invalidate cache suffix.
+
+        The pool keeps trie/session state for common prefix unchanged and drops
+        session index on trie nodes after the first edited position.
+
+        Args:
+            session_id: Target session id.
+            edited_tokens: New full token history after edit.
+
+        Returns:
+            Updated KVCacheSlot whose `past_len` is the reusable cache prefix.
+
+        Raises:
+            KeyError: If target session does not exist.
+        """
+        slot = self._slots.get(session_id)
+        if slot is None:
+            raise KeyError(f"session not found: {session_id}")
+
+        new_tokens = tuple(edited_tokens)
+        old_cached_tokens = slot.tokens[:slot.past_len]
+        keep_len = self._common_prefix_len(old_cached_tokens, new_tokens)
+
+        session_index = self._get_or_create_session_index(session_id)
+        self._active_indices.add(session_index)
+
+        if old_cached_tokens:
+            self._trie_remove_from_depth(old_cached_tokens, session_index, keep_len)
+        if keep_len > 0:
+            self._trie_insert(new_tokens[:keep_len], session_index)
+
+        slot.tokens = new_tokens
+        slot.past_len = keep_len
+        self._slots.move_to_end(session_id)
+        return slot
+
+    def commit_session(self, session_id: str, full_tokens: Sequence[int]) -> None:
+        """Commit generation result and refresh trie index.
+
+        Args:
+            session_id: Target session id.
+            full_tokens: Full tokens after generation (prompt + output).
+        """
+        slot = self._slots.get(session_id)
+        if slot is None:
+            return
+        self._update_slot_tokens(session_id, tuple(full_tokens))
+        self._slots.move_to_end(session_id)
+
+    def reset_session(self, session_id: str) -> None:
+        """Remove one session and release its cache resources."""
+        slot = self._slots.pop(session_id, None)
+        if slot is None:
+            return
+        session_index = self._session_to_index.get(session_id)
+        if session_index is not None:
+            if slot.past_len > 0:
+                self._trie_remove(slot.tokens[:slot.past_len], session_index)
+            self._active_indices.discard(session_index)
+        self._destroy_slot(slot)
+
+    def clear(self) -> None:
+        """Clear all sessions and release all cache resources."""
+        for slot in self._slots.values():
+            self._destroy_slot(slot)
+        self._slots.clear()
+        self._trie_root = _TrieNode()
+        self._active_indices.clear()
diff --git a/python/llaisys/models/qwen2.py b/python/llaisys/models/qwen2.py
index 0d07b0b2..fc436818 100644
--- a/python/llaisys/models/qwen2.py
+++ b/python/llaisys/models/qwen2.py
@@ -1,33 +1,331 @@
-from typing import Sequence
-from ..libllaisys import LIB_LLAISYS
-from ..libllaisys import DeviceType
-
+from typing import Sequence, Optional, Union
 from pathlib import Path
+import json
+import ctypes
+
+from huggingface_hub import snapshot_download
 import safetensors
+import torch
+
+from ..libllaisys import LIB_LLAISYS, DeviceType, DataType, llaisysTensor_t
+from ..libllaisys.models import load_qwen2, LlaisysQwen2Meta
 
+load_qwen2(LIB_LLAISYS)
 
 class Qwen2:
+    """Qwen2 language model implementation."""
+    
+    DEFAULT_MODEL_ID = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
+
+    def __init__(
+        self, 
+        model_path: Optional[Union[str, Path]] = None, 
+        device: DeviceType = DeviceType.CPU,
+        max_seq_len: Optional[int] = None
+    ):
+        """Initialize Qwen2 model.
+        
+        Args:
+            model_path: Path to model directory. If None, downloads default model.
+            device: Device type for inference.
+            max_seq_len: Maximum sequence length. If None, uses model's default.
+                        Set this to a larger value (e.g., 8192, 16384) for longer contexts.
+            
+        Raises:
+            ValueError: If unsupported device is specified.
+            FileNotFoundError: If required model files are missing.
+        """
+        if device not in (DeviceType.CPU, DeviceType.NVIDIA):
+            raise ValueError(f"Unsupported device: {device}. Only CPU and NVIDIA are supported.")
+            
+        self.model_path = self._resolve_model_path(model_path)
+        self._validate_model_files()
+        
+        self.device = device
+        self.device_id = 0
+        
+        config = self._load_config()
+        self._init_model_params(config)
+        
+        # Override max sequence length if specified
+        if max_seq_len is not None:
+            if max_seq_len <= 0:
+                raise ValueError(f"max_seq_len must be positive, got {max_seq_len}")
+            print(f"[Qwen2] Overriding max_position_embeddings: {self.max_position_embeddings} → {max_seq_len}")
+            self.max_position_embeddings = max_seq_len
+        
+        self.data_type = DataType.F32 if device == DeviceType.CPU else DataType.BF16
+        
+        self._create_model()
+        
+        self._load_weights()
+        
+
+    def _resolve_model_path(self, model_path: Optional[Union[str, Path]]) -> Path:
+        """Resolve model path, downloading if necessary."""
+        if model_path is not None and Path(model_path).exists():
+            return Path(model_path)
+        return Path(snapshot_download(self.DEFAULT_MODEL_ID))
+
+    def _validate_model_files(self) -> None:
+        """Validate that required model files exist."""
+        required_files = ["config.json", "model.safetensors"]
+        for file_name in required_files:
+            file_path = self.model_path / file_name
+            if not file_path.exists():
+                raise FileNotFoundError(f"Required file {file_path} not found!")
+
+    def _load_config(self) -> dict:
+        """Load model configuration from config.json."""
+        config_path = self.model_path / "config.json"
+        try:
+            with open(config_path, "r", encoding="utf-8") as f:
+                return json.load(f)
+        except (json.JSONDecodeError, IOError) as e:
+            raise ValueError(f"Failed to load config from {config_path}: {e}")
+
+    def _init_model_params(self, config: dict) -> None:
+        """Initialize model parameters from configuration."""
+        # Required parameters
+        required_params = [
+            "hidden_size", "intermediate_size", "max_position_embeddings",
+            "num_attention_heads", "num_hidden_layers", "num_key_value_heads",
+            "rms_norm_eps", "rope_theta", "vocab_size", "eos_token_id"
+        ]
+        
+        for param in required_params:
+            if param not in config:
+                raise ValueError(f"Missing required parameter: {param}")
+                
+        self.eos_token_id = config["eos_token_id"]
+        self.hidden_size = config["hidden_size"]
+        self.intermediate_size = config["intermediate_size"]
+        self.max_position_embeddings = config["max_position_embeddings"]
+        self.num_attention_heads = config["num_attention_heads"]
+        self.num_hidden_layers = config["num_hidden_layers"]
+        self.num_key_value_heads = config["num_key_value_heads"]
+        self.rms_norm_eps = config["rms_norm_eps"]
+        self.rope_theta = config["rope_theta"]
+        self.torch_dtype = config.get("torch_dtype", "bfloat16")
+        self.vocab_size = config["vocab_size"]
+        
+        # Derived parameters
+        self.per_head_dim = self.hidden_size // self.num_attention_heads
+        self.per_kvhead_dim = self.per_head_dim  # For Qwen2, dv = d
+
+    def _create_model(self) -> None:
+        """Create the model instance."""
+        meta = LlaisysQwen2Meta(
+            dtype=self.data_type,
+            nlayer=self.num_hidden_layers,
+            hs=self.hidden_size,
+            nh=self.num_attention_heads,
+            nkvh=self.num_key_value_heads,
+            dh=self.per_head_dim,
+            di=self.intermediate_size,
+            maxseq=self.max_position_embeddings,
+            voc=self.vocab_size,
+            epsilon=self.rms_norm_eps,
+            theta=self.rope_theta,
+            end_token=self.eos_token_id
+        )
 
-    def __init__(self, model_path, device: DeviceType = DeviceType.CPU):
-        # TODO: Implement model constructor
+        device_ids = (ctypes.c_int * 1)(0)
+        self.model = LIB_LLAISYS.llaisysQwen2ModelCreate(
+            ctypes.byref(meta),
+            ctypes.c_int(self.device),
+            device_ids,
+            ctypes.c_int(1)
+        )
 
-        model_path = Path(model_path)
+        if not self.model:
+            raise RuntimeError("Failed to create Qwen2 model.")
 
-        for file in sorted(model_path.glob("*.safetensors")):
-            data_ = safetensors.safe_open(file, framework="numpy", device="cpu")
-            for name_ in data_.keys():
-                ## TODO: load the model weights
-                pass
+    def _load_weights(self) -> None:
+        """Load model weights from safetensors files."""
+        weights = LIB_LLAISYS.llaisysQwen2ModelWeights(self.model)
+        if not weights:
+            raise RuntimeError("Failed to get Qwen2 weights.")
+        def maybe_cast_tensor(tensor):
+            """Cast tensor to appropriate dtype if needed."""
+            if self.device == DeviceType.CPU:
+                return tensor.to(torch.float32).contiguous()
+            elif self.device == DeviceType.NVIDIA:
+                return tensor.to(torch.bfloat16).contiguous()
+            return tensor
+        
+        
+        for file in sorted(self.model_path.glob("*.safetensors")):
+            data = safetensors.safe_open(file, framework="torch", device="cpu")
+            self._load_embedding_layers(data, weights, maybe_cast_tensor)
+            self._load_attention_layers(data, weights, maybe_cast_tensor)
+            self._load_mlp_layers(data, weights, maybe_cast_tensor)
+        
+    
+    def _load_embedding_layers(self, data, weights, cast_fn):
+        """Load embedding and output layers."""
+        embedding_mappings = [
+            ("model.embed_tokens.weight", "in_embed"),
+            ("lm_head.weight", "out_embed"),
+            ("model.norm.weight", "out_norm_w")
+        ]
+        
+        for tensor_name, field_name in embedding_mappings:
+            tensor = cast_fn(data.get_tensor(tensor_name))
+            LIB_LLAISYS.tensorLoad(getattr(weights.contents, field_name), tensor.data_ptr())
+
+    def _load_attention_layers(self, data, weights, cast_fn):
+        """Load self-attention layer weights."""
+        attention_mappings = [
+            ("input_layernorm.weight", "attn_norm_w"),
+            ("self_attn.q_proj.weight", "attn_q_w"),
+            ("self_attn.q_proj.bias", "attn_q_b"),
+            ("self_attn.k_proj.weight", "attn_k_w"),
+            ("self_attn.k_proj.bias", "attn_k_b"),
+            ("self_attn.v_proj.weight", "attn_v_w"),
+            ("self_attn.v_proj.bias", "attn_v_b"),
+            ("self_attn.o_proj.weight", "attn_o_w"),
+        ]
+        
+        for base_name, field_name in attention_mappings:
+            self._load_layer_array(data, weights, field_name, base_name, cast_fn)
+
+    def _load_mlp_layers(self, data, weights, cast_fn):
+        """Load MLP layer weights."""
+        mlp_mappings = [
+            ("post_attention_layernorm.weight", "mlp_norm_w"),
+            ("mlp.gate_proj.weight", "mlp_gate_w"),
+            ("mlp.up_proj.weight", "mlp_up_w"),
+            ("mlp.down_proj.weight", "mlp_down_w"),
+        ]
+        
+        for base_name, field_name in mlp_mappings:
+            self._load_layer_array(data, weights, field_name, base_name, cast_fn)
+
+    def _load_layer_array(self, data, weights, field_name, base_name, cast_fn):
+        """Load weights for a layer array."""
+        arr_ptr = getattr(weights.contents, field_name)
+        arr_type = llaisysTensor_t * self.num_hidden_layers
+        arr = ctypes.cast(arr_ptr, ctypes.POINTER(arr_type)).contents
+
+        for i in range(self.num_hidden_layers):
+            tensor_name = f"model.layers.{i}.{base_name}"
+            tensor = cast_fn(data.get_tensor(tensor_name))
+            LIB_LLAISYS.tensorLoad(arr[i], tensor.data_ptr())
 
     def generate(
         self,
         inputs: Sequence[int],
-        max_new_tokens: int = None,
+        max_new_tokens: int = 128,
         top_k: int = 1,
         top_p: float = 0.8,
         temperature: float = 0.8,
-    ):
+        use_cache: bool = True,
+        kcache_array=None,
+        vcache_array=None,
+        past_len: int = 0,
+        seed: Optional[int] = None,
+    ) -> Sequence[int]:
+        """Generate tokens using the model.
+        
+        Args:
+            inputs: Input token IDs.
+                - use_cache=True with external cache: incremental input (current request tail).
+                - use_cache=False: full input sequence.
+            max_new_tokens: Maximum number of new tokens to generate
+            top_k: Top-k sampling parameter
+            top_p: Top-p (nucleus) sampling parameter  
+            temperature: Sampling temperature
+            use_cache: Whether to use KV cache for efficiency.
+            kcache_array: External KV cache k tensor array managed outside model.
+            vcache_array: External KV cache v tensor array managed outside model.
+            past_len: Number of valid tokens already written in external KV cache.
+            seed: Optional random seed for reproducible sampling. If set,
+                each decode step uses `seed + step_index`.
+            
+        Returns:
+            Token IDs for current request + current output.
+            - use_cache=True: returns `inputs + new_tokens`.
+            - use_cache=False: returns `inputs[past_len:] + new_tokens`.
+        """
+        if not inputs:
+            raise ValueError("Input tokens cannot be empty")
+        if max_new_tokens <= 0:
+            raise ValueError("max_new_tokens must be positive")
+        if temperature <= 0:
+            raise ValueError("temperature must be positive")
+        if top_k == 0:
+            raise ValueError("top_k cannot be 0")
+        if top_p < 0 or top_p > 1:
+            raise ValueError("top_p must be in [0, 1]")
+        if past_len < 0:
+            raise ValueError("past_len must be non-negative")
+        if seed is not None and seed < 0:
+            raise ValueError("seed must be non-negative")
+            
+        generated = list(inputs)
+        use_external_cache = bool(use_cache and kcache_array is not None and vcache_array is not None)
+        null_cache = ctypes.POINTER(llaisysTensor_t)()
+        active_kcache = kcache_array if use_external_cache else null_cache
+        active_vcache = vcache_array if use_external_cache else null_cache
+
+        if not use_external_cache and past_len > len(generated):
+            raise ValueError(
+                f"past_len={past_len} cannot exceed input length={len(generated)} for the current call"
+            )
+
+        output = list(generated) if use_external_cache else list(generated[past_len:])
+        cached_token_len = past_len
+        sample_step = 0
+
+        # Prefill phase
+        prefill_seed = (seed + sample_step) if seed is not None else -1
+        next_token = self._infer_tokens(generated, active_kcache, active_vcache,
+                        past_len if use_external_cache else 0, temperature, top_k, top_p,
+                        prefill_seed)
+        output.append(next_token)
+        generated.append(next_token)
+        cached_token_len +=len(output)-1
+        sample_step += 1
+        
+
+        # Decode phase  
+        for _ in range(max_new_tokens - 1):
+            if next_token == self.eos_token_id:
+                break
+                
+            if use_external_cache:
+                step_seed = (seed + sample_step) if seed is not None else -1
+                next_token = self._infer_tokens([next_token], active_kcache, active_vcache, cached_token_len,
+                                                temperature, top_k, top_p, step_seed)
+                cached_token_len += 1
+            else:
+                step_seed = (seed + sample_step) if seed is not None else -1
+                next_token = self._infer_tokens(generated, active_kcache, active_vcache, 0,
+                                                temperature, top_k, top_p, step_seed)
+                
+            generated.append(next_token)
+            output.append(next_token)
+            sample_step += 1
+
+        return output
 
-        # TODO: Implement generate function
+    def _infer_tokens(self, tokens: Sequence[int], kcache_array, vcache_array,
+                      past_len: int, temperature: float, top_k: int, top_p: float, seed: int = 42) -> int:
+        """Perform inference on token sequence."""
+        ntokens = len(tokens)
+        TokenArrayType = ctypes.c_int64 * ntokens
+        input_token_array = TokenArrayType(*tokens)
 
-        return []
+        return LIB_LLAISYS.llaisysQwen2ModelInfer(
+            self.model,
+            input_token_array,
+            ctypes.c_size_t(ntokens),
+            kcache_array,
+            vcache_array,
+            ctypes.c_size_t(past_len),
+            ctypes.c_float(temperature),
+            ctypes.c_int(top_k),
+            ctypes.c_float(top_p),
+            ctypes.c_int64(seed),
+        )
\ No newline at end of file
diff --git a/python/llaisys/ops.py b/python/llaisys/ops.py
index ed0180bc..c2084eab 100644
--- a/python/llaisys/ops.py
+++ b/python/llaisys/ops.py
@@ -1,6 +1,6 @@
 from .libllaisys import LIB_LLAISYS
 from .tensor import Tensor
-from ctypes import c_float, c_int
+from ctypes import c_float, c_int, c_int64
 
 
 class Ops:
@@ -53,3 +53,17 @@ def self_attention(attn_val: Tensor, q: Tensor, k: Tensor, v: Tensor, scale: flo
     @staticmethod
     def swiglu(out: Tensor, gate: Tensor, up: Tensor):
         LIB_LLAISYS.llaisysSwiGLU(out.lib_tensor(), gate.lib_tensor(), up.lib_tensor())
+
+    @staticmethod
+    def random_sample(sample_idx: Tensor, sample_val: Tensor, logits: Tensor, temperature: float, top_k: int,
+                      top_p: float, seed: int | None = None):
+        seed_arg = -1 if seed is None else int(seed)
+        LIB_LLAISYS.llaisysRandomSample(
+            sample_idx.lib_tensor(),
+            sample_val.lib_tensor(),
+            logits.lib_tensor(),
+            c_float(temperature),
+            c_int(top_k),
+            c_float(top_p),
+            c_int64(seed_arg),
+        )
diff --git a/python/llaisys/tensor.py b/python/llaisys/tensor.py
index 1466d851..5a208ffd 100644
--- a/python/llaisys/tensor.py
+++ b/python/llaisys/tensor.py
@@ -95,3 +95,14 @@ def slice(self, dim: int, start: int, end: int):
                 self._tensor, c_size_t(dim), c_size_t(start), c_size_t(end)
             )
         )
+    def reshape(self, *shape: int):
+        _shape = (c_size_t * len(shape))(*shape)
+        return Tensor(
+            tensor=LIB_LLAISYS.tensorReshape(self._tensor, _shape, c_size_t(len(shape)))
+        )
+    def to(self, device: DeviceType, device_id: int = 0):
+        return Tensor(
+            tensor=LIB_LLAISYS.tensorTo(
+                self._tensor, llaisysDeviceType_t(device), c_int(device_id)
+            )
+        )
\ No newline at end of file
diff --git a/python/pyproject.toml b/python/llaisyscore/pyproject.toml
similarity index 100%
rename from python/pyproject.toml
rename to python/llaisyscore/pyproject.toml
diff --git a/python/setup.cfg b/python/llaisyscore/setup.cfg
similarity index 77%
rename from python/setup.cfg
rename to python/llaisyscore/setup.cfg
index b35fc65f..0d8e2175 100644
--- a/python/setup.cfg
+++ b/python/llaisyscore/setup.cfg
@@ -7,12 +7,20 @@ license = MIT
 
 [options]
 packages = find:
+package_dir =
+    = ..
 include_package_data = True
 zip_safe = False
 install_requires =
     torch>=2.4.0
     transformers
     accelerate
+    huggingface-hub
+
+[options.packages.find]
+where = ..
+include =
+    llaisys*
 
 [options.package_data]
 llaisys = 
diff --git a/python/server-project/pyproject.toml b/python/server-project/pyproject.toml
new file mode 100644
index 00000000..8fe2f47a
--- /dev/null
+++ b/python/server-project/pyproject.toml
@@ -0,0 +1,3 @@
+[build-system]
+requires = ["setuptools>=42", "wheel"]
+build-backend = "setuptools.build_meta"
diff --git a/python/server-project/setup.cfg b/python/server-project/setup.cfg
new file mode 100644
index 00000000..97fe2dca
--- /dev/null
+++ b/python/server-project/setup.cfg
@@ -0,0 +1,22 @@
+[metadata]
+name = llaisys-server
+version = 0.1.0
+description = Chat server project for LLAISYS
+author = Pan Zezhong
+license = MIT
+
+[options]
+packages = find:
+package_dir =
+    = ..
+include_package_data = True
+zip_safe = False
+install_requires =
+    llaisys>=0.1.0
+    fastapi
+    uvicorn
+
+[options.packages.find]
+where = ..
+include =
+    server*
diff --git a/python/server/README.md b/python/server/README.md
new file mode 100644
index 00000000..80a8eba3
--- /dev/null
+++ b/python/server/README.md
@@ -0,0 +1,68 @@
+# LLAISYS Chat Server (Project #3)
+
+A minimal Python chat server built on top of `llaisys.models.Qwen2`.
+
+## Install deps
+
+```bash
+python3 -m pip install fastapi uvicorn
+```
+
+## Start server
+
+```bash
+cd python
+uvicorn server.app:app --host 0.0.0.0 --port 8000
+```
+
+Open chat UI:
+
+```bash
+http://127.0.0.1:8000/
+```
+
+Optional environment variables:
+
+- `LLAISYS_MODEL_PATH`: local model path (default: `/home/vankari/code/DeepSeek-R1-Distill-Qwen-1.5B/`; if path does not exist, falls back to HF model)
+- `LLAISYS_DEVICE`: `cpu` or `nvidia` (default: `cpu`)
+- `LLAISYS_MAX_NEW_TOKENS` (default: `128`)
+- `LLAISYS_TOP_K` (default: `50`)
+- `LLAISYS_TOP_P` (default: `0.8`)
+- `LLAISYS_TEMPERATURE` (default: `0.8`)
+- `LLAISYS_USE_CACHE` (default: `true`)
+- `LLAISYS_SERVE_MODEL_NAME` (default: `llaisys-qwen2`)
+
+## API
+
+### Health
+
+```bash
+curl http://127.0.0.1:8000/healthz
+```
+
+### Chat completion
+
+```bash
+curl -X POST http://127.0.0.1:8000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "llaisys-qwen2",
+    "messages": [{"role": "user", "content": "你好，介绍一下你自己"}],
+    "max_tokens": 64,
+    "top_k": 50,
+    "top_p": 0.8,
+    "temperature": 0.8,
+    "stream": false
+  }'
+```
+
+### Streaming completion (SSE)
+
+```bash
+curl -N -X POST http://127.0.0.1:8000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "messages": [{"role": "user", "content": "讲个笑话"}],
+    "stream": true
+  }'
+```
diff --git a/python/server/__init__.py b/python/server/__init__.py
new file mode 100644
index 00000000..1ef54839
--- /dev/null
+++ b/python/server/__init__.py
@@ -0,0 +1 @@
+"""LLAISYS chat server package."""
diff --git a/python/server/app.py b/python/server/app.py
new file mode 100644
index 00000000..673902b6
--- /dev/null
+++ b/python/server/app.py
@@ -0,0 +1,255 @@
+import json
+import re
+import time
+import uuid
+from pathlib import Path
+from typing import Dict, Iterator, List, Optional
+
+from fastapi import FastAPI, HTTPException
+from fastapi.responses import FileResponse, StreamingResponse
+from pydantic import BaseModel
+
+from .config import CONFIG
+from .engine import ENGINE
+from .schemas import (
+    ChatCompletionRequest,
+    ChatCompletionResponse,
+    ChatChoice,
+    ChatResponseMessage,
+    Usage,
+)
+
+app = FastAPI(title="LLAISYS Chat Server", version="0.1.0")
+UI_INDEX = Path(__file__).resolve().parent / "ui" / "index.html"
+
+
+class SessionCreateRequest(BaseModel):
+    session_id: Optional[str] = None
+
+
+@app.get("/")
+def chat_ui():
+    if not UI_INDEX.exists():
+        raise HTTPException(status_code=404, detail="UI page not found")
+    return FileResponse(UI_INDEX)
+
+
+@app.get("/healthz")
+def healthz():
+    return {"status": "ok", "device": CONFIG.device, "model": CONFIG.default_model_name}
+
+
+def _chunk_payload(chunk_id: str, model_name: str, content: str, finish_reason=None):
+    return {
+        "id": chunk_id,
+        "object": "chat.completion.chunk",
+        "created": int(time.time()),
+        "model": model_name,
+        "choices": [
+            {
+                "index": 0,
+                "delta": {"content": content} if content else {},
+                "finish_reason": finish_reason,
+            }
+        ],
+    }
+
+def _normalize_think_text(text: str, prompt_has_think_tag: bool) -> str:
+    if not text:
+        return text
+    has_open = re.search(r"<think>", text) is not None
+    has_close = re.search(r"</think>", text) is not None
+    if prompt_has_think_tag and has_close and not has_open:
+        text = f"<think>{text}"
+        has_open = True
+    if prompt_has_think_tag and has_open and not has_close:
+        text = f"{text}</think>"
+    return text
+
+
+def _prompt_has_think_tag(messages: List[Dict[str, str]]) -> bool:
+    prompt = ENGINE._tokenizer.apply_chat_template(
+        conversation=messages,
+        add_generation_prompt=True,
+        tokenize=False,
+    )
+    return "<think>" in prompt or "</think>" in prompt
+
+
+def _filter_newlines(text: str) -> str:
+    if not text:
+        return text
+    return text.replace("\n", "")
+
+
+def _longest_suffix_prefix(value: str, token: str) -> int:
+    upper = min(len(value), len(token) - 1)
+    for n in range(upper, 0, -1):
+        if value.endswith(token[:n]):
+            return n
+    return 0
+
+
+def _normalize_stream_piece(piece: str, prompt_has_think_tag: bool, seen_open_tag: bool):
+    start_tag = "<think>"
+    end_tag = "</think>"
+    source = piece
+    out_parts = []
+
+    while source:
+        start_idx = source.find(start_tag)
+        end_idx = source.find(end_tag)
+
+        next_idx = -1
+        tag_type = ""
+        if start_idx != -1 and (end_idx == -1 or start_idx < end_idx):
+            next_idx = start_idx
+            tag_type = "start"
+        elif end_idx != -1:
+            next_idx = end_idx
+            tag_type = "end"
+
+        if next_idx == -1:
+            keep = max(
+                _longest_suffix_prefix(source, start_tag),
+                _longest_suffix_prefix(source, end_tag),
+            )
+            if keep > 0:
+                out_parts.append(source[:-keep])
+                return "".join(out_parts), source[-keep:], seen_open_tag
+            out_parts.append(source)
+            return "".join(out_parts), "", seen_open_tag
+
+        out_parts.append(source[:next_idx])
+        source = source[next_idx + (len(start_tag) if tag_type == "start" else len(end_tag)) :]
+
+        if tag_type == "start":
+            if not seen_open_tag:
+                out_parts.append(start_tag)
+            seen_open_tag = True
+        else:
+            if prompt_has_think_tag and not seen_open_tag:
+                out_parts.append(start_tag)
+                seen_open_tag = True
+            out_parts.append(end_tag)
+
+    return "".join(out_parts), "", seen_open_tag
+
+
+def _stream_completion(req: ChatCompletionRequest, model_name: str) -> Iterator[str]:
+    chunk_id = f"chatcmpl-{uuid.uuid4().hex}"
+    messages = [m.model_dump() for m in req.messages]
+    prompt_has_think_tag = _prompt_has_think_tag(messages)
+    seen_open_tag = False
+    carry = ""
+
+    def flush_payload(content: str) -> Iterator[str]:
+        content = _filter_newlines(content)
+        if not content:
+            return
+        payload = _chunk_payload(chunk_id, model_name, content)
+        yield f"data: {json.dumps(payload, ensure_ascii=False)}\n\n"
+
+    if prompt_has_think_tag:
+        seen_open_tag = True
+        yield from flush_payload("<think>")
+
+    for piece in ENGINE.stream_generate(
+        messages=messages,
+        max_tokens=req.max_tokens or CONFIG.default_max_new_tokens,
+        top_k=req.top_k if req.top_k is not None else CONFIG.default_top_k,
+        top_p=req.top_p if req.top_p is not None else CONFIG.default_top_p,
+        temperature=req.temperature if req.temperature is not None else CONFIG.default_temperature,
+        use_cache=req.use_cache if req.use_cache is not None else CONFIG.default_use_cache,
+        session_id=req.session_id,
+    ):
+        if not piece:
+            continue
+        merged = carry + piece
+        normalized_piece, carry, seen_open_tag = _normalize_stream_piece(
+            merged,
+            prompt_has_think_tag,
+            seen_open_tag,
+        )
+        if normalized_piece:
+            yield from flush_payload(normalized_piece)
+
+    if carry:
+        final_piece, _, _ = _normalize_stream_piece(carry, prompt_has_think_tag, seen_open_tag)
+        if final_piece:
+            yield from flush_payload(final_piece)
+
+    done_payload = _chunk_payload(chunk_id, model_name, "", finish_reason="stop")
+    yield f"data: {json.dumps(done_payload, ensure_ascii=False)}\n\n"
+    yield "data: [DONE]\n\n"
+
+
+@app.post("/v1/chat/completions")
+def chat_completions(req: ChatCompletionRequest):
+    if not req.messages:
+        raise HTTPException(status_code=400, detail="messages cannot be empty")
+
+    messages = [m.model_dump() for m in req.messages]
+    model_name = req.model or CONFIG.default_model_name
+    if req.stream:
+        return StreamingResponse(_stream_completion(req, model_name), media_type="text/event-stream")
+
+    result = ENGINE.generate(
+        messages=messages,
+        max_tokens=req.max_tokens or CONFIG.default_max_new_tokens,
+        top_k=req.top_k if req.top_k is not None else CONFIG.default_top_k,
+        top_p=req.top_p if req.top_p is not None else CONFIG.default_top_p,
+        temperature=req.temperature if req.temperature is not None else CONFIG.default_temperature,
+        use_cache=req.use_cache if req.use_cache is not None else CONFIG.default_use_cache,
+        session_id=req.session_id,
+    )
+
+    completion_text = _normalize_think_text(result["completion_text"], _prompt_has_think_tag(messages)).strip()
+    completion_text = _filter_newlines(completion_text)
+
+    completion_id = f"chatcmpl-{uuid.uuid4().hex}"
+    response = ChatCompletionResponse(
+        id=completion_id,
+        object="chat.completion",
+        created=result["created"],
+        model=model_name,
+        choices=[
+            ChatChoice(
+                index=0,
+                message=ChatResponseMessage(role="assistant", content=completion_text),
+                finish_reason="stop",
+            )
+        ],
+        usage=Usage(
+            prompt_tokens=len(result["prompt_ids"]),
+            completion_tokens=len(result["completion_ids"]),
+            total_tokens=len(result["prompt_ids"]) + len(result["completion_ids"]),
+        ),
+    )
+    return response
+
+
+@app.post("/v1/sessions")
+def create_session(req: SessionCreateRequest):
+    session_id = ENGINE.create_session(req.session_id)
+    return {"session_id": session_id}
+
+
+@app.get("/v1/sessions")
+def list_sessions():
+    return {"sessions": ENGINE.list_sessions()}
+
+
+@app.delete("/v1/sessions/{session_id}")
+def delete_session(session_id: str):
+    ENGINE.delete_session(session_id)
+    return {"deleted": session_id}
+
+
+@app.get("/v1/sessions/{session_id}")
+def get_session(session_id: str):
+    try:
+        messages = ENGINE.get_session_messages(session_id)
+    except KeyError:
+        raise HTTPException(status_code=404, detail="session not found")
+    return {"session_id": session_id, "messages": messages}
diff --git a/python/server/config.py b/python/server/config.py
new file mode 100644
index 00000000..4a1e316f
--- /dev/null
+++ b/python/server/config.py
@@ -0,0 +1,38 @@
+import os
+from pathlib import Path
+
+
+def _env_bool(name: str, default: bool) -> bool:
+    value = os.getenv(name)
+    if value is None:
+        return default
+    return value.strip().lower() in {"1", "true", "yes", "on"}
+
+
+class ServerConfig:
+    def __init__(self) -> None:
+        self.default_local_model_path = "/home/vankari/code/DeepSeek-R1-Distill-Qwen-1.5B/"
+        self.configured_model_path = os.getenv("LLAISYS_MODEL_PATH", self.default_local_model_path)
+        self.model_path = self._resolve_existing_model_path(self.configured_model_path)
+        self.device = os.getenv("LLAISYS_DEVICE", "cpu").strip().lower()
+        self.default_max_new_tokens = int(os.getenv("LLAISYS_MAX_NEW_TOKENS", "128"))
+        self.default_top_k = int(os.getenv("LLAISYS_TOP_K", "10"))
+        self.default_top_p = float(os.getenv("LLAISYS_TOP_P", "0.8"))
+        self.default_temperature = float(os.getenv("LLAISYS_TEMPERATURE", "0.8"))
+        self.default_use_cache = _env_bool("LLAISYS_USE_CACHE", True)
+        self.default_model_name = os.getenv("LLAISYS_SERVE_MODEL_NAME", "llaisys-qwen2")
+
+        if self.device not in {"cpu", "nvidia"}:
+            raise ValueError("LLAISYS_DEVICE must be 'cpu' or 'nvidia'")
+
+    @staticmethod
+    def _resolve_existing_model_path(path_value: str | None) -> str | None:
+        if path_value is None:
+            return None
+        candidate = Path(path_value).expanduser()
+        if candidate.exists() and candidate.is_dir():
+            return str(candidate)
+        return None
+
+
+CONFIG = ServerConfig()
diff --git a/python/server/engine.py b/python/server/engine.py
new file mode 100644
index 00000000..7d18bb28
--- /dev/null
+++ b/python/server/engine.py
@@ -0,0 +1,100 @@
+import threading
+import time
+from typing import List, Dict, Any, Iterator, Optional
+
+import llaisys
+
+from .config import CONFIG
+
+
+class ChatEngine:
+    def __init__(self) -> None:
+        self._lock = threading.Lock()
+        self._backend = llaisys.backend.InferenceBackend(
+            model_path=CONFIG.model_path,
+            device=llaisys.DeviceType.CPU if CONFIG.device == "cpu" else llaisys.DeviceType.NVIDIA,
+        )
+        self._default_session_id = "engine-default"
+        self._backend.create_session(self._default_session_id)
+        self._tokenizer = self._backend._tokenizer
+
+    def _reset_default_session(self, messages: List[Dict[str, str]]) -> None:
+        self._backend.delete_session(self._default_session_id)
+        self._backend.create_session(self._default_session_id, messages=list(messages))
+
+    def _ensure_named_session(self, session_id: str, messages: List[Dict[str, str]]) -> None:
+        try:
+            self._backend.get_session(session_id)
+            self._backend.replace_messages(session_id, messages)
+        except KeyError:
+            self._backend.create_session(session_id, messages=list(messages))
+
+    def _build_prompt(self, messages: List[Dict[str, str]]) -> str:
+        return self._tokenizer.apply_chat_template(conversation=messages, add_generation_prompt=True, tokenize=False)
+
+    def generate(self, messages: List[Dict[str, str]], max_tokens: int, top_k: int, top_p: float, temperature: float, use_cache: bool,
+                 session_id: Optional[str] = None) -> Dict[str, Any]:
+        with self._lock:
+            active_session_id = session_id or self._default_session_id
+            if session_id:
+                self._ensure_named_session(active_session_id, messages)
+            else:
+                self._reset_default_session(messages)
+            result = self._backend.generate(
+                session_id=active_session_id,
+                max_tokens=max_tokens,
+                top_k=top_k,
+                top_p=top_p,
+                temperature=temperature,
+                use_cache=use_cache,
+                append_assistant_message=True,
+            )
+
+        return {
+            "prompt_ids": result["prompt_ids"],
+            "completion_ids": result["completion_ids"],
+            "completion_text": result["completion_text"],
+            "created": int(time.time()),
+        }
+
+    def stream_generate(self, messages: List[Dict[str, str]], max_tokens: int, top_k: int, top_p: float, temperature: float, use_cache: bool,
+                        session_id: Optional[str] = None) -> Iterator[str]:
+        with self._lock:
+            active_session_id = session_id or self._default_session_id
+            if session_id:
+                self._ensure_named_session(active_session_id, messages)
+            else:
+                self._reset_default_session(messages)
+            for token_id in self._backend.stream_generate(
+                session_id=active_session_id,
+                max_tokens=max_tokens,
+                top_k=top_k,
+                top_p=top_p,
+                temperature=temperature,
+                use_cache=use_cache,
+                append_assistant_message=True,
+            ):
+                yield token_id
+
+    def get_session_messages(self, session_id: str) -> List[Dict[str, str]]:
+        with self._lock:
+            state = self._backend.get_session(session_id)
+            return list(state.messages)
+
+    def create_session(self, session_id: Optional[str] = None) -> str:
+        with self._lock:
+            state = self._backend.create_session(session_id=session_id)
+            return state.session_id
+
+    def list_sessions(self) -> List[str]:
+        with self._lock:
+            return [item.session_id for item in self._backend.list_sessions()]
+
+    def delete_session(self, session_id: str) -> None:
+        if session_id == self._default_session_id:
+            return
+        with self._lock:
+            self._backend.delete_session(session_id)
+
+
+ENGINE = ChatEngine()
diff --git a/python/server/schemas.py b/python/server/schemas.py
new file mode 100644
index 00000000..682ad8d5
--- /dev/null
+++ b/python/server/schemas.py
@@ -0,0 +1,45 @@
+from typing import List, Optional
+from pydantic import BaseModel, Field
+
+
+class ChatMessage(BaseModel):
+    role: str
+    content: str
+
+
+class ChatCompletionRequest(BaseModel):
+    model: Optional[str] = None
+    session_id: Optional[str] = None
+    messages: List[ChatMessage]
+    max_tokens: Optional[int] = Field(default=None, ge=1)
+    top_k: Optional[int] = None
+    top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0)
+    temperature: Optional[float] = Field(default=None, gt=0.0)
+    stream: bool = False
+    use_cache: Optional[bool] = None
+
+
+class ChatResponseMessage(BaseModel):
+    role: str
+    content: str
+
+
+class ChatChoice(BaseModel):
+    index: int
+    message: ChatResponseMessage
+    finish_reason: str
+
+
+class Usage(BaseModel):
+    prompt_tokens: int
+    completion_tokens: int
+    total_tokens: int
+
+
+class ChatCompletionResponse(BaseModel):
+    id: str
+    object: str
+    created: int
+    model: str
+    choices: List[ChatChoice]
+    usage: Usage
diff --git a/python/server/ui/index.html b/python/server/ui/index.html
new file mode 100644
index 00000000..79a2a5a7
--- /dev/null
+++ b/python/server/ui/index.html
@@ -0,0 +1,783 @@
+<!doctype html>
+<html lang="zh-CN">
+  <head>
+    <meta charset="UTF-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+    <title>LLAISYS Chat UI</title>
+    <style>
+      :root {
+        font-family: Inter, -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "PingFang SC", "Noto Sans CJK SC", "Microsoft YaHei", sans-serif;
+      }
+      body {
+        margin: 0;
+        background: #f5f6f8;
+        color: #111827;
+      }
+      .container {
+        max-width: 900px;
+        margin: 24px auto;
+        padding: 0 16px;
+      }
+      .card {
+        background: #ffffff;
+        border: 1px solid #e5e7eb;
+        border-radius: 12px;
+        box-shadow: 0 1px 2px rgba(0, 0, 0, 0.04);
+      }
+      .header {
+        padding: 16px;
+        border-bottom: 1px solid #e5e7eb;
+        display: flex;
+        justify-content: space-between;
+        align-items: center;
+        gap: 12px;
+      }
+      .title {
+        margin: 0;
+        font-size: 18px;
+      }
+      .session-tools {
+        display: flex;
+        gap: 8px;
+        align-items: center;
+      }
+      .session-tools select {
+        border: 1px solid #d1d5db;
+        border-radius: 8px;
+        padding: 6px 8px;
+        font-size: 13px;
+        max-width: 220px;
+      }
+      .chat {
+        height: 60vh;
+        overflow-y: auto;
+        padding: 16px;
+      }
+      .msg {
+        margin-bottom: 12px;
+        line-height: 1.5;
+        white-space: pre-wrap;
+      }
+      .role {
+        font-weight: 600;
+        margin-bottom: 4px;
+      }
+      .user .role {
+        color: #2563eb;
+      }
+      .assistant .role {
+        color: #059669;
+      }
+      .role-row {
+        display: flex;
+        justify-content: space-between;
+        align-items: center;
+        margin-bottom: 4px;
+      }
+      .msg-tools {
+        display: flex;
+        gap: 6px;
+      }
+      .msg-tools button {
+        padding: 3px 8px;
+        font-size: 12px;
+      }
+      .edit-box {
+        margin-top: 8px;
+        display: grid;
+        gap: 8px;
+      }
+      .edit-box textarea {
+        min-height: 70px;
+      }
+      .assistant-sections {
+        display: grid;
+        gap: 8px;
+      }
+      .assistant-block {
+        border: 1px solid #e5e7eb;
+        border-radius: 8px;
+        padding: 8px 10px;
+      }
+      .assistant-block-title {
+        font-size: 12px;
+        color: #6b7280;
+        margin-bottom: 4px;
+      }
+      .assistant-block-content {
+        white-space: pre-wrap;
+      }
+      .assistant-block.think {
+        background: #f9fafb;
+      }
+      .assistant-block.reply {
+        background: #ffffff;
+      }
+      .composer {
+        padding: 16px;
+        border-top: 1px solid #e5e7eb;
+      }
+      textarea {
+        width: 100%;
+        min-height: 88px;
+        resize: vertical;
+        border: 1px solid #d1d5db;
+        border-radius: 8px;
+        padding: 10px;
+        font-size: 14px;
+        box-sizing: border-box;
+      }
+      .actions {
+        margin-top: 10px;
+        display: flex;
+        align-items: center;
+        gap: 8px;
+      }
+      .switch {
+        margin-left: auto;
+        font-size: 13px;
+        color: #374151;
+        display: inline-flex;
+        align-items: center;
+        gap: 6px;
+      }
+      button {
+        border: 1px solid #d1d5db;
+        background: #ffffff;
+        border-radius: 8px;
+        padding: 8px 14px;
+        font-size: 14px;
+        cursor: pointer;
+      }
+      button.primary {
+        background: #111827;
+        color: #ffffff;
+        border-color: #111827;
+      }
+      button:disabled {
+        opacity: 0.6;
+        cursor: not-allowed;
+      }
+      .status {
+        font-size: 13px;
+        color: #6b7280;
+      }
+    </style>
+  </head>
+  <body>
+    <div class="container">
+      <div class="card">
+        <div class="header">
+          <h1 class="title">LLAISYS Chat</h1>
+          <div class="session-tools">
+            <select id="session-select"></select>
+            <button id="new-session">新会话</button>
+            <button id="delete-session">删除会话</button>
+          </div>
+          <div id="status" class="status">就绪</div>
+        </div>
+        <div id="chat" class="chat"></div>
+        <div class="composer">
+          <textarea id="input" placeholder="输入问题，按 Ctrl+Enter 发送"></textarea>
+          <div class="actions">
+            <button id="send" class="primary">发送</button>
+            <button id="clear">清空对话</button>
+            <label class="switch">
+              <input id="stream-toggle" type="checkbox" checked />
+              流式显示
+            </label>
+          </div>
+        </div>
+      </div>
+    </div>
+
+    <script>
+      const chatEl = document.getElementById('chat');
+      const inputEl = document.getElementById('input');
+      const sendBtn = document.getElementById('send');
+      const clearBtn = document.getElementById('clear');
+      const statusEl = document.getElementById('status');
+      const streamToggleEl = document.getElementById('stream-toggle');
+      const sessionSelectEl = document.getElementById('session-select');
+      const newSessionBtn = document.getElementById('new-session');
+      const deleteSessionBtn = document.getElementById('delete-session');
+
+      const sessions = [];
+      let currentSessionId = null;
+      let pending = false;
+      let editingIndex = null;
+      let editingDraft = '';
+
+      function currentSession() {
+        return sessions.find((item) => item.id === currentSessionId);
+      }
+
+      function addLocalSession(id, initialMessages = [], activate = true) {
+        sessions.push({
+          id,
+          title: `会话 ${sessions.length + 1}`,
+          messages: [...initialMessages],
+        });
+        if (activate) {
+          currentSessionId = id;
+          editingIndex = null;
+          editingDraft = '';
+          renderSessionOptions();
+          renderChat();
+        }
+      }
+
+      async function fetchSessionMessages(sessionId) {
+        try {
+          const resp = await fetch(`/v1/sessions/${encodeURIComponent(sessionId)}`);
+          if (!resp.ok) {
+            return [];
+          }
+          const data = await resp.json();
+          return Array.isArray(data.messages) ? data.messages : [];
+        } catch {
+          return [];
+        }
+      }
+
+      async function createSession(initialMessages = [], explicitId = null) {
+        let sessionId = explicitId;
+        try {
+          const resp = await fetch('/v1/sessions', {
+            method: 'POST',
+            headers: { 'Content-Type': 'application/json' },
+            body: JSON.stringify({ session_id: explicitId })
+          });
+          if (resp.ok) {
+            const data = await resp.json();
+            sessionId = data.session_id;
+          }
+        } catch {
+          // Fallback to local-only session id if server session API is unavailable.
+        }
+
+        if (!sessionId) {
+          sessionId = `sess-${Date.now()}-${Math.random().toString(16).slice(2, 8)}`;
+        }
+        addLocalSession(sessionId, initialMessages);
+      }
+
+      async function loadSessionsFromServer() {
+        try {
+          const resp = await fetch('/v1/sessions');
+          if (!resp.ok) {
+            throw new Error('failed to list sessions');
+          }
+          const data = await resp.json();
+          const ids = Array.isArray(data.sessions) ? data.sessions : [];
+          sessions.length = 0;
+          for (const id of ids) {
+            const messages = await fetchSessionMessages(id);
+            addLocalSession(id, messages, false);
+          }
+
+          if (sessions.length === 0) {
+            await createSession();
+          } else {
+            currentSessionId = sessions[0].id;
+            renderSessionOptions();
+            renderChat();
+          }
+        } catch {
+          if (sessions.length === 0) {
+            await createSession();
+          }
+        }
+      }
+
+      function updateSessionTitle(session) {
+        const firstUser = session.messages.find((item) => item.role === 'user');
+        if (firstUser && firstUser.content) {
+          session.title = firstUser.content.slice(0, 12);
+          return;
+        }
+        session.title = `会话 ${sessions.indexOf(session) + 1}`;
+      }
+
+      function renderSessionOptions() {
+        sessionSelectEl.innerHTML = '';
+        sessions.forEach((session) => {
+          updateSessionTitle(session);
+          const opt = document.createElement('option');
+          opt.value = session.id;
+          opt.textContent = session.title;
+          if (session.id === currentSessionId) {
+            opt.selected = true;
+          }
+          sessionSelectEl.appendChild(opt);
+        });
+      }
+
+      function setStatus(text) {
+        statusEl.textContent = text;
+      }
+
+      function addMessage(role, content) {
+        const wrapper = document.createElement('div');
+        wrapper.className = `msg ${role}`;
+
+        const roleEl = document.createElement('div');
+        roleEl.className = 'role';
+        roleEl.textContent = role === 'user' ? '你' : '助手';
+
+        const contentEl = document.createElement('div');
+        contentEl.textContent = content;
+
+        wrapper.appendChild(roleEl);
+        wrapper.appendChild(contentEl);
+        chatEl.appendChild(wrapper);
+        chatEl.scrollTop = chatEl.scrollHeight;
+
+        return contentEl;
+      }
+
+      function createUserMessageView(index, content) {
+        const wrapper = document.createElement('div');
+        wrapper.className = 'msg user';
+
+        const roleRow = document.createElement('div');
+        roleRow.className = 'role-row';
+
+        const roleEl = document.createElement('div');
+        roleEl.className = 'role';
+        roleEl.textContent = '你';
+
+        const tools = document.createElement('div');
+        tools.className = 'msg-tools';
+        const editBtn = document.createElement('button');
+        editBtn.textContent = '编辑重生成';
+        editBtn.disabled = pending;
+        editBtn.addEventListener('click', () => {
+          if (pending) return;
+          editingIndex = index;
+          editingDraft = content;
+          renderChat();
+        });
+        tools.appendChild(editBtn);
+
+        roleRow.appendChild(roleEl);
+        roleRow.appendChild(tools);
+
+        const contentEl = document.createElement('div');
+        contentEl.textContent = content;
+
+        wrapper.appendChild(roleRow);
+        wrapper.appendChild(contentEl);
+
+        if (editingIndex === index) {
+          const editBox = document.createElement('div');
+          editBox.className = 'edit-box';
+
+          const editInput = document.createElement('textarea');
+          editInput.value = editingDraft;
+          editInput.addEventListener('input', () => {
+            editingDraft = editInput.value;
+          });
+
+          const actionRow = document.createElement('div');
+          actionRow.className = 'msg-tools';
+
+          const saveBtn = document.createElement('button');
+          saveBtn.className = 'primary';
+          saveBtn.textContent = '保存并重生成';
+          saveBtn.disabled = pending;
+          saveBtn.addEventListener('click', async () => {
+            const session = currentSession();
+            if (!session) return;
+            const nextText = editingDraft.trim();
+            if (!nextText) return;
+
+            session.messages = session.messages.slice(0, index + 1);
+            session.messages[index] = { role: 'user', content: nextText };
+            editingIndex = null;
+            editingDraft = '';
+            renderSessionOptions();
+            renderChat();
+            await requestAssistantReply();
+          });
+
+          const cancelBtn = document.createElement('button');
+          cancelBtn.textContent = '取消';
+          cancelBtn.disabled = pending;
+          cancelBtn.addEventListener('click', () => {
+            editingIndex = null;
+            editingDraft = '';
+            renderChat();
+          });
+
+          actionRow.appendChild(saveBtn);
+          actionRow.appendChild(cancelBtn);
+          editBox.appendChild(editInput);
+          editBox.appendChild(actionRow);
+          wrapper.appendChild(editBox);
+        }
+
+        chatEl.appendChild(wrapper);
+      }
+
+      function createAssistantMessageView() {
+        const wrapper = document.createElement('div');
+        wrapper.className = 'msg assistant';
+
+        const roleEl = document.createElement('div');
+        roleEl.className = 'role';
+        roleEl.textContent = '助手';
+
+        const sectionsEl = document.createElement('div');
+        sectionsEl.className = 'assistant-sections';
+
+        const thinkBlock = document.createElement('div');
+        thinkBlock.className = 'assistant-block think';
+        const thinkTitle = document.createElement('div');
+        thinkTitle.className = 'assistant-block-title';
+        thinkTitle.textContent = '思考';
+        const thinkContent = document.createElement('div');
+        thinkContent.className = 'assistant-block-content';
+        thinkBlock.appendChild(thinkTitle);
+        thinkBlock.appendChild(thinkContent);
+
+        const replyBlock = document.createElement('div');
+        replyBlock.className = 'assistant-block reply';
+        const replyTitle = document.createElement('div');
+        replyTitle.className = 'assistant-block-title';
+        replyTitle.textContent = '回复';
+        const replyContent = document.createElement('div');
+        replyContent.className = 'assistant-block-content';
+        replyBlock.appendChild(replyTitle);
+        replyBlock.appendChild(replyContent);
+
+        sectionsEl.appendChild(thinkBlock);
+        sectionsEl.appendChild(replyBlock);
+        wrapper.appendChild(roleEl);
+        wrapper.appendChild(sectionsEl);
+        chatEl.appendChild(wrapper);
+
+        const state = {
+          inThinking: false,
+          carry: '',
+          thinkText: '',
+          replyText: '',
+          rawText: ''
+        };
+
+        function longestSuffixPrefix(value, token) {
+          const upper = Math.min(value.length, token.length - 1);
+          for (let n = upper; n > 0; n -= 1) {
+            if (value.endsWith(token.slice(0, n))) {
+              return n;
+            }
+          }
+          return 0;
+        }
+
+        function commitText(text) {
+          if (!text) return;
+          if (state.inThinking) {
+            state.thinkText += text;
+          } else {
+            state.replyText += text;
+          }
+        }
+
+        function processBuffer(bufferValue) {
+          const startTag = '<think>';
+          const endTag = '</think>';
+          let source = bufferValue;
+
+          while (source.length > 0) {
+            const startIdx = source.indexOf(startTag);
+            const endIdx = source.indexOf(endTag);
+
+            let nextIdx = -1;
+            let tagType = '';
+            if (startIdx !== -1 && (endIdx === -1 || startIdx < endIdx)) {
+              nextIdx = startIdx;
+              tagType = 'start';
+            } else if (endIdx !== -1) {
+              nextIdx = endIdx;
+              tagType = 'end';
+            }
+
+            if (nextIdx === -1) {
+              const keep = Math.max(
+                longestSuffixPrefix(source, startTag),
+                longestSuffixPrefix(source, endTag)
+              );
+              if (keep > 0) {
+                commitText(source.slice(0, source.length - keep));
+                state.carry = source.slice(source.length - keep);
+              } else {
+                commitText(source);
+                state.carry = '';
+              }
+              return;
+            }
+
+            commitText(source.slice(0, nextIdx));
+            source = source.slice(nextIdx + (tagType === 'start' ? startTag.length : endTag.length));
+
+            if (tagType === 'start') {
+              state.inThinking = true;
+            } else {
+              state.inThinking = false;
+            }
+          }
+
+          state.carry = '';
+        }
+
+        function render() {
+          thinkContent.textContent = state.thinkText;
+          replyContent.textContent = state.replyText;
+          thinkBlock.style.display = state.thinkText ? 'block' : 'none';
+          replyBlock.style.display = state.replyText ? 'block' : 'none';
+          chatEl.scrollTop = chatEl.scrollHeight;
+        }
+
+        function appendChunk(text) {
+          if (!text) return;
+          state.rawText += text;
+          const merged = state.carry + text;
+          state.carry = '';
+          processBuffer(merged);
+          render();
+        }
+
+        function finish() {
+          if (state.carry) {
+            commitText(state.carry);
+            state.carry = '';
+          }
+          if (!state.replyText && !state.thinkText) {
+            replyContent.textContent = '';
+            replyBlock.style.display = 'block';
+          }
+          render();
+          return state.rawText || state.replyText || state.thinkText;
+        }
+
+        render();
+        return {
+          wrapper,
+          appendChunk,
+          finish,
+          setFullText(text) {
+            state.inThinking = false;
+            state.carry = '';
+            state.thinkText = '';
+            state.replyText = '';
+            state.rawText = text || '';
+            processBuffer(text || '');
+            return finish();
+          },
+        };
+      }
+
+      function renderChat() {
+        const session = currentSession();
+        chatEl.innerHTML = '';
+        if (!session) return;
+
+        session.messages.forEach((message, index) => {
+          if (message.role === 'user') {
+            createUserMessageView(index, message.content || '');
+            return;
+          }
+          const assistantView = createAssistantMessageView();
+          assistantView.setFullText(message.content || '');
+        });
+        chatEl.scrollTop = chatEl.scrollHeight;
+      }
+
+      async function readStreamingReply(resp, assistantView) {
+        if (!resp.body) {
+          throw new Error('流式响应不可读');
+        }
+
+        const reader = resp.body.getReader();
+        const decoder = new TextDecoder('utf-8');
+        let buffer = '';
+        while (true) {
+          const { done, value } = await reader.read();
+          if (done) break;
+
+          buffer += decoder.decode(value, { stream: true });
+          const events = buffer.split('\n\n');
+          buffer = events.pop() || '';
+
+          for (const eventText of events) {
+            const lines = eventText.split('\n');
+            for (const line of lines) {
+              if (!line.startsWith('data: ')) {
+                continue;
+              }
+              const payload = line.slice(6);
+              if (payload === '[DONE]') {
+                continue;
+              }
+
+              let item;
+              try {
+                item = JSON.parse(payload);
+              } catch {
+                continue;
+              }
+
+              const piece = item.choices?.[0]?.delta?.content || '';
+              if (piece) {
+                assistantView.appendChunk(piece);
+              }
+            }
+          }
+        }
+
+        return assistantView.finish();
+      }
+
+      async function requestAssistantReply() {
+        const session = currentSession();
+        if (!session) return;
+        pending = true;
+        sendBtn.disabled = true;
+        clearBtn.disabled = true;
+        streamToggleEl.disabled = true;
+        sessionSelectEl.disabled = true;
+        newSessionBtn.disabled = true;
+        deleteSessionBtn.disabled = true;
+        setStatus('生成中...');
+
+        try {
+          const useStream = streamToggleEl.checked;
+          renderChat();
+
+          const resp = await fetch('/v1/chat/completions', {
+            method: 'POST',
+            headers: { 'Content-Type': 'application/json' },
+            body: JSON.stringify({
+              session_id: session.id,
+              messages: session.messages,
+              stream: useStream
+            })
+          });
+
+          if (!resp.ok) {
+            throw new Error(`HTTP ${resp.status}`);
+          }
+
+          const assistantView = createAssistantMessageView();
+          let reply = '';
+
+          if (useStream) {
+            reply = await readStreamingReply(resp, assistantView);
+          } else {
+            const data = await resp.json();
+            reply = assistantView.setFullText(data.choices?.[0]?.message?.content ?? '');
+          }
+
+          session.messages.push({ role: 'assistant', content: reply });
+          renderSessionOptions();
+          setStatus('就绪');
+        } catch (err) {
+          addMessage('assistant', `请求失败：${err.message}`);
+          setStatus('请求失败');
+        } finally {
+          pending = false;
+          sendBtn.disabled = false;
+          clearBtn.disabled = false;
+          streamToggleEl.disabled = false;
+          sessionSelectEl.disabled = false;
+          newSessionBtn.disabled = false;
+          deleteSessionBtn.disabled = false;
+          renderChat();
+        }
+      }
+
+      async function sendMessage() {
+        if (pending) return;
+        const text = inputEl.value.trim();
+        if (!text) return;
+
+        const session = currentSession();
+        if (!session) return;
+
+        session.messages.push({ role: 'user', content: text });
+        inputEl.value = '';
+        editingIndex = null;
+        editingDraft = '';
+        renderSessionOptions();
+        renderChat();
+        await requestAssistantReply();
+      }
+
+      sendBtn.addEventListener('click', sendMessage);
+      clearBtn.addEventListener('click', () => {
+        const session = currentSession();
+        if (!session) return;
+        session.messages = [];
+        editingIndex = null;
+        editingDraft = '';
+        renderSessionOptions();
+        renderChat();
+        setStatus('已清空');
+      });
+      newSessionBtn.addEventListener('click', async () => {
+        if (pending) return;
+        await createSession();
+        setStatus('已创建新会话');
+      });
+      deleteSessionBtn.addEventListener('click', async () => {
+        if (pending) return;
+        const session = currentSession();
+        if (!session) return;
+
+        if (sessions.length <= 1) {
+          if (session) {
+            session.messages = [];
+            renderSessionOptions();
+            renderChat();
+            setStatus('已清空当前会话');
+          }
+          return;
+        }
+
+        try {
+          await fetch(`/v1/sessions/${encodeURIComponent(session.id)}`, { method: 'DELETE' });
+        } catch {
+          // Keep local deletion behavior when server API is unavailable.
+        }
+
+        const idx = sessions.findIndex((item) => item.id === currentSessionId);
+        if (idx !== -1) {
+          sessions.splice(idx, 1);
+        }
+        currentSessionId = sessions[Math.max(0, idx - 1)]?.id || sessions[0]?.id || null;
+        editingIndex = null;
+        editingDraft = '';
+        renderSessionOptions();
+        renderChat();
+        setStatus('已删除会话');
+      });
+      sessionSelectEl.addEventListener('change', () => {
+        if (pending) return;
+        currentSessionId = sessionSelectEl.value;
+        editingIndex = null;
+        editingDraft = '';
+        renderChat();
+        setStatus('已切换会话');
+      });
+      inputEl.addEventListener('keydown', (event) => {
+        if (event.key === 'Enter' && event.ctrlKey) {
+          event.preventDefault();
+          sendMessage();
+        }
+      });
+
+      loadSessionsFromServer();
+    </script>
+  </body>
+</html>
diff --git a/report/image.png b/report/image.png
new file mode 100644
index 00000000..c7d534df
Binary files /dev/null and b/report/image.png differ
diff --git a/report/report_chatbot.md b/report/report_chatbot.md
new file mode 100644
index 00000000..77b19092
--- /dev/null
+++ b/report/report_chatbot.md
@@ -0,0 +1,146 @@
+# 本项目基于llaisys实现了一个聊天AI（方向3）
+## 项目依赖（与llaisys保持一致）
+- 编译工具：[Xmake](https://xmake.io/)
+- C++编译器：MSVC（Windows）或Clang或GCC
+- Python >= 3.9
+## 项目运行
+- fork 本仓库
+- 编译运行
+  ```bash
+  # 编译c++代码
+  xmake
+  # 安装llaisys共享库
+  xmake install
+  # 安装core包
+  python3 -m pip install -e ./python/llaisyscore --user --break-system-packages
+  # 安装server包
+  python3 -m pip install -e ./python/server-project --user --break-system-packages
+  ```
+- 从[huggingface](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B)手动下载权重并指定权重加载目录（或使用snapshot download下载）
+## 实现功能
+1. 实现随机采样算子randomsample(sample_val, sample_idx ,logits, temperature, top_k, top_p)
+    - temperature：按如下公式 $$ y = t-softmax(x,temperature),\  y_i = \frac{\frac{x_i}{temperature}}{\sum_j \frac{x_j}{temperature}} $$ scale logits的系数
+    - top_k：按概率排序(t-softmax归一后的输出)后选择最高k个token作为candidate
+    - top_p：经过top_k选取后，再对logits进行一次普通softmax，指定一个累积概率的threshold，累积概率低于这个threshold的token选入candidate
+    - output：对经过top_p选取后的logits再使用一次普通softmax，得到最后的token分布，按照分布采样的结果的val和idx写入相应张量
+  
+2. 基于llaisys实现 Project #3 聊天服务（FastAPI + Web UI）
+    - 后端入口：`python/server/app.py`
+    - 推理引擎：`python/server/engine.py`
+    - 会话与缓存后端：`python/llaisys/backend/inference_backend.py`
+    - KV-Cache池：`python/llaisys/models/kvcachepool.py`
+
+## project3 核心实现说明
+1. OpenAI风格 chat-completion API
+  - `POST /v1/chat/completions`：支持非流式与流式（SSE）返回
+  - `GET /healthz`：健康检查
+  - `GET /`：返回前端聊天页面
+
+2. 会话管理能力
+  - `POST /v1/sessions`：创建会话
+  - `GET /v1/sessions`：列出会话
+  - `GET /v1/sessions/{session_id}`：读取会话消息
+  - `DELETE /v1/sessions/{session_id}`：删除会话
+  - `InferenceBackend`中维护`SessionState`，支持`append_message`和`replace_messages`，满足“编辑历史并重生成”需求
+
+3. KVCache复用与前缀匹配
+  - 基于trie + session slots实现的Cache pool，前者维护session的tokens prefix，后者维护session的KVCache（并非ref count based的trie，所以prepare的时候会额外多一个复制prefix的开销）
+  - 在`KVCachePool.prepare_session`阶段计算当前prompt与历史缓存的最长公共前缀
+  - 复用可复用前缀，减少重复prefill
+  - 生成完成后通过`commit_session`提交最新token历史
+  - 删除会话时同步释放对应cache资源
+
+4. 非流式与流式推理路径
+  - 非流式：调用`model.generate(...)`，按输入长度切分`prompt_ids`和`completion_ids`
+  - 流式：使用`model._infer_tokens(...)`逐token迭代
+    - 首次执行prefill
+    - 后续每步单token decode
+    - 过滤EOS,'\n'等特殊token，避免前端显示结束标记
+
+5. 前端UI能力（`python/server/ui/index.html`）
+  - 多会话切换
+  - 用户消息“编辑重生成”
+  - 流式开关
+  - 对`<think>...</think>`进行分段展示（思考区/回复区）
+
+6. 输出规范化
+  - 对流式与非流式输出进行一致性处理
+  - 处理`<think>`标签开闭不对称问题
+  - 过滤换行符（`\n`）以匹配当前前端展示要求
+
+## 测试与验证
+  - 随机采样算子测试文件位于`test/ops/random_sample.py`，主要对random_sample的极端p值（p<=0, p>=1），极端k值（k=1），以及（固定temperature，top_k，top_p组合采样，与pytorch实现进行容差校验）
+  - 后端会话与流式行为：`test/test_inference_backend.py`
+    - 覆盖会话创建/删除/替换历史
+    - 覆盖流式路径“首步prefill + 后续单token解码”
+  - 服务端接口行为：`test/test_server.py`
+    - 覆盖`healthz`、`/`、`/v1/chat/completions`（流式/非流式）
+    - 覆盖`/v1/sessions`增删查
+  - 配置行为：`test/test_server_config.py`
+    - 覆盖模型路径环境变量存在/不存在时的分支
+
+## 演示效果
+1. 启动服务
+  ```bash
+  cd ./python
+  python3 -m uvicorn server.app:app --host 0.0.0.0 --port {port}
+  ```
+2. 按下面的调用示例或者直接在webgui进行交互
+  - 非流式调用示例
+    ```bash
+    curl -X POST http://127.0.0.1:{port}/v1/chat/completions \
+      -H "Content-Type: application/json" \
+      -d '{
+      "messages": [{"role": "user", "content": "你好，介绍一下你自己"}],
+      "stream": false,
+      "max_tokens": 64
+      }'
+    ```
+
+  - 流式调用示例（SSE）
+    ```bash
+    curl -N -X POST http://127.0.0.1:{port}/v1/chat/completions \
+      -H "Content-Type: application/json" \
+      -d '{
+      "messages": [{"role": "user", "content": "讲个笑话"}],
+      "stream": true
+      }'
+    ```
+
+  - 会话接口示例
+    ```bash
+    # 创建会话
+    curl -X POST http://127.0.0.1:{port}/v1/sessions -H "Content-Type: application/json" -d '{"session_id":"demo-session"}'
+    # 查询会话列表
+    curl http://127.0.0.1:{port}/v1/sessions
+    # 删除会话
+    curl -X DELETE http://127.0.0.1:{port}/v1/sessions/demo-session
+    # 非流式 chat-completions（指定 session_id）
+    curl -X POST http://127.0.0.1:8000/v1/chat/completions \
+    -H "Content-Type: application/json" \
+    -d '{
+    "session_id":"{session_id }",
+    "messages":[{"role":"user","content":"你好，介绍一下你自己"}],
+    "stream":false,
+    "max_tokens":64
+    }'
+    # 流式 chat-completions（指定 session_id）
+    curl -N -X POST http://127.0.0.1:8000/v1/chat/completions \
+    -H "Content-Type: application/json" \
+    -d '{
+    "session_id":"demo-session-001",
+    "messages":[{"role":"user","content":"讲个笑话"}],
+    "stream":true,
+    "max_tokens":64
+    }'
+    ```
+3. webui展示
+  ![fig](image.png)
+
+## 说明
+- 当前项目3实现聚焦“单实例可用聊天服务 + 会话管理 + KV缓存复用 + 流式输出”。未实现多用户管理及连续批处理等功能。
+## todo
+- 项目4跟进
+- 项目1跟进
+- CUDA linear算子优化
+
diff --git a/report/report_cuda.md b/report/report_cuda.md
new file mode 100644
index 00000000..4427368a
--- /dev/null
+++ b/report/report_cuda.md
@@ -0,0 +1,457 @@
+# LLAISYS CUDA 适配报告
+
+## 1. 项目目标与范围
+
+本报告记录了 LLAISYS 在 NVIDIA CUDA 平台上的适配工作，覆盖内容包括：
+
+- 前置依赖安装
+- CUDA 构建与安装流程
+- CUDA Runtime API 与算子实现
+- Qwen2 GPU 推理接入与 CPU 兼容性说明
+- 复现与验证方法
+
+本次适配遵循原则：
+
+1. 不改变现有 CPU 推理接口语义。
+2. CUDA 路径以算子原生 kernel 为主，不采用“CPU 计算后拷回 GPU”的默认策略。
+3. Python 层 API 与现有调用方式保持兼容。
+
+---
+
+## 2. 前置依赖安装
+
+### 2.1 系统与工具链
+
+建议环境（本次验证环境）：
+
+- Linux x86_64
+- NVIDIA GPU（A100，SM80）
+- CUDA Toolkit 12.x（本仓库验证为 12.8）
+- GCC / G++（支持 C++17）
+- Xmake
+- Python 3.9+
+
+### 2.2 Python 依赖
+
+在仓库根目录执行：
+
+```bash
+python3 -m pip install -e ./python/llaisyscore --user --break-system-packages
+python3 -m pip install -e ./python/server-project --user --break-system-packages
+```
+
+如需推理测试（含 HF 对照），还需确保环境中可用：
+
+- torch
+- transformers
+- accelerate
+- safetensors
+- huggingface-hub
+
+---
+
+## 3. 编译与安装
+
+### 3.1 CUDA 开关编译
+
+```bash
+xmake f --nv-gpu=y -cv
+xmake -v
+xmake install
+```
+
+其中 `xmake install` 会将 `libllaisys.so` 同步到 `python/llaisys/libllaisys/`。
+
+### 3.2 构建系统关键配置
+
+文件：`xmake.lua`
+
+```lua
+option("nv-gpu")
+    set_default(false)
+    set_showmenu(true)
+    set_description("Whether to compile implementations for Nvidia GPU")
+option_end()
+
+if has_config("nv-gpu") then
+    add_defines("ENABLE_NVIDIA_API")
+    includes("xmake/nvidia.lua")
+end
+```
+
+文件：`xmake/nvidia.lua`
+
+```lua
+target("llaisys-device-nvidia")
+    set_kind("static")
+    set_languages("cxx17")
+    set_warnings("all", "error")
+    set_policy("build.cuda.devlink", true)
+
+    add_files("../src/device/nvidia/*.cu")
+
+    add_cugencodes("compute_80")
+    add_cuflags("--use_fast_math")
+    add_cuflags("-Xcompiler=-fPIC")
+    add_culdflags("-Xcompiler=-fPIC")
+
+    add_links("cudart")
+    add_linkdirs("/usr/local/cuda/lib64")
+
+target_end()
+```
+
+说明：
+
+- `build.cuda.devlink` + `-Xcompiler=-fPIC` 解决了 CUDA device link 与共享库链接问题。
+- `compute_80` 与当前测试硬件匹配。
+
+---
+
+## 4. CUDA Runtime API 实现
+
+文件：`src/device/nvidia/nvidia_runtime_api.cu`
+
+主要实现了：
+
+- `getDeviceCount` / `setDevice`
+- stream 创建、销毁、同步
+- `cudaMalloc/cudaFree`、`cudaMallocHost/cudaFreeHost`
+- `memcpySync/memcpyAsync` 的 H2H/H2D/D2H/D2D 映射
+
+代码片段（仓库实现）：
+
+```cpp
+int getDeviceCount() {
+    int count = 0;
+    CUDA_CHECK(cudaGetDeviceCount(&count));
+    return count;
+}
+
+void *mallocDevice(size_t size) {
+    void *ptr = nullptr;
+    CUDA_CHECK(cudaMalloc(&ptr, size));
+    return ptr;
+}
+
+void memcpySync(void *dst, const void *src, size_t size, llaisysMemcpyKind_t kind) {
+    cudaMemcpyKind cuda_kind = cudaMemcpyDefault;
+    switch (kind) {
+    case LLAISYS_MEMCPY_H2H:
+        cuda_kind = cudaMemcpyHostToHost;
+        break;
+    case LLAISYS_MEMCPY_H2D:
+        cuda_kind = cudaMemcpyHostToDevice;
+        break;
+    case LLAISYS_MEMCPY_D2H:
+        cuda_kind = cudaMemcpyDeviceToHost;
+        break;
+    case LLAISYS_MEMCPY_D2D:
+        cuda_kind = cudaMemcpyDeviceToDevice;
+        break;
+    default:
+        ASSERT(false, "Unsupported memcpy kind in CUDA memcpySync");
+    }
+    CUDA_CHECK(cudaMemcpy(dst, src, size, cuda_kind));
+}
+```
+
+---
+
+## 5. CUDA 算子实现
+
+### 5.1 通用 dtype 读写工具
+
+文件：`src/utils/cuda_type_utils.cuh`
+
+该工具统一支持 F32/F16/BF16 的 device 端读取与写回，避免各算子重复实现类型分发。
+
+```cpp
+__device__ __forceinline__ float load_as_float(const void *ptr, size_t idx, llaisysDataType_t dtype) {
+    if (dtype == LLAISYS_DTYPE_F32) return load_as_float_f32(ptr, idx);
+    if (dtype == LLAISYS_DTYPE_F16) return load_as_float_f16(ptr, idx);
+    return load_as_float_bf16(ptr, idx);
+}
+```
+
+### 5.2 add（F32/F16/BF16）
+
+文件：`src/ops/add/cuda/add_cuda.cu`
+
+```cpp
+template <typename T>
+__global__ void add_kernel(T *c, const T *a, const T *b, size_t numel) {
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= numel) return;
+    c[idx] = a[idx] + b[idx];
+}
+```
+
+对于 FP16/BF16 使用对应半精度运算指令。
+
+### 5.3 embedding
+
+文件：`src/ops/embedding/cuda/embedding_cuda.cu`
+
+```cpp
+__global__ void embedding_kernel(void *out, const int64_t *index, const void *weight,
+                                 llaisysDataType_t dtype, size_t num, size_t dim) {
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t total = num * dim;
+    if (idx >= total) return;
+
+    size_t row = idx / dim;
+    size_t col = idx % dim;
+    int64_t token = index[row];
+    size_t src_idx = static_cast<size_t>(token) * dim + col;
+    float val = load_as_float(weight, src_idx, dtype);
+    store_from_float(out, idx, val, dtype);
+}
+```
+
+同时在 `src/ops/embedding/op.cpp` 中补充了 GPU index 越界校验时的 D2H 安全路径（避免直接把 device 指针当 host 指针解引用）。
+
+### 5.4 linear
+
+文件：`src/ops/linear/cuda/linear_cuda.cu`
+
+```cpp
+__global__ void linear_kernel(void *out, const void *in, const void *weight, const void *bias,
+                              llaisysDataType_t type, size_t batch_size, size_t in_features,
+                              size_t out_features, bool has_bias) {
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t total = batch_size * out_features;
+    if (idx >= total) return;
+
+    size_t b = idx / out_features;
+    size_t o = idx % out_features;
+
+    float sum = 0.0f;
+    for (size_t i = 0; i < in_features; ++i) {
+        float x = load_as_float(in, b * in_features + i, type);
+        float w = load_as_float(weight, o * in_features + i, type);
+        sum += x * w;
+    }
+    if (has_bias) sum += load_as_float(bias, o, type);
+    store_from_float(out, idx, sum, type);
+}
+```
+
+### 5.5 argmax
+
+文件：`src/ops/argmax/cuda/argmax_cuda.cu`
+
+```cpp
+__global__ void argmax_kernel(int64_t *mi, void *mv, const void *v, llaisysDataType_t type, size_t numel) {
+    if (blockIdx.x != 0 || threadIdx.x != 0) return;
+
+    size_t best_idx = 0;
+    float best_val = load_as_float(v, 0, type);
+    for (size_t i = 1; i < numel; ++i) {
+        float cur = load_as_float(v, i, type);
+        if (cur > best_val) {
+            best_val = cur;
+            best_idx = i;
+        }
+    }
+    mi[0] = static_cast<int64_t>(best_idx);
+    store_from_float(mv, 0, best_val, type);
+}
+```
+
+### 5.6 rms_norm
+
+文件：`src/ops/rms_norm/cuda/rms_norm_cuda.cu`
+
+```cpp
+__global__ void rms_norm_kernel(void *out, const void *in, const void *weight,
+                                llaisysDataType_t type, size_t feature_dim, float eps) {
+    size_t row = blockIdx.x;
+    extern __shared__ float sdata[];
+
+    float local_sum = 0.0f;
+    for (size_t col = threadIdx.x; col < feature_dim; col += blockDim.x) {
+        float val = load_as_float(in, row * feature_dim + col, type);
+        local_sum += val * val;
+    }
+    sdata[threadIdx.x] = local_sum;
+    __syncthreads();
+
+    for (unsigned int stride = blockDim.x / 2; stride > 0; stride >>= 1) {
+        if (threadIdx.x < stride) sdata[threadIdx.x] += sdata[threadIdx.x + stride];
+        __syncthreads();
+    }
+
+    float inv_rms = rsqrtf(sdata[0] / static_cast<float>(feature_dim) + eps);
+    for (size_t col = threadIdx.x; col < feature_dim; col += blockDim.x) {
+        float x = load_as_float(in, row * feature_dim + col, type);
+        float w = load_as_float(weight, col, type);
+        store_from_float(out, row * feature_dim + col, x * w * inv_rms, type);
+    }
+}
+```
+
+### 5.7 rope
+
+文件：`src/ops/rope/cuda/rope_cuda.cu`
+
+```cpp
+__global__ void rope_kernel(void *out, const void *in, const int64_t *pos_ids, llaisysDataType_t type,
+                            size_t seq_len, size_t n_heads, size_t head_dim, float theta) {
+    // ... 计算 phi/cos/sin 后旋转写回
+}
+```
+
+### 5.8 self_attention
+
+文件：`src/ops/self_attention/cuda/self_attention_cuda.cu`
+
+```cpp
+__global__ void self_attention_kernel(void *attn_val, const void *q, const void *k, const void *v,
+                                      llaisysDataType_t type, size_t seqlen, size_t totlen,
+                                      size_t nh, size_t nkvh, size_t d, size_t dv, float scale) {
+    // 单 kernel 内完成 causal masked attention 的分数、归一化与加权求和
+}
+```
+
+### 5.9 swiglu
+
+文件：`src/ops/swiglu/cuda/swiglu_cuda.cu`
+
+```cpp
+__global__ void swiglu_kernel(void *out, const void *gate, const void *up,
+                              llaisysDataType_t type, size_t numel) {
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= numel) return;
+    float g = load_as_float(gate, idx, type);
+    float u = load_as_float(up, idx, type);
+    float sig = g / (1.0f + expf(-g));
+    store_from_float(out, idx, u * sig, type);
+}
+```
+
+### 5.10 random_sample（temperature/top-k/top-p）
+
+文件：`src/ops/random_sample/cuda/random_sample_cuda.cu`
+
+实现流程：
+
+1. logits 温度缩放
+2. device 侧排序（thrust）
+3. 按 top-k/top-p 截断
+4. 使用 `curand` 采样
+
+```cpp
+curandStatePhilox4_32_10_t rng_state;
+curand_init(seed, 0ULL, 0ULL, &rng_state);
+float r = curand_uniform(&rng_state);
+```
+
+并通过 host 侧原子计数 + 时间戳组合 seed，避免固定采样序列。
+
+---
+
+## 6. Qwen2 CUDA 推理支持
+
+### 6.1 Python 侧模型设备接入
+
+文件：`python/llaisys/models/qwen2.py`
+
+- 支持 `DeviceType.NVIDIA` 初始化
+- CUDA 路径权重 dtype 使用 BF16
+- 权重加载时保持 host tensor，使用 `tensorLoad` 进行 H2D
+
+关键代码：
+
+```python
+def maybe_cast_tensor(tensor):
+    if self.device == DeviceType.CPU:
+        return tensor.to(torch.float32).contiguous()
+    elif self.device == DeviceType.NVIDIA:
+        return tensor.to(torch.bfloat16).contiguous()
+    return tensor
+```
+
+此处避免 `.cuda()`，防止把 device 指针走到 `tensorLoad(H2D)` 的 host 输入路径造成不兼容。
+
+### 6.2 端到端验证结果
+
+已通过：
+
+- `python test/test_infer.py --device nvidia --max_steps 128`
+- `python test/test_infer.py --device nvidia --max_steps 8 --test`
+
+其中 `--test` 模式已与 HF 在确定性配置下对齐并 `Test passed`。
+
+---
+
+## 7. 复现方法（可直接执行）
+
+### 7.1 编译与安装
+
+```bash
+xmake f --nv-gpu=y -cv
+xmake -v
+xmake install
+```
+
+### 7.2 运行时测试
+
+```bash
+python test/test_runtime.py --device nvidia
+```
+
+### 7.3 算子测试（NVIDIA）
+
+```bash
+for f in test/ops/*.py; do
+  echo "===== RUN $f --device nvidia ====="
+  python "$f" --device nvidia
+  echo
+ done
+```
+
+### 7.4 Tensor 双设备迁移补充测试
+
+```bash
+python test/test_tensor.py --device nvidia
+python test/test_tensor_dual_device.py --device_id 0
+```
+
+### 7.5 Qwen2 推理测试
+
+```bash
+python test/test_infer.py --device nvidia --max_steps 128
+python test/test_infer.py --device nvidia --max_steps 8 --test
+```
+
+---
+
+## 8. 兼容性说明
+
+1. CPU 路径保持可用，CUDA 仅在 `--nv-gpu=y` + `ENABLE_NVIDIA_API` 条件下启用。
+2. Python API 兼容既有调用方式，不要求业务侧改接口。
+3. 采样接口未新增外部 seed 参数；CUDA 侧 seed 为实现细节，不影响现有测试接口。
+
+---
+
+## 9. 已知限制与后续优化方向
+
+1. 当前部分 CUDA kernel 仍是基础实现（可进一步做块内优化、向量化、融合）。
+2. 与 HF 相比 decode 吞吐仍有差距，后续可优先优化：
+   - 减少每 token 动态分配
+   - 优化采样（减少全排序开销）
+   - 注意力 kernel 融合/flash 化
+   - 进一步减少同步与跨设备拷贝
+
+---
+
+## 10. 本次交付结论
+
+本次已经完成从 Runtime 到算子到模型推理的 CUDA 全链路接入：
+
+- 构建可编译、可安装
+- `test_runtime` 与 `test/ops` 在 NVIDIA 设备通过
+- `Qwen2` 在 NVIDIA 设备可完成推理，并通过 `test_infer --test` 的一致性验证
+
+即：LLAISYS 已具备可复现的 CUDA 推理能力，且与 CPU 实现保持接口兼容。
diff --git a/src/core/context/context.cpp b/src/core/context/context.cpp
index 44894b9e..63756fab 100644
--- a/src/core/context/context.cpp
+++ b/src/core/context/context.cpp
@@ -52,7 +52,7 @@ Context::~Context() {
 void Context::setDevice(llaisysDeviceType_t device_type, int device_id) {
     // If doest not match the current runtime.
     if (_current_runtime == nullptr || _current_runtime->deviceType() != device_type || _current_runtime->deviceId() != device_id) {
-        auto runtimes = _runtime_map[device_type];
+        auto &runtimes = _runtime_map[device_type];
         CHECK_ARGUMENT((size_t)device_id < runtimes.size() && device_id >= 0, "invalid device id");
         if (_current_runtime != nullptr) {
             _current_runtime->_deactivate();
diff --git a/src/device/nvidia/nvidia_runtime_api.cu b/src/device/nvidia/nvidia_runtime_api.cu
index cab92826..c99b886f 100644
--- a/src/device/nvidia/nvidia_runtime_api.cu
+++ b/src/device/nvidia/nvidia_runtime_api.cu
@@ -1,56 +1,119 @@
 #include "../runtime_api.hpp"
 
+#include <cuda_runtime.h>
+
 #include <cstdlib>
 #include <cstring>
 
+#define CUDA_CHECK(call)                                                                                               \
+    do {                                                                                                               \
+        cudaError_t err__ = (call);                                                                                    \
+        ASSERT(err__ == cudaSuccess, "CUDA Runtime API call failed");                                                \
+    } while (0)
+
 namespace llaisys::device::nvidia {
 
 namespace runtime_api {
 int getDeviceCount() {
-    TO_BE_IMPLEMENTED();
+    int count = 0;
+    CUDA_CHECK(cudaGetDeviceCount(&count));
+    return count;
 }
 
-void setDevice(int) {
-    TO_BE_IMPLEMENTED();
+void setDevice(int device) {
+    CUDA_CHECK(cudaSetDevice(device));
 }
 
 void deviceSynchronize() {
-    TO_BE_IMPLEMENTED();
+    CUDA_CHECK(cudaDeviceSynchronize());
 }
 
 llaisysStream_t createStream() {
-    TO_BE_IMPLEMENTED();
+    cudaStream_t stream = nullptr;
+    CUDA_CHECK(cudaStreamCreate(&stream));
+    return reinterpret_cast<llaisysStream_t>(stream);
 }
 
 void destroyStream(llaisysStream_t stream) {
-    TO_BE_IMPLEMENTED();
+    if (!stream) {
+        return;
+    }
+    CUDA_CHECK(cudaStreamDestroy(reinterpret_cast<cudaStream_t>(stream)));
 }
 void streamSynchronize(llaisysStream_t stream) {
-    TO_BE_IMPLEMENTED();
+    if (!stream) {
+        CUDA_CHECK(cudaDeviceSynchronize());
+        return;
+    }
+    CUDA_CHECK(cudaStreamSynchronize(reinterpret_cast<cudaStream_t>(stream)));
 }
 
 void *mallocDevice(size_t size) {
-    TO_BE_IMPLEMENTED();
+    void *ptr = nullptr;
+    CUDA_CHECK(cudaMalloc(&ptr, size));
+    return ptr;
 }
 
 void freeDevice(void *ptr) {
-    TO_BE_IMPLEMENTED();
+    if (!ptr) {
+        return;
+    }
+    CUDA_CHECK(cudaFree(ptr));
 }
 
 void *mallocHost(size_t size) {
-    TO_BE_IMPLEMENTED();
+    void *ptr = nullptr;
+    CUDA_CHECK(cudaMallocHost(&ptr, size));
+    return ptr;
 }
 
 void freeHost(void *ptr) {
-    TO_BE_IMPLEMENTED();
+    if (!ptr) {
+        return;
+    }
+    CUDA_CHECK(cudaFreeHost(ptr));
 }
 
 void memcpySync(void *dst, const void *src, size_t size, llaisysMemcpyKind_t kind) {
-    TO_BE_IMPLEMENTED();
+    cudaMemcpyKind cuda_kind = cudaMemcpyDefault;
+    switch (kind) {
+    case LLAISYS_MEMCPY_H2H:
+        cuda_kind = cudaMemcpyHostToHost;
+        break;
+    case LLAISYS_MEMCPY_H2D:
+        cuda_kind = cudaMemcpyHostToDevice;
+        break;
+    case LLAISYS_MEMCPY_D2H:
+        cuda_kind = cudaMemcpyDeviceToHost;
+        break;
+    case LLAISYS_MEMCPY_D2D:
+        cuda_kind = cudaMemcpyDeviceToDevice;
+        break;
+    default:
+        ASSERT(false, "Unsupported memcpy kind in CUDA memcpySync");
+    }
+    CUDA_CHECK(cudaMemcpy(dst, src, size, cuda_kind));
 }
 
-void memcpyAsync(void *dst, const void *src, size_t size, llaisysMemcpyKind_t kind) {
-    TO_BE_IMPLEMENTED();
+void memcpyAsync(void *dst, const void *src, size_t size, llaisysMemcpyKind_t kind, llaisysStream_t stream) {
+    cudaMemcpyKind cuda_kind = cudaMemcpyDefault;
+    switch (kind) {
+    case LLAISYS_MEMCPY_H2H:
+        cuda_kind = cudaMemcpyHostToHost;
+        break;
+    case LLAISYS_MEMCPY_H2D:
+        cuda_kind = cudaMemcpyHostToDevice;
+        break;
+    case LLAISYS_MEMCPY_D2H:
+        cuda_kind = cudaMemcpyDeviceToHost;
+        break;
+    case LLAISYS_MEMCPY_D2D:
+        cuda_kind = cudaMemcpyDeviceToDevice;
+        break;
+    default:
+        ASSERT(false, "Unsupported memcpy kind in CUDA memcpyAsync");
+    }
+    CUDA_CHECK(cudaMemcpyAsync(dst, src, size, cuda_kind, reinterpret_cast<cudaStream_t>(stream)));
 }
 
 static const LlaisysRuntimeAPI RUNTIME_API = {
diff --git a/src/llaisys/ops.cc b/src/llaisys/ops.cc
index c99fbc32..ddcc9772 100644
--- a/src/llaisys/ops.cc
+++ b/src/llaisys/ops.cc
@@ -7,6 +7,7 @@
 #include "../ops/embedding/op.hpp"
 #include "../ops/linear/op.hpp"
 #include "../ops/rearrange/op.hpp"
+#include "../ops/random_sample/op.hpp"
 #include "../ops/rms_norm/op.hpp"
 #include "../ops/rope/op.hpp"
 #include "../ops/self_attention/op.hpp"
@@ -23,7 +24,10 @@ __C {
         llaisys::ops::embedding(out->tensor, index->tensor, weight->tensor);
     }
     void llaisysLinear(llaisysTensor_t out, llaisysTensor_t in, llaisysTensor_t weight, llaisysTensor_t bias) {
-        llaisys::ops::linear(out->tensor, in->tensor, weight->tensor, bias->tensor);
+        if(bias == nullptr) {
+            llaisys::ops::linear(out->tensor, in->tensor, weight->tensor, nullptr);
+        }else
+            llaisys::ops::linear(out->tensor, in->tensor, weight->tensor, bias->tensor);
     }
     void llaisysRearrange(llaisysTensor_t out, llaisysTensor_t in) {
         llaisys::ops::rearrange(out->tensor, in->tensor);
@@ -40,4 +44,9 @@ __C {
     void llaisysSwiGLU(llaisysTensor_t out, llaisysTensor_t gate, llaisysTensor_t up) {
         llaisys::ops::swiglu(out->tensor, gate->tensor, up->tensor);
     }
+    void llaisysRandomSample(llaisysTensor_t sample_idx, llaisysTensor_t sample_val, llaisysTensor_t logits,
+                             float temperature, int top_k, float top_p, int64_t seed) {
+        llaisys::ops::random_sample(sample_idx->tensor, sample_val->tensor, logits->tensor, temperature, top_k, top_p,
+                                    seed);
+    }
 }
diff --git a/src/llaisys/qwen2.cc b/src/llaisys/qwen2.cc
new file mode 100644
index 00000000..441c8d69
--- /dev/null
+++ b/src/llaisys/qwen2.cc
@@ -0,0 +1,510 @@
+#include "llaisys/models/qwen2.h"
+#include "llaisys/ops.h"
+#include "llaisys_tensor.hpp"
+
+#include <cstring>
+#include <cstdlib>
+#include <cmath>
+#include <iostream>
+#include <memory>
+
+
+__C {
+    struct LlaisysQwen2Model *llaisysQwen2ModelCreate(const LlaisysQwen2Meta *meta, llaisysDeviceType_t device, int *device_ids, int ndevice) {
+        // Validate input parameters
+        if (!meta || !device_ids || ndevice <= 0) {
+            std::cerr << "Invalid parameters for Qwen2 model creation" << std::endl;
+            return nullptr;
+        }
+
+        // Allocate main model structure
+        auto model = static_cast<LlaisysQwen2Model*>(std::calloc(1, sizeof(LlaisysQwen2Model)));
+        if (!model) {
+            std::cerr << "Failed to allocate LlaisysQwen2Model" << std::endl;
+            return nullptr;
+        }
+
+        // Initialize metadata
+        model->meta = static_cast<LlaisysQwen2Meta*>(std::malloc(sizeof(LlaisysQwen2Meta)));
+        if (!model->meta) {
+            std::cerr << "Failed to allocate model metadata" << std::endl;
+            free(model);
+            return nullptr;
+        }
+        std::memcpy(model->meta, meta, sizeof(LlaisysQwen2Meta));
+
+        // Initialize weights structure  
+        model->weights = static_cast<LlaisysQwen2Weights*>(std::calloc(1, sizeof(LlaisysQwen2Weights)));
+        if (!model->weights) {
+            std::cerr << "Failed to allocate model weights" << std::endl;
+            free(model->meta);
+            free(model);
+            return nullptr;
+        }
+
+        // Initialize device information
+        model->device = device;
+        model->ndevice = ndevice;
+        model->device_ids = static_cast<int*>(std::malloc(sizeof(int) * ndevice));
+        if (!model->device_ids) {
+            std::cerr << "Failed to allocate device IDs" << std::endl;
+            free(model->weights);
+            free(model->meta);
+            free(model);
+            return nullptr;
+        }
+        std::memcpy(model->device_ids, device_ids, sizeof(int) * ndevice);
+
+
+
+        // Input Embedding
+        size_t shape_in_embed[2] = { meta->voc, meta->hs };
+        model->weights->in_embed = tensorCreate(shape_in_embed, 2, meta->dtype, device, device_ids[0]);
+
+        // Helper functions for tensor array allocation
+        auto alloc_layer_array_2d = [&](llaisysTensor_t *&ptr, size_t dim0, size_t dim1) -> bool {
+            ptr = static_cast<llaisysTensor_t*>(std::malloc(sizeof(llaisysTensor_t) * meta->nlayer));
+            if (!ptr) return false;
+            
+            for (size_t i = 0; i < meta->nlayer; ++i) {
+                size_t shape[2] = { dim0, dim1 };
+                ptr[i] = tensorCreate(shape, 2, meta->dtype, model->device, model->device_ids[0]);
+                if (!ptr[i]) {
+                    // Clean up previously allocated tensors on failure
+                    for (size_t j = 0; j < i; ++j) {
+                        tensorDestroy(ptr[j]);
+                    }
+                    free(ptr);
+                    ptr = nullptr;
+                    return false;
+                }
+            }
+            return true;
+        };
+
+        auto alloc_layer_array_1d = [&](llaisysTensor_t *&ptr, size_t dim0) -> bool {
+            ptr = static_cast<llaisysTensor_t*>(std::malloc(sizeof(llaisysTensor_t) * meta->nlayer));
+            if (!ptr) return false;
+            
+            for (size_t i = 0; i < meta->nlayer; ++i) {
+                size_t shape[1] = { dim0 };
+                ptr[i] = tensorCreate(shape, 1, meta->dtype, model->device, model->device_ids[0]);
+                if (!ptr[i]) {
+                    // Clean up previously allocated tensors on failure
+                    for (size_t j = 0; j < i; ++j) {
+                        tensorDestroy(ptr[j]);
+                    }
+                    free(ptr);
+                    ptr = nullptr;
+                    return false;
+                }
+            }
+            return true;
+        };
+
+        // Self-Attention
+        alloc_layer_array_1d(model->weights->attn_norm_w, meta->hs);                     // [1536]
+        alloc_layer_array_2d(model->weights->attn_q_w, meta->hs, meta->nh * meta->dh);   // [1536, 1536]
+        alloc_layer_array_1d(model->weights->attn_q_b, meta->nh * meta->dh);             // [1536]
+        alloc_layer_array_2d(model->weights->attn_k_w, meta->nkvh * meta->dh, meta->hs); // [256, 1536]
+        alloc_layer_array_1d(model->weights->attn_k_b, meta->nkvh * meta->dh);           // [256]
+        alloc_layer_array_2d(model->weights->attn_v_w, meta->nkvh * meta->dh, meta->hs); // [256, 1536]
+        alloc_layer_array_1d(model->weights->attn_v_b, meta->nkvh * meta->dh);           // [256]
+        alloc_layer_array_2d(model->weights->attn_o_w, meta->nh * meta->dh, meta->hs);   // [1536, 1536]
+
+        // MLP
+        alloc_layer_array_1d(model->weights->mlp_norm_w, meta->hs);             // [1536]
+        alloc_layer_array_2d(model->weights->mlp_gate_w, meta->di, meta->hs);   // [8960, 1536]
+        alloc_layer_array_2d(model->weights->mlp_up_w, meta->di, meta->hs);     // [8960, 1536]
+        alloc_layer_array_2d(model->weights->mlp_down_w, meta->hs, meta->di);   // [1536, 8960]
+
+        // Output Layer Norm
+        size_t shape_out_norm[1] = { meta->hs };
+        model->weights->out_norm_w = tensorCreate(shape_out_norm, 1, meta->dtype, model->device, model->device_ids[0]);
+
+        // Output Embedding
+        size_t shape_out_embed[2] = { meta->voc, meta->hs };
+        model->weights->out_embed = tensorCreate(shape_out_embed, 2, meta->dtype, device, device_ids[0]);
+
+        return model;
+    }
+
+    void llaisysQwen2ModelDestroy(struct LlaisysQwen2Model * model) {
+        if (!model) return;
+
+        tensorDestroy(model->weights->in_embed);
+        tensorDestroy(model->weights->out_embed);
+        tensorDestroy(model->weights->out_norm_w);
+
+        // attn_norm_w (1d tensor array)
+        if (model->weights->attn_norm_w) {
+            for (size_t i = 0; i < model->meta->nlayer; ++i) {
+                tensorDestroy(model->weights->attn_norm_w[i]);
+            }
+            free(model->weights->attn_norm_w);
+        }
+
+        // attn_q_w (2d tensor array)
+        if (model->weights->attn_q_w) {
+            for (size_t i = 0; i < model->meta->nlayer; ++i) {
+                tensorDestroy(model->weights->attn_q_w[i]);
+            }
+            free(model->weights->attn_q_w);
+        }
+
+        // attn_q_b (1d tensor array)
+        if (model->weights->attn_q_b) {
+            for (size_t i = 0; i < model->meta->nlayer; ++i) {
+                tensorDestroy(model->weights->attn_q_b[i]);
+            }
+            free(model->weights->attn_q_b);
+        }
+
+        // attn_k_w (2d tensor array)
+        if (model->weights->attn_k_w) {
+            for (size_t i = 0; i < model->meta->nlayer; ++i) {
+                tensorDestroy(model->weights->attn_k_w[i]);
+            }
+            free(model->weights->attn_k_w);
+        }
+
+        // attn_k_b (1d tensor array)
+        if (model->weights->attn_k_b) {
+            for (size_t i = 0; i < model->meta->nlayer; ++i) {
+                tensorDestroy(model->weights->attn_k_b[i]);
+            }
+            free(model->weights->attn_k_b);
+        }
+
+        // attn_v_w (2d tensor array)
+        if (model->weights->attn_v_w) {
+            for (size_t i = 0; i < model->meta->nlayer; ++i) {
+                tensorDestroy(model->weights->attn_v_w[i]);
+            }
+            free(model->weights->attn_v_w);
+        }
+
+        // attn_v_b (1d tensor array)
+        if (model->weights->attn_v_b) {
+            for (size_t i = 0; i < model->meta->nlayer; ++i) {
+                tensorDestroy(model->weights->attn_v_b[i]);
+            }
+            free(model->weights->attn_v_b);
+        }
+
+        // attn_o_w (2d tensor array)
+        if (model->weights->attn_o_w) {
+            for (size_t i = 0; i < model->meta->nlayer; ++i) {
+                tensorDestroy(model->weights->attn_o_w[i]);
+            }
+            free(model->weights->attn_o_w);
+        }
+
+        // mlp_norm_w (1d tensor array)
+        if (model->weights->mlp_norm_w) {
+            for (size_t i = 0; i < model->meta->nlayer; ++i) {
+                tensorDestroy(model->weights->mlp_norm_w[i]);
+            }
+            free(model->weights->mlp_norm_w);
+        }
+
+        // mlp_gate_w (2d tensor array)
+        if (model->weights->mlp_gate_w) {
+            for (size_t i = 0; i < model->meta->nlayer; ++i) {
+                tensorDestroy(model->weights->mlp_gate_w[i]);
+            }
+            free(model->weights->mlp_gate_w);
+        }
+
+        // mlp_up_w (2d tensor array)
+        if (model->weights->mlp_up_w) {
+            for (size_t i = 0; i < model->meta->nlayer; ++i) {
+                tensorDestroy(model->weights->mlp_up_w[i]);
+            }
+            free(model->weights->mlp_up_w);
+        }
+
+        // mlp_down_w (2d tensor array)
+        if (model->weights->mlp_down_w) {
+            for (size_t i = 0; i < model->meta->nlayer; ++i) {
+                tensorDestroy(model->weights->mlp_down_w[i]);
+            }
+            free(model->weights->mlp_down_w);
+        }
+
+        if (model->device_ids) {
+            free(model->device_ids);
+        }
+
+        if (model->meta) {
+            free(model->meta);
+        }
+
+        free(model);
+    }
+
+    struct LlaisysQwen2Weights *llaisysQwen2ModelWeights(struct LlaisysQwen2Model * model) {
+        return model->weights;
+    }
+
+    int64_t llaisysQwen2ModelInfer(struct LlaisysQwen2Model * model, int64_t * token_ids, size_t ntoken,
+                                   llaisysTensor_t *kcache, llaisysTensor_t *vcache, size_t past_len,
+                                   float temperature, int top_k, float top_p, int64_t seed) {
+        if (!model || !token_ids || ntoken == 0) return -1;
+
+        // If kv_cache != nullptr, it means KV Cache is used for performance.
+        bool kv_cache_used = (kcache != nullptr && vcache != nullptr);
+        // kcache [max_seq, nkvh, d], vcache [max_seq, nkvh, dv]
+
+        size_t seqlen = ntoken;
+        size_t hs = model->meta->hs;       // hidden size
+        size_t nh = model->meta->nh;       // num heads
+        size_t dh = model->meta->dh;       // head dim
+        size_t nkvh = model->meta->nkvh;   // num key-value heads
+        size_t di = model->meta->di;       // mlp intermediate dim
+        size_t voc = model->meta->voc;     // vocab size
+        size_t nlayer = model->meta->nlayer;
+        float scale = 1.0f / std::sqrt(static_cast<float>(dh)); // scale for attention
+        float rms_eps = model->meta->epsilon; // epsilon for RMSNorm
+        float rope_theta = model->meta->theta; // theta for RoPE
+        
+        // 1. intput token_ids -> tensor
+        size_t input_tensor_shape[1] = {seqlen};
+        
+        llaisysTensor_t input_tensor = tensorCreate(input_tensor_shape, 1, LLAISYS_DTYPE_I64, model->device, model->device_ids[0]);
+        
+        tensorLoad(input_tensor, token_ids);
+        
+        // 2. Embedding lookup: [seqlen] -> [seqlen, hs]
+        size_t output_embedding_tensor_shape[2] = {seqlen, hs};
+        llaisysTensor_t output_embedding_tensor = tensorCreate(output_embedding_tensor_shape, 2, model->meta->dtype, model->device, model->device_ids[0]);
+        llaisysEmbedding(output_embedding_tensor, input_tensor, model->weights->in_embed);
+        
+        
+        
+        // output_hidden_layer_tensor is used to store the output of the hidden layer
+        llaisysTensor_t output_hidden_layer_tensor = output_embedding_tensor;
+        size_t position_shape[1] = {seqlen};
+        llaisysTensor_t position_ids = tensorCreate(position_shape, 1, LLAISYS_DTYPE_I64, model->device, model->device_ids[0]);
+        int64_t* pos_data_cpu = new int64_t[seqlen];
+        for (size_t i = 0; i < seqlen; i++) {
+            if(kv_cache_used)
+                pos_data_cpu[i] = (int64_t)(past_len + i);  // When using KV cache, position ids continue from past_len
+            else
+                pos_data_cpu[i] = (int64_t) i;
+        }
+        tensorLoad(position_ids, pos_data_cpu);
+        delete[] pos_data_cpu;
+        
+        
+
+        // 3. Transformer hidden layers
+        for (size_t i = 0; i < nlayer; i++) {
+            // 3.1 LayerNorm before Self-attention
+            llaisysTensor_t attn_norm_w = model->weights->attn_norm_w[i];
+            size_t output_input_layernorm_shape[2] = {seqlen, hs};
+            llaisysTensor_t output_input_layernorm_tensor = tensorCreate(output_input_layernorm_shape, 2, model->meta->dtype, model->device, model->device_ids[0]);
+            llaisysRmsNorm(output_input_layernorm_tensor, output_hidden_layer_tensor, attn_norm_w, rms_eps);
+
+
+
+            // 3.2 Q projection
+            llaisysTensor_t attn_q_w = model->weights->attn_q_w[i];
+            llaisysTensor_t attn_q_b = model->weights->attn_q_b[i];
+            size_t q_tensor_shape[2] = {seqlen, nh * dh};
+            llaisysTensor_t q_tensor = tensorCreate(q_tensor_shape, 2, model->meta->dtype, model->device, model->device_ids[0]);
+            llaisysLinear(q_tensor, output_input_layernorm_tensor, attn_q_w, attn_q_b);
+
+            size_t q_tensor_reshape_shape[3] = {seqlen, nh, dh};
+            q_tensor = tensorReshape(q_tensor, q_tensor_reshape_shape, 3);
+            llaisysTensor_t q_rope_tensor = tensorCreate(q_tensor_reshape_shape, 3, model->meta->dtype, model->device, model->device_ids[0]);
+            llaisysROPE(q_rope_tensor, q_tensor, position_ids, rope_theta);
+
+
+
+            // 3.3 K projection
+            llaisysTensor_t attn_k_w = model->weights->attn_k_w[i];
+            llaisysTensor_t attn_k_b = model->weights->attn_k_b[i];
+            size_t k_tensor_shape[2] = {seqlen, nkvh * dh};
+            llaisysTensor_t k_tensor = tensorCreate(k_tensor_shape, 2, model->meta->dtype, model->device, model->device_ids[0]);
+            llaisysLinear(k_tensor, output_input_layernorm_tensor, attn_k_w, attn_k_b);
+
+            size_t k_tensor_reshape_shape[3] = {seqlen, nkvh, dh};
+            k_tensor = tensorReshape(k_tensor, k_tensor_reshape_shape, 3);
+            llaisysTensor_t k_rope_tensor; 
+            if (kv_cache_used) {
+                // When using KV cache, we need to update the kcache tensor
+                // kcache shape: [max_seq, nkvh, dh]
+                // Slice the kcache to get the current position to update
+                k_rope_tensor = tensorSlice(kcache[i], 0, past_len, past_len + seqlen); // Write to kcache
+            } else {
+                k_rope_tensor = tensorCreate(k_tensor_reshape_shape, 3, model->meta->dtype, model->device, model->device_ids[0]);
+            }
+            llaisysROPE(k_rope_tensor, k_tensor, position_ids, rope_theta);
+
+
+
+            // 3.4 V projection
+            llaisysTensor_t attn_v_w = model->weights->attn_v_w[i];
+            llaisysTensor_t attn_v_b = model->weights->attn_v_b[i];
+            size_t v_tensor_shape[2] = {seqlen, nkvh * dh};
+            llaisysTensor_t v_tensor;
+            if (kv_cache_used) {
+                // When using KV cache, we need to update the vcache tensor
+                // vcache shape: [max_seq, nkvh, dh]
+                // Slice the vcache to get the current position to update
+                v_tensor = tensorView(tensorSlice(vcache[i], 0, past_len, past_len + seqlen), v_tensor_shape, 2);
+            } else {
+                v_tensor = tensorCreate(v_tensor_shape, 2, model->meta->dtype, model->device, model->device_ids[0]);
+            }
+            llaisysLinear(v_tensor, output_input_layernorm_tensor, attn_v_w, attn_v_b);
+
+
+            // 3.5 Self-attention with Flash Attention
+            size_t output_self_attn_multihead_tensor_shape[3] = {seqlen, nh, dh};
+            llaisysTensor_t output_self_attn_tensor = tensorCreate(output_self_attn_multihead_tensor_shape, 3, model->meta->dtype, model->device, model->device_ids[0]);
+            
+            // Prepare tensors for attention (with or without KV cache)
+            llaisysTensor_t k_for_attn, v_for_attn;
+            size_t kv_seq_len;
+            
+            if (kv_cache_used) {
+                // Use KV cache to speed up inference
+                // kcache [max_seq, nkvh, d], vcache [max_seq, nkvh, dv]
+                kv_seq_len = past_len + seqlen;
+                k_for_attn = tensorSlice(kcache[i], 0, 0, kv_seq_len);
+                v_for_attn = tensorSlice(vcache[i], 0, 0, kv_seq_len);
+            } else {
+                size_t v_tensor_reshape_shape[3] = {seqlen, nkvh, dh};
+                v_for_attn = tensorReshape(v_tensor, v_tensor_reshape_shape, 3);
+                k_for_attn = k_rope_tensor;
+                kv_seq_len = seqlen;
+            }
+           
+            
+            llaisysSelfAttention(output_self_attn_tensor, q_rope_tensor, k_for_attn, v_for_attn, scale);
+            
+            size_t output_self_attn_tensor_shape[2] = {seqlen, nh * dh};
+            output_self_attn_tensor = tensorReshape(output_self_attn_tensor, output_self_attn_tensor_shape, 2);
+            
+            
+            // 3.6 Self-attention output projection
+            llaisysTensor_t attn_o_w = model->weights->attn_o_w[i];
+            size_t o_tensor_shape[2] = {seqlen, hs};
+            llaisysTensor_t o_tensor = tensorCreate(o_tensor_shape, 2, model->meta->dtype, model->device, model->device_ids[0]);
+            
+            llaisysLinear(o_tensor, output_self_attn_tensor, attn_o_w, nullptr);
+
+            // 3.7 Residual connection after attn
+            size_t output_res1_tensor_shape[2] = {seqlen, hs};
+            llaisysTensor_t output_res1_tensor = tensorCreate(output_res1_tensor_shape, 2, model->meta->dtype, model->device, model->device_ids[0]);
+            llaisysAdd(output_res1_tensor, output_hidden_layer_tensor, o_tensor);
+
+
+            // 3.8 Post-attention LayerNorm
+            llaisysTensor_t post_attn_norm_w = model->weights->mlp_norm_w[i];
+            size_t output_post_self_attn_layernorm_tensor_shape[2] = {seqlen, hs};
+            llaisysTensor_t output_post_self_attn_layernorm_tensor = tensorCreate(output_post_self_attn_layernorm_tensor_shape, 2, model->meta->dtype, model->device, model->device_ids[0]);
+            llaisysRmsNorm(output_post_self_attn_layernorm_tensor, output_res1_tensor, post_attn_norm_w, rms_eps);
+
+
+            // 3.9 MLP (Gate, Up, Down)
+            llaisysTensor_t mlp_gate_w = model->weights->mlp_gate_w[i];
+            llaisysTensor_t mlp_up_w = model->weights->mlp_up_w[i];
+            llaisysTensor_t mlp_down_w = model->weights->mlp_down_w[i];
+
+            size_t mlp_gate_tensor_shape[2] = {seqlen, di};
+            llaisysTensor_t mlp_gate_tensor = tensorCreate(mlp_gate_tensor_shape, 2, model->meta->dtype, model->device, model->device_ids[0]);
+            llaisysLinear(mlp_gate_tensor, output_post_self_attn_layernorm_tensor, mlp_gate_w, nullptr);
+
+
+            size_t mlp_up_tensor_shape[2] = {seqlen, di};
+            llaisysTensor_t mlp_up_tensor = tensorCreate(mlp_up_tensor_shape, 2, model->meta->dtype, model->device, model->device_ids[0]);
+            llaisysLinear(mlp_up_tensor, output_post_self_attn_layernorm_tensor, mlp_up_w, nullptr);
+
+
+            size_t swiglu_tensor_shape[2] = {seqlen, di};
+            llaisysTensor_t swiglu_tensor = tensorCreate(swiglu_tensor_shape, 2, model->meta->dtype, model->device, model->device_ids[0]);
+            llaisysSwiGLU(swiglu_tensor, mlp_gate_tensor, mlp_up_tensor);
+
+
+            size_t mlp_down_tensor_shape[2] = {seqlen, hs};
+            llaisysTensor_t mlp_down_tensor = tensorCreate(mlp_down_tensor_shape, 2, model->meta->dtype, model->device, model->device_ids[0]);
+            llaisysLinear(mlp_down_tensor, swiglu_tensor, mlp_down_w, nullptr);
+
+
+
+            // 3.10 Residual connection after MLP
+            llaisysAdd(output_hidden_layer_tensor, output_res1_tensor, mlp_down_tensor);
+
+
+
+
+
+            if(!kv_cache_used){
+                tensorDestroy(k_rope_tensor);
+                tensorDestroy(v_tensor);
+            }
+            
+            // release intermediate tensors
+            tensorDestroy(output_input_layernorm_tensor);
+            tensorDestroy(q_tensor);
+            tensorDestroy(q_rope_tensor);
+            tensorDestroy(k_tensor);
+            tensorDestroy(output_self_attn_tensor);
+            tensorDestroy(o_tensor);
+            tensorDestroy(output_res1_tensor);
+            tensorDestroy(output_post_self_attn_layernorm_tensor);
+            tensorDestroy(mlp_gate_tensor);
+            tensorDestroy(mlp_up_tensor);
+            tensorDestroy(swiglu_tensor);
+            tensorDestroy(mlp_down_tensor);
+        }
+        // 4. Output LayerNorm
+        llaisysTensor_t final_layernorm_w = model->weights->out_norm_w;
+        size_t output_final_layernorm_shape[2] = {seqlen, hs};
+        llaisysTensor_t output_final_layernorm_tensor = tensorCreate(output_final_layernorm_shape, 2, model->meta->dtype, model->device, model->device_ids[0]);
+        llaisysRmsNorm(output_final_layernorm_tensor, output_hidden_layer_tensor, final_layernorm_w, rms_eps);
+
+
+
+        // 5. Output [seqlen, voc]
+        size_t output_tensor_shape[2] = {seqlen, voc};
+        llaisysTensor_t output_tensor = tensorCreate(output_tensor_shape, 2, model->meta->dtype, model->device, model->device_ids[0]);
+        llaisysLinear(output_tensor, output_final_layernorm_tensor, model->weights->out_embed, nullptr);
+    
+
+
+        // [seqlen, voc] -> [1, voc]
+        size_t output_tensor_slice_reshape_shape[1] = {voc};
+        llaisysTensor_t output_tensor_slice = tensorSlice(output_tensor, 0, seqlen-1, seqlen); // only need the last token's logits
+
+
+        size_t index_shape[1] = {1};
+        llaisysTensor_t index_tensor = tensorCreate(index_shape, 1, LLAISYS_DTYPE_I64, model->device, model->device_ids[0]);
+        llaisysTensor_t value_tensor = tensorCreate(index_shape, 1, model->meta->dtype, model->device, model->device_ids[0]);
+        llaisysRandomSample(index_tensor, value_tensor, tensorView(output_tensor_slice, output_tensor_slice_reshape_shape, 1),
+                    temperature, top_k, top_p, seed);
+
+
+        // For NVIDIA device, need to copy data back to CPU to read it
+        int64_t index;
+        if (model->device == LLAISYS_DEVICE_NVIDIA) {
+            // Use C++ API to convert tensor to CPU
+            auto index_cpu_tensor = index_tensor->tensor->to(LLAISYS_DEVICE_CPU);
+            index = *((int64_t *) index_cpu_tensor->data());
+        } else {
+            index = *((int64_t *) tensorGetData(index_tensor));
+        }
+
+        tensorDestroy(position_ids);
+        tensorDestroy(input_tensor);
+        tensorDestroy(output_embedding_tensor);
+        tensorDestroy(output_final_layernorm_tensor);
+        tensorDestroy(output_tensor);
+        tensorDestroy(index_tensor);
+        tensorDestroy(value_tensor);
+
+        return index;
+    }
+}   
\ No newline at end of file
diff --git a/src/llaisys/tensor.cc b/src/llaisys/tensor.cc
index 5e6e5012..0e6e6f48 100644
--- a/src/llaisys/tensor.cc
+++ b/src/llaisys/tensor.cc
@@ -93,4 +93,17 @@ __C {
         size_t end) {
         return new LlaisysTensor{tensor->tensor->slice(dim, start, end)};
     }
+    llaisysTensor_t tensorReshape(
+        llaisysTensor_t tensor,
+        size_t * shape,
+        size_t ndim) {
+        std::vector<size_t> shape_vec(shape, shape + ndim);
+        return new LlaisysTensor{tensor->tensor->reshape(shape_vec)};
+    }
+    llaisysTensor_t tensorTo(
+        llaisysTensor_t tensor,
+        llaisysDeviceType_t device_type,
+        int device_id) {
+        return new LlaisysTensor{tensor->tensor->to(device_type, device_id)};
+    }
 }
diff --git a/src/ops/add/cuda/add_cuda.cu b/src/ops/add/cuda/add_cuda.cu
new file mode 100644
index 00000000..cfd855d9
--- /dev/null
+++ b/src/ops/add/cuda/add_cuda.cu
@@ -0,0 +1,66 @@
+#include "add_cuda.cuh"
+
+#include "../../../utils.hpp"
+#include "../../../utils/check.hpp"
+
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+
+namespace llaisys::ops::cuda {
+
+template <typename T>
+__global__ void add_kernel(T *c, const T *a, const T *b, size_t numel) {
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= numel) {
+        return;
+    }
+    c[idx] = a[idx] + b[idx];
+}
+
+__global__ void add_kernel_fp16(__half *c, const __half *a, const __half *b, size_t numel) {
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= numel) {
+        return;
+    }
+    c[idx] = __hadd(a[idx], b[idx]);
+}
+
+__global__ void add_kernel_bf16(__nv_bfloat16 *c, const __nv_bfloat16 *a, const __nv_bfloat16 *b, size_t numel) {
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= numel) {
+        return;
+    }
+#if __CUDA_ARCH__ >= 800
+    c[idx] = __hadd(a[idx], b[idx]);
+#else
+    c[idx] = __float2bfloat16(__bfloat162float(a[idx]) + __bfloat162float(b[idx]));
+#endif
+}
+
+void add(std::byte *c, const std::byte *a, const std::byte *b, llaisysDataType_t dtype, size_t numel) {
+    dim3 block(256);
+    dim3 grid((numel + block.x - 1) / block.x);
+
+    switch (dtype) {
+    case LLAISYS_DTYPE_F32:
+        add_kernel<<<grid, block>>>(reinterpret_cast<float *>(c), reinterpret_cast<const float *>(a),
+                                    reinterpret_cast<const float *>(b), numel);
+        break;
+    case LLAISYS_DTYPE_F16:
+        add_kernel_fp16<<<grid, block>>>(reinterpret_cast<__half *>(c), reinterpret_cast<const __half *>(a),
+                                         reinterpret_cast<const __half *>(b), numel);
+        break;
+    case LLAISYS_DTYPE_BF16:
+        add_kernel_bf16<<<grid, block>>>(reinterpret_cast<__nv_bfloat16 *>(c),
+                                         reinterpret_cast<const __nv_bfloat16 *>(a),
+                                         reinterpret_cast<const __nv_bfloat16 *>(b), numel);
+        break;
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(dtype);
+    }
+
+    auto err = cudaGetLastError();
+    ASSERT(err == cudaSuccess, "add cuda kernel launch failed");
+}
+
+} // namespace llaisys::ops::cuda
diff --git a/src/ops/add/cuda/add_cuda.cuh b/src/ops/add/cuda/add_cuda.cuh
new file mode 100644
index 00000000..9802a736
--- /dev/null
+++ b/src/ops/add/cuda/add_cuda.cuh
@@ -0,0 +1,9 @@
+#pragma once
+
+#include "llaisys.h"
+
+#include <cstddef>
+
+namespace llaisys::ops::cuda {
+void add(std::byte *c, const std::byte *a, const std::byte *b, llaisysDataType_t dtype, size_t numel);
+}
diff --git a/src/ops/add/op.cpp b/src/ops/add/op.cpp
index a057330d..c39021ec 100644
--- a/src/ops/add/op.cpp
+++ b/src/ops/add/op.cpp
@@ -4,6 +4,9 @@
 #include "../../utils.hpp"
 
 #include "cpu/add_cpu.hpp"
+#ifdef ENABLE_NVIDIA_API
+#include "cuda/add_cuda.cuh"
+#endif
 
 namespace llaisys::ops {
 void add(tensor_t c, tensor_t a, tensor_t b) {
@@ -25,7 +28,7 @@ void add(tensor_t c, tensor_t a, tensor_t b) {
         return cpu::add(c->data(), a->data(), b->data(), c->dtype(), c->numel());
 #ifdef ENABLE_NVIDIA_API
     case LLAISYS_DEVICE_NVIDIA:
-        TO_BE_IMPLEMENTED();
+    return cuda::add(c->data(), a->data(), b->data(), c->dtype(), c->numel());
         return;
 #endif
     default:
diff --git a/src/ops/argmax/cpu/argmax_cpu.cpp b/src/ops/argmax/cpu/argmax_cpu.cpp
new file mode 100644
index 00000000..e1e97e88
--- /dev/null
+++ b/src/ops/argmax/cpu/argmax_cpu.cpp
@@ -0,0 +1,45 @@
+#include "argmax_cpu.hpp"
+
+#include "../../../utils.hpp"
+
+#include <limits>
+
+template <typename T>
+void argmax_(int64_t *mi, T *mv, const T *v, size_t numel) {
+    mi[0]=0;
+    if constexpr (std::is_same_v<T, llaisys::bf16_t> || std::is_same_v<T, llaisys::fp16_t>) {
+        mv[0] = llaisys::utils::cast<T>(llaisys::utils::cast<float>(v[0]));
+
+        for (size_t i = 1; i < numel; i++) {
+            if(llaisys::utils::cast<float>(mv[0]) < llaisys::utils::cast<float>(v[i])){
+                    mi[0] = static_cast<int64_t>(i);
+                    mv[0] = v[i];
+            }
+        }
+
+    }else{
+        mv[0] = v[0];
+        for (size_t i = 1; i < numel; i++) {
+            if(mv[0] < v[i]){
+                    mi[0] = static_cast<int64_t>(i);
+                    mv[0] = v[i];
+            }
+        }
+    }
+}
+namespace llaisys::ops::cpu {
+void argmax(std::byte *mi, std::byte *mv, const std::byte *v, llaisysDataType_t type, size_t numel) {
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        return argmax_(reinterpret_cast<int64_t *>(mi), reinterpret_cast<float *>(mv), reinterpret_cast<const float *>(v), numel);
+    case LLAISYS_DTYPE_BF16:
+        return argmax_(reinterpret_cast<int64_t *>(mi), reinterpret_cast<llaisys::bf16_t *>(mv),
+                    reinterpret_cast<const llaisys::bf16_t *>(v), numel);
+    case LLAISYS_DTYPE_F16:
+        return argmax_(reinterpret_cast<int64_t *>(mi), reinterpret_cast<llaisys::fp16_t *>(mv),
+                    reinterpret_cast<const llaisys::fp16_t *>(v), numel);
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+}
+} // namespace llaisys::ops::cpu
diff --git a/src/ops/argmax/cpu/argmax_cpu.hpp b/src/ops/argmax/cpu/argmax_cpu.hpp
new file mode 100644
index 00000000..3ba45392
--- /dev/null
+++ b/src/ops/argmax/cpu/argmax_cpu.hpp
@@ -0,0 +1,8 @@
+#pragma once
+#include "llaisys.h"
+
+#include <cstddef>
+
+namespace llaisys::ops::cpu {
+void argmax(std::byte *mi, std::byte *mv, const std::byte *v, llaisysDataType_t type, size_t size);
+}
\ No newline at end of file
diff --git a/src/ops/argmax/cuda/argmax_cuda.cu b/src/ops/argmax/cuda/argmax_cuda.cu
new file mode 100644
index 00000000..d0ddec2a
--- /dev/null
+++ b/src/ops/argmax/cuda/argmax_cuda.cu
@@ -0,0 +1,35 @@
+#include "argmax_cuda.cuh"
+
+#include "../../../utils.hpp"
+#include "../../../utils/check.hpp"
+
+namespace llaisys::ops::cuda {
+
+__global__ void argmax_kernel(int64_t *mi, void *mv, const void *v, llaisysDataType_t type, size_t numel) {
+    if (blockIdx.x != 0 || threadIdx.x != 0) {
+        return;
+    }
+
+    size_t best_idx = 0;
+    float best_val = load_as_float(v, 0, type);
+    for (size_t i = 1; i < numel; ++i) {
+        float cur = load_as_float(v, i, type);
+        if (cur > best_val) {
+            best_val = cur;
+            best_idx = i;
+        }
+    }
+
+    mi[0] = static_cast<int64_t>(best_idx);
+    store_from_float(mv, 0, best_val, type);
+}
+
+void argmax(std::byte *mi, std::byte *mv, const std::byte *v, llaisysDataType_t type, size_t numel) {
+    ASSERT(type == LLAISYS_DTYPE_F32 || type == LLAISYS_DTYPE_F16 || type == LLAISYS_DTYPE_BF16,
+           "argmax cuda only supports F32/F16/BF16");
+    argmax_kernel<<<1, 1>>>(reinterpret_cast<int64_t *>(mi), mv, v, type, numel);
+    auto err = cudaGetLastError();
+    ASSERT(err == cudaSuccess, "argmax cuda kernel launch failed");
+}
+
+} // namespace llaisys::ops::cuda
diff --git a/src/ops/argmax/cuda/argmax_cuda.cuh b/src/ops/argmax/cuda/argmax_cuda.cuh
new file mode 100644
index 00000000..fc733aec
--- /dev/null
+++ b/src/ops/argmax/cuda/argmax_cuda.cuh
@@ -0,0 +1,9 @@
+#pragma once
+
+#include "llaisys.h"
+
+#include <cstddef>
+
+namespace llaisys::ops::cuda {
+void argmax(std::byte *mi, std::byte *mv, const std::byte *v, llaisysDataType_t type, size_t numel);
+}
diff --git a/src/ops/argmax/op.cpp b/src/ops/argmax/op.cpp
index 6dc37d42..1c78dcc9 100644
--- a/src/ops/argmax/op.cpp
+++ b/src/ops/argmax/op.cpp
@@ -1,7 +1,56 @@
 #include "op.hpp"
+#include "../../core/llaisys_core.hpp"
+#include "../../utils.hpp"
 
+#include "cpu/argmax_cpu.hpp"
+#ifdef ENABLE_NVIDIA_API
+#include "cuda/argmax_cuda.cuh"
+#endif
 namespace llaisys::ops {
 void argmax(tensor_t max_idx, tensor_t max_val, tensor_t vals) {
-    TO_BE_IMPLEMENTED();
+     CHECK_SAME_DEVICE(max_idx, max_val, vals);  
+      
+    // 验证vals是1D张量  
+    ASSERT(vals->shape().size() == 1, "Argmax: vals must be a 1D tensor");  
+
+    //验证vals连续性
+    ASSERT(vals->isContiguous(),"Argmax: vals must be contiguous.");
+      
+    // 验证max_idx是包含单个元素的1D张量  
+    ASSERT(max_idx->shape().size() == 1 && max_idx->shape()[0] == 1,   
+           "Argmax: max_idx must be a 1D tensor with single element");  
+      
+    // 验证max_val是包含单个元素的1D张量  
+    ASSERT(max_val->shape().size() == 1 && max_val->shape()[0] == 1,   
+           "Argmax: max_val must be a 1D tensor with single element");  
+      
+    // 验证max_idx是Int64类型  
+    ASSERT(max_idx->dtype() == LLAISYS_DTYPE_I64,   
+           "Argmax: max_idx must be Int64 type");  
+      
+    // 验证max_val类型与vals相同  
+    ASSERT(max_val->dtype() == vals->dtype(),   
+           "Argmax: max_val must have same type as vals");  
+  
+    // always support cpu calculation  
+    if (vals->deviceType() == LLAISYS_DEVICE_CPU) {  
+        return cpu::argmax(max_idx->data(), max_val->data(), vals->data(),   
+                          vals->dtype(), vals->numel());  
+    }  
+  
+    llaisys::core::context().setDevice(vals->deviceType(), vals->deviceId());  
+  
+    switch (vals->deviceType()) {  
+    case LLAISYS_DEVICE_CPU:  
+        return cpu::argmax(max_idx->data(), max_val->data(), vals->data(),   
+                          vals->dtype(), vals->numel());  
+#ifdef ENABLE_NVIDIA_API  
+    case LLAISYS_DEVICE_NVIDIA:  
+    return cuda::argmax(max_idx->data(), max_val->data(), vals->data(), vals->dtype(), vals->numel());  
+        return;  
+#endif  
+    default:  
+        EXCEPTION_UNSUPPORTED_DEVICE;  
+    }  
 }
 } // namespace llaisys::ops
diff --git a/src/ops/embedding/cpu/embedding_cpu.cpp b/src/ops/embedding/cpu/embedding_cpu.cpp
new file mode 100644
index 00000000..3400ab62
--- /dev/null
+++ b/src/ops/embedding/cpu/embedding_cpu.cpp
@@ -0,0 +1,28 @@
+#include "embedding_cpu.hpp"
+
+#include "../../../utils.hpp"
+
+#include <limits>
+
+template <typename T>
+void embedding_(T *out, const int64_t *index, const T *weight, size_t num , size_t dim ) {
+    for(size_t i = 0;i < num; i++)
+        for(size_t j = 0;j < dim ; j++)
+            out[i * dim + j] = weight[index[i] * dim + j];
+}
+namespace llaisys::ops::cpu {
+void embedding(std::byte *out, const std::byte *index, const std::byte *weight, llaisysDataType_t type, size_t num , size_t dim) {
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        return embedding_(reinterpret_cast<float *>(out), reinterpret_cast<const int64_t *>(index), reinterpret_cast<const float *>(weight), num , dim);
+    case LLAISYS_DTYPE_BF16:
+        return embedding_(reinterpret_cast<llaisys::bf16_t *>(out), reinterpret_cast<const int64_t *>(index),
+                    reinterpret_cast<const llaisys::bf16_t *>(weight), num , dim);
+    case LLAISYS_DTYPE_F16:
+        return embedding_(reinterpret_cast<llaisys::fp16_t *>(out), reinterpret_cast<const int64_t *>(index),
+                    reinterpret_cast<const llaisys::fp16_t *>(weight), num , dim);
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+}
+} // namespace llaisys::ops::cpu
diff --git a/src/ops/embedding/cpu/embedding_cpu.hpp b/src/ops/embedding/cpu/embedding_cpu.hpp
new file mode 100644
index 00000000..29788657
--- /dev/null
+++ b/src/ops/embedding/cpu/embedding_cpu.hpp
@@ -0,0 +1,8 @@
+#pragma once
+#include "llaisys.h"
+
+#include <cstddef>
+
+namespace llaisys::ops::cpu {
+void embedding(std::byte *out, const std::byte *index, const std::byte *weight, llaisysDataType_t type, size_t num , size_t dim);
+}
\ No newline at end of file
diff --git a/src/ops/embedding/cuda/embedding_cuda.cu b/src/ops/embedding/cuda/embedding_cuda.cu
new file mode 100644
index 00000000..b936ff18
--- /dev/null
+++ b/src/ops/embedding/cuda/embedding_cuda.cu
@@ -0,0 +1,36 @@
+#include "embedding_cuda.cuh"
+
+#include "../../../utils.hpp"
+#include "../../../utils/check.hpp"
+
+namespace llaisys::ops::cuda {
+
+__global__ void embedding_kernel(void *out, const int64_t *index, const void *weight, llaisysDataType_t dtype, size_t num,
+                                 size_t dim) {
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t total = num * dim;
+    if (idx >= total) {
+        return;
+    }
+
+    size_t row = idx / dim;
+    size_t col = idx % dim;
+    int64_t token = index[row];
+    size_t src_idx = static_cast<size_t>(token) * dim + col;
+    float val = load_as_float(weight, src_idx, dtype);
+    store_from_float(out, idx, val, dtype);
+}
+
+void embedding(std::byte *out, const std::byte *index, const std::byte *weight, llaisysDataType_t dtype, size_t num,
+               size_t dim) {
+    ASSERT(dtype == LLAISYS_DTYPE_F32 || dtype == LLAISYS_DTYPE_F16 || dtype == LLAISYS_DTYPE_BF16,
+           "embedding cuda only supports F32/F16/BF16");
+    size_t total = num * dim;
+    dim3 block(256);
+    dim3 grid((total + block.x - 1) / block.x);
+    embedding_kernel<<<grid, block>>>(out, reinterpret_cast<const int64_t *>(index), weight, dtype, num, dim);
+    auto err = cudaGetLastError();
+    ASSERT(err == cudaSuccess, "embedding cuda kernel launch failed");
+}
+
+} // namespace llaisys::ops::cuda
diff --git a/src/ops/embedding/cuda/embedding_cuda.cuh b/src/ops/embedding/cuda/embedding_cuda.cuh
new file mode 100644
index 00000000..bb0d7556
--- /dev/null
+++ b/src/ops/embedding/cuda/embedding_cuda.cuh
@@ -0,0 +1,10 @@
+#pragma once
+
+#include "llaisys.h"
+
+#include <cstddef>
+
+namespace llaisys::ops::cuda {
+void embedding(std::byte *out, const std::byte *index, const std::byte *weight, llaisysDataType_t dtype, size_t num,
+               size_t dim);
+}
diff --git a/src/ops/embedding/op.cpp b/src/ops/embedding/op.cpp
index 84b9a5d0..ce471fca 100644
--- a/src/ops/embedding/op.cpp
+++ b/src/ops/embedding/op.cpp
@@ -1,7 +1,70 @@
 #include "op.hpp"
+#include "../../core/llaisys_core.hpp"
+#include "../../utils.hpp"
 
+#include "cpu/embedding_cpu.hpp"
+#include <vector>
+#ifdef ENABLE_NVIDIA_API
+#include "cuda/embedding_cuda.cuh"
+#endif
 namespace llaisys::ops {
 void embedding(tensor_t out, tensor_t index, tensor_t weight) {
-    TO_BE_IMPLEMENTED();
-}
+    CHECK_SAME_DEVICE(out, index, weight);  
+      
+    ASSERT(index->shape().size() == 1, "Embedding: index must be 1D tensor");  
+    ASSERT(index->dtype() == LLAISYS_DTYPE_I64, "embedding: index must be Int64 type");  
+      
+ 
+    ASSERT(weight->shape().size() == 2, "Embedding: weight must be 2D tensor");  
+      
+
+    ASSERT(out->shape().size() == 2, "Embedding: out must be 2D tensor");  
+      
+
+    ASSERT(out->shape()[0] == index->shape()[0], "Embedding: out and index first dimension must match");  
+    ASSERT(out->shape()[1] == weight->shape()[1], "Embedding: out and weight second dimension must match");  
+
+    size_t vocab_size = weight->shape()[0];
+    std::vector<int64_t> host_index(index->numel());
+    if (index->deviceType() == LLAISYS_DEVICE_CPU) {
+        auto index_data = reinterpret_cast<const int64_t *>(index->data());
+        for (size_t i = 0; i < index->numel(); i++) {
+            host_index[i] = index_data[i];
+        }
+    } else {
+        llaisys::core::context().setDevice(index->deviceType(), index->deviceId());
+        llaisys::core::context().runtime().api()->memcpy_sync(
+            host_index.data(),
+            index->data(),
+            index->numel() * sizeof(int64_t),
+            LLAISYS_MEMCPY_D2H);
+    }
+
+    for (size_t i = 0; i < host_index.size(); i++) {
+        ASSERT(host_index[i] >= 0 && static_cast<size_t>(host_index[i]) < vocab_size,
+               "Embedding: index out of bounds");
+    }
+  
+    // always support cpu calculation  
+    if (out->deviceType() == LLAISYS_DEVICE_CPU) {  
+        return cpu::embedding(out->data(), index->data(), weight->data(),   
+                             out->dtype(), index->numel(), weight->shape()[1]);  
+    }  
+  
+    llaisys::core::context().setDevice(out->deviceType(), out->deviceId());  
+  
+    switch (out->deviceType()) {  
+    case LLAISYS_DEVICE_CPU:  
+        return cpu::embedding(out->data(), index->data(), weight->data(),   
+                             out->dtype(), index->numel(), weight->shape()[1]);  
+#ifdef ENABLE_NVIDIA_API  
+    case LLAISYS_DEVICE_NVIDIA:  
+    return cuda::embedding(out->data(), index->data(), weight->data(), out->dtype(), index->numel(), weight->shape()[1]);  
+        return;  
+#endif  
+    default:  
+        EXCEPTION_UNSUPPORTED_DEVICE;  
+    }  
+}  
+
 } // namespace llaisys::ops
diff --git a/src/ops/linear/cpu/linear_cpu.cpp b/src/ops/linear/cpu/linear_cpu.cpp
new file mode 100644
index 00000000..2b1b19e3
--- /dev/null
+++ b/src/ops/linear/cpu/linear_cpu.cpp
@@ -0,0 +1,75 @@
+#include "linear_cpu.hpp"
+
+#include "../../../utils.hpp"
+
+template <typename T>
+void linear_(T *out, const T *in, const T *weight, const T *bias,
+             size_t batch_size, size_t in_features, size_t out_features) {
+
+    for (size_t b = 0; b < batch_size; b++) {
+        for (size_t o = 0; o < out_features; o++) {
+            if constexpr (std::is_same_v<T, llaisys::bf16_t> || std::is_same_v<T, llaisys::fp16_t>) {
+                float sum_float = 0.0f;
+
+                for (size_t i = 0; i < in_features; i++) {
+                    float x_float = llaisys::utils::cast<float>(in[b * in_features + i]);
+                    float w_float = llaisys::utils::cast<float>(weight[o * in_features + i]);
+                    sum_float += x_float * w_float;
+                }
+
+                if (bias != nullptr) {
+                    sum_float += llaisys::utils::cast<float>(bias[o]);
+                }
+
+                out[b * out_features + o] = llaisys::utils::cast<T>(sum_float);
+            } else {
+                
+                T sum = T(0);
+
+                for (size_t i = 0; i < in_features; i++) {
+                    sum += in[b * in_features + i] * weight[o * in_features + i];
+                }
+
+                if (bias != nullptr) {
+                    sum += bias[o];
+                }
+
+                out[b * out_features + o] = sum;
+            }
+        }
+    }
+}
+
+namespace llaisys::ops::cpu {
+void linear(std::byte *out, const std::byte *in, const std::byte *weight, const std::byte *bias,
+            llaisysDataType_t type, size_t batch_size, size_t in_features, size_t out_features) {
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        return linear_(
+            reinterpret_cast<float *>(out),
+            reinterpret_cast<const float *>(in),
+            reinterpret_cast<const float *>(weight),
+            bias ? reinterpret_cast<const float *>(bias) : nullptr,
+            batch_size, in_features, out_features
+        );
+    case LLAISYS_DTYPE_BF16:
+        return linear_(
+            reinterpret_cast<llaisys::bf16_t *>(out),
+            reinterpret_cast<const llaisys::bf16_t *>(in),
+            reinterpret_cast<const llaisys::bf16_t *>(weight),
+            bias ? reinterpret_cast<const llaisys::bf16_t *>(bias) : nullptr,
+            batch_size, in_features, out_features
+        );
+    case LLAISYS_DTYPE_F16:
+        return linear_(
+            reinterpret_cast<llaisys::fp16_t *>(out),
+            reinterpret_cast<const llaisys::fp16_t *>(in),
+            reinterpret_cast<const llaisys::fp16_t *>(weight),
+            bias ? reinterpret_cast<const llaisys::fp16_t *>(bias) : nullptr,
+            batch_size, in_features, out_features
+        );
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+}
+}//// namespace llaisys::ops
\ No newline at end of file
diff --git a/src/ops/linear/cpu/linear_cpu.hpp b/src/ops/linear/cpu/linear_cpu.hpp
new file mode 100644
index 00000000..fb491ccd
--- /dev/null
+++ b/src/ops/linear/cpu/linear_cpu.hpp
@@ -0,0 +1,10 @@
+
+#pragma once
+#include "llaisys.h"
+
+#include <cstddef>
+
+namespace llaisys::ops::cpu {
+void linear(std::byte *out, const std::byte *in, const std::byte *weight, const std::byte *bias,
+            llaisysDataType_t type, size_t batch_size, size_t in_features, size_t out_features);
+}
\ No newline at end of file
diff --git a/src/ops/linear/cuda/linear_cuda.cu b/src/ops/linear/cuda/linear_cuda.cu
new file mode 100644
index 00000000..e3ceee00
--- /dev/null
+++ b/src/ops/linear/cuda/linear_cuda.cu
@@ -0,0 +1,43 @@
+#include "linear_cuda.cuh"
+
+#include "../../../utils.hpp"
+#include "../../../utils/check.hpp"
+
+namespace llaisys::ops::cuda {
+
+__global__ void linear_kernel(void *out, const void *in, const void *weight, const void *bias, llaisysDataType_t type,
+                              size_t batch_size, size_t in_features, size_t out_features, bool has_bias) {
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t total = batch_size * out_features;
+    if (idx >= total) {
+        return;
+    }
+
+    size_t b = idx / out_features;
+    size_t o = idx % out_features;
+
+    float sum = 0.0f;
+    for (size_t i = 0; i < in_features; ++i) {
+        float x = load_as_float(in, b * in_features + i, type);
+        float w = load_as_float(weight, o * in_features + i, type);
+        sum += x * w;
+    }
+    if (has_bias) {
+        sum += load_as_float(bias, o, type);
+    }
+    store_from_float(out, idx, sum, type);
+}
+
+void linear(std::byte *out, const std::byte *in, const std::byte *weight, const std::byte *bias, llaisysDataType_t type,
+            size_t batch_size, size_t in_features, size_t out_features) {
+    ASSERT(type == LLAISYS_DTYPE_F32 || type == LLAISYS_DTYPE_F16 || type == LLAISYS_DTYPE_BF16,
+           "linear cuda only supports F32/F16/BF16");
+    size_t total = batch_size * out_features;
+    dim3 block(256);
+    dim3 grid((total + block.x - 1) / block.x);
+    linear_kernel<<<grid, block>>>(out, in, weight, bias, type, batch_size, in_features, out_features, bias != nullptr);
+    auto err = cudaGetLastError();
+    ASSERT(err == cudaSuccess, "linear cuda kernel launch failed");
+}
+
+} // namespace llaisys::ops::cuda
diff --git a/src/ops/linear/cuda/linear_cuda.cuh b/src/ops/linear/cuda/linear_cuda.cuh
new file mode 100644
index 00000000..c46077ba
--- /dev/null
+++ b/src/ops/linear/cuda/linear_cuda.cuh
@@ -0,0 +1,10 @@
+#pragma once
+
+#include "llaisys.h"
+
+#include <cstddef>
+
+namespace llaisys::ops::cuda {
+void linear(std::byte *out, const std::byte *in, const std::byte *weight, const std::byte *bias, llaisysDataType_t type,
+            size_t batch_size, size_t in_features, size_t out_features);
+}
diff --git a/src/ops/linear/op.cpp b/src/ops/linear/op.cpp
index 97d1f865..ba6e9222 100644
--- a/src/ops/linear/op.cpp
+++ b/src/ops/linear/op.cpp
@@ -1,7 +1,75 @@
 #include "op.hpp"
 
+#include "../../core/llaisys_core.hpp"
+#include "../../utils.hpp"
+
+#include "cpu/linear_cpu.hpp"
+#ifdef ENABLE_NVIDIA_API
+#include "cuda/linear_cuda.cuh"
+#endif
 namespace llaisys::ops {
 void linear(tensor_t out, tensor_t in, tensor_t weight, tensor_t bias) {
-    TO_BE_IMPLEMENTED();
+        CHECK_SAME_DEVICE(out, in, weight);
+    if (bias) {
+        CHECK_SAME_DEVICE(out, bias);
+    }
+
+    // Check data types
+    CHECK_SAME_DTYPE(out->dtype(), in->dtype(), weight->dtype());
+    if (bias) {
+        CHECK_SAME_DTYPE(out->dtype(), bias->dtype());
+    }
+
+    // Check contiguity
+    ASSERT(out->isContiguous() && in->isContiguous() && weight->isContiguous(), 
+           "Linear: input tensors must be contiguous");
+    if (bias) {
+        ASSERT(bias->isContiguous(), "Linear: bias must be contiguous");
+    }
+
+    // Check dimensions
+    ASSERT(in->ndim() == 2, "Linear: input must be 2D tensor");
+    ASSERT(weight->ndim() == 2, "Linear: weight must be 2D tensor");
+    ASSERT(out->ndim() == 2, "Linear: output must be 2D tensor");
+    if (bias) {
+        ASSERT(bias->ndim() == 1, "Linear: bias must be 1D tensor");
+    }
+
+    // Get dimensions
+    size_t batch_size = in->shape()[0];
+    size_t in_features = in->shape()[1];
+    size_t out_features = weight->shape()[0];
+
+    // Check shapes are compatible
+    ASSERT(weight->shape()[1] == in_features, "Linear: weight input dimension must match input features");
+    ASSERT(out->shape()[0] == batch_size, "Linear: output batch size must match input batch size");
+    ASSERT(out->shape()[1] == out_features, "Linear: output features must match weight output features");
+    if (bias) {
+        ASSERT(bias->shape()[0] == out_features, "Linear: bias size must match output features");
+    }
+
+    // always support cpu calculation
+    if (out->deviceType() == LLAISYS_DEVICE_CPU) {
+        return cpu::linear(out->data(), in->data(), weight->data(), 
+                          bias ? bias->data() : nullptr,
+                          out->dtype(), batch_size, in_features, out_features);
+    }
+
+    llaisys::core::context().setDevice(out->deviceType(), out->deviceId());
+
+    switch (out->deviceType()) {
+    case LLAISYS_DEVICE_CPU:
+        return cpu::linear(out->data(), in->data(), weight->data(), 
+                          bias ? bias->data() : nullptr,
+                          out->dtype(), batch_size, in_features, out_features);
+#ifdef ENABLE_NVIDIA_API
+    case LLAISYS_DEVICE_NVIDIA:
+        return cuda::linear(out->data(), in->data(), weight->data(), bias ? bias->data() : nullptr,
+                            out->dtype(), batch_size, in_features, out_features);
+        return;
+#endif
+    default:
+        EXCEPTION_UNSUPPORTED_DEVICE;
+    }
 }
 } // namespace llaisys::ops
diff --git a/src/ops/random_sample/cpu/random_sample_cpu.cpp b/src/ops/random_sample/cpu/random_sample_cpu.cpp
new file mode 100644
index 00000000..b43a84a0
--- /dev/null
+++ b/src/ops/random_sample/cpu/random_sample_cpu.cpp
@@ -0,0 +1,109 @@
+#include "random_sample_cpu.hpp"
+
+#include "../../../utils.hpp"
+
+#include <algorithm>
+#include <cmath>
+#include <numeric>
+#include <random>
+#include <vector>
+
+template <typename T>
+void random_sample_(int64_t *sample_idx, T *sample_val, const T *logits, size_t vocab_size, float temperature,
+                    int top_k, float top_p, int64_t seed) {
+    if (top_p <= 0.0f) {
+        top_k = 1;
+    }
+
+    std::vector<float> scaled_logits(vocab_size);
+    float inv_temperature = 1.0f / std::max(temperature, 1e-6f);
+
+    for (size_t i = 0; i < vocab_size; ++i) {
+        scaled_logits[i] = llaisys::utils::cast<float>(logits[i]) * inv_temperature;
+    }
+
+    std::vector<size_t> sorted_indices(vocab_size);
+    std::iota(sorted_indices.begin(), sorted_indices.end(), 0);
+    std::sort(sorted_indices.begin(), sorted_indices.end(),
+              [&](size_t a, size_t b) { return scaled_logits[a] > scaled_logits[b]; });
+
+    if (top_k <= 0 || static_cast<size_t>(top_k) > vocab_size) {
+        top_k = static_cast<int>(vocab_size);
+    }
+    sorted_indices.resize(static_cast<size_t>(top_k));
+
+    std::vector<float> candidate_probs(sorted_indices.size());
+    float max_logit = scaled_logits[sorted_indices[0]];
+    float sum_exp = 0.0f;
+    for (size_t i = 0; i < sorted_indices.size(); ++i) {
+        float p = std::exp(scaled_logits[sorted_indices[i]] - max_logit);
+        candidate_probs[i] = p;
+        sum_exp += p;
+    }
+    for (float &p : candidate_probs) {
+        p /= sum_exp;
+    }
+
+    float top_p_clamped = std::min(1.0f, std::max(0.0f, top_p));
+    size_t keep_count = sorted_indices.size();
+    if (top_p_clamped > 0.0f && top_p_clamped < 1.0f) {
+        float cumulative = 0.0f;
+        keep_count = 0;
+        for (size_t i = 0; i < candidate_probs.size(); ++i) {
+            cumulative += candidate_probs[i];
+            keep_count = i + 1;
+            if (cumulative >= top_p_clamped) {
+                break;
+            }
+        }
+        keep_count = std::max<size_t>(1, keep_count);
+    }
+
+    sorted_indices.resize(keep_count);
+    candidate_probs.resize(keep_count);
+
+    float renorm_sum = 0.0f;
+    for (float p : candidate_probs) {
+        renorm_sum += p;
+    }
+    for (float &p : candidate_probs) {
+        p /= renorm_sum;
+    }
+
+    std::discrete_distribution<size_t> distribution(candidate_probs.begin(), candidate_probs.end());
+    size_t sampled_local_idx = 0;
+    if (seed >= 0) {
+        std::mt19937_64 generator(static_cast<uint64_t>(seed));
+        sampled_local_idx = distribution(generator);
+    } else {
+        static thread_local std::mt19937_64 generator(std::random_device{}());
+        sampled_local_idx = distribution(generator);
+    }
+    size_t sampled_vocab_idx = sorted_indices[sampled_local_idx];
+
+    sample_idx[0] = static_cast<int64_t>(sampled_vocab_idx);
+    sample_val[0] = logits[sampled_vocab_idx];
+}
+
+namespace llaisys::ops::cpu {
+void random_sample(std::byte *sample_idx, std::byte *sample_val, const std::byte *logits, llaisysDataType_t type,
+                   size_t vocab_size, float temperature, int top_k, float top_p, int64_t seed) {
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        return random_sample_(reinterpret_cast<int64_t *>(sample_idx), reinterpret_cast<float *>(sample_val),
+                              reinterpret_cast<const float *>(logits), vocab_size, temperature, top_k, top_p, seed);
+    case LLAISYS_DTYPE_BF16:
+        return random_sample_(reinterpret_cast<int64_t *>(sample_idx),
+                              reinterpret_cast<llaisys::bf16_t *>(sample_val),
+                              reinterpret_cast<const llaisys::bf16_t *>(logits), vocab_size, temperature, top_k,
+                              top_p, seed);
+    case LLAISYS_DTYPE_F16:
+        return random_sample_(reinterpret_cast<int64_t *>(sample_idx),
+                              reinterpret_cast<llaisys::fp16_t *>(sample_val),
+                              reinterpret_cast<const llaisys::fp16_t *>(logits), vocab_size, temperature, top_k,
+                              top_p, seed);
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+}
+} // namespace llaisys::ops::cpu
diff --git a/src/ops/random_sample/cpu/random_sample_cpu.hpp b/src/ops/random_sample/cpu/random_sample_cpu.hpp
new file mode 100644
index 00000000..88fc3bcc
--- /dev/null
+++ b/src/ops/random_sample/cpu/random_sample_cpu.hpp
@@ -0,0 +1,10 @@
+#pragma once
+
+#include "llaisys.h"
+
+#include <cstddef>
+
+namespace llaisys::ops::cpu {
+void random_sample(std::byte *sample_idx, std::byte *sample_val, const std::byte *logits, llaisysDataType_t type,
+                   size_t vocab_size, float temperature, int top_k, float top_p, int64_t seed);
+}
diff --git a/src/ops/random_sample/cuda/random_sample_cuda.cu b/src/ops/random_sample/cuda/random_sample_cuda.cu
new file mode 100644
index 00000000..07883cd1
--- /dev/null
+++ b/src/ops/random_sample/cuda/random_sample_cuda.cu
@@ -0,0 +1,155 @@
+#include "random_sample_cuda.cuh"
+
+#include "../../../utils.hpp"
+#include "../../../utils/check.hpp"
+
+#include <cmath>
+#include <cfloat>
+
+#include <thrust/device_ptr.h>
+#include <thrust/execution_policy.h>
+#include <thrust/sort.h>
+
+#include <curand_kernel.h>
+#include <atomic>
+#include <chrono>
+
+namespace llaisys::ops::cuda {
+
+__global__ void init_indices_kernel(int64_t *indices, size_t n) {
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= n) {
+        return;
+    }
+    indices[idx] = static_cast<int64_t>(idx);
+}
+
+__global__ void scale_logits_kernel(float *scaled, const void *logits, llaisysDataType_t type, size_t vocab_size,
+                                    float inv_temperature) {
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= vocab_size) {
+        return;
+    }
+    float x = load_as_float(logits, idx, type);
+    scaled[idx] = x * inv_temperature;
+}
+
+__global__ void random_sample_kernel(int64_t *sample_idx, void *sample_val, const void *logits,
+                                     const float *sorted_scaled_logits, const int64_t *sorted_indices,
+                                     llaisysDataType_t type, size_t top_k, float top_p,
+                                     unsigned long long seed) {
+    if (blockIdx.x != 0 || threadIdx.x != 0) {
+        return;
+    }
+
+    if (top_k == 0) {
+        sample_idx[0] = 0;
+        store_from_float(sample_val, 0, load_as_float(logits, 0, type), type);
+        return;
+    }
+
+    top_p = fminf(1.0f, fmaxf(0.0f, top_p));
+    if (top_p <= 0.0f) {
+        top_k = 1;
+        top_p = 1.0f;
+    }
+
+    const float max_logit = sorted_scaled_logits[0];
+    float sum_exp = 0.0f;
+    for (size_t i = 0; i < top_k; ++i) {
+        sum_exp += expf(sorted_scaled_logits[i] - max_logit);
+    }
+
+    size_t keep_count = top_k;
+    if (top_p < 1.0f) {
+        float cumulative = 0.0f;
+        keep_count = 0;
+        for (size_t i = 0; i < top_k; ++i) {
+            float p = expf(sorted_scaled_logits[i] - max_logit) / sum_exp;
+            cumulative += p;
+            keep_count = i + 1;
+            if (cumulative >= top_p) {
+                break;
+            }
+        }
+        if (keep_count == 0) {
+            keep_count = 1;
+        }
+    }
+
+    float renorm = 0.0f;
+    for (size_t i = 0; i < keep_count; ++i) {
+        renorm += expf(sorted_scaled_logits[i] - max_logit) / sum_exp;
+    }
+
+    curandStatePhilox4_32_10_t rng_state;
+    curand_init(seed, 0ULL, 0ULL, &rng_state);
+    float r = curand_uniform(&rng_state);
+
+    float running = 0.0f;
+    size_t sampled_i = keep_count - 1;
+    for (size_t i = 0; i < keep_count; ++i) {
+        float p = (expf(sorted_scaled_logits[i] - max_logit) / sum_exp) / renorm;
+        running += p;
+        if (r <= running) {
+            sampled_i = i;
+            break;
+        }
+    }
+
+    const int64_t sampled_vocab_idx = sorted_indices[sampled_i];
+    sample_idx[0] = sampled_vocab_idx;
+    const float sampled_val = load_as_float(logits, static_cast<size_t>(sampled_vocab_idx), type);
+    store_from_float(sample_val, 0, sampled_val, type);
+}
+
+void random_sample(std::byte *sample_idx, std::byte *sample_val, const std::byte *logits, llaisysDataType_t type,
+                   size_t vocab_size, float temperature, int top_k, float top_p, int64_t seed) {
+    static std::atomic<unsigned long long> seed_counter{0};
+    unsigned long long sample_seed = 0;
+    if (seed >= 0) {
+        sample_seed = static_cast<unsigned long long>(seed);
+    } else {
+        sample_seed =
+            static_cast<unsigned long long>(
+                std::chrono::high_resolution_clock::now().time_since_epoch().count()) ^
+            (seed_counter.fetch_add(1, std::memory_order_relaxed) + 0x9e3779b97f4a7c15ULL);
+    }
+
+    ASSERT(type == LLAISYS_DTYPE_F32 || type == LLAISYS_DTYPE_F16 || type == LLAISYS_DTYPE_BF16,
+           "random_sample cuda only supports F32/F16/BF16");
+
+    const float inv_temperature = 1.0f / fmaxf(temperature, 1e-6f);
+    size_t k = (top_k <= 0 || static_cast<size_t>(top_k) > vocab_size) ? vocab_size : static_cast<size_t>(top_k);
+
+    float *d_scaled = nullptr;
+    int64_t *d_indices = nullptr;
+    auto err = cudaMalloc(&d_scaled, sizeof(float) * vocab_size);
+    ASSERT(err == cudaSuccess, "random_sample cuda malloc d_scaled failed");
+    err = cudaMalloc(&d_indices, sizeof(int64_t) * vocab_size);
+    ASSERT(err == cudaSuccess, "random_sample cuda malloc d_indices failed");
+
+    dim3 block(256);
+    dim3 grid((vocab_size + block.x - 1) / block.x);
+    init_indices_kernel<<<grid, block>>>(d_indices, vocab_size);
+    err = cudaGetLastError();
+    ASSERT(err == cudaSuccess, "random_sample init_indices kernel launch failed");
+
+    scale_logits_kernel<<<grid, block>>>(d_scaled, logits, type, vocab_size, inv_temperature);
+    err = cudaGetLastError();
+    ASSERT(err == cudaSuccess, "random_sample scale_logits kernel launch failed");
+
+    thrust::device_ptr<float> scaled_ptr(d_scaled);
+    thrust::device_ptr<int64_t> indices_ptr(d_indices);
+    thrust::sort_by_key(thrust::device, scaled_ptr, scaled_ptr + vocab_size, indices_ptr, thrust::greater<float>());
+
+    random_sample_kernel<<<1, 1>>>(reinterpret_cast<int64_t *>(sample_idx), sample_val, logits, d_scaled, d_indices,
+                                   type, k, top_p, sample_seed);
+    err = cudaGetLastError();
+    ASSERT(err == cudaSuccess, "random_sample sample kernel launch failed");
+
+    cudaFree(d_scaled);
+    cudaFree(d_indices);
+}
+
+} // namespace llaisys::ops::cuda
diff --git a/src/ops/random_sample/cuda/random_sample_cuda.cuh b/src/ops/random_sample/cuda/random_sample_cuda.cuh
new file mode 100644
index 00000000..d3e2ec5a
--- /dev/null
+++ b/src/ops/random_sample/cuda/random_sample_cuda.cuh
@@ -0,0 +1,10 @@
+#pragma once
+
+#include "llaisys.h"
+
+#include <cstddef>
+
+namespace llaisys::ops::cuda {
+void random_sample(std::byte *sample_idx, std::byte *sample_val, const std::byte *logits, llaisysDataType_t type,
+                   size_t vocab_size, float temperature, int top_k, float top_p, int64_t seed);
+}
diff --git a/src/ops/random_sample/op.cpp b/src/ops/random_sample/op.cpp
new file mode 100644
index 00000000..540bf698
--- /dev/null
+++ b/src/ops/random_sample/op.cpp
@@ -0,0 +1,51 @@
+#include "op.hpp"
+
+#include "../../core/llaisys_core.hpp"
+#include "../../utils.hpp"
+
+#include "cpu/random_sample_cpu.hpp"
+#ifdef ENABLE_NVIDIA_API
+#include "cuda/random_sample_cuda.cuh"
+#endif
+
+namespace llaisys::ops {
+void random_sample(tensor_t sample_idx, tensor_t sample_val, tensor_t logits, float temperature, int top_k, float top_p,
+                   int64_t seed) {
+    CHECK_SAME_DEVICE(sample_idx, sample_val, logits);
+    CHECK_SAME_DTYPE(sample_val->dtype(), logits->dtype());
+
+    ASSERT(logits->ndim() == 1, "RandomSample: logits must be a 1D tensor");
+    ASSERT(logits->numel() > 0, "RandomSample: logits cannot be empty");
+    ASSERT(sample_idx->ndim() == 1 && sample_idx->shape()[0] == 1,
+           "RandomSample: sample_idx must be a 1D tensor with single element");
+    ASSERT(sample_val->ndim() == 1 && sample_val->shape()[0] == 1,
+           "RandomSample: sample_val must be a 1D tensor with single element");
+
+    ASSERT(logits->isContiguous() && sample_idx->isContiguous() && sample_val->isContiguous(),
+           "RandomSample: tensors must be contiguous");
+    ASSERT(sample_idx->dtype() == LLAISYS_DTYPE_I64, "RandomSample: sample_idx must be Int64 type");
+    ASSERT(logits->dtype() == LLAISYS_DTYPE_F32 || logits->dtype() == LLAISYS_DTYPE_F16 || logits->dtype() == LLAISYS_DTYPE_BF16,
+           "RandomSample: logits dtype must be Float32/Float16/BFloat16");
+
+    if (logits->deviceType() == LLAISYS_DEVICE_CPU) {
+        return cpu::random_sample(sample_idx->data(), sample_val->data(), logits->data(), logits->dtype(), logits->numel(),
+                                  temperature, top_k, top_p, seed);
+    }
+
+    llaisys::core::context().setDevice(logits->deviceType(), logits->deviceId());
+
+    switch (logits->deviceType()) {
+    case LLAISYS_DEVICE_CPU:
+        return cpu::random_sample(sample_idx->data(), sample_val->data(), logits->data(), logits->dtype(), logits->numel(),
+                                  temperature, top_k, top_p, seed);
+#ifdef ENABLE_NVIDIA_API
+    case LLAISYS_DEVICE_NVIDIA:
+        return cuda::random_sample(sample_idx->data(), sample_val->data(), logits->data(), logits->dtype(), logits->numel(),
+                                   temperature, top_k, top_p, seed);
+        return;
+#endif
+    default:
+        EXCEPTION_UNSUPPORTED_DEVICE;
+    }
+}
+} // namespace llaisys::ops
diff --git a/src/ops/random_sample/op.hpp b/src/ops/random_sample/op.hpp
new file mode 100644
index 00000000..81025545
--- /dev/null
+++ b/src/ops/random_sample/op.hpp
@@ -0,0 +1,8 @@
+#pragma once
+
+#include "../../tensor/tensor.hpp"
+
+namespace llaisys::ops {
+void random_sample(tensor_t sample_idx, tensor_t sample_val, tensor_t logits, float temperature, int top_k, float top_p,
+				   int64_t seed);
+}
diff --git a/src/ops/rms_norm/cpu/rms_norm_cpu.cpp b/src/ops/rms_norm/cpu/rms_norm_cpu.cpp
new file mode 100644
index 00000000..0c59afbd
--- /dev/null
+++ b/src/ops/rms_norm/cpu/rms_norm_cpu.cpp
@@ -0,0 +1,70 @@
+#include "rms_norm_cpu.hpp"
+
+#include "../../../utils.hpp"
+
+#include <cmath>
+template <typename T>
+void rms_norm_(T *out, const T *in, const T *weight, 
+               size_t batch_size, size_t feature_dim, float eps) {
+                if constexpr (std::is_same_v<T, llaisys::bf16_t> || std::is_same_v<T, llaisys::fp16_t>)
+                    for(size_t i = 0 ; i < batch_size ; i++){
+                        float sum = 0.0f;
+                        auto base = in + i * feature_dim ;
+                        auto baseout = out +i *feature_dim ; 
+                        for(size_t j = 0 ; j < feature_dim ;j++){
+                            float u = llaisys::utils::cast<float>(base[j]);
+                            sum += u*u;
+                        }
+                        float factor = std::sqrt(sum / static_cast<float>(feature_dim) +eps);
+                        for(size_t j = 0 ; j < feature_dim ;j++){
+                            float u = llaisys::utils::cast<float>(base[j]);
+                            baseout[j] = llaisys::utils::cast<T>(  llaisys::utils::cast<float>(weight[j])*u / factor);
+                        }
+                    }else{
+                        for(size_t i = 0 ; i < batch_size ; i++){
+                            float sum = 0.0f;
+                            auto base = in + i * feature_dim ;
+                            auto baseout = out +i *feature_dim ; 
+                            for(size_t j = 0 ; j < feature_dim ;j++){
+                                float u = static_cast<float>(base[j]);
+                                sum += u*u;
+                            }
+                            float factor = std::sqrt(sum / static_cast<float>(feature_dim) +eps);
+                            for(size_t j = 0 ; j < feature_dim ;j++){
+                                float u = static_cast<float>(base[j]);
+                                baseout[j] = static_cast<T>(static_cast<float>(weight[j])*u / factor );
+                            }
+                        }            
+                }
+
+}
+namespace llaisys::ops::cpu {
+void rms_norm(std::byte *out, const std::byte *in, const std::byte *weight, 
+              llaisysDataType_t type, size_t batch_size, size_t feature_dim, float eps) {
+            switch (type) {
+                case LLAISYS_DTYPE_F32:
+                    return rms_norm_(
+                    reinterpret_cast<float *>(out),
+                    reinterpret_cast<const float *>(in),
+                    reinterpret_cast<const float *>(weight),
+                    batch_size, feature_dim, eps
+                    );
+                case LLAISYS_DTYPE_BF16:
+                    return rms_norm_(
+                    reinterpret_cast<llaisys::bf16_t *>(out),
+                    reinterpret_cast<const llaisys::bf16_t *>(in),
+                    reinterpret_cast<const llaisys::bf16_t *>(weight),
+                    batch_size, feature_dim, eps
+                    );
+                case LLAISYS_DTYPE_F16:
+                    return rms_norm_(
+                    reinterpret_cast<llaisys::fp16_t *>(out),
+                    reinterpret_cast<const llaisys::fp16_t *>(in),
+                    reinterpret_cast<const llaisys::fp16_t *>(weight),
+                    batch_size, feature_dim, eps
+                    );
+                default:
+                    EXCEPTION_UNSUPPORTED_DATATYPE(type);
+            }  
+        }
+}
\ No newline at end of file
diff --git a/src/ops/rms_norm/cpu/rms_norm_cpu.hpp b/src/ops/rms_norm/cpu/rms_norm_cpu.hpp
new file mode 100644
index 00000000..d4a5fa07
--- /dev/null
+++ b/src/ops/rms_norm/cpu/rms_norm_cpu.hpp
@@ -0,0 +1,9 @@
+#pragma once
+#include "llaisys.h"
+
+#include <cstddef>
+
+namespace llaisys::ops::cpu {
+void rms_norm(std::byte *out, const std::byte *in, const std::byte *weight, 
+              llaisysDataType_t type, size_t batch_size, size_t feature_dim, float eps);
+}
\ No newline at end of file
diff --git a/src/ops/rms_norm/cuda/rms_norm_cuda.cu b/src/ops/rms_norm/cuda/rms_norm_cuda.cu
new file mode 100644
index 00000000..acd56845
--- /dev/null
+++ b/src/ops/rms_norm/cuda/rms_norm_cuda.cu
@@ -0,0 +1,48 @@
+#include "rms_norm_cuda.cuh"
+
+#include "../../../utils.hpp"
+#include "../../../utils/check.hpp"
+
+namespace llaisys::ops::cuda {
+
+__global__ void rms_norm_kernel(void *out, const void *in, const void *weight, llaisysDataType_t type, size_t feature_dim,
+                                float eps) {
+    size_t row = blockIdx.x;
+    extern __shared__ float sdata[];
+
+    float local_sum = 0.0f;
+    for (size_t col = threadIdx.x; col < feature_dim; col += blockDim.x) {
+        float val = load_as_float(in, row * feature_dim + col, type);
+        local_sum += val * val;
+    }
+    sdata[threadIdx.x] = local_sum;
+    __syncthreads();
+
+    for (unsigned int stride = blockDim.x / 2; stride > 0; stride >>= 1) {
+        if (threadIdx.x < stride) {
+            sdata[threadIdx.x] += sdata[threadIdx.x + stride];
+        }
+        __syncthreads();
+    }
+
+    float inv_rms = rsqrtf(sdata[0] / static_cast<float>(feature_dim) + eps);
+    for (size_t col = threadIdx.x; col < feature_dim; col += blockDim.x) {
+        float x = load_as_float(in, row * feature_dim + col, type);
+        float w = load_as_float(weight, col, type);
+        store_from_float(out, row * feature_dim + col, x * w * inv_rms, type);
+    }
+}
+
+void rms_norm(std::byte *out, const std::byte *in, const std::byte *weight, llaisysDataType_t type, size_t batch_size,
+              size_t feature_dim, float eps) {
+    ASSERT(type == LLAISYS_DTYPE_F32 || type == LLAISYS_DTYPE_F16 || type == LLAISYS_DTYPE_BF16,
+           "rms_norm cuda only supports F32/F16/BF16");
+    dim3 block(256);
+    dim3 grid(batch_size);
+    size_t smem = block.x * sizeof(float);
+    rms_norm_kernel<<<grid, block, smem>>>(out, in, weight, type, feature_dim, eps);
+    auto err = cudaGetLastError();
+    ASSERT(err == cudaSuccess, "rms_norm cuda kernel launch failed");
+}
+
+} // namespace llaisys::ops::cuda
diff --git a/src/ops/rms_norm/cuda/rms_norm_cuda.cuh b/src/ops/rms_norm/cuda/rms_norm_cuda.cuh
new file mode 100644
index 00000000..57180f82
--- /dev/null
+++ b/src/ops/rms_norm/cuda/rms_norm_cuda.cuh
@@ -0,0 +1,10 @@
+#pragma once
+
+#include "llaisys.h"
+
+#include <cstddef>
+
+namespace llaisys::ops::cuda {
+void rms_norm(std::byte *out, const std::byte *in, const std::byte *weight, llaisysDataType_t type, size_t batch_size,
+              size_t feature_dim, float eps);
+}
diff --git a/src/ops/rms_norm/op.cpp b/src/ops/rms_norm/op.cpp
index 529553d9..327526db 100644
--- a/src/ops/rms_norm/op.cpp
+++ b/src/ops/rms_norm/op.cpp
@@ -1,7 +1,54 @@
 #include "op.hpp"
+#include "../../core/llaisys_core.hpp"
+#include "../../utils.hpp"
+
+#include "cpu/rms_norm_cpu.hpp"
+#ifdef ENABLE_NVIDIA_API
+#include "cuda/rms_norm_cuda.cuh"
+#endif
+
 
 namespace llaisys::ops {
 void rms_norm(tensor_t out, tensor_t in, tensor_t weight, float eps) {
-    TO_BE_IMPLEMENTED();
+    CHECK_SAME_DEVICE(out, in, weight);
+
+    
+    CHECK_SAME_DTYPE(out->dtype(), in->dtype(), weight->dtype());
+
+    
+    ASSERT(out->isContiguous() && in->isContiguous() && weight->isContiguous(), 
+           "RMS Norm: all tensors must be contiguous");
+
+    
+    ASSERT(in->ndim() == 2, "RMS Norm: input must be 2D tensor");
+    ASSERT(weight->ndim() == 1, "RMS Norm: weight must be 1D tensor");
+    ASSERT(out->ndim() == 2, "RMS Norm: output must be 2D tensor");
+
+    
+    size_t batch_size = in->shape()[0];
+    size_t feature_dim = in->shape()[1];
+    ASSERT(out->shape()[0] == batch_size, "RMS Norm: output batch size must match input");
+    ASSERT(out->shape()[1] == feature_dim, "RMS Norm: output feature dim must match input");
+    ASSERT(weight->shape()[0] == feature_dim, "RMS Norm: weight size must match feature dim");
+
+    if (out->deviceType() == LLAISYS_DEVICE_CPU) {
+        return cpu::rms_norm(out->data(), in->data(), weight->data(), 
+                            out->dtype(), batch_size, feature_dim, eps);
+    }
+
+    llaisys::core::context().setDevice(out->deviceType(), out->deviceId());
+
+    switch (out->deviceType()) {
+    case LLAISYS_DEVICE_CPU:
+        return cpu::rms_norm(out->data(), in->data(), weight->data(), 
+                            out->dtype(), batch_size, feature_dim, eps);
+#ifdef ENABLE_NVIDIA_API
+    case LLAISYS_DEVICE_NVIDIA:
+    return cuda::rms_norm(out->data(), in->data(), weight->data(), out->dtype(), batch_size, feature_dim, eps);
+        return;
+#endif
+    default:
+        EXCEPTION_UNSUPPORTED_DEVICE;
+    }
 }
 } // namespace llaisys::ops
diff --git a/src/ops/rope/cpu/rope_cpu.cpp b/src/ops/rope/cpu/rope_cpu.cpp
new file mode 100644
index 00000000..195fd12a
--- /dev/null
+++ b/src/ops/rope/cpu/rope_cpu.cpp
@@ -0,0 +1,88 @@
+#include "rope_cpu.hpp"
+
+#include "../../../utils.hpp"
+
+#include <cmath>
+#include <cassert>
+
+
+template <typename T>
+void rope_(T *out, const T *in, const int64_t *pos_ids,
+           size_t seq_len, size_t n_heads, size_t head_dim, float theta) {
+                auto half_dim = head_dim / 2;
+            if constexpr (std::is_same_v<T, llaisys::bf16_t> || std::is_same_v<T, llaisys::fp16_t>) {
+                for(size_t s = 0 ; s < seq_len ; s++){
+
+                    auto position = pos_ids[s];
+
+                    for(size_t h = 0 ; h < n_heads ; h++){
+
+                        auto in_a = in + s* n_heads * head_dim + h * head_dim ;
+                        auto in_b = in + s* n_heads * head_dim + h * head_dim + half_dim;
+                        auto out_a = out + s* n_heads * head_dim + h * head_dim;
+                        auto out_b = out + s* n_heads * head_dim + h * head_dim + half_dim;
+
+                        for(size_t d = 0 ; d < half_dim ; d++){
+
+                            float phi = position /(std::pow(theta , 2.0f * static_cast<float>(d) /static_cast<float>(head_dim) )) ; 
+                            float cosval = std::cos(phi);
+                            float sinval = std::sin(phi);
+                            out_a[d] = llaisys::utils::cast<T>(llaisys::utils::cast<float>(in_a[d]) *cosval - llaisys::utils::cast<float>(in_b[d]) * sinval);
+                            out_b[d] = llaisys::utils::cast<T>(llaisys::utils::cast<float>(in_b[d]) *cosval + llaisys::utils::cast<float>(in_a[d]) * sinval);
+                        }
+                    }
+                }
+           } else{
+                for(size_t s = 0 ; s < seq_len ; s++){
+
+                    auto position = pos_ids[s];
+
+                    for(size_t h = 0 ; h < n_heads ; h++){
+
+                        auto in_a = in + s* n_heads * head_dim + h * head_dim ;
+                        auto in_b = in + s* n_heads * head_dim + h * head_dim + half_dim;
+                        auto out_a = out + s* n_heads * head_dim + h * head_dim;
+                        auto out_b = out + s* n_heads * head_dim + h * head_dim + half_dim;
+
+                        for(size_t d = 0 ; d < half_dim ; d++){
+
+                            float phi = position /(std::pow(theta , 2.0f * static_cast<float>(d) /static_cast<float>(head_dim) )) ; 
+                            float cosval = std::cos(phi);
+                            float sinval = std::sin(phi);
+                            out_a[d] = static_cast<T>(static_cast<float>(in_a[d]) *cosval - static_cast<float>(in_b[d]) * sinval);
+                            out_b[d] = static_cast<T>(static_cast<float>(in_b[d]) *cosval + static_cast<float>(in_a[d]) * sinval);
+                        }
+                    }
+                }
+           }
+}
+namespace llaisys::ops::cpu {
+void rope(std::byte *out, const std::byte *in, const std::byte *pos_ids, 
+          llaisysDataType_t type, size_t seq_len, size_t n_heads, size_t head_dim, float theta) {
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        return rope_(
+            reinterpret_cast<float *>(out),
+            reinterpret_cast<const float *>(in),
+            reinterpret_cast<const int64_t *>(pos_ids),
+            seq_len, n_heads, head_dim, theta
+        );
+    case LLAISYS_DTYPE_BF16:
+        return rope_(
+            reinterpret_cast<llaisys::bf16_t *>(out),
+            reinterpret_cast<const llaisys::bf16_t *>(in),
+            reinterpret_cast<const int64_t *>(pos_ids),
+            seq_len, n_heads, head_dim, theta
+        );
+    case LLAISYS_DTYPE_F16:
+        return rope_(
+            reinterpret_cast<llaisys::fp16_t *>(out),
+            reinterpret_cast<const llaisys::fp16_t *>(in),
+            reinterpret_cast<const int64_t *>(pos_ids),
+            seq_len, n_heads, head_dim, theta
+        );
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+}
+}
\ No newline at end of file
diff --git a/src/ops/rope/cpu/rope_cpu.hpp b/src/ops/rope/cpu/rope_cpu.hpp
new file mode 100644
index 00000000..705c248d
--- /dev/null
+++ b/src/ops/rope/cpu/rope_cpu.hpp
@@ -0,0 +1,9 @@
+#pragma once
+#include "llaisys.h"
+
+#include <cstddef>
+
+namespace llaisys::ops::cpu {
+void rope(std::byte *out, const std::byte *in, const std::byte *pos_ids, 
+          llaisysDataType_t type, size_t seq_len, size_t n_heads, size_t head_dim, float theta);
+}
\ No newline at end of file
diff --git a/src/ops/rope/cuda/rope_cuda.cu b/src/ops/rope/cuda/rope_cuda.cu
new file mode 100644
index 00000000..ddc4a5c9
--- /dev/null
+++ b/src/ops/rope/cuda/rope_cuda.cu
@@ -0,0 +1,49 @@
+#include "rope_cuda.cuh"
+
+#include "../../../utils.hpp"
+#include "../../../utils/check.hpp"
+
+#include <cmath>
+
+namespace llaisys::ops::cuda {
+
+__global__ void rope_kernel(void *out, const void *in, const int64_t *pos_ids, llaisysDataType_t type, size_t seq_len,
+                            size_t n_heads, size_t head_dim, float theta) {
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t half_dim = head_dim / 2;
+    size_t total = seq_len * n_heads * half_dim;
+    if (idx >= total) {
+        return;
+    }
+
+    size_t d = idx % half_dim;
+    size_t h = (idx / half_dim) % n_heads;
+    size_t s = idx / (half_dim * n_heads);
+
+    float pos = static_cast<float>(pos_ids[s]);
+    float phi = pos / powf(theta, 2.0f * static_cast<float>(d) / static_cast<float>(head_dim));
+    float c = cosf(phi);
+    float sn = sinf(phi);
+
+    size_t base = (s * n_heads + h) * head_dim;
+    float a = load_as_float(in, base + d, type);
+    float b = load_as_float(in, base + half_dim + d, type);
+
+    store_from_float(out, base + d, a * c - b * sn, type);
+    store_from_float(out, base + half_dim + d, b * c + a * sn, type);
+}
+
+void rope(std::byte *out, const std::byte *in, const std::byte *pos_ids, llaisysDataType_t type, size_t seq_len,
+          size_t n_heads, size_t head_dim, float theta) {
+    ASSERT(type == LLAISYS_DTYPE_F32 || type == LLAISYS_DTYPE_F16 || type == LLAISYS_DTYPE_BF16,
+           "rope cuda only supports F32/F16/BF16");
+    size_t total = seq_len * n_heads * (head_dim / 2);
+    dim3 block(256);
+    dim3 grid((total + block.x - 1) / block.x);
+    rope_kernel<<<grid, block>>>(out, in, reinterpret_cast<const int64_t *>(pos_ids), type, seq_len, n_heads, head_dim,
+                                 theta);
+    auto err = cudaGetLastError();
+    ASSERT(err == cudaSuccess, "rope cuda kernel launch failed");
+}
+
+} // namespace llaisys::ops::cuda
diff --git a/src/ops/rope/cuda/rope_cuda.cuh b/src/ops/rope/cuda/rope_cuda.cuh
new file mode 100644
index 00000000..38e3aae8
--- /dev/null
+++ b/src/ops/rope/cuda/rope_cuda.cuh
@@ -0,0 +1,10 @@
+#pragma once
+
+#include "llaisys.h"
+
+#include <cstddef>
+
+namespace llaisys::ops::cuda {
+void rope(std::byte *out, const std::byte *in, const std::byte *pos_ids, llaisysDataType_t type, size_t seq_len,
+          size_t n_heads, size_t head_dim, float theta);
+}
diff --git a/src/ops/rope/op.cpp b/src/ops/rope/op.cpp
index d60dbe64..02ced154 100644
--- a/src/ops/rope/op.cpp
+++ b/src/ops/rope/op.cpp
@@ -1,7 +1,59 @@
 #include "op.hpp"
 
+#include "../../core/llaisys_core.hpp"
+#include "../../utils.hpp"
+
+#include "cpu/rope_cpu.hpp"
+#ifdef ENABLE_NVIDIA_API
+#include "cuda/rope_cuda.cuh"
+#endif
+
 namespace llaisys::ops {
 void rope(tensor_t out, tensor_t in, tensor_t pos_ids, float theta) {
-    TO_BE_IMPLEMENTED();
+    CHECK_SAME_DEVICE(out, in, pos_ids);
+
+    CHECK_SAME_DTYPE(out->dtype(), in->dtype());
+    ASSERT(pos_ids->dtype() == LLAISYS_DTYPE_I64, "RoPE: pos_ids must be int64");
+
+    
+    ASSERT(out->isContiguous() && in->isContiguous() && pos_ids->isContiguous(), 
+           "RoPE: all tensors must be contiguous");
+
+    
+    ASSERT(in->ndim() == 3, "RoPE: input must be 3D tensor [seq_len, n_heads, head_dim]");
+    ASSERT(out->ndim() == 3, "RoPE: output must be 3D tensor [seq_len, n_heads, head_dim]");
+    ASSERT(pos_ids->ndim() == 1, "RoPE: pos_ids must be 1D tensor [seq_len]");
+
+    
+    size_t seq_len = in->shape()[0];
+    size_t n_heads = in->shape()[1];
+    size_t head_dim = in->shape()[2];
+
+    ASSERT(out->shape()[0] == seq_len, "RoPE: output seq_len must match input");
+    ASSERT(out->shape()[1] == n_heads, "RoPE: output n_heads must match input");
+    ASSERT(out->shape()[2] == head_dim, "RoPE: output head_dim must match input");
+    ASSERT(pos_ids->shape()[0] == seq_len, "RoPE: pos_ids length must match seq_len");
+    ASSERT(head_dim % 2 == 0, "RoPE: head_dim must be even");
+
+    
+    if (out->deviceType() == LLAISYS_DEVICE_CPU) {
+        return cpu::rope(out->data(), in->data(), pos_ids->data(), 
+                        out->dtype(), seq_len, n_heads, head_dim, theta);
+    }
+
+    llaisys::core::context().setDevice(out->deviceType(), out->deviceId());
+
+    switch (out->deviceType()) {
+    case LLAISYS_DEVICE_CPU:
+        return cpu::rope(out->data(), in->data(), pos_ids->data(), 
+                        out->dtype(), seq_len, n_heads, head_dim, theta);
+#ifdef ENABLE_NVIDIA_API
+    case LLAISYS_DEVICE_NVIDIA:
+    return cuda::rope(out->data(), in->data(), pos_ids->data(), out->dtype(), seq_len, n_heads, head_dim, theta);
+        return;
+#endif
+    default:
+        EXCEPTION_UNSUPPORTED_DEVICE;
+    }
 }
 } // namespace llaisys::ops
diff --git a/src/ops/self_attention/cpu/self_attention_cpu.cpp b/src/ops/self_attention/cpu/self_attention_cpu.cpp
new file mode 100644
index 00000000..7b46a057
--- /dev/null
+++ b/src/ops/self_attention/cpu/self_attention_cpu.cpp
@@ -0,0 +1,136 @@
+#include "self_attention_cpu.hpp"
+
+#include "../../../utils.hpp"
+
+#include <cmath>
+#include <algorithm>
+#include <limits>
+
+template <typename T>
+void self_attention_(T *attn_val, const T *q, const T *k, const T *v,
+                    size_t seqlen, size_t totlen, size_t nh, size_t nkvh, size_t d, size_t dv, float scale) {
+                    auto shared = nh/nkvh ;
+                    if constexpr( std::is_same_v<T, llaisys::bf16_t> || std::is_same_v<T, llaisys::fp16_t>)
+                    for(size_t s = 0 ; s<seqlen ;s++){
+                        for(size_t h=0 ; h<nh ; h++){
+                            auto kvhead = h/shared ;
+                            auto baseq = q + s * nh * d + h *d;
+                            auto baseout = attn_val + s * nh*dv+ h*dv;
+                            float* attn = new float[totlen];
+                            for(size_t t = 0; t<totlen ;t++){
+                                auto basek = k + t * nkvh * d + kvhead *d;
+                                attn[t]= 0;
+                                for(size_t d_ = 0;d_<d ; d_++){
+                                    attn[t]+=llaisys::utils::cast<float>(baseq[d_])*llaisys::utils::cast<float>(basek[d_]);
+                                }
+                                attn[t] *= scale;
+                                if(t > s + totlen - seqlen){
+                                    attn[t] = -std::numeric_limits<float>::infinity();
+                                }
+                            }
+                            float max_attn = -std::numeric_limits<float>::infinity();
+                            for (size_t t= 0; t < totlen; t++) {
+                                if (attn[t] > max_attn) {
+                                    max_attn = attn[t];
+                                }
+                            }
+                            float sum_exp = 0.0f;
+                            for (size_t t = 0; t < totlen; t++) {
+                                attn[t] = std::exp(attn[t] - max_attn);
+                                sum_exp += attn[t];
+                            } 
+                            for (size_t t = 0; t < totlen; t++) {
+                                attn[t] /= sum_exp;
+                            }
+                            for (size_t dv_ = 0;dv_ < dv;dv_++){
+                                float val = 0;
+                                for(size_t t = 0; t < totlen; t++){
+                                    auto basev  = v+ t* nkvh * dv + kvhead * dv;
+                                    val+=llaisys::utils::cast<float>( basev[dv_])*attn[t];
+                                }
+                                baseout[dv_]=llaisys::utils::cast<T>(val);
+                            }
+                            delete[]attn;
+                        }
+                    }
+                    else{
+                    for(size_t s = 0 ; s<seqlen ;s++){
+                        for(size_t h=0 ; h<nh ; h++){
+                            auto kvhead = h/shared ;
+                            auto baseq = q + s * nh * d + h *d;
+                            auto baseout = attn_val + s * nh*dv+ h*dv;
+                            float* attn = new float[totlen];
+                            for(size_t t = 0; t<totlen ;t++){
+                                auto basek = k + t * nkvh * d + kvhead *d;
+                                attn[t]= 0;
+                                for(size_t d_ = 0;d_<d ; d_++){
+                                    attn[t]+=static_cast<float>(baseq[d_])*static_cast<float>(basek[d_]);
+                                }
+                                attn[t]*= scale;
+                                if(t > s + totlen - seqlen){
+                                    attn[t] = -std::numeric_limits<float>::infinity();
+                                }
+                            }
+                            float max_attn = -std::numeric_limits<float>::infinity();
+                            for (size_t t= 0; t < totlen; t++) {
+                                if (attn[t] > max_attn) {
+                                    max_attn = attn[t];
+                                }
+                            }
+                            float sum_exp = 0.0f;
+                            for (size_t t = 0; t < totlen; t++) {
+                                attn[t] = std::exp(attn[t] - max_attn);
+                                sum_exp += attn[t];
+                            } 
+                            for (size_t t = 0; t < totlen; t++) {
+                                attn[t] /= sum_exp;
+                            }
+                            for (size_t dv_ = 0;dv_ < dv;dv_++){
+                                float val = 0;
+                                for(size_t t = 0; t < totlen; t++){
+                                    auto basev  = v+ t* nkvh * dv + kvhead * dv;
+                                    val+=static_cast<float>( basev[dv_])*attn[t];
+                                }
+                                baseout[dv_]=llaisys::utils::cast<T>(val);
+                            }
+                            delete[]attn;
+                        }
+                    }
+                }
+        }
+                
+
+namespace llaisys::ops::cpu {
+void self_attention(std::byte *attn_val, const std::byte *q, const std::byte *k, const std::byte *v,
+                   llaisysDataType_t type, size_t seqlen, size_t totlen, size_t nh, size_t nkvh, 
+                   size_t d, size_t dv , float scale) {
+    switch (type) {
+    case LLAISYS_DTYPE_F32:
+        return self_attention_(
+            reinterpret_cast<float *>(attn_val),
+            reinterpret_cast<const float *>(q),
+            reinterpret_cast<const float *>(k),
+            reinterpret_cast<const float *>(v),
+            seqlen, totlen, nh, nkvh, d, dv , scale
+        );
+    case LLAISYS_DTYPE_BF16:
+        return self_attention_(
+            reinterpret_cast<llaisys::bf16_t *>(attn_val),
+            reinterpret_cast<const llaisys::bf16_t *>(q),
+            reinterpret_cast<const llaisys::bf16_t *>(k),
+            reinterpret_cast<const llaisys::bf16_t *>(v),
+            seqlen, totlen, nh, nkvh, d, dv , scale
+        );
+    case LLAISYS_DTYPE_F16:
+        return self_attention_(
+            reinterpret_cast<llaisys::fp16_t *>(attn_val),
+            reinterpret_cast<const llaisys::fp16_t *>(q),
+            reinterpret_cast<const llaisys::fp16_t *>(k),
+            reinterpret_cast<const llaisys::fp16_t *>(v),
+            seqlen, totlen, nh, nkvh, d, dv , scale
+        );
+    default:
+        EXCEPTION_UNSUPPORTED_DATATYPE(type);
+    }
+}
+} // namespace llaisys::ops::cpu
\ No newline at end of file
diff --git a/src/ops/self_attention/cpu/self_attention_cpu.hpp b/src/ops/self_attention/cpu/self_attention_cpu.hpp
new file mode 100644
index 00000000..f7b5be9a
--- /dev/null
+++ b/src/ops/self_attention/cpu/self_attention_cpu.hpp
@@ -0,0 +1,10 @@
+#pragma once
+#include "llaisys.h"
+
+#include <cstddef>
+
+namespace llaisys::ops::cpu {
+void self_attention(std::byte *attn_val, const std::byte *q, const std::byte *k, const std::byte *v,
+                   llaisysDataType_t type, size_t seqlen, size_t totlen, size_t nh, size_t nkvh, 
+                   size_t d, size_t dv, float scale);
+}
diff --git a/src/ops/self_attention/cuda/self_attention_cuda.cu b/src/ops/self_attention/cuda/self_attention_cuda.cu
new file mode 100644
index 00000000..be9dfa0a
--- /dev/null
+++ b/src/ops/self_attention/cuda/self_attention_cuda.cu
@@ -0,0 +1,91 @@
+#include "self_attention_cuda.cuh"
+
+#include "../../../utils.hpp"
+#include "../../../utils/check.hpp"
+
+#include <cmath>
+
+namespace llaisys::ops::cuda {
+
+__global__ void self_attention_kernel(void *attn_val, const void *q, const void *k, const void *v,
+                                      llaisysDataType_t type, size_t seqlen, size_t totlen, size_t nh, size_t nkvh,
+                                      size_t d, size_t dv, float scale) {
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t total = seqlen * nh * dv;
+    if (idx >= total) {
+        return;
+    }
+
+    size_t dv_i = idx % dv;
+    size_t h = (idx / dv) % nh;
+    size_t s = idx / (dv * nh);
+
+    size_t shared = nh / nkvh;
+    size_t kvh = h / shared;
+
+    float max_score = -INFINITY;
+    for (size_t t = 0; t < totlen; ++t) {
+        float score = 0.0f;
+        size_t qbase = (s * nh + h) * d;
+        size_t kbase = (t * nkvh + kvh) * d;
+        for (size_t di = 0; di < d; ++di) {
+            score += load_as_float(q, qbase + di, type) * load_as_float(k, kbase + di, type);
+        }
+        score *= scale;
+        if (t > s + totlen - seqlen) {
+            score = -INFINITY;
+        }
+        if (score > max_score) {
+            max_score = score;
+        }
+    }
+
+    float sum_exp = 0.0f;
+    for (size_t t = 0; t < totlen; ++t) {
+        float score = 0.0f;
+        size_t qbase = (s * nh + h) * d;
+        size_t kbase = (t * nkvh + kvh) * d;
+        for (size_t di = 0; di < d; ++di) {
+            score += load_as_float(q, qbase + di, type) * load_as_float(k, kbase + di, type);
+        }
+        score *= scale;
+        if (t > s + totlen - seqlen) {
+            score = -INFINITY;
+        }
+        sum_exp += expf(score - max_score);
+    }
+
+    float out_val = 0.0f;
+    for (size_t t = 0; t < totlen; ++t) {
+        float score = 0.0f;
+        size_t qbase = (s * nh + h) * d;
+        size_t kbase = (t * nkvh + kvh) * d;
+        for (size_t di = 0; di < d; ++di) {
+            score += load_as_float(q, qbase + di, type) * load_as_float(k, kbase + di, type);
+        }
+        score *= scale;
+        if (t > s + totlen - seqlen) {
+            score = -INFINITY;
+        }
+        float p = expf(score - max_score) / sum_exp;
+        size_t vbase = (t * nkvh + kvh) * dv;
+        out_val += p * load_as_float(v, vbase + dv_i, type);
+    }
+
+    store_from_float(attn_val, idx, out_val, type);
+}
+
+void self_attention(std::byte *attn_val, const std::byte *q, const std::byte *k, const std::byte *v,
+                    llaisysDataType_t type, size_t seqlen, size_t totlen, size_t nh, size_t nkvh, size_t d, size_t dv,
+                    float scale) {
+    ASSERT(type == LLAISYS_DTYPE_F32 || type == LLAISYS_DTYPE_F16 || type == LLAISYS_DTYPE_BF16,
+           "self_attention cuda only supports F32/F16/BF16");
+    size_t total = seqlen * nh * dv;
+    dim3 block(128);
+    dim3 grid((total + block.x - 1) / block.x);
+    self_attention_kernel<<<grid, block>>>(attn_val, q, k, v, type, seqlen, totlen, nh, nkvh, d, dv, scale);
+    auto err = cudaGetLastError();
+    ASSERT(err == cudaSuccess, "self_attention cuda kernel launch failed");
+}
+
+} // namespace llaisys::ops::cuda
diff --git a/src/ops/self_attention/cuda/self_attention_cuda.cuh b/src/ops/self_attention/cuda/self_attention_cuda.cuh
new file mode 100644
index 00000000..cf2a90bc
--- /dev/null
+++ b/src/ops/self_attention/cuda/self_attention_cuda.cuh
@@ -0,0 +1,11 @@
+#pragma once
+
+#include "llaisys.h"
+
+#include <cstddef>
+
+namespace llaisys::ops::cuda {
+void self_attention(std::byte *attn_val, const std::byte *q, const std::byte *k, const std::byte *v,
+                    llaisysDataType_t type, size_t seqlen, size_t totlen, size_t nh, size_t nkvh, size_t d, size_t dv,
+                    float scale);
+}
diff --git a/src/ops/self_attention/op.cpp b/src/ops/self_attention/op.cpp
index 43d62014..3aed732c 100644
--- a/src/ops/self_attention/op.cpp
+++ b/src/ops/self_attention/op.cpp
@@ -1,7 +1,68 @@
 #include "op.hpp"
+#include "../../core/llaisys_core.hpp"
+#include "../../utils.hpp"
 
+#include "cpu/self_attention_cpu.hpp"
+#ifdef ENABLE_NVIDIA_API
+#include "cuda/self_attention_cuda.cuh"
+#endif
 namespace llaisys::ops {
 void self_attention(tensor_t attn_val, tensor_t q, tensor_t k, tensor_t v, float scale) {
-    TO_BE_IMPLEMENTED();
+    CHECK_SAME_DEVICE(attn_val, q, k, v);
+
+    // Check data types
+    CHECK_SAME_DTYPE(attn_val->dtype(), q->dtype(), k->dtype(), v->dtype());
+
+    // Check contiguity
+    ASSERT(attn_val->isContiguous() && q->isContiguous() && k->isContiguous() && v->isContiguous(), 
+           "Self-Attention: all tensors must be contiguous");
+
+    // Check dimensions
+    ASSERT(q->ndim() == 3, "Self-Attention: q must be 3D tensor [seqlen, nh, d]");
+    ASSERT(k->ndim() == 3, "Self-Attention: k must be 3D tensor [totlen, nkvh, d]");
+    ASSERT(v->ndim() == 3, "Self-Attention: v must be 3D tensor [totlen, nkvh, dv]");
+    ASSERT(attn_val->ndim() == 3, "Self-Attention: attn_val must be 3D tensor [seqlen, nh, dv]");
+
+    // Get dimensions
+    size_t seqlen = q->shape()[0];
+    size_t nh = q->shape()[1];       // number of query heads
+    size_t d = q->shape()[2];       // head dimension
+    size_t dv = v->shape()[2];
+    size_t totlen = k->shape()[0];    // key/value sequence length
+    size_t nkvh = k->shape()[1];     // number of key/value heads
+
+    // Check shapes are compatible
+    ASSERT(attn_val->shape()[0] == seqlen, "Self-Attention: attn_val seq_len must match q");
+    ASSERT(attn_val->shape()[1] == nh, "Self-Attention: attn_val heads must match q");
+    ASSERT(attn_val->shape()[2] == dv, "Self-Attention: attn_val head_dim must match v");
+
+    ASSERT(k->shape()[2] == d, "Self-Attention: k head_dim must match q");
+    ASSERT(v->shape()[0] == totlen, "Self-Attention: v seq_len must match k");
+    ASSERT(v->shape()[1] == nkvh, "Self-Attention: v heads must match k");
+
+    // Check Group Query Attention compatibility
+    ASSERT(nh % nkvh == 0, "Self-Attention: query heads must be divisible by key/value heads");
+
+    // always support cpu calculation
+    if (attn_val->deviceType() == LLAISYS_DEVICE_CPU) {
+        return cpu::self_attention(attn_val->data(), q->data(), k->data(), v->data(),
+                                  attn_val->dtype(), seqlen, totlen, nh, nkvh, d,dv, scale);
+    }
+
+    llaisys::core::context().setDevice(attn_val->deviceType(), attn_val->deviceId());
+
+    switch (attn_val->deviceType()) {
+    case LLAISYS_DEVICE_CPU:
+        return cpu::self_attention(attn_val->data(), q->data(), k->data(), v->data(),
+                                  attn_val->dtype(), seqlen, totlen, nh, nkvh, d,dv, scale);
+#ifdef ENABLE_NVIDIA_API
+    case LLAISYS_DEVICE_NVIDIA:
+        return cuda::self_attention(attn_val->data(), q->data(), k->data(), v->data(), attn_val->dtype(), seqlen,
+                                    totlen, nh, nkvh, d, dv, scale);
+        return;
+#endif
+    default:
+        EXCEPTION_UNSUPPORTED_DEVICE;
+    }
 }
 } // namespace llaisys::ops
diff --git a/src/ops/swiglu/cpu/swiglu_cpu.cpp b/src/ops/swiglu/cpu/swiglu_cpu.cpp
new file mode 100644
index 00000000..d2b452bd
--- /dev/null
+++ b/src/ops/swiglu/cpu/swiglu_cpu.cpp
@@ -0,0 +1,52 @@
+#include "swiglu_cpu.hpp"
+
+#include "../../../utils.hpp"
+
+#include <cmath>
+
+// SwiGLU: out_i = up_i * sigmoid(gate_i)
+// 其中 sigmoid(x) = x / (1 + exp(-x))
+template <typename T>
+void swiglu_(T *out, const T *gate, const T *up, size_t numel) {
+    if constexpr( std::is_same_v<T, llaisys::bf16_t> || std::is_same_v<T, llaisys::fp16_t>)
+        for(size_t i = 0;i<numel ;i++){
+            auto u = llaisys::utils::cast<float>( gate[i]);
+            out[i] = llaisys::utils::cast<T>( llaisys::utils::cast<float>( up[i] )* u/(1+std::exp(-u)));
+        }else{
+            for(size_t i = 0;i<numel ;i++){
+                auto u = static_cast<float>(gate[i]);
+                out[i] = llaisys::utils::cast<T>( static_cast<float>(up[i]) *u/(1+std::exp(-u)));
+            }
+        }
+}
+
+namespace llaisys::ops::cpu {
+void swiglu(std::byte *out, const std::byte *gate, const std::byte *up, 
+           llaisysDataType_t type, size_t numel) {
+            switch (type) {
+            case LLAISYS_DTYPE_F32:
+                return swiglu_(
+                    reinterpret_cast<float *>(out),
+                    reinterpret_cast<const float *>(gate),
+                    reinterpret_cast<const float *>(up),
+                    numel
+                );
+            case LLAISYS_DTYPE_BF16:
+                return swiglu_(
+                    reinterpret_cast<llaisys::bf16_t *>(out),
+                    reinterpret_cast<const llaisys::bf16_t *>(gate),
+                    reinterpret_cast<const llaisys::bf16_t *>(up),
+                    numel
+                );
+            case LLAISYS_DTYPE_F16:
+                return swiglu_(
+                    reinterpret_cast<llaisys::fp16_t *>(out),
+                    reinterpret_cast<const llaisys::fp16_t *>(gate),
+                    reinterpret_cast<const llaisys::fp16_t *>(up),
+                    numel
+                );
+            default:
+                EXCEPTION_UNSUPPORTED_DATATYPE(type);
+            }
+           }
+        }
\ No newline at end of file
diff --git a/src/ops/swiglu/cpu/swiglu_cpu.hpp b/src/ops/swiglu/cpu/swiglu_cpu.hpp
new file mode 100644
index 00000000..e19875ff
--- /dev/null
+++ b/src/ops/swiglu/cpu/swiglu_cpu.hpp
@@ -0,0 +1,9 @@
+#pragma once
+#include "llaisys.h"
+
+#include <cstddef>
+
+namespace llaisys::ops::cpu {
+void swiglu(std::byte *out, const std::byte *gate, const std::byte *up, 
+           llaisysDataType_t type, size_t numel);
+}
\ No newline at end of file
diff --git a/src/ops/swiglu/cuda/swiglu_cuda.cu b/src/ops/swiglu/cuda/swiglu_cuda.cu
new file mode 100644
index 00000000..ff90f463
--- /dev/null
+++ b/src/ops/swiglu/cuda/swiglu_cuda.cu
@@ -0,0 +1,30 @@
+#include "swiglu_cuda.cuh"
+
+#include "../../../utils.hpp"
+#include "../../../utils/check.hpp"
+
+namespace llaisys::ops::cuda {
+
+__global__ void swiglu_kernel(void *out, const void *gate, const void *up, llaisysDataType_t type, size_t numel) {
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= numel) {
+        return;
+    }
+
+    float g = load_as_float(gate, idx, type);
+    float u = load_as_float(up, idx, type);
+    float sig = g / (1.0f + expf(-g));
+    store_from_float(out, idx, u * sig, type);
+}
+
+void swiglu(std::byte *out, const std::byte *gate, const std::byte *up, llaisysDataType_t type, size_t numel) {
+    ASSERT(type == LLAISYS_DTYPE_F32 || type == LLAISYS_DTYPE_F16 || type == LLAISYS_DTYPE_BF16,
+           "swiglu cuda only supports F32/F16/BF16");
+    dim3 block(256);
+    dim3 grid((numel + block.x - 1) / block.x);
+    swiglu_kernel<<<grid, block>>>(out, gate, up, type, numel);
+    auto err = cudaGetLastError();
+    ASSERT(err == cudaSuccess, "swiglu cuda kernel launch failed");
+}
+
+} // namespace llaisys::ops::cuda
diff --git a/src/ops/swiglu/cuda/swiglu_cuda.cuh b/src/ops/swiglu/cuda/swiglu_cuda.cuh
new file mode 100644
index 00000000..81bd5a5a
--- /dev/null
+++ b/src/ops/swiglu/cuda/swiglu_cuda.cuh
@@ -0,0 +1,9 @@
+#pragma once
+
+#include "llaisys.h"
+
+#include <cstddef>
+
+namespace llaisys::ops::cuda {
+void swiglu(std::byte *out, const std::byte *gate, const std::byte *up, llaisysDataType_t type, size_t numel);
+}
diff --git a/src/ops/swiglu/op.cpp b/src/ops/swiglu/op.cpp
index 47edbcc9..d1a5c55f 100644
--- a/src/ops/swiglu/op.cpp
+++ b/src/ops/swiglu/op.cpp
@@ -1,7 +1,44 @@
 #include "op.hpp"
+#include "../../core/llaisys_core.hpp"
+#include "../../utils.hpp"
 
+#include "cpu/swiglu_cpu.hpp"
+#ifdef ENABLE_NVIDIA_API
+#include "cuda/swiglu_cuda.cuh"
+#endif
 namespace llaisys::ops {
 void swiglu(tensor_t out, tensor_t gate, tensor_t up) {
-    TO_BE_IMPLEMENTED();
-}
+    CHECK_SAME_DEVICE(out, gate, up);
+
+    // Check data types
+    CHECK_SAME_DTYPE(out->dtype(), gate->dtype(), up->dtype());
+
+    // Check contiguity
+    ASSERT(out->isContiguous() && gate->isContiguous() && up->isContiguous(), 
+           "SwiGLU: all tensors must be contiguous");
+
+    // Check shapes are the same
+    CHECK_SAME_SHAPE(out->shape(), gate->shape(), up->shape());
+
+    // always support cpu calculation
+    if (out->deviceType() == LLAISYS_DEVICE_CPU) {
+        return cpu::swiglu(out->data(), gate->data(), up->data(), 
+                          out->dtype(), out->numel());
+    }
+
+    llaisys::core::context().setDevice(out->deviceType(), out->deviceId());
+
+    switch (out->deviceType()) {
+    case LLAISYS_DEVICE_CPU:
+        return cpu::swiglu(out->data(), gate->data(), up->data(), 
+                          out->dtype(), out->numel());
+#ifdef ENABLE_NVIDIA_API
+    case LLAISYS_DEVICE_NVIDIA:
+    return cuda::swiglu(out->data(), gate->data(), up->data(), out->dtype(), out->numel());
+        return;
+#endif
+    default:
+        EXCEPTION_UNSUPPORTED_DEVICE;
+        }
+    }
 } // namespace llaisys::ops
diff --git a/src/tensor/tensor.cpp b/src/tensor/tensor.cpp
index 2f594bb6..6a333d08 100644
--- a/src/tensor/tensor.cpp
+++ b/src/tensor/tensor.cpp
@@ -1,5 +1,5 @@
 #include "tensor.hpp"
-
+#include "../../include/llaisys.h"
 #include "../utils.hpp"
 
 #include <cstring>
@@ -164,42 +164,196 @@ void Tensor::debug() const {
 }
 
 bool Tensor::isContiguous() const {
-    TO_BE_IMPLEMENTED();
-    return true;
+    auto shape = this->shape();    
+    auto strides = this->strides();    
+    if (shape.empty()) {    
+        return true;   
+    }    
+      
+    for (size_t i = 0; i < shape.size(); ++i) {    
+        size_t expected_stride = 1;    
+        for (size_t j = i + 1; j < shape.size(); ++j) {    
+            expected_stride *= shape[j];    
+        }    
+          
+        if (static_cast<size_t>(strides[i]) != expected_stride) {    
+            return false;    
+        }    
+    }    
+    return true;  
 }
 
 tensor_t Tensor::permute(const std::vector<size_t> &order) const {
-    TO_BE_IMPLEMENTED();
-    return std::shared_ptr<Tensor>(new Tensor(_meta, _storage));
+    if (order.size() != this->shape().size()) {  
+        throw std::invalid_argument("permute order must match tensor dimensions");  
+    }  
+    
+    std::vector<bool> used(this->shape().size(), false);  
+    for (size_t idx : order) {  
+        if (idx >= this->shape().size()) {  
+            throw std::invalid_argument("permute order contains invalid dimension index");  
+        }  
+        if (used[idx]) {  
+            throw std::invalid_argument("permute order contains duplicate dimension indices");  
+        }  
+        used[idx] = true;  
+    }  
+       
+    std::vector<size_t> new_shape;  
+    std::vector<ptrdiff_t> new_strides;  
+      
+    for (size_t idx : order) {  
+        new_shape.push_back(this->shape()[idx]);  
+        new_strides.push_back(this->strides()[idx]);  
+    }  
+       
+    TensorMeta new_meta;  
+    new_meta.dtype = this->dtype();  
+    new_meta.shape = new_shape;  
+    new_meta.strides = new_strides;  
+    return std::shared_ptr<Tensor>(new Tensor(new_meta, _storage, this->_offset));
 }
 
 tensor_t Tensor::view(const std::vector<size_t> &shape) const {
-    TO_BE_IMPLEMENTED();
-    return std::shared_ptr<Tensor>(new Tensor(_meta, _storage));
+    size_t new_numel = 1;  
+    for (size_t dim : shape) {  
+        new_numel *= dim;  
+    }  
+      
+    if (new_numel != this->numel()) {  
+        throw std::invalid_argument("view shape has different number of elements than tensor");  
+    }  
+      
+     
+    if (!this->isContiguous()) {  
+        throw std::invalid_argument("view can only be applied to contiguous tensors");  
+    }  
+      
+   
+    std::vector<ptrdiff_t> new_strides(shape.size());  
+    new_strides[shape.size() - 1] = 1;  
+      
+    for (int i = static_cast<int> (shape.size()) - 2; i >= 0; --i) {  
+        new_strides[i] = new_strides[i + 1] * shape[i + 1];  
+    }  
+      
+    
+    TensorMeta new_meta;  
+    new_meta.dtype = this->dtype();  
+    new_meta.shape = shape;  
+    new_meta.strides = new_strides;  
+    return std::shared_ptr<Tensor>(new Tensor(new_meta, _storage, this->_offset)); 
 }
 
 tensor_t Tensor::slice(size_t dim, size_t start, size_t end) const {
-    TO_BE_IMPLEMENTED();
-    return std::shared_ptr<Tensor>(new Tensor(_meta, _storage));
+    if (dim >= this->shape().size()) {  
+        throw std::invalid_argument("slice dimension out of range");  
+    }  
+    if (start >= end || end > this->shape()[dim]) {  
+        throw std::invalid_argument("slice start/end indices out of range");  
+    }  
+
+    auto shape = this->shape();
+    shape[dim] = end - start;
+    auto new_strides = this->strides();
+
+    TensorMeta new_meta;  
+    new_meta.dtype = this->dtype();  
+    new_meta.shape = shape;  
+    new_meta.strides = new_strides;  
+    auto _offset = this->_offset + start * new_strides[dim]*this->elementSize();
+
+    return std::shared_ptr<Tensor>(new Tensor(new_meta, _storage, _offset)); 
 }
 
 void Tensor::load(const void *src_) {
-    TO_BE_IMPLEMENTED();
+    core::context().setDevice(deviceType(), deviceId());  
+    auto &runtime = core::context().runtime();  
+    runtime.api()->memcpy_sync(data(), src_, this->numel() * this->elementSize(), LLAISYS_MEMCPY_H2D); 
 }
 
+
 tensor_t Tensor::contiguous() const {
     TO_BE_IMPLEMENTED();
-    return std::shared_ptr<Tensor>(new Tensor(_meta, _storage));
+    return std::shared_ptr<Tensor>(new Tensor(_meta, _storage, this->_offset));
 }
 
 tensor_t Tensor::reshape(const std::vector<size_t> &shape) const {
-    TO_BE_IMPLEMENTED();
-    return std::shared_ptr<Tensor>(new Tensor(_meta, _storage));
+    // Check contiguous and shape compatibility
+    if (!this->isContiguous()) {
+        throw std::runtime_error("Reshape failed: tensor must be contiguous");
+    }    
+    size_t new_numel = 1;
+    for (auto d : shape) new_numel *= d;
+    if (this->numel() != new_numel) {
+        throw std::runtime_error("Reshape failed: total number of elements must not change");
+    }
+
+    // Calculate new strides
+    std::vector<ptrdiff_t> new_strides(shape.size());
+    if (!shape.empty()) {
+        new_strides.back() = 1;
+        for (int i = static_cast<int>(shape.size()) - 2; i >= 0; --i) {
+            new_strides[i] = new_strides[i + 1] * shape[i + 1];
+        }
+    }
+
+    // Construct new tensor meta
+    TensorMeta new_meta = this->_meta;
+    new_meta.shape = std::move(shape);
+    new_meta.strides = std::move(new_strides);
+    return std::shared_ptr<Tensor>(new Tensor(new_meta, _storage, this->_offset));
 }
 
-tensor_t Tensor::to(llaisysDeviceType_t device_type, int device) const {
-    TO_BE_IMPLEMENTED();
-    return std::shared_ptr<Tensor>(new Tensor(_meta, _storage));
+tensor_t Tensor::to(llaisysDeviceType_t device_type, int device) const {  
+    // If already on the target device, return a copy (or the same tensor)
+    if (_storage->deviceType() == device_type && (device == -1 || _storage->deviceId() == device)) {
+        return std::shared_ptr<Tensor>(new Tensor(_meta, _storage, this->_offset));
+    }
+    
+    // Create a new tensor on the target device
+    if (device == -1) {
+        device = 0; // Default device ID
+    }
+    
+    auto new_tensor = Tensor::create(_meta.shape, _meta.dtype, device_type, device);
+    
+    // Make sure source tensor is contiguous before copying
+    tensor_t source = isContiguous() ? 
+        std::shared_ptr<Tensor>(new Tensor(_meta, _storage, this->_offset)) : 
+        contiguous();
+    
+    // Copy data from source device to target device
+    void* src_ptr = source->data();
+    void* dst_ptr = new_tensor->data();
+    size_t num_bytes = source->numel() * source->elementSize();
+    
+    
+    // Determine memcpy kind based on source and destination devices
+    llaisysMemcpyKind_t kind;
+    if (_storage->deviceType() == LLAISYS_DEVICE_CPU && device_type == LLAISYS_DEVICE_CPU) {
+        kind = LLAISYS_MEMCPY_H2H;
+    } else if (_storage->deviceType() == LLAISYS_DEVICE_CPU && device_type != LLAISYS_DEVICE_CPU) {
+        kind = LLAISYS_MEMCPY_H2D;
+    } else if (_storage->deviceType() != LLAISYS_DEVICE_CPU && device_type == LLAISYS_DEVICE_CPU) {
+        kind = LLAISYS_MEMCPY_D2H;
+    } else {
+        kind = LLAISYS_MEMCPY_D2D;
+    }
+
+    llaisysDeviceType_t runtime_device_type = LLAISYS_DEVICE_CPU;
+    int runtime_device_id = 0;
+    if (_storage->deviceType() != LLAISYS_DEVICE_CPU) {
+        runtime_device_type = _storage->deviceType();
+        runtime_device_id = _storage->deviceId();
+    } else if (device_type != LLAISYS_DEVICE_CPU) {
+        runtime_device_type = device_type;
+        runtime_device_id = device;
+    }
+
+    core::context().setDevice(runtime_device_type, runtime_device_id);
+    core::context().runtime().api()->memcpy_sync(dst_ptr, src_ptr, num_bytes, kind); 
+    return new_tensor;
 }
 
 } // namespace llaisys
diff --git a/src/utils.hpp b/src/utils.hpp
index f038edfb..dadeb8ee 100644
--- a/src/utils.hpp
+++ b/src/utils.hpp
@@ -1,3 +1,7 @@
 #pragma once
 #include "utils/check.hpp"
 #include "utils/types.hpp"
+
+#if defined(ENABLE_NVIDIA_API) && defined(__CUDACC__)
+#include "utils/cuda_type_utils.cuh"
+#endif
diff --git a/src/utils/cuda_type_utils.cuh b/src/utils/cuda_type_utils.cuh
new file mode 100644
index 00000000..8026aece
--- /dev/null
+++ b/src/utils/cuda_type_utils.cuh
@@ -0,0 +1,57 @@
+#pragma once
+
+#include "llaisys.h"
+
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+
+namespace llaisys::ops::cuda {
+
+__device__ __forceinline__ float load_as_float_f32(const void *ptr, size_t idx) {
+    return reinterpret_cast<const float *>(ptr)[idx];
+}
+
+__device__ __forceinline__ float load_as_float_f16(const void *ptr, size_t idx) {
+    return __half2float(reinterpret_cast<const __half *>(ptr)[idx]);
+}
+
+__device__ __forceinline__ float load_as_float_bf16(const void *ptr, size_t idx) {
+    return __bfloat162float(reinterpret_cast<const __nv_bfloat16 *>(ptr)[idx]);
+}
+
+__device__ __forceinline__ float load_as_float(const void *ptr, size_t idx, llaisysDataType_t dtype) {
+    if (dtype == LLAISYS_DTYPE_F32) {
+        return load_as_float_f32(ptr, idx);
+    }
+    if (dtype == LLAISYS_DTYPE_F16) {
+        return load_as_float_f16(ptr, idx);
+    }
+    return load_as_float_bf16(ptr, idx);
+}
+
+__device__ __forceinline__ void store_from_float_f32(void *ptr, size_t idx, float val) {
+    reinterpret_cast<float *>(ptr)[idx] = val;
+}
+
+__device__ __forceinline__ void store_from_float_f16(void *ptr, size_t idx, float val) {
+    reinterpret_cast<__half *>(ptr)[idx] = __float2half_rn(val);
+}
+
+__device__ __forceinline__ void store_from_float_bf16(void *ptr, size_t idx, float val) {
+    reinterpret_cast<__nv_bfloat16 *>(ptr)[idx] = __float2bfloat16(val);
+}
+
+__device__ __forceinline__ void store_from_float(void *ptr, size_t idx, float val, llaisysDataType_t dtype) {
+    if (dtype == LLAISYS_DTYPE_F32) {
+        store_from_float_f32(ptr, idx, val);
+        return;
+    }
+    if (dtype == LLAISYS_DTYPE_F16) {
+        store_from_float_f16(ptr, idx, val);
+        return;
+    }
+    store_from_float_bf16(ptr, idx, val);
+}
+
+} // namespace llaisys::ops::cuda
diff --git a/test/ops/random_sample.py b/test/ops/random_sample.py
new file mode 100644
index 00000000..155560a2
--- /dev/null
+++ b/test/ops/random_sample.py
@@ -0,0 +1,187 @@
+import sys
+import os
+
+parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+sys.path.insert(0, parent_dir)
+
+import llaisys
+import torch
+from test_utils import zero_tensor, check_equal, torch_device
+
+
+def _torch_to_llaisys(torch_tensor: torch.Tensor, dtype_name: str, device_name: str):
+    llaisys_tensor_torch, llaisys_tensor = zero_tensor(torch_tensor.shape, dtype_name, device_name)
+    api = llaisys.RuntimeAPI(llaisys.DeviceType.CPU if device_name == "cpu" else llaisys.DeviceType.NVIDIA)
+    api.memcpy_sync(
+        llaisys_tensor.data_ptr(),
+        torch_tensor.data_ptr(),
+        torch_tensor.numel() * torch_tensor.element_size(),
+        llaisys.MemcpyKind.D2D,
+    )
+    return llaisys_tensor_torch, llaisys_tensor
+
+
+def _read_scalar_i64(tensor: llaisys.Tensor, device_name: str) -> int:
+    out = torch.zeros((1,), dtype=torch.int64, device=torch_device(device_name))
+    api = llaisys.RuntimeAPI(tensor.device_type())
+    api.memcpy_sync(out.data_ptr(), tensor.data_ptr(), out.numel() * out.element_size(), llaisys.MemcpyKind.D2D)
+    return int(out.item())
+
+
+def _torch_reference_distribution(logits: torch.Tensor, temperature: float, top_k: int, top_p: float):
+    scaled_logits = logits.float() / max(temperature, 1e-6)
+    sorted_vals, sorted_idx = torch.sort(scaled_logits, descending=True)
+
+    if top_k <= 0 or top_k > sorted_idx.numel():
+        top_k = sorted_idx.numel()
+    sorted_vals = sorted_vals[:top_k]
+    sorted_idx = sorted_idx[:top_k]
+
+    probs = torch.softmax(sorted_vals, dim=0)
+
+    top_p = min(1.0, max(0.0, float(top_p)))
+    keep_count = probs.numel()
+    if 0.0 < top_p < 1.0:
+        cdf = torch.cumsum(probs, dim=0)
+        keep_count = int(torch.searchsorted(cdf, torch.tensor(top_p, device=cdf.device), right=False).item()) + 1
+        keep_count = max(1, keep_count)
+
+    sorted_idx = sorted_idx[:keep_count]
+    probs = probs[:keep_count]
+    probs = probs / probs.sum()
+    return sorted_idx, probs
+
+
+def _sample_llaisys_many(logits_: llaisys.Tensor, nsample: int, temperature: float, top_k: int, top_p: float,
+                        device_name: str, seed_base: int | None = None):
+    idx_t, idx_ = zero_tensor((1,), "i64", device_name)
+    val_t, val_ = zero_tensor((1,), "f32", device_name)
+    counts = {}
+
+    for _ in range(nsample):
+        seed = None if seed_base is None else (seed_base + _)
+        llaisys.Ops.random_sample(idx_, val_, logits_, temperature=temperature, top_k=top_k, top_p=top_p, seed=seed)
+        sampled = _read_scalar_i64(idx_, device_name)
+        counts[sampled] = counts.get(sampled, 0) + 1
+
+    return counts
+
+
+def test_random_sample_topk1_is_argmax(dtype_name="f32", device_name="cpu"):
+    print(f"   top_k=1 -> argmax, dtype <{dtype_name}>")
+    logits_cpu = torch.tensor([0.1, 3.2, -0.5, 1.1], dtype=torch.float32, device=torch_device(device_name))
+    dtype = {
+        "f32": torch.float32,
+        "f16": torch.float16,
+        "bf16": torch.bfloat16,
+    }[dtype_name]
+    logits = logits_cpu.to(dtype=dtype)
+
+    _, logits_ = _torch_to_llaisys(logits, dtype_name, device_name)
+    max_idx, max_idx_ = zero_tensor((1,), "i64", device_name)
+    max_val, max_val_ = zero_tensor((1,), dtype_name, device_name)
+
+    llaisys.Ops.random_sample(max_idx_, max_val_, logits_, temperature=1.0, top_k=1, top_p=1.0)
+
+    expected_idx = torch.argmax(logits, dim=-1, keepdim=True).to(torch.int64)
+    expected_val = logits[expected_idx]
+
+    assert check_equal(max_idx_, expected_idx, strict=True)
+    assert check_equal(max_val_, expected_val, strict=True)
+
+
+def test_random_sample_topp0_is_argmax(device_name="cpu"):
+    print("   top_p=0 -> argmax fallback")
+    logits = torch.tensor([0.2, 1.6, -0.4, 0.7, 1.2], dtype=torch.float32, device=torch_device(device_name))
+    _, logits_ = _torch_to_llaisys(logits, "f32", device_name)
+    idx_t, idx_ = zero_tensor((1,), "i64", device_name)
+    val_t, val_ = zero_tensor((1,), "f32", device_name)
+
+    expected_idx = int(torch.argmax(logits, dim=-1).item())
+    for _ in range(64):
+        llaisys.Ops.random_sample(idx_, val_, logits_, temperature=1.0, top_k=4, top_p=0.0)
+        sampled = _read_scalar_i64(idx_, device_name)
+        assert sampled == expected_idx
+
+
+def test_random_sample_topk_constraint(device_name="cpu"):
+    print("   sampled token must be inside torch reference support")
+    logits = torch.tensor([0.1, 2.5, -1.0, 1.2, 3.3, 0.9], dtype=torch.float32, device=torch_device(device_name))
+    _, logits_ = _torch_to_llaisys(logits, "f32", device_name)
+
+    top_k = 3
+    ref_idx, _ = _torch_reference_distribution(logits, temperature=1.0, top_k=top_k, top_p=1.0)
+    ref_set = set(ref_idx.tolist())
+
+    counts = _sample_llaisys_many(logits_, nsample=256, temperature=1.0, top_k=top_k, top_p=1.0, device_name=device_name)
+    for sampled in counts:
+        assert sampled in ref_set
+
+
+def test_random_sample_topp_constraint(device_name="cpu"):
+    print("   sampling distribution should match torch reference")
+    logits = torch.tensor([2.0, 1.2, 0.7, -0.5, -1.3], dtype=torch.float32, device=torch_device(device_name))
+    _, logits_ = _torch_to_llaisys(logits, "f32", device_name)
+    temperature = 0.9
+    top_k = 5
+    top_p = 0.8
+    nsample = 4000
+
+    ref_idx, ref_prob = _torch_reference_distribution(logits, temperature=temperature, top_k=top_k, top_p=top_p)
+    counts = _sample_llaisys_many(logits_, nsample=nsample, temperature=temperature, top_k=top_k, top_p=top_p, device_name=device_name)
+
+    for sampled in counts:
+        assert sampled in set(ref_idx.tolist())
+
+    empirical = []
+    for token in ref_idx.tolist():
+        empirical.append(counts.get(token, 0) / nsample)
+
+    empirical = torch.tensor(empirical, dtype=torch.float32, device=ref_prob.device)
+    max_diff = torch.max(torch.abs(empirical - ref_prob)).item()
+    assert max_diff < 0.01, f"distribution mismatch too large: {max_diff}"
+
+
+def test_random_sample_seed_reproducibility(device_name="cpu"):
+    print("   seeded sampling should be reproducible")
+    logits = torch.tensor([2.0, 1.1, 0.8, -0.3, -1.8], dtype=torch.float32, device=torch_device(device_name))
+    _, logits_ = _torch_to_llaisys(logits, "f32", device_name)
+
+    counts_run1 = _sample_llaisys_many(
+        logits_,
+        nsample=256,
+        temperature=0.95,
+        top_k=5,
+        top_p=0.9,
+        device_name=device_name,
+        seed_base=12345,
+    )
+    counts_run2 = _sample_llaisys_many(
+        logits_,
+        nsample=256,
+        temperature=0.95,
+        top_k=5,
+        top_p=0.9,
+        device_name=device_name,
+        seed_base=12345,
+    )
+    assert counts_run1 == counts_run2
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--device", default="cpu", choices=["cpu", "nvidia"], type=str)
+    args = parser.parse_args()
+
+    print(f"Testing Ops.random_sample on {args.device}")
+    for dtype_name in ["f32", "f16", "bf16"]:
+        test_random_sample_topk1_is_argmax(dtype_name=dtype_name, device_name=args.device)
+
+    test_random_sample_topp0_is_argmax(device_name=args.device)
+    test_random_sample_topk_constraint(device_name=args.device)
+    test_random_sample_topp_constraint(device_name=args.device)
+    test_random_sample_seed_reproducibility(device_name=args.device)
+
+    print("\033[92mTest passed!\033[0m\n")
diff --git a/test/test_infer.py b/test/test_infer.py
index 59d06b87..8b237799 100644
--- a/test/test_infer.py
+++ b/test/test_infer.py
@@ -1,6 +1,7 @@
 import gc
 from test_utils import *
-
+import faulthandler
+faulthandler.enable()
 import argparse
 from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch
@@ -23,6 +24,8 @@ def load_hf_model(model_path=None, device_name="cpu"):
         print(f"Loading model from Hugging Face: {model_id}")
         model_path = snapshot_download(model_id)
     tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+    if tokenizer.pad_token_id is None:
+        tokenizer.pad_token = tokenizer.eos_token
     model = AutoModelForCausalLM.from_pretrained(
         model_path,
         torch_dtype=torch.bfloat16,
@@ -34,21 +37,27 @@ def load_hf_model(model_path=None, device_name="cpu"):
 
 
 def hf_infer(
-    prompt, tokenizer, model, max_new_tokens=128, top_p=0.8, top_k=50, temperature=0.8
+    prompt, tokenizer, model, max_new_tokens=128, top_p=0.8, top_k=50, temperature=0.8, seed=None
 ):
+    if seed is not None:
+        torch.manual_seed(seed)
+        if model.device.type == "cuda":
+            torch.cuda.manual_seed_all(seed)
     input_content = tokenizer.apply_chat_template(
         conversation=[{"role": "user", "content": prompt}],
         add_generation_prompt=True,
         tokenize=False,
     )
-    inputs = tokenizer.encode(input_content, return_tensors="pt").to(model.device)
+    model_inputs = tokenizer(input_content, return_tensors="pt").to(model.device)
     with torch.no_grad():
         outputs = model.generate(
-            inputs,
+            input_ids=model_inputs["input_ids"],
+            attention_mask=model_inputs["attention_mask"],
             max_new_tokens=max_new_tokens,
             top_k=top_k,
             top_p=top_p,
             temperature=temperature,
+            pad_token_id=tokenizer.pad_token_id,
         )
     result = tokenizer.decode(outputs[0], skip_special_tokens=True)
     return outputs[0].tolist(), result
@@ -60,21 +69,39 @@ def load_llaisys_model(model_path, device_name):
 
 
 def llaisys_infer(
-    prompt, tokenizer, model, max_new_tokens=128, top_p=0.8, top_k=50, temperature=0.8
+    prompt, tokenizer, model, max_new_tokens=128, top_p=0.8, top_k=50, temperature=0.8, seed=None
 ):
     input_content = tokenizer.apply_chat_template(
         conversation=[{"role": "user", "content": prompt}],
-        add_generation_prompt=True,
+        add_generation_prompt= True,
         tokenize=False,
     )
     inputs = tokenizer.encode(input_content)
+
+    if not hasattr(llaisys_infer, "_cache_pools"):
+        llaisys_infer._cache_pools = {}
+
+    model_key = id(model)
+    if model_key not in llaisys_infer._cache_pools:
+        llaisys_infer._cache_pools[model_key] = llaisys.models.KVCachePool(model, max_sessions=1)
+
+    cache_pool = llaisys_infer._cache_pools[model_key]
+    session_id = "test-infer-default-session"
+    slot = cache_pool.prepare_session(session_id, inputs, max_new_tokens)
+
     outputs = model.generate(
         inputs,
         max_new_tokens=max_new_tokens,
         top_k=top_k,
         top_p=top_p,
         temperature=temperature,
+        use_cache=True,
+        kcache_array=slot.kcache_array,
+        vcache_array=slot.vcache_array,
+        past_len=slot.past_len,
+        seed=seed,
     )
+    cache_pool.commit_session(session_id, outputs)
 
     return outputs, tokenizer.decode(outputs, skip_special_tokens=True)
 
@@ -82,12 +109,13 @@ def llaisys_infer(
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--device", default="cpu", choices=["cpu", "nvidia"], type=str)
-    parser.add_argument("--model", default=None, type=str)
+    parser.add_argument("--model", default="/home/vankari/code/DeepSeek-R1-Distill-Qwen-1.5B/", type=str)
     parser.add_argument("--prompt", default="Who are you?", type=str)
-    parser.add_argument("--max_steps", default=128, type=int)
+    parser.add_argument("--max_steps", default=64, type=int)
     parser.add_argument("--top_p", default=0.8, type=float)
     parser.add_argument("--top_k", default=50, type=int)
     parser.add_argument("--temperature", default=1.0, type=float)
+    parser.add_argument("--seed", default=42, type=int)
     parser.add_argument("--test", action="store_true")
 
     args = parser.parse_args()
@@ -95,9 +123,9 @@ def llaisys_infer(
     top_p, top_k, temperature = args.top_p, args.top_k, args.temperature
     if args.test:
         top_p, top_k, temperature = 1.0, 1, 1.0
-
+    
     tokenizer, model, model_path = load_hf_model(args.model, args.device)
-
+    
     # Example prompt
     start_time = time.time()
     tokens, output = hf_infer(
@@ -108,12 +136,13 @@ def llaisys_infer(
         top_p=top_p,
         top_k=top_k,
         temperature=temperature,
+        seed=args.seed,
     )
     end_time = time.time()
-
-    del model
+    
+    del model 
     gc.collect()
-
+   
     print("\n=== Answer ===\n")
     print("Tokens:")
     print(tokens)
@@ -121,9 +150,10 @@ def llaisys_infer(
     print(output)
     print("\n")
     print(f"Time elapsed: {(end_time - start_time):.2f}s\n")
-
+    
     model = load_llaisys_model(model_path, args.device)
     start_time = time.time()
+    
     llaisys_tokens, llaisys_output = llaisys_infer(
         args.prompt,
         tokenizer,
@@ -132,8 +162,11 @@ def llaisys_infer(
         top_p=top_p,
         top_k=top_k,
         temperature=temperature,
+        seed=args.seed,
     )
 
+    
+    
     end_time = time.time()
 
     print("\n=== Your Result ===\n")
@@ -143,7 +176,7 @@ def llaisys_infer(
     print(llaisys_output)
     print("\n")
     print(f"Time elapsed: {(end_time - start_time):.2f}s\n")
-
     if args.test:
         assert llaisys_tokens == tokens
         print("\033[92mTest passed!\033[0m\n")
+    
\ No newline at end of file
diff --git a/test/test_inference_backend.py b/test/test_inference_backend.py
new file mode 100644
index 00000000..48f4a21f
--- /dev/null
+++ b/test/test_inference_backend.py
@@ -0,0 +1,182 @@
+import importlib
+import os
+import sys
+import threading
+from dataclasses import dataclass
+
+
+def _load_backend_module():
+    repo_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+    python_root = os.path.join(repo_root, "python")
+    if python_root not in sys.path:
+        sys.path.insert(0, python_root)
+    return importlib.import_module("llaisys.backend")
+
+
+@dataclass
+class _FakeSlot:
+    kcache_array: object = None
+    vcache_array: object = None
+    past_len: int = 0
+
+
+class _FakePool:
+    def __init__(self):
+        self._committed = {}
+
+    @staticmethod
+    def _common_prefix_len(a, b):
+        limit = min(len(a), len(b))
+        for i in range(limit):
+            if a[i] != b[i]:
+                return i
+        return limit
+
+    def prepare_session(self, session_id, prompt_tokens, _max_new_tokens):
+        history = self._committed.get(session_id, [])
+        shared = self._common_prefix_len(history, prompt_tokens)
+        return _FakeSlot(past_len=shared)
+
+    def commit_session(self, session_id, full_tokens):
+        self._committed[session_id] = list(full_tokens)
+
+    def reset_session(self, session_id):
+        self._committed.pop(session_id, None)
+
+    def clear(self):
+        self._committed.clear()
+
+
+class _FakeTokenizer:
+    def apply_chat_template(self, conversation, add_generation_prompt=True, tokenize=False):
+        pieces = [f"{m['role']}:{m['content']}" for m in conversation]
+        if add_generation_prompt:
+            pieces.append("assistant:")
+        return "\n".join(pieces)
+
+    def encode(self, prompt):
+        return [ord(ch) % 251 for ch in prompt]
+
+    def decode(self, token_ids, skip_special_tokens=True):
+        _ = skip_special_tokens
+        return " ".join(str(t) for t in token_ids)
+
+
+class _FakeModel:
+    eos_token_id = 999999
+
+    def __init__(self):
+        self._counter = 700
+        self.infer_calls = []
+
+    def generate(self, inputs, max_new_tokens=1, **kwargs):
+        _ = kwargs
+        generated = list(inputs)
+        for _ in range(max_new_tokens):
+            self._counter += 1
+            generated.append(self._counter)
+        return generated
+
+    def _infer_tokens(self, tokens, _kcache, _vcache, past_len, _temperature, _top_k, _top_p):
+        self.infer_calls.append({"tokens": list(tokens), "past_len": past_len})
+        self._counter += 1
+        return self._counter
+
+
+def _make_backend(backend_module):
+    backend = backend_module.InferenceBackend.__new__(backend_module.InferenceBackend)
+    backend._tokenizer = _FakeTokenizer()
+    backend._model = _FakeModel()
+    backend._pool = _FakePool()
+    backend._sessions = {}
+    backend._lock = threading.Lock()
+    return backend
+
+
+def test_backend_session_interfaces():
+    backend_module = _load_backend_module()
+    assert hasattr(backend_module, "InferenceBackend")
+    assert hasattr(backend_module, "SessionState")
+
+
+def test_backend_optional_scenario_and_stream():
+    backend_module = _load_backend_module()
+    backend = _make_backend(backend_module)
+
+    # 会话1：who are you?
+    s1 = backend.create_session("session-1")
+    backend.append_message(s1.session_id, "user", "who are you?")
+    result1 = backend.generate(
+        session_id=s1.session_id,
+        max_tokens=3,
+        top_k=10,
+        top_p=0.9,
+        temperature=0.7,
+        use_cache=True,
+    )
+    
+    assert len(result1["completion_ids"]) == 3
+    assert backend.get_session(s1.session_id).messages[-1]["role"] == "assistant"
+
+    # 新建会话2：相同提示词 who are you?
+    s2 = backend.create_session("session-2")
+    backend.append_message(s2.session_id, "user", "who are you?")
+    result2 = backend.generate(
+        session_id=s2.session_id,
+        max_tokens=2,
+        top_k=10,
+        top_p=0.9,
+        temperature=0.7,
+        use_cache=True,
+    )
+    assert len(result2["completion_ids"]) == 2
+    
+    # 删除会话1
+    backend.delete_session("session-1")
+    sessions = [s.session_id for s in backend.list_sessions()]
+    assert "session-1" not in sessions
+    assert "session-2" in sessions
+
+    # 修改会话2历史问题并重生成
+    backend.replace_messages(
+        "session-2",
+        [{"role": "user", "content": "who are you exactly?"}],
+    )
+    result3 = backend.generate(
+        session_id="session-2",
+        max_tokens=2,
+        top_k=10,
+        top_p=0.9,
+        temperature=0.7,
+        use_cache=True,
+    )
+    assert len(result3["completion_ids"]) == 2
+    assert backend.get_session("session-2").messages[-1]["role"] == "assistant"
+    
+    # 流式生成覆盖
+    backend.replace_messages(
+        "session-2",
+        [{"role": "user", "content": "stream test"}],
+    )
+    chunks = list(
+        backend.stream_generate(
+            session_id="session-2",
+            max_tokens=4,
+            top_k=10,
+            top_p=0.9,
+            temperature=0.7,
+            use_cache=True,
+        )
+    )
+
+    assert len(chunks) == 4
+    assert all(isinstance(chunk, str) for chunk in chunks)
+    assert len(backend._model.infer_calls) == 4
+    assert len(backend._model.infer_calls[0]["tokens"]) > 1
+    assert all(len(call["tokens"]) == 1 for call in backend._model.infer_calls[1:])
+
+
+if __name__ == "__main__":
+    test_backend_session_interfaces()
+    test_backend_optional_scenario_and_stream()
+    print("\033[92mTest passed!\033[0m\n")
diff --git a/test/test_kvcachepool.py b/test/test_kvcachepool.py
new file mode 100644
index 00000000..8b363da7
--- /dev/null
+++ b/test/test_kvcachepool.py
@@ -0,0 +1,128 @@
+import ctypes
+
+import torch
+
+import llaisys
+from llaisys.libllaisys import LIB_LLAISYS
+from llaisys.models.kvcachepool import KVCachePool
+
+
+class _DummyModel:
+    def __init__(self):
+        self.num_hidden_layers = 2
+        self.num_key_value_heads = 2
+        self.per_kvhead_dim = 4
+        self.data_type = llaisys.DataType.F32
+        self.device = llaisys.DeviceType.CPU
+        self.device_id = 0
+        self.max_position_embeddings = 64
+
+
+def _zero_init_slot(slot, model):
+    for i in range(model.num_hidden_layers):
+        zero_tensor = torch.zeros(
+            (slot.capacity, model.num_key_value_heads, model.per_kvhead_dim),
+            dtype=torch.float32,
+            device="cpu",
+        ).contiguous()
+        LIB_LLAISYS.tensorLoad(slot.kcache_array[i], zero_tensor.data_ptr())
+        LIB_LLAISYS.tensorLoad(slot.vcache_array[i], zero_tensor.data_ptr())
+
+
+def test_prepare_and_commit_session():
+    model = _DummyModel()
+    pool = KVCachePool(model, max_sessions=4)
+    try:
+        slot = pool.prepare_session("s1", [1, 2, 3], max_new_tokens=8)
+        _zero_init_slot(slot, model)
+        assert slot.past_len == 0
+
+        pool.commit_session("s1", [1, 2, 3, 4])
+        slot = pool.prepare_session("s1", [1, 2, 3, 4, 9], max_new_tokens=4)
+        assert slot.past_len == 4
+    finally:
+        pool.clear()
+
+
+def test_find_best_prefix_with_trie():
+    model = _DummyModel()
+    pool = KVCachePool(model, max_sessions=4)
+    try:
+        s1 = pool.prepare_session("s1", [10, 20, 30], max_new_tokens=8)
+        s2 = pool.prepare_session("s2", [10, 21, 31], max_new_tokens=8)
+        _zero_init_slot(s1, model)
+        _zero_init_slot(s2, model)
+
+        pool.commit_session("s1", [10, 20, 30, 40])
+        pool.commit_session("s2", [10, 21, 31, 41])
+
+        best = pool.find_best_prefix([10, 20, 99])
+        assert best is not None
+        assert best.session_id == "s1"
+        assert best.matched_tokens == 2
+
+        best_excluded = pool.find_best_prefix([10, 20, 99], exclude_session_id="s1")
+        assert best_excluded is not None
+        assert best_excluded.session_id == "s2"
+        assert best_excluded.matched_tokens == 1
+    finally:
+        pool.clear()
+
+
+def test_reset_and_clear():
+    model = _DummyModel()
+    pool = KVCachePool(model, max_sessions=4)
+
+    s1 = pool.prepare_session("s1", [1, 2], max_new_tokens=8)
+    s2 = pool.prepare_session("s2", [1, 3], max_new_tokens=8)
+    _zero_init_slot(s1, model)
+    _zero_init_slot(s2, model)
+
+    pool.commit_session("s1", [1, 2, 4])
+    pool.commit_session("s2", [1, 3, 5])
+
+    pool.reset_session("s1")
+    best = pool.find_best_prefix([1, 2, 9])
+    assert best is not None
+    assert best.session_id == "s2"
+    assert best.matched_tokens == 1
+
+    pool.clear()
+    assert pool.find_best_prefix([1, 3]) is None
+
+
+def test_modify_history_then_commit():
+    model = _DummyModel()
+    pool = KVCachePool(model, max_sessions=4)
+    try:
+        s1 = pool.prepare_session("s1", [1, 2, 3], max_new_tokens=8)
+        s2 = pool.prepare_session("s2", [1, 7, 8], max_new_tokens=8)
+        _zero_init_slot(s1, model)
+        _zero_init_slot(s2, model)
+
+        pool.commit_session("s1", [1, 2, 3, 4])
+        pool.commit_session("s2", [1, 7, 8, 9])
+
+        slot = pool.modify_session_tokens("s1", [1, 2, 8])
+        assert slot.past_len == 2
+
+        best_after_modify = pool.find_best_prefix([1, 2, 3, 99])
+        assert best_after_modify is not None
+        assert best_after_modify.session_id == "s1"
+        assert best_after_modify.matched_tokens == 2
+
+        pool.commit_session("s1", [1, 2, 8, 10])
+        best_after_commit = pool.find_best_prefix([1, 2, 8, 11])
+        assert best_after_commit is not None
+        assert best_after_commit.session_id == "s1"
+        assert best_after_commit.matched_tokens == 3
+    finally:
+        pool.clear()
+
+
+if __name__ == "__main__":
+    test_prepare_and_commit_session()
+    test_find_best_prefix_with_trie()
+    test_reset_and_clear()
+    test_modify_history_then_commit()
+    print("\033[92mTest passed!\033[0m\n")
diff --git a/test/test_server.py b/test/test_server.py
new file mode 100644
index 00000000..0621a5a2
--- /dev/null
+++ b/test/test_server.py
@@ -0,0 +1,183 @@
+import importlib
+import json
+import os
+import sys
+import uuid
+
+from fastapi.testclient import TestClient
+
+
+def _load_app_with_real_engine():
+    repo_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+    python_root = os.path.join(repo_root, "python")
+    if python_root not in sys.path:
+        sys.path.insert(0, python_root)
+
+    if "server.app" in sys.modules:
+        del sys.modules["server.app"]
+    if "server.engine" in sys.modules:
+        del sys.modules["server.engine"]
+
+    return importlib.import_module("server.app")
+
+
+def _collect_stream_reply(chunks):
+    parts = []
+    for chunk in chunks:
+        for line in chunk.splitlines():
+            if not line.startswith("data: "):
+                continue
+            payload = line[6:]
+            if payload == "[DONE]":
+                continue
+            try:
+                item = json.loads(payload)
+            except json.JSONDecodeError:
+                continue
+            delta = item.get("choices", [{}])[0].get("delta", {})
+            piece = delta.get("content", "")
+            if piece:
+                parts.append(piece)
+    return "".join(parts)
+
+
+def _print_prompt_debug(app_module, messages, label):
+    backend = app_module.ENGINE._backend
+    prompt = backend._build_prompt(messages)
+    print(f"[{label} messages repr] {ascii(messages)}")
+    print(f"[{label} prompt repr] {ascii(prompt)}")
+    print(f"[{label} messages has <think>] {any('<think>' in (m.get('content') or '') or '</think>' in (m.get('content') or '') for m in messages)}")
+    print(f"[{label} prompt has <think>] {'<think>' in prompt}")
+    print(f"[{label} prompt has </think>] {'</think>' in prompt}")
+
+
+def test_healthz(app_module=None):
+    
+    client = TestClient(app_module.app)
+
+    resp = client.get("/healthz")
+    assert resp.status_code == 200
+    body = resp.json()
+    assert body["status"] == "ok"
+    assert "device" in body
+    assert "model" in body
+
+
+def test_ui_index(app_module=None):
+    client = TestClient(app_module.app)
+
+    resp = client.get("/")
+    assert resp.status_code == 200
+    assert "LLAISYS Chat UI" in resp.text
+
+
+def test_chat_completion_non_stream(app_module=None):
+    
+    client = TestClient(app_module.app)
+
+    payload = {
+        "messages": [{"role": "user", "content": "你好"}],
+        "stream": False,
+        "max_tokens": 64,
+        "top_k": 10,
+        "top_p": 0.9,
+        "temperature": 0.7,
+    }
+
+    _print_prompt_debug(app_module, payload["messages"], "non-stream")
+
+    resp = client.post("/v1/chat/completions", json=payload)
+    assert resp.status_code == 200
+
+    body = resp.json()
+    print(f"[non-stream reply] {ascii(body['choices'][0]['message']['content'])}")
+    assert body["object"] == "chat.completion"
+    assert body["choices"][0]["message"]["role"] == "assistant"
+    assert isinstance(body["choices"][0]["message"]["content"], str)
+    assert body["usage"]["prompt_tokens"] > 0
+    assert body["usage"]["completion_tokens"] >= 0
+    assert body["usage"]["total_tokens"] == body["usage"]["prompt_tokens"] + body["usage"]["completion_tokens"]
+
+
+def test_chat_completion_stream(app_module=None):
+    client = TestClient(app_module.app)
+
+    payload = {
+        "messages": [{"role": "user", "content": "你好"}],
+        "stream": True,
+        "max_tokens": 64,
+        "top_k": 10,
+        "top_p": 0.9,
+        "temperature": 0.7,
+    }
+
+    _print_prompt_debug(app_module, payload["messages"], "stream")
+
+    with client.stream("POST", "/v1/chat/completions", json=payload) as resp:
+        assert resp.status_code == 200
+        chunks = list(resp.iter_text())
+
+    merged = "".join(chunks)
+    stream_reply = _collect_stream_reply(chunks)
+    print(f"[stream reply] {ascii(stream_reply)}")
+    assert "chat.completion.chunk" in merged
+    assert '"finish_reason": "stop"' in merged
+    assert "data: [DONE]" in merged
+    assert isinstance(stream_reply, str)
+
+
+def test_chat_completion_empty_messages(app_module=None):
+    client = TestClient(app_module.app)
+
+    payload = {
+        "messages": [],
+        "stream": False,
+    }
+
+    resp = client.post("/v1/chat/completions", json=payload)
+    assert resp.status_code == 400
+    assert "messages cannot be empty" in resp.text
+
+
+def test_session_apis_and_session_chat(app_module=None):
+    client = TestClient(app_module.app)
+
+    session_id = f"test-sess-{uuid.uuid4().hex[:8]}"
+
+    create_resp = client.post("/v1/sessions", json={"session_id": session_id})
+    assert create_resp.status_code == 200
+    assert create_resp.json()["session_id"] == session_id
+
+    list_resp = client.get("/v1/sessions")
+    assert list_resp.status_code == 200
+    assert session_id in list_resp.json().get("sessions", [])
+
+    payload = {
+        "session_id": session_id,
+        "messages": [{"role": "user", "content": "who are you?"}],
+        "stream": False,
+        "max_tokens": 64,
+    }
+    chat_resp = client.post("/v1/chat/completions", json=payload)
+    assert chat_resp.status_code == 200
+    assert chat_resp.json()["object"] == "chat.completion"
+
+    session_resp = client.get(f"/v1/sessions/{session_id}")
+    assert session_resp.status_code == 200
+    session_messages = session_resp.json().get("messages", [])
+    assert any(item.get("role") == "assistant" for item in session_messages)
+
+    delete_resp = client.delete(f"/v1/sessions/{session_id}")
+    assert delete_resp.status_code == 200
+    assert delete_resp.json()["deleted"] == session_id
+
+
+if __name__ == "__main__":
+    app_module = _load_app_with_real_engine()
+    test_healthz(app_module)
+    test_ui_index(app_module)
+    test_chat_completion_non_stream(app_module)
+    test_chat_completion_stream(app_module)
+    test_chat_completion_empty_messages(app_module)
+    test_session_apis_and_session_chat(app_module)
+    print("\033[92mTest passed!\033[0m\n")
diff --git a/test/test_server_config.py b/test/test_server_config.py
new file mode 100644
index 00000000..4fa2ca7e
--- /dev/null
+++ b/test/test_server_config.py
@@ -0,0 +1,64 @@
+import importlib
+import os
+import sys
+import tempfile
+
+
+def _load_server_config_module():
+    repo_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+    python_root = os.path.join(repo_root, "python")
+    if python_root not in sys.path:
+        sys.path.insert(0, python_root)
+
+    if "server.config" in sys.modules:
+        del sys.modules["server.config"]
+
+    return importlib.import_module("server.config")
+
+
+def _set_model_path_env(value, monkeypatch=None):
+    env_key = "LLAISYS_MODEL_PATH"
+    if monkeypatch is not None:
+        monkeypatch.setenv(env_key, value)
+        return None
+
+    previous = os.environ.get(env_key)
+    os.environ[env_key] = value
+    return previous
+
+
+def _restore_model_path_env(previous, monkeypatch=None):
+    env_key = "LLAISYS_MODEL_PATH"
+    if monkeypatch is not None:
+        return
+    if previous is None:
+        os.environ.pop(env_key, None)
+    else:
+        os.environ[env_key] = previous
+
+
+def test_model_path_uses_existing_local_path(monkeypatch=None):
+    with tempfile.TemporaryDirectory() as temp_dir:
+        previous = _set_model_path_env(temp_dir, monkeypatch)
+        try:
+            config_module = _load_server_config_module()
+            config = config_module.ServerConfig()
+            assert config.model_path == temp_dir
+        finally:
+            _restore_model_path_env(previous, monkeypatch)
+
+
+def test_model_path_fallbacks_to_hf_when_path_missing(monkeypatch=None):
+    previous = _set_model_path_env("/tmp/llaisys-model-path-not-exist", monkeypatch)
+    try:
+        config_module = _load_server_config_module()
+        config = config_module.ServerConfig()
+        assert config.model_path is None
+    finally:
+        _restore_model_path_env(previous, monkeypatch)
+
+
+if __name__ == "__main__":
+    test_model_path_uses_existing_local_path()
+    test_model_path_fallbacks_to_hf_when_path_missing()
+    print("\033[92mTest passed!\033[0m\n")
diff --git a/test/test_tensor.py b/test/test_tensor.py
index 9d2e9a07..a285ef8f 100644
--- a/test/test_tensor.py
+++ b/test/test_tensor.py
@@ -5,10 +5,14 @@
 import argparse
 
 
-def test_tensor():
-    torch_tensor = torch.arange(60, dtype=torch_dtype("i64")).reshape(3, 4, 5)
+def test_tensor(device_name: str = "cpu"):
+    torch_tensor = torch.arange(
+        60,
+        dtype=torch_dtype("i64"),
+        device=torch_device(device_name),
+    ).reshape(3, 4, 5)
     llaisys_tensor = llaisys.Tensor(
-        (3, 4, 5), dtype=llaisys_dtype("i64"), device=llaisys_device("cpu")
+        (3, 4, 5), dtype=llaisys_dtype("i64"), device=llaisys_device(device_name)
     )
 
     # Test load
@@ -50,6 +54,10 @@ def test_tensor():
 
 
 if __name__ == "__main__":
-    test_tensor()
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--device", default="cpu", choices=["cpu", "nvidia"], type=str)
+    args = parser.parse_args()
+
+    test_tensor(args.device)
 
     print("\n\033[92mTest passed!\033[0m\n")
diff --git a/test/test_tensor_dual_device.py b/test/test_tensor_dual_device.py
new file mode 100644
index 00000000..94c3efd0
--- /dev/null
+++ b/test/test_tensor_dual_device.py
@@ -0,0 +1,44 @@
+import argparse
+
+import llaisys
+import torch
+
+from test_utils import check_equal, llaisys_device
+
+
+def test_tensor_to_dual_device(device_id: int = 0):
+    api = llaisys.RuntimeAPI(llaisys_device("nvidia"))
+    if api.get_device_count() == 0:
+        print("===Test tensor.to dual-device===")
+        print("No nvidia devices found, skipped")
+        return
+
+    if device_id < 0 or device_id >= api.get_device_count():
+        raise ValueError(f"Invalid nvidia device_id={device_id}, count={api.get_device_count()}")
+
+    print("===Test tensor.to dual-device===")
+    torch_tensor_cpu = torch.arange(32, dtype=torch.float32).reshape(4, 8)
+    torch_tensor_gpu = torch_tensor_cpu.to(torch.device(f"cuda:{device_id}"))
+
+    cpu_tensor = llaisys.Tensor(
+        (4, 8), dtype=llaisys.DataType.F32, device=llaisys.DeviceType.CPU
+    )
+    cpu_tensor.load(torch_tensor_cpu.data_ptr())
+
+    gpu_tensor = cpu_tensor.to(llaisys.DeviceType.NVIDIA, device_id)
+    assert gpu_tensor.device_type() == llaisys.DeviceType.NVIDIA
+    assert gpu_tensor.device_id() == device_id
+    assert check_equal(gpu_tensor, torch_tensor_gpu)
+
+    cpu_back = gpu_tensor.to(llaisys.DeviceType.CPU, 0)
+    assert cpu_back.device_type() == llaisys.DeviceType.CPU
+    assert check_equal(cpu_back, torch_tensor_cpu)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--device_id", default=0, type=int)
+    args = parser.parse_args()
+
+    test_tensor_to_dual_device(args.device_id)
+    print("\n\033[92mTest passed!\033[0m\n")
diff --git a/wget-log b/wget-log
new file mode 100644
index 00000000..3ec4ad6f
--- /dev/null
+++ b/wget-log
@@ -0,0 +1,8 @@
+--2026-01-15 01:03:02--  http://github.com/
+Connecting to 192.168.1.16:10809... connected.
+Proxy request sent, awaiting response... 301 Moved Permanently
+Location: https://github.com/ [following]
+--2026-01-15 01:03:02--  https://github.com/
+Connecting to 192.168.1.16:10809... connected.
+OpenSSL: error:0A000126:SSL routines::unexpected eof while reading
+Unable to establish SSL connection.
diff --git a/wget-log.1 b/wget-log.1
new file mode 100644
index 00000000..78df2099
--- /dev/null
+++ b/wget-log.1
@@ -0,0 +1,5 @@
+--2026-01-15 01:03:02--  http://gitlab.com/
+Connecting to 192.168.1.16:10809... connected.
+Proxy request sent, awaiting response... 503 Service Unavailable
+2026-01-15 01:03:02 ERROR 503: Service Unavailable.
+
diff --git a/wget-log.2 b/wget-log.2
new file mode 100644
index 00000000..9e8fd778
--- /dev/null
+++ b/wget-log.2
@@ -0,0 +1,14 @@
+--2026-01-15 01:03:02--  http://gitcode.com/
+Connecting to 192.168.1.16:10809... connected.
+Proxy request sent, awaiting response... 302 Moved Temporarily
+Location: https://gitcode.com/ [following]
+--2026-01-15 01:03:02--  https://gitcode.com/
+Connecting to 192.168.1.16:10809... connected.
+Proxy request sent, awaiting response... 200 OK
+Length: 3538 (3.5K) [text/html]
+Saving to: ‘/dev/null’
+
+     0K ...                                                   100% 2.51G=0s
+
+2026-01-15 01:03:02 (2.51 GB/s) - ‘/dev/null’ saved [3538/3538]
+
diff --git a/wget-log.3 b/wget-log.3
new file mode 100644
index 00000000..fcc8a5b9
--- /dev/null
+++ b/wget-log.3
@@ -0,0 +1,27 @@
+--2026-01-15 01:03:02--  http://gitee.com/
+Connecting to 192.168.1.16:10809... connected.
+Proxy request sent, awaiting response... 301 Moved Permanently
+Location: https://gitee.com/ [following]
+--2026-01-15 01:03:02--  https://gitee.com/
+Connecting to 192.168.1.16:10809... connected.
+Proxy request sent, awaiting response... 200 OK
+Length: unspecified [text/html]
+Saving to: ‘/dev/null’
+
+     0K .......... .......... .......... .......... ..........  699K
+    50K .......... .......... .......... .......... ..........  409K
+   100K .......... .......... .......... .......... .......... 2.89M
+   150K .......... .......... .......... .......... .......... 1.27M
+   200K .......... .......... .......... .......... ..........  746K
+   250K .......... .......... .......... .......... .......... 1.30M
+   300K .......... .......... .......... .......... .......... 1.19M
+   350K .......... .......... .......... .......... .......... 1.41M
+   400K .......... .......... .......... .......... .......... 1.21M
+   450K .......... .......... .......... .......... .......... 1.08M
+   500K .......... .......... .......... .......... .......... 1.33M
+   550K .......... .......... .......... .......... .......... 2.79M
+   600K .......... .......... .......... .......... .......... 1.35M
+   650K .......                                                 592M=0.6s
+
+2026-01-15 01:03:03 (1.06 MB/s) - ‘/dev/null’ saved [672997]
+
diff --git a/xmake.lua b/xmake.lua
index 1f65f7a9..b6ee3f7f 100644
--- a/xmake.lua
+++ b/xmake.lua
@@ -1,5 +1,6 @@
 add_rules("mode.debug", "mode.release")
 set_encodings("utf-8")
+set_policy("check.auto_ignore_flags", false)
 
 add_includedirs("include")
 
@@ -37,6 +38,9 @@ target("llaisys-device")
     set_kind("static")
     add_deps("llaisys-utils")
     add_deps("llaisys-device-cpu")
+    if has_config("nv-gpu") then
+        add_deps("llaisys-device-nvidia")
+    end
 
     set_languages("cxx17")
     set_warnings("all", "error")
@@ -83,6 +87,9 @@ target_end()
 target("llaisys-ops")
     set_kind("static")
     add_deps("llaisys-ops-cpu")
+    if has_config("nv-gpu") then
+        add_deps("llaisys-ops-nvidia")
+    end
 
     set_languages("cxx17")
     set_warnings("all", "error")
@@ -92,6 +99,20 @@ target("llaisys-ops")
     
     add_files("src/ops/*/*.cpp")
 
+    on_install(function (target) end)
+target_end()
+target("llaisys-models")
+    set_kind("static")
+    add_deps("llaisys-tensor")
+    add_deps("llaisys-ops")
+
+    set_languages("cxx17")
+    set_warnings("all", "error")
+    if not is_plat("windows") then
+        add_cxflags("-fPIC", "-Wno-unknown-pragmas")
+    end
+
+
     on_install(function (target) end)
 target_end()
 
@@ -102,7 +123,7 @@ target("llaisys")
     add_deps("llaisys-core")
     add_deps("llaisys-tensor")
     add_deps("llaisys-ops")
-
+    add_deps("llaisys-models")
     set_languages("cxx17")
     set_warnings("all", "error")
     add_files("src/llaisys/*.cc")
diff --git a/xmake/nvidia.lua b/xmake/nvidia.lua
new file mode 100644
index 00000000..eb77e085
--- /dev/null
+++ b/xmake/nvidia.lua
@@ -0,0 +1,40 @@
+target("llaisys-device-nvidia")
+    set_kind("static")
+    set_languages("cxx17")
+    set_warnings("all", "error")
+    set_policy("build.cuda.devlink", true)
+
+    add_files("../src/device/nvidia/*.cu")
+
+    add_cugencodes("compute_80")
+    add_cuflags("--use_fast_math")
+    add_cuflags("--objdir-as-tempdir")
+    add_cuflags("-Xcompiler=-fPIC")
+    add_culdflags("-Xcompiler=-fPIC")
+
+    add_links("cudart")
+    add_linkdirs("/usr/local/cuda/lib64")
+
+    on_install(function (target) end)
+target_end()
+
+target("llaisys-ops-nvidia")
+    set_kind("static")
+    add_deps("llaisys-tensor", "llaisys-device-nvidia")
+    set_languages("cxx17")
+    set_warnings("all", "error")
+    set_policy("build.cuda.devlink", true)
+
+    add_files("../src/ops/*/cuda/*.cu")
+
+    add_cugencodes("compute_80")
+    add_cuflags("--use_fast_math")
+    add_cuflags("--objdir-as-tempdir")
+    add_cuflags("-Xcompiler=-fPIC")
+    add_culdflags("-Xcompiler=-fPIC")
+
+    add_links("cudart")
+    add_linkdirs("/usr/local/cuda/lib64")
+
+    on_install(function (target) end)
+target_end()