diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 3d31c23b..29c37472 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -7,14 +7,18 @@ on: - 'LICENSE' jobs: - build: - name: Build + build-cuda: + name: Build and test (CUDA, ${{ matrix.target }}) strategy: fail-fast: false matrix: - os: [windows-latest, ubuntu-latest] - type: [release] + include: + - target: ubuntu + os: ubuntu-latest runs-on: ${{ matrix.os }} + env: + TMPDIR: ${{ github.workspace }}/.tmp + PYTHONUTF8: "1" steps: - name: checkout code @@ -24,37 +28,144 @@ jobs: uses: xmake-io/github-action-setup-xmake@v1 with: xmake-version: latest - - - name: Xmake Build & Install - run: | - xmake + + - name: install cuda toolkit + uses: Jimver/cuda-toolkit@v0.2.24 + with: + cuda: "12.8.0" + + - name: prepare tmp (linux) + if: matrix.target == 'ubuntu' + run: | + mkdir -p "$TMPDIR" + shell: bash + + - name: prepare tmp (windows) + if: matrix.target == 'windows' + run: | + New-Item -ItemType Directory -Force -Path $env:TMPDIR | Out-Null + shell: pwsh + + - name: check toolchain (linux) + if: matrix.target == 'ubuntu' + run: | + command -v python + command -v pip + command -v xmake + command -v nvcc + python --version + pip --version + xmake --version + nvcc --version + shell: bash + + - name: check toolchain (windows) + if: matrix.target == 'windows' + run: | + Get-Command python + Get-Command pip + Get-Command xmake + if (-not $env:CUDA_PATH) { + $nvccCmd = Get-Command nvcc -ErrorAction SilentlyContinue + if ($nvccCmd) { + $env:CUDA_PATH = Split-Path (Split-Path $nvccCmd.Source -Parent) -Parent + } + } + if (-not $env:CUDA_PATH) { + throw "CUDA_PATH is not set and nvcc not found in PATH" + } + "$env:CUDA_PATH\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append + python --version + pip --version + xmake --version + & "$env:CUDA_PATH\bin\nvcc.exe" --version + shell: pwsh + + - name: detect nvidia gpu (linux) + if: matrix.target == 'ubuntu' + run: | + if command -v nvidia-smi >/dev/null 2>&1; then + nvidia-smi || true + echo "HAS_NVIDIA_GPU=1" >> "$GITHUB_ENV" + else + echo "HAS_NVIDIA_GPU=0" >> "$GITHUB_ENV" + fi + shell: bash + + - name: detect nvidia gpu (windows) + if: matrix.target == 'windows' + run: | + if (Get-Command nvidia-smi -ErrorAction SilentlyContinue) { + nvidia-smi + "HAS_NVIDIA_GPU=1" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append + } else { + "HAS_NVIDIA_GPU=0" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append + } + shell: pwsh + + - name: Xmake CUDA Build & Install (linux) + if: matrix.target == 'ubuntu' + run: | + xmake f --nv-gpu=y -c -v + xmake -v + xmake install + shell: bash + + - name: Xmake CUDA Build & Install (windows) + if: matrix.target == 'windows' + run: | + if (-not $env:CUDA_PATH) { + throw "CUDA_PATH is required for windows CUDA build" + } + $env:Path = "$env:CUDA_PATH\bin;$env:Path" + xmake f -p windows -a x64 --toolchain=msvc --cuda="$env:CUDA_PATH" --nv-gpu=y -c -v + xmake -v xmake install + shell: pwsh - - name: Install Python + - name: Install Python (linux) + if: matrix.target == 'ubuntu' run: | cd python - pip install . + pip install ./llaisyscore/ + pip install ./server-project/ cd .. + shell: bash - - name: Assignment-0 + - name: Install Python (windows) + if: matrix.target == 'windows' run: | - python test/test_runtime.py --device cpu + Set-Location python + pip install ./llaisyscore/ + pip install ./server-project/ + Set-Location .. + shell: pwsh - - name: Assignment-1 + - name: CUDA runtime api test + if: env.HAS_NVIDIA_GPU == '1' run: | - python test/test_tensor.py - - - name: Assignment-2 + python test/test_runtime.py --device nvidia + + - name: CUDA ops tests + if: env.HAS_NVIDIA_GPU == '1' + run: | + python test/ops/add.py --device nvidia + python test/ops/argmax.py --device nvidia + python test/ops/embedding.py --device nvidia + python test/ops/linear.py --device nvidia + python test/ops/random_sample.py --device nvidia + python test/ops/rms_norm.py --device nvidia + python test/ops/rope.py --device nvidia + python test/ops/self_attention.py --device nvidia + python test/ops/swiglu.py --device nvidia + + - name: CUDA infer test + if: env.HAS_NVIDIA_GPU == '1' run: | - python test/ops/add.py - python test/ops/argmax.py - python test/ops/embedding.py - python test/ops/linear.py - python test/ops/rms_norm.py - python test/ops/rope.py - python test/ops/self_attention.py - python test/ops/swiglu.py + python test/test_infer.py --device nvidia --test - - name: Assignment-3 + - name: skip gpu tests when no gpu + if: env.HAS_NVIDIA_GPU != '1' run: | - python test/test_infer.py --test + echo "No NVIDIA GPU available on this runner, skipped runtime/ops/infer GPU tests." + diff --git a/.gitignore b/.gitignore index e38cf574..1dff4f73 100644 --- a/.gitignore +++ b/.gitignore @@ -9,7 +9,8 @@ lib/ *.dll *.dylib *.pyd - +# tmpfile +.tmp/ # MacOS Cache .DS_Store diff --git a/include/llaisys/models/qwen2.h b/include/llaisys/models/qwen2.h index 7054626d..842d644c 100644 --- a/include/llaisys/models/qwen2.h +++ b/include/llaisys/models/qwen2.h @@ -29,14 +29,20 @@ __C { llaisysTensor_t *mlp_down_w; }; - struct LlaisysQwen2Model; + struct LlaisysQwen2Model + { + struct LlaisysQwen2Meta *meta; + struct LlaisysQwen2Weights *weights; + llaisysDeviceType_t device; + int *device_ids; + int ndevice; + }; __export struct LlaisysQwen2Model *llaisysQwen2ModelCreate(const LlaisysQwen2Meta *meta, llaisysDeviceType_t device, int *device_ids, int ndevice); - __export void llaisysQwen2ModelDestroy(struct LlaisysQwen2Model * model); - __export struct LlaisysQwen2Weights *llaisysQwen2ModelWeights(struct LlaisysQwen2Model * model); - - __export int64_t llaisysQwen2ModelInfer(struct LlaisysQwen2Model * model, int64_t * token_ids, size_t ntoken); + __export int64_t llaisysQwen2ModelInfer(struct LlaisysQwen2Model * model, int64_t * token_ids, size_t ntoken, + llaisysTensor_t *kcache, llaisysTensor_t *vcache, size_t past_len, + float temperature, int top_k, float top_p, int64_t seed); } #endif // LLAISYS_MODELS_QWEN2_H diff --git a/include/llaisys/ops.h b/include/llaisys/ops.h index ddb3be24..822a415f 100644 --- a/include/llaisys/ops.h +++ b/include/llaisys/ops.h @@ -13,6 +13,8 @@ __C { __export void llaisysROPE(llaisysTensor_t out, llaisysTensor_t in, llaisysTensor_t pos_ids, float theta); __export void llaisysSelfAttention(llaisysTensor_t attn_val, llaisysTensor_t q, llaisysTensor_t k, llaisysTensor_t v, float scale); __export void llaisysSwiGLU(llaisysTensor_t out, llaisysTensor_t gate, llaisysTensor_t up); + __export void llaisysRandomSample(llaisysTensor_t sample_idx, llaisysTensor_t sample_val, llaisysTensor_t logits, + float temperature, int top_k, float top_p, int64_t seed); } #endif diff --git a/include/llaisys/tensor.h b/include/llaisys/tensor.h index 76f13fbc..ab37c59b 100644 --- a/include/llaisys/tensor.h +++ b/include/llaisys/tensor.h @@ -63,6 +63,16 @@ __C { size_t dim, size_t start, size_t end); + __export llaisysTensor_t tensorReshape( + llaisysTensor_t tensor, + size_t * shape, + size_t ndim); + + __export llaisysTensor_t tensorTo( + llaisysTensor_t tensor, + llaisysDeviceType_t device_type, + int device_id); + } #endif // LLAISYS_TENSOR_H diff --git a/python/README.md b/python/README.md new file mode 100644 index 00000000..9dc4aad0 --- /dev/null +++ b/python/README.md @@ -0,0 +1,25 @@ +# llaisys-server + +Standalone server package for Project #3. + +## Install order + +1. Install core package: + +```bash +python3 -m pip install -e /home/vankari/code/llaisys/python/llaisyscore --user --break-system-packages +``` + +2. Install server package: + +```bash +python3 -m pip install -e /home/vankari/code/llaisys/python/server-project --user --break-system-packages +``` + +## Run server + +```bash +cd /home/vankari/code/llaisys/python +python3 -m uvicorn server.app:app --host 0.0.0.0 --port 8000 +``` +### attention! the --break-system-packages was enabled on public server ,it should be disabled in personal computer. \ No newline at end of file diff --git a/python/llaisys/__init__.py b/python/llaisys/__init__.py index de8d99f4..a16b3fff 100644 --- a/python/llaisys/__init__.py +++ b/python/llaisys/__init__.py @@ -6,6 +6,7 @@ from .tensor import Tensor from .ops import Ops from . import models +from . import backend from .models import * __all__ = [ @@ -17,4 +18,5 @@ "Tensor", "Ops", "models", + "backend", ] diff --git a/python/llaisys/backend/__init__.py b/python/llaisys/backend/__init__.py new file mode 100644 index 00000000..44d54020 --- /dev/null +++ b/python/llaisys/backend/__init__.py @@ -0,0 +1,3 @@ +from .inference_backend import InferenceBackend, SessionState + +__all__ = ["InferenceBackend", "SessionState"] diff --git a/python/llaisys/backend/inference_backend.py b/python/llaisys/backend/inference_backend.py new file mode 100644 index 00000000..7ca1be03 --- /dev/null +++ b/python/llaisys/backend/inference_backend.py @@ -0,0 +1,306 @@ +import threading +import time +import uuid +from dataclasses import dataclass, field +from typing import Any, Dict, Iterator, List, Optional + +from transformers import AutoTokenizer + +import llaisys +from llaisys.models.kvcachepool import KVCachePool + + +@dataclass +class SessionState: + session_id: str + messages: List[Dict[str, str]] = field(default_factory=list) + created_at: int = field(default_factory=lambda: int(time.time())) + updated_at: int = field(default_factory=lambda: int(time.time())) + + +class InferenceBackend: + """Project #3 optional backend for session-aware inference. + + Responsibilities: + - Manage multi-session message history + - Integrate Qwen2 model with external KVCachePool + - Support history edit + regenerate workflow + """ + + def __init__( + self, + model_path: Optional[str] = None, + device: llaisys.DeviceType = llaisys.DeviceType.CPU, + max_sessions: int = 32, + ) -> None: + tokenizer_source = model_path or llaisys.models.Qwen2.DEFAULT_MODEL_ID + self._tokenizer = AutoTokenizer.from_pretrained(tokenizer_source, trust_remote_code=True) + self._model = llaisys.models.Qwen2(model_path=model_path, device=device) + self._pool = KVCachePool(self._model, max_sessions=max_sessions) + + self._sessions: Dict[str, SessionState] = {} + self._lock = threading.Lock() + + def create_session(self, session_id: Optional[str] = None, messages: Optional[List[Dict[str, str]]] = None) -> SessionState: + with self._lock: + sid = session_id or f"sess-{uuid.uuid4().hex}" + if sid in self._sessions: + raise ValueError(f"session already exists: {sid}") + state = SessionState(session_id=sid, messages=list(messages or [])) + self._sessions[sid] = state + return state + + def list_sessions(self) -> List[SessionState]: + with self._lock: + return list(self._sessions.values()) + + def get_session(self, session_id: str) -> SessionState: + with self._lock: + state = self._sessions.get(session_id) + if state is None: + raise KeyError(f"session not found: {session_id}") + return state + + def delete_session(self, session_id: str) -> None: + with self._lock: + state = self._sessions.pop(session_id, None) + if state is None: + return + self._pool.reset_session(session_id) + + def clear_sessions(self) -> None: + with self._lock: + self._sessions.clear() + self._pool.clear() + + def append_message(self, session_id: str, role: str, content: str) -> SessionState: + if role not in ("system", "user", "assistant"): + raise ValueError(f"unsupported role: {role}") + with self._lock: + state = self._sessions.get(session_id) + if state is None: + raise KeyError(f"session not found: {session_id}") + state.messages.append({"role": role, "content": content}) + state.updated_at = int(time.time()) + return state + + def replace_messages(self, session_id: str, messages: List[Dict[str, str]]) -> SessionState: + """Replace full history for one session. + + This is suitable for edit-history + regenerate workflow. + KV cache validity will be recalculated on next generate call. + """ + with self._lock: + state = self._sessions.get(session_id) + if state is None: + raise KeyError(f"session not found: {session_id}") + state.messages = list(messages) + state.updated_at = int(time.time()) + return state + + def _build_prompt(self, messages: List[Dict[str, str]]) -> str: + return self._tokenizer.apply_chat_template( + conversation=messages, + add_generation_prompt=True, + tokenize=False, + ) + + @staticmethod + def _normalize_think_for_session(prompt: str, completion_text: str) -> str: + if not completion_text: + return completion_text + prompt_has_think = ("" in prompt) or ("" in prompt) + if not prompt_has_think: + return completion_text + + has_open = "" in completion_text + has_close = "" in completion_text + if has_close and not has_open: + completion_text = f"{completion_text}" + has_open = True + if has_open and not has_close: + completion_text = f"{completion_text}" + return completion_text + + def generate( + self, + session_id: str, + max_tokens: int, + top_k: int, + top_p: float, + temperature: float, + use_cache: bool = True, + append_assistant_message: bool = True, + ) -> Dict[str, Any]: + with self._lock: + state = self._sessions.get(session_id) + if state is None: + raise KeyError(f"session not found: {session_id}") + + prompt = self._build_prompt(state.messages) + prompt_ids = self._tokenizer.encode(prompt) + + if not use_cache: + output_ids = self._model.generate( + prompt_ids, + max_new_tokens=max_tokens, + top_k=top_k, + top_p=top_p, + temperature=temperature, + use_cache=False, + ) + completion_ids = output_ids[len(prompt_ids):] + self._pool.reset_session(session_id) + else: + slot = self._pool.prepare_session(session_id, prompt_ids, max_tokens) + + if slot.past_len == 0: + model_input_ids = prompt_ids + model_past_len = 0 + else: + model_input_ids = prompt_ids[slot.past_len:] + model_past_len = slot.past_len + if not model_input_ids: + model_input_ids = [prompt_ids[-1]] + model_past_len = max(0, slot.past_len - 1) + + output_ids = self._model.generate( + model_input_ids, + max_new_tokens=max_tokens, + top_k=top_k, + top_p=top_p, + temperature=temperature, + use_cache=True, + kcache_array=slot.kcache_array, + vcache_array=slot.vcache_array, + past_len=model_past_len, + ) + + completion_ids = output_ids[len(model_input_ids):] + full_output_ids = list(prompt_ids) + list(completion_ids) + self._pool.commit_session(session_id, full_output_ids) + + if self._model.eos_token_id in completion_ids: + completion_ids = completion_ids[:completion_ids.index(self._model.eos_token_id)] + + completion_text = self._tokenizer.decode(completion_ids, skip_special_tokens=False) + if append_assistant_message: + stored_text = self._normalize_think_for_session(prompt, completion_text) + state.messages.append({"role": "assistant", "content": stored_text}) + state.updated_at = int(time.time()) + + return { + "session_id": session_id, + "prompt_ids": prompt_ids, + "completion_ids": completion_ids, + "completion_text": completion_text, + "created": int(time.time()), + } + + def stream_generate( + self, + session_id: str, + max_tokens: int, + top_k: int, + top_p: float, + temperature: float, + use_cache: bool = True, + append_assistant_message: bool = True, + ) -> Iterator[str]: + if max_tokens <= 0: + raise ValueError("max_tokens must be positive") + if temperature <= 0: + raise ValueError("temperature must be positive") + if top_k == 0: + raise ValueError("top_k cannot be 0") + if top_p < 0 or top_p > 1: + raise ValueError("top_p must be in [0, 1]") + + with self._lock: + state = self._sessions.get(session_id) + if state is None: + raise KeyError(f"session not found: {session_id}") + + prompt = self._build_prompt(state.messages) + prompt_ids = self._tokenizer.encode(prompt) + completion_ids: List[int] = [] + + if not use_cache: + generated_ids = list(prompt_ids) + for _ in range(max_tokens): + token_id = self._model._infer_tokens( + generated_ids, + None, + None, + 0, + temperature, + top_k, + top_p, + ) + if token_id == self._model.eos_token_id: + break + completion_ids.append(token_id) + generated_ids.append(token_id) + piece = self._tokenizer.decode([token_id], skip_special_tokens=False) + if piece: + yield piece + self._pool.reset_session(session_id) + else: + slot = self._pool.prepare_session(session_id, prompt_ids, max_tokens) + + if slot.past_len == 0: + prefill_input_ids = prompt_ids + prefill_past_len = 0 + else: + prefill_input_ids = prompt_ids[slot.past_len:] + prefill_past_len = slot.past_len + if not prefill_input_ids: + prefill_input_ids = [prompt_ids[-1]] + prefill_past_len = max(0, slot.past_len - 1) + + token_id = self._model._infer_tokens( + prefill_input_ids, + slot.kcache_array, + slot.vcache_array, + prefill_past_len, + temperature, + top_k, + top_p, + ) + if token_id != self._model.eos_token_id: + completion_ids.append(token_id) + piece = self._tokenizer.decode([token_id], skip_special_tokens=False) + if piece: + yield piece + else: + token_id = None + + cached_token_len = prefill_past_len + len(prefill_input_ids) + for _ in range(max_tokens - 1): + if token_id is None: + break + token_id = self._model._infer_tokens( + [token_id], + slot.kcache_array, + slot.vcache_array, + cached_token_len, + temperature, + top_k, + top_p, + ) + if token_id == self._model.eos_token_id: + break + completion_ids.append(token_id) + cached_token_len += 1 + piece = self._tokenizer.decode([token_id], skip_special_tokens=False) + if piece: + yield piece + + full_output_ids = list(prompt_ids) + completion_ids + self._pool.commit_session(session_id, full_output_ids) + + if append_assistant_message: + completion_text = self._tokenizer.decode(completion_ids, skip_special_tokens=False) + stored_text = self._normalize_think_for_session(prompt, completion_text) + state.messages.append({"role": "assistant", "content": stored_text}) + state.updated_at = int(time.time()) diff --git a/python/llaisys/libllaisys/__init__.py b/python/llaisys/libllaisys/__init__.py index f536fb52..0f952d00 100644 --- a/python/llaisys/libllaisys/__init__.py +++ b/python/llaisys/libllaisys/__init__.py @@ -12,7 +12,7 @@ from .tensor import llaisysTensor_t from .tensor import load_tensor from .ops import load_ops - +from .models import load_qwen2 def load_shared_library(): lib_dir = Path(__file__).parent @@ -38,7 +38,7 @@ def load_shared_library(): load_runtime(LIB_LLAISYS) load_tensor(LIB_LLAISYS) load_ops(LIB_LLAISYS) - +load_qwen2(LIB_LLAISYS) __all__ = [ "LIB_LLAISYS", @@ -52,4 +52,6 @@ def load_shared_library(): "llaisysMemcpyKind_t", "MemcpyKind", "llaisysStream_t", + "LlaisysQwen2Meta", + "LlaisysQwen2Weights" ] diff --git a/python/llaisys/libllaisys/models.py b/python/llaisys/libllaisys/models.py new file mode 100644 index 00000000..dfd480ae --- /dev/null +++ b/python/llaisys/libllaisys/models.py @@ -0,0 +1,75 @@ +from ctypes import c_int, c_int64, c_size_t +from .llaisys_types import llaisysDeviceType_t, llaisysDataType_t +from .tensor import llaisysTensor_t +import ctypes + +class LlaisysQwen2Meta(ctypes.Structure): + _fields_ = [ + ("dtype", llaisysDataType_t), + ("nlayer", ctypes.c_size_t), + ("hs", ctypes.c_size_t), + ("nh", ctypes.c_size_t), + ("nkvh", ctypes.c_size_t), + ("dh", ctypes.c_size_t), + ("di", ctypes.c_size_t), + ("maxseq", ctypes.c_size_t), + ("voc", ctypes.c_size_t), + ("epsilon", ctypes.c_float), + ("theta", ctypes.c_float), + ("end_token", ctypes.c_int64) + ] + +class LlaisysQwen2Weights(ctypes.Structure): + _fields_ = [ + ("in_embed", llaisysTensor_t), + ("out_embed", llaisysTensor_t), + ("out_norm_w", llaisysTensor_t), + + ("attn_norm_w", ctypes.POINTER(llaisysTensor_t)), + ("attn_q_w", ctypes.POINTER(llaisysTensor_t)), + ("attn_q_b", ctypes.POINTER(llaisysTensor_t)), + ("attn_k_w", ctypes.POINTER(llaisysTensor_t)), + ("attn_k_b", ctypes.POINTER(llaisysTensor_t)), + ("attn_v_w", ctypes.POINTER(llaisysTensor_t)), + ("attn_v_b", ctypes.POINTER(llaisysTensor_t)), + ("attn_o_w", ctypes.POINTER(llaisysTensor_t)), + + ("mlp_norm_w", ctypes.POINTER(llaisysTensor_t)), + ("mlp_gate_w", ctypes.POINTER(llaisysTensor_t)), + ("mlp_up_w", ctypes.POINTER(llaisysTensor_t)), + ("mlp_down_w", ctypes.POINTER(llaisysTensor_t)), + ] + +class LlaisysQwen2Model(ctypes.Structure): + _fields_ = [ + ("meta", ctypes.POINTER(LlaisysQwen2Meta)), + ("device", ctypes.c_int), # llaisysDeviceType_t + ("ndevice", ctypes.c_int), + ("device_ids", ctypes.POINTER(ctypes.c_int)), + ("weights", ctypes.POINTER(LlaisysQwen2Weights)), + ] + +# Load shared library +def load_qwen2(lib): + lib.llaisysQwen2ModelCreate.argtypes = [ctypes.POINTER(LlaisysQwen2Meta), llaisysDeviceType_t, ctypes.POINTER(c_int), c_int] + lib.llaisysQwen2ModelCreate.restype = ctypes.POINTER(LlaisysQwen2Model) + + lib.llaisysQwen2ModelDestroy.argtypes = [ctypes.POINTER(LlaisysQwen2Model)] + lib.llaisysQwen2ModelDestroy.restype = None + + lib.llaisysQwen2ModelWeights.argtypes = [ctypes.POINTER(LlaisysQwen2Model)] + lib.llaisysQwen2ModelWeights.restype = ctypes.POINTER(LlaisysQwen2Weights) + + lib.llaisysQwen2ModelInfer.argtypes = [ + ctypes.POINTER(LlaisysQwen2Model), + ctypes.POINTER(c_int64), + c_size_t, + ctypes.POINTER(llaisysTensor_t), + ctypes.POINTER(llaisysTensor_t), + c_size_t, + ctypes.c_float, + c_int, + ctypes.c_float, + c_int64, + ] + lib.llaisysQwen2ModelInfer.restype = c_int64 \ No newline at end of file diff --git a/python/llaisys/libllaisys/ops.py b/python/llaisys/libllaisys/ops.py index 5be095ef..49b70620 100644 --- a/python/llaisys/libllaisys/ops.py +++ b/python/llaisys/libllaisys/ops.py @@ -1,5 +1,5 @@ from .tensor import llaisysTensor_t -from ctypes import c_float +from ctypes import c_float, c_int, c_int64 def load_ops(lib): lib.llaisysAdd.argtypes = [llaisysTensor_t, llaisysTensor_t, llaisysTensor_t] @@ -34,3 +34,14 @@ def load_ops(lib): lib.llaisysSwiGLU.argtypes = [llaisysTensor_t, llaisysTensor_t, llaisysTensor_t] lib.llaisysSwiGLU.restype = None + + lib.llaisysRandomSample.argtypes = [ + llaisysTensor_t, + llaisysTensor_t, + llaisysTensor_t, + c_float, + c_int, + c_float, + c_int64, + ] + lib.llaisysRandomSample.restype = None diff --git a/python/llaisys/libllaisys/tensor.py b/python/llaisys/libllaisys/tensor.py index b5805788..ce7e674b 100644 --- a/python/llaisys/libllaisys/tensor.py +++ b/python/llaisys/libllaisys/tensor.py @@ -76,3 +76,14 @@ def load_tensor(lib): c_size_t, # end : exclusive ] lib.tensorSlice.restype = llaisysTensor_t + # Function: tensorToDevice(llaisysTensor_t tensor, + # llaisysDeviceType_t device_type, + # int device_id); + lib.tensorTo.argtypes = [ + llaisysTensor_t, + llaisysDeviceType_t, + c_int, + ] + lib.tensorTo.restype = llaisysTensor_t + lib.tensorReshape.argtypes = [llaisysTensor_t, POINTER(c_size_t), c_size_t] + lib.tensorReshape.restype = llaisysTensor_t \ No newline at end of file diff --git a/python/llaisys/models/__init__.py b/python/llaisys/models/__init__.py index af9918b0..fea68919 100644 --- a/python/llaisys/models/__init__.py +++ b/python/llaisys/models/__init__.py @@ -1 +1,2 @@ from .qwen2 import Qwen2 +from .kvcachepool import KVCachePool, KVCacheSlot, PrefixMatch diff --git a/python/llaisys/models/kvcachepool.py b/python/llaisys/models/kvcachepool.py new file mode 100644 index 00000000..0737f2a1 --- /dev/null +++ b/python/llaisys/models/kvcachepool.py @@ -0,0 +1,487 @@ +from __future__ import annotations + +from dataclasses import dataclass +from collections import OrderedDict +from typing import Dict, Optional, Sequence +import ctypes + +from ..libllaisys import LIB_LLAISYS, llaisysTensor_t, DeviceType, DataType, MemcpyKind +from ..runtime import RuntimeAPI + + +@dataclass +class PrefixMatch: + """Prefix match result from pool-level lookup. + + Attributes: + session_id: Matched source session id. + matched_tokens: Number of matched prefix tokens. + """ + session_id: str + matched_tokens: int + + +@dataclass +class KVCacheSlot: + """Per-session KV cache slot. + + Attributes: + session_id: Session identifier. + capacity: Max tokens storable in this slot. + kcache_array: C-level K cache tensor array, one per layer. + vcache_array: C-level V cache tensor array, one per layer. + tokens: Full token history tracked by upper layer. + past_len: Valid reusable prefix length in current caches. + """ + session_id: str + capacity: int + kcache_array: object + vcache_array: object + tokens: tuple[int, ...] + past_len: int + + +class _TrieNode: + """Trie node for token-prefix indexing. + + Attributes: + token: Current node token value, root is None. + children: Mapping token -> child node. + session_indices: Sessions that contain this prefix. + """ + __slots__ = ("token", "children", "session_indices") + + def __init__(self, token: Optional[int] = None): + self.token = token + self.children: dict[int, _TrieNode] = {} + self.session_indices: list[int] = [] + + +class KVCachePool: + """External KV cache manager for multi-session inference. + + Notes: + - This pool owns tensor lifecycle for created caches. + - Prefix matching is based on token history per session. + - Backend should call `prepare_session` before generation and + `commit_session` after generation. + """ + + def __init__(self, model, max_sessions: int = 32): + """Create a KV cache pool. + + Args: + model: Qwen2 model instance that provides tensor/meta attributes. + max_sessions: Maximum active sessions retained in memory. + """ + self._model = model + self._max_sessions = max_sessions + self._slots: "OrderedDict[str, KVCacheSlot]" = OrderedDict() + self._trie_root = _TrieNode() + self._next_session_index = 0 + self._session_to_index: dict[str, int] = {} + self._index_to_session: dict[int, str] = {} + self._active_indices: set[int] = set() + self._runtime = RuntimeAPI(self._model.device) + + @staticmethod + def _common_prefix_len(a: Sequence[int], b: Sequence[int]) -> int: + """Return common-prefix length for two token sequences.""" + limit = min(len(a), len(b)) + for i in range(limit): + if a[i] != b[i]: + return i + return limit + + def _destroy_slot(self, slot: KVCacheSlot) -> None: + """Release all tensor resources owned by one slot.""" + for i in range(self._model.num_hidden_layers): + if slot.kcache_array[i]: + LIB_LLAISYS.tensorDestroy(slot.kcache_array[i]) + if slot.vcache_array[i]: + LIB_LLAISYS.tensorDestroy(slot.vcache_array[i]) + + @staticmethod + def _dtype_nbytes(dtype: DataType) -> int: + if dtype in (DataType.BYTE, DataType.BOOL, DataType.I8, DataType.U8): + return 1 + if dtype in (DataType.I16, DataType.U16, DataType.F16, DataType.BF16): + return 2 + if dtype in (DataType.I32, DataType.U32, DataType.F32): + return 4 + if dtype in (DataType.I64, DataType.U64, DataType.F64): + return 8 + raise ValueError(f"unsupported dtype for KV cache copy: {dtype}") + + def _copy_prefix_cache(self, src_slot: KVCacheSlot, dst_slot: KVCacheSlot, prefix_len: int) -> None: + if prefix_len <= 0: + return + + nbytes = ( + prefix_len + * self._model.num_key_value_heads + * self._model.per_kvhead_dim + * self._dtype_nbytes(self._model.data_type) + ) + memcpy_kind = MemcpyKind.H2H if self._model.device == DeviceType.CPU else MemcpyKind.D2D + + for i in range(self._model.num_hidden_layers): + src_k = LIB_LLAISYS.tensorSlice(src_slot.kcache_array[i], 0, 0, prefix_len) + dst_k = LIB_LLAISYS.tensorSlice(dst_slot.kcache_array[i], 0, 0, prefix_len) + src_v = LIB_LLAISYS.tensorSlice(src_slot.vcache_array[i], 0, 0, prefix_len) + dst_v = LIB_LLAISYS.tensorSlice(dst_slot.vcache_array[i], 0, 0, prefix_len) + try: + self._runtime.memcpy_sync( + LIB_LLAISYS.tensorGetData(dst_k), + LIB_LLAISYS.tensorGetData(src_k), + nbytes, + memcpy_kind, + ) + self._runtime.memcpy_sync( + LIB_LLAISYS.tensorGetData(dst_v), + LIB_LLAISYS.tensorGetData(src_v), + nbytes, + memcpy_kind, + ) + finally: + LIB_LLAISYS.tensorDestroy(src_k) + LIB_LLAISYS.tensorDestroy(dst_k) + LIB_LLAISYS.tensorDestroy(src_v) + LIB_LLAISYS.tensorDestroy(dst_v) + + def _get_or_create_session_index(self, session_id: str) -> int: + """Get stable integer index for a session id.""" + idx = self._session_to_index.get(session_id) + if idx is not None: + return idx + idx = self._next_session_index + self._next_session_index += 1 + self._session_to_index[session_id] = idx + self._index_to_session[idx] = session_id + return idx + + @staticmethod + def _add_session_index(node: _TrieNode, session_index: int) -> None: + """Attach one session index to trie node if absent.""" + if session_index not in node.session_indices: + node.session_indices.append(session_index) + + @staticmethod + def _remove_session_index(node: _TrieNode, session_index: int) -> None: + """Detach one session index from trie node if present.""" + if session_index in node.session_indices: + node.session_indices.remove(session_index) + + def _trie_insert(self, tokens: Sequence[int], session_index: int) -> None: + """Insert a token prefix path into trie for one session.""" + node = self._trie_root + for token in tokens: + child = node.children.get(token) + if child is None: + child = _TrieNode(token) + node.children[token] = child + node = child + self._add_session_index(node, session_index) + + def _trie_remove(self, tokens: Sequence[int], session_index: int) -> None: + """Remove one session's token prefix path from trie.""" + if not tokens: + return + + stack: list[tuple[_TrieNode, int, _TrieNode]] = [] + node = self._trie_root + for token in tokens: + child = node.children.get(token) + if child is None: + return + stack.append((node, token, child)) + node = child + + for _, _, child in stack: + self._remove_session_index(child, session_index) + + for parent, token, child in reversed(stack): + if child.children or child.session_indices: + break + del parent.children[token] + + def _trie_remove_from_depth(self, tokens: Sequence[int], session_index: int, keep_depth: int) -> None: + """Remove session index from trie nodes after `keep_depth` on one token path. + + Args: + tokens: Existing cached token path. + session_index: Target session index. + keep_depth: Number of leading nodes to keep unchanged. + """ + if not tokens: + return + + stack: list[tuple[_TrieNode, int, _TrieNode]] = [] + node = self._trie_root + for token in tokens: + child = node.children.get(token) + if child is None: + return + stack.append((node, token, child)) + node = child + + for depth, (_, _, child) in enumerate(stack, start=1): + if depth > keep_depth: + self._remove_session_index(child, session_index) + + for parent, token, child in reversed(stack): + if child.children or child.session_indices: + break + del parent.children[token] + + def _trie_best_prefix_len(self, tokens: Sequence[int], exclude_session_id: Optional[str]) -> tuple[int, Optional[str]]: + """Find longest matched prefix and source session by trie traversal.""" + exclude_index = None + if exclude_session_id is not None: + exclude_index = self._session_to_index.get(exclude_session_id) + + best_len = 0 + best_session_id: Optional[str] = None + node = self._trie_root + + for depth, token in enumerate(tokens, start=1): + node = node.children.get(token) + if node is None: + break + + matched_session_id = None + for session_index in node.session_indices: + if session_index == exclude_index: + continue + if session_index not in self._active_indices: + continue + sid = self._index_to_session.get(session_index) + if sid is None or sid not in self._slots: + continue + matched_session_id = sid + break + + if matched_session_id is not None: + best_len = depth + best_session_id = matched_session_id + + return best_len, best_session_id + + def _update_slot_tokens(self, session_id: str, new_tokens: Sequence[int]) -> None: + """Update a session slot token history and synchronize trie index.""" + slot = self._slots.get(session_id) + if slot is None: + return + + session_index = self._get_or_create_session_index(session_id) + self._active_indices.add(session_index) + + old_tokens = slot.tokens[:slot.past_len] + if old_tokens: + self._trie_remove(old_tokens, session_index) + + slot.tokens = tuple(new_tokens) + slot.past_len = len(slot.tokens) + + if slot.past_len > 0: + self._trie_insert(slot.tokens[:slot.past_len], session_index) + + def _set_slot_reuse_state(self, session_id: str, new_tokens: Sequence[int], reusable_len: int) -> KVCacheSlot: + slot = self._slots.get(session_id) + if slot is None: + raise KeyError(f"session not found: {session_id}") + + new_tokens = tuple(new_tokens) + reusable_len = max(0, min(reusable_len, len(new_tokens))) + + session_index = self._get_or_create_session_index(session_id) + self._active_indices.add(session_index) + + old_cached_tokens = slot.tokens[:slot.past_len] + if old_cached_tokens: + self._trie_remove(old_cached_tokens, session_index) + + slot.tokens = new_tokens + slot.past_len = reusable_len + + if reusable_len > 0: + self._trie_insert(new_tokens[:reusable_len], session_index) + + return slot + + def _evict_if_needed(self) -> None: + """Evict least-recently-used sessions when exceeding pool capacity.""" + while len(self._slots) > self._max_sessions: + session_id, slot = self._slots.popitem(last=False) + session_index = self._session_to_index.get(session_id) + if session_index is not None: + if slot.past_len > 0: + self._trie_remove(slot.tokens[:slot.past_len], session_index) + self._active_indices.discard(session_index) + self._destroy_slot(slot) + + def _allocate_slot(self, session_id: str, capacity: int) -> KVCacheSlot: + """Allocate one session slot and its per-layer KV tensors.""" + array_type = llaisysTensor_t * self._model.num_hidden_layers + kcache_array = array_type() + vcache_array = array_type() + + for i in range(self._model.num_hidden_layers): + shape_arr = (ctypes.c_size_t * 3)( + capacity, + self._model.num_key_value_heads, + self._model.per_kvhead_dim, + ) + kcache_array[i] = LIB_LLAISYS.tensorCreate( + shape_arr, 3, self._model.data_type, self._model.device, self._model.device_id + ) + vcache_array[i] = LIB_LLAISYS.tensorCreate( + shape_arr, 3, self._model.data_type, self._model.device, self._model.device_id + ) + + return KVCacheSlot( + session_id=session_id, + capacity=capacity, + kcache_array=kcache_array, + vcache_array=vcache_array, + tokens=(), + past_len=0, + ) + + def find_best_prefix(self, tokens: Sequence[int], exclude_session_id: Optional[str] = None) -> Optional[PrefixMatch]: + """Find best reusable prefix from pool. + + Args: + tokens: Target token sequence. + exclude_session_id: Optional session id to skip. + + Returns: + PrefixMatch for longest hit, or None if no positive-length match. + """ + matched_len, matched_session = self._trie_best_prefix_len(tokens, exclude_session_id) + if matched_session is None or matched_len == 0: + return None + return PrefixMatch(session_id=matched_session, matched_tokens=matched_len) + + def prepare_session(self, session_id: str, prompt_tokens: Sequence[int], max_new_tokens: int) -> KVCacheSlot: + """Prepare slot for a new inference call. + + This method ensures slot existence/capacity, computes best reusable + prefix from current session and pool-level matches, and returns slot + handles for model inference. + + Args: + session_id: Target session id. + prompt_tokens: Full prompt tokens for this call. + max_new_tokens: Planned generation upper bound. + + Returns: + Prepared KVCacheSlot with updated `past_len`. + """ + prompt_tokens = tuple(prompt_tokens) + if not prompt_tokens: + raise ValueError("prompt_tokens cannot be empty") + if max_new_tokens <= 0: + raise ValueError("max_new_tokens must be positive") + + required_capacity = len(prompt_tokens) + max_new_tokens + slot = self._slots.get(session_id) + existed_before = slot is not None + + if slot is None: + slot = self._allocate_slot(session_id, max(required_capacity, self._model.max_position_embeddings)) + self._slots[session_id] = slot + elif slot.capacity < required_capacity: + self._destroy_slot(slot) + slot = self._allocate_slot(session_id, max(required_capacity, self._model.max_position_embeddings)) + self._slots[session_id] = slot + + self_shared = self._common_prefix_len(slot.tokens[:slot.past_len], prompt_tokens) if existed_before else 0 + + pool_match = self.find_best_prefix(prompt_tokens, exclude_session_id=session_id) + pool_shared = pool_match.matched_tokens if pool_match is not None else 0 + + chosen_shared = self_shared + source_slot: Optional[KVCacheSlot] = None + if pool_shared > chosen_shared and pool_match is not None: + source_slot = self._slots.get(pool_match.session_id) + if source_slot is not None: + chosen_shared = pool_shared + + slot = self._set_slot_reuse_state(session_id, prompt_tokens, chosen_shared) + if source_slot is not None and chosen_shared > 0: + self._copy_prefix_cache(source_slot, slot, chosen_shared) + + self._slots.move_to_end(session_id) + self._evict_if_needed() + return slot + + def modify_session_tokens(self, session_id: str, edited_tokens: Sequence[int]) -> KVCacheSlot: + """Apply history edit for one session and invalidate cache suffix. + + The pool keeps trie/session state for common prefix unchanged and drops + session index on trie nodes after the first edited position. + + Args: + session_id: Target session id. + edited_tokens: New full token history after edit. + + Returns: + Updated KVCacheSlot whose `past_len` is the reusable cache prefix. + + Raises: + KeyError: If target session does not exist. + """ + slot = self._slots.get(session_id) + if slot is None: + raise KeyError(f"session not found: {session_id}") + + new_tokens = tuple(edited_tokens) + old_cached_tokens = slot.tokens[:slot.past_len] + keep_len = self._common_prefix_len(old_cached_tokens, new_tokens) + + session_index = self._get_or_create_session_index(session_id) + self._active_indices.add(session_index) + + if old_cached_tokens: + self._trie_remove_from_depth(old_cached_tokens, session_index, keep_len) + if keep_len > 0: + self._trie_insert(new_tokens[:keep_len], session_index) + + slot.tokens = new_tokens + slot.past_len = keep_len + self._slots.move_to_end(session_id) + return slot + + def commit_session(self, session_id: str, full_tokens: Sequence[int]) -> None: + """Commit generation result and refresh trie index. + + Args: + session_id: Target session id. + full_tokens: Full tokens after generation (prompt + output). + """ + slot = self._slots.get(session_id) + if slot is None: + return + self._update_slot_tokens(session_id, tuple(full_tokens)) + self._slots.move_to_end(session_id) + + def reset_session(self, session_id: str) -> None: + """Remove one session and release its cache resources.""" + slot = self._slots.pop(session_id, None) + if slot is None: + return + session_index = self._session_to_index.get(session_id) + if session_index is not None: + if slot.past_len > 0: + self._trie_remove(slot.tokens[:slot.past_len], session_index) + self._active_indices.discard(session_index) + self._destroy_slot(slot) + + def clear(self) -> None: + """Clear all sessions and release all cache resources.""" + for slot in self._slots.values(): + self._destroy_slot(slot) + self._slots.clear() + self._trie_root = _TrieNode() + self._active_indices.clear() diff --git a/python/llaisys/models/qwen2.py b/python/llaisys/models/qwen2.py index 0d07b0b2..fc436818 100644 --- a/python/llaisys/models/qwen2.py +++ b/python/llaisys/models/qwen2.py @@ -1,33 +1,331 @@ -from typing import Sequence -from ..libllaisys import LIB_LLAISYS -from ..libllaisys import DeviceType - +from typing import Sequence, Optional, Union from pathlib import Path +import json +import ctypes + +from huggingface_hub import snapshot_download import safetensors +import torch + +from ..libllaisys import LIB_LLAISYS, DeviceType, DataType, llaisysTensor_t +from ..libllaisys.models import load_qwen2, LlaisysQwen2Meta +load_qwen2(LIB_LLAISYS) class Qwen2: + """Qwen2 language model implementation.""" + + DEFAULT_MODEL_ID = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" + + def __init__( + self, + model_path: Optional[Union[str, Path]] = None, + device: DeviceType = DeviceType.CPU, + max_seq_len: Optional[int] = None + ): + """Initialize Qwen2 model. + + Args: + model_path: Path to model directory. If None, downloads default model. + device: Device type for inference. + max_seq_len: Maximum sequence length. If None, uses model's default. + Set this to a larger value (e.g., 8192, 16384) for longer contexts. + + Raises: + ValueError: If unsupported device is specified. + FileNotFoundError: If required model files are missing. + """ + if device not in (DeviceType.CPU, DeviceType.NVIDIA): + raise ValueError(f"Unsupported device: {device}. Only CPU and NVIDIA are supported.") + + self.model_path = self._resolve_model_path(model_path) + self._validate_model_files() + + self.device = device + self.device_id = 0 + + config = self._load_config() + self._init_model_params(config) + + # Override max sequence length if specified + if max_seq_len is not None: + if max_seq_len <= 0: + raise ValueError(f"max_seq_len must be positive, got {max_seq_len}") + print(f"[Qwen2] Overriding max_position_embeddings: {self.max_position_embeddings} → {max_seq_len}") + self.max_position_embeddings = max_seq_len + + self.data_type = DataType.F32 if device == DeviceType.CPU else DataType.BF16 + + self._create_model() + + self._load_weights() + + + def _resolve_model_path(self, model_path: Optional[Union[str, Path]]) -> Path: + """Resolve model path, downloading if necessary.""" + if model_path is not None and Path(model_path).exists(): + return Path(model_path) + return Path(snapshot_download(self.DEFAULT_MODEL_ID)) + + def _validate_model_files(self) -> None: + """Validate that required model files exist.""" + required_files = ["config.json", "model.safetensors"] + for file_name in required_files: + file_path = self.model_path / file_name + if not file_path.exists(): + raise FileNotFoundError(f"Required file {file_path} not found!") + + def _load_config(self) -> dict: + """Load model configuration from config.json.""" + config_path = self.model_path / "config.json" + try: + with open(config_path, "r", encoding="utf-8") as f: + return json.load(f) + except (json.JSONDecodeError, IOError) as e: + raise ValueError(f"Failed to load config from {config_path}: {e}") + + def _init_model_params(self, config: dict) -> None: + """Initialize model parameters from configuration.""" + # Required parameters + required_params = [ + "hidden_size", "intermediate_size", "max_position_embeddings", + "num_attention_heads", "num_hidden_layers", "num_key_value_heads", + "rms_norm_eps", "rope_theta", "vocab_size", "eos_token_id" + ] + + for param in required_params: + if param not in config: + raise ValueError(f"Missing required parameter: {param}") + + self.eos_token_id = config["eos_token_id"] + self.hidden_size = config["hidden_size"] + self.intermediate_size = config["intermediate_size"] + self.max_position_embeddings = config["max_position_embeddings"] + self.num_attention_heads = config["num_attention_heads"] + self.num_hidden_layers = config["num_hidden_layers"] + self.num_key_value_heads = config["num_key_value_heads"] + self.rms_norm_eps = config["rms_norm_eps"] + self.rope_theta = config["rope_theta"] + self.torch_dtype = config.get("torch_dtype", "bfloat16") + self.vocab_size = config["vocab_size"] + + # Derived parameters + self.per_head_dim = self.hidden_size // self.num_attention_heads + self.per_kvhead_dim = self.per_head_dim # For Qwen2, dv = d + + def _create_model(self) -> None: + """Create the model instance.""" + meta = LlaisysQwen2Meta( + dtype=self.data_type, + nlayer=self.num_hidden_layers, + hs=self.hidden_size, + nh=self.num_attention_heads, + nkvh=self.num_key_value_heads, + dh=self.per_head_dim, + di=self.intermediate_size, + maxseq=self.max_position_embeddings, + voc=self.vocab_size, + epsilon=self.rms_norm_eps, + theta=self.rope_theta, + end_token=self.eos_token_id + ) - def __init__(self, model_path, device: DeviceType = DeviceType.CPU): - # TODO: Implement model constructor + device_ids = (ctypes.c_int * 1)(0) + self.model = LIB_LLAISYS.llaisysQwen2ModelCreate( + ctypes.byref(meta), + ctypes.c_int(self.device), + device_ids, + ctypes.c_int(1) + ) - model_path = Path(model_path) + if not self.model: + raise RuntimeError("Failed to create Qwen2 model.") - for file in sorted(model_path.glob("*.safetensors")): - data_ = safetensors.safe_open(file, framework="numpy", device="cpu") - for name_ in data_.keys(): - ## TODO: load the model weights - pass + def _load_weights(self) -> None: + """Load model weights from safetensors files.""" + weights = LIB_LLAISYS.llaisysQwen2ModelWeights(self.model) + if not weights: + raise RuntimeError("Failed to get Qwen2 weights.") + def maybe_cast_tensor(tensor): + """Cast tensor to appropriate dtype if needed.""" + if self.device == DeviceType.CPU: + return tensor.to(torch.float32).contiguous() + elif self.device == DeviceType.NVIDIA: + return tensor.to(torch.bfloat16).contiguous() + return tensor + + + for file in sorted(self.model_path.glob("*.safetensors")): + data = safetensors.safe_open(file, framework="torch", device="cpu") + self._load_embedding_layers(data, weights, maybe_cast_tensor) + self._load_attention_layers(data, weights, maybe_cast_tensor) + self._load_mlp_layers(data, weights, maybe_cast_tensor) + + + def _load_embedding_layers(self, data, weights, cast_fn): + """Load embedding and output layers.""" + embedding_mappings = [ + ("model.embed_tokens.weight", "in_embed"), + ("lm_head.weight", "out_embed"), + ("model.norm.weight", "out_norm_w") + ] + + for tensor_name, field_name in embedding_mappings: + tensor = cast_fn(data.get_tensor(tensor_name)) + LIB_LLAISYS.tensorLoad(getattr(weights.contents, field_name), tensor.data_ptr()) + + def _load_attention_layers(self, data, weights, cast_fn): + """Load self-attention layer weights.""" + attention_mappings = [ + ("input_layernorm.weight", "attn_norm_w"), + ("self_attn.q_proj.weight", "attn_q_w"), + ("self_attn.q_proj.bias", "attn_q_b"), + ("self_attn.k_proj.weight", "attn_k_w"), + ("self_attn.k_proj.bias", "attn_k_b"), + ("self_attn.v_proj.weight", "attn_v_w"), + ("self_attn.v_proj.bias", "attn_v_b"), + ("self_attn.o_proj.weight", "attn_o_w"), + ] + + for base_name, field_name in attention_mappings: + self._load_layer_array(data, weights, field_name, base_name, cast_fn) + + def _load_mlp_layers(self, data, weights, cast_fn): + """Load MLP layer weights.""" + mlp_mappings = [ + ("post_attention_layernorm.weight", "mlp_norm_w"), + ("mlp.gate_proj.weight", "mlp_gate_w"), + ("mlp.up_proj.weight", "mlp_up_w"), + ("mlp.down_proj.weight", "mlp_down_w"), + ] + + for base_name, field_name in mlp_mappings: + self._load_layer_array(data, weights, field_name, base_name, cast_fn) + + def _load_layer_array(self, data, weights, field_name, base_name, cast_fn): + """Load weights for a layer array.""" + arr_ptr = getattr(weights.contents, field_name) + arr_type = llaisysTensor_t * self.num_hidden_layers + arr = ctypes.cast(arr_ptr, ctypes.POINTER(arr_type)).contents + + for i in range(self.num_hidden_layers): + tensor_name = f"model.layers.{i}.{base_name}" + tensor = cast_fn(data.get_tensor(tensor_name)) + LIB_LLAISYS.tensorLoad(arr[i], tensor.data_ptr()) def generate( self, inputs: Sequence[int], - max_new_tokens: int = None, + max_new_tokens: int = 128, top_k: int = 1, top_p: float = 0.8, temperature: float = 0.8, - ): + use_cache: bool = True, + kcache_array=None, + vcache_array=None, + past_len: int = 0, + seed: Optional[int] = None, + ) -> Sequence[int]: + """Generate tokens using the model. + + Args: + inputs: Input token IDs. + - use_cache=True with external cache: incremental input (current request tail). + - use_cache=False: full input sequence. + max_new_tokens: Maximum number of new tokens to generate + top_k: Top-k sampling parameter + top_p: Top-p (nucleus) sampling parameter + temperature: Sampling temperature + use_cache: Whether to use KV cache for efficiency. + kcache_array: External KV cache k tensor array managed outside model. + vcache_array: External KV cache v tensor array managed outside model. + past_len: Number of valid tokens already written in external KV cache. + seed: Optional random seed for reproducible sampling. If set, + each decode step uses `seed + step_index`. + + Returns: + Token IDs for current request + current output. + - use_cache=True: returns `inputs + new_tokens`. + - use_cache=False: returns `inputs[past_len:] + new_tokens`. + """ + if not inputs: + raise ValueError("Input tokens cannot be empty") + if max_new_tokens <= 0: + raise ValueError("max_new_tokens must be positive") + if temperature <= 0: + raise ValueError("temperature must be positive") + if top_k == 0: + raise ValueError("top_k cannot be 0") + if top_p < 0 or top_p > 1: + raise ValueError("top_p must be in [0, 1]") + if past_len < 0: + raise ValueError("past_len must be non-negative") + if seed is not None and seed < 0: + raise ValueError("seed must be non-negative") + + generated = list(inputs) + use_external_cache = bool(use_cache and kcache_array is not None and vcache_array is not None) + null_cache = ctypes.POINTER(llaisysTensor_t)() + active_kcache = kcache_array if use_external_cache else null_cache + active_vcache = vcache_array if use_external_cache else null_cache + + if not use_external_cache and past_len > len(generated): + raise ValueError( + f"past_len={past_len} cannot exceed input length={len(generated)} for the current call" + ) + + output = list(generated) if use_external_cache else list(generated[past_len:]) + cached_token_len = past_len + sample_step = 0 + + # Prefill phase + prefill_seed = (seed + sample_step) if seed is not None else -1 + next_token = self._infer_tokens(generated, active_kcache, active_vcache, + past_len if use_external_cache else 0, temperature, top_k, top_p, + prefill_seed) + output.append(next_token) + generated.append(next_token) + cached_token_len +=len(output)-1 + sample_step += 1 + + + # Decode phase + for _ in range(max_new_tokens - 1): + if next_token == self.eos_token_id: + break + + if use_external_cache: + step_seed = (seed + sample_step) if seed is not None else -1 + next_token = self._infer_tokens([next_token], active_kcache, active_vcache, cached_token_len, + temperature, top_k, top_p, step_seed) + cached_token_len += 1 + else: + step_seed = (seed + sample_step) if seed is not None else -1 + next_token = self._infer_tokens(generated, active_kcache, active_vcache, 0, + temperature, top_k, top_p, step_seed) + + generated.append(next_token) + output.append(next_token) + sample_step += 1 + + return output - # TODO: Implement generate function + def _infer_tokens(self, tokens: Sequence[int], kcache_array, vcache_array, + past_len: int, temperature: float, top_k: int, top_p: float, seed: int = 42) -> int: + """Perform inference on token sequence.""" + ntokens = len(tokens) + TokenArrayType = ctypes.c_int64 * ntokens + input_token_array = TokenArrayType(*tokens) - return [] + return LIB_LLAISYS.llaisysQwen2ModelInfer( + self.model, + input_token_array, + ctypes.c_size_t(ntokens), + kcache_array, + vcache_array, + ctypes.c_size_t(past_len), + ctypes.c_float(temperature), + ctypes.c_int(top_k), + ctypes.c_float(top_p), + ctypes.c_int64(seed), + ) \ No newline at end of file diff --git a/python/llaisys/ops.py b/python/llaisys/ops.py index ed0180bc..c2084eab 100644 --- a/python/llaisys/ops.py +++ b/python/llaisys/ops.py @@ -1,6 +1,6 @@ from .libllaisys import LIB_LLAISYS from .tensor import Tensor -from ctypes import c_float, c_int +from ctypes import c_float, c_int, c_int64 class Ops: @@ -53,3 +53,17 @@ def self_attention(attn_val: Tensor, q: Tensor, k: Tensor, v: Tensor, scale: flo @staticmethod def swiglu(out: Tensor, gate: Tensor, up: Tensor): LIB_LLAISYS.llaisysSwiGLU(out.lib_tensor(), gate.lib_tensor(), up.lib_tensor()) + + @staticmethod + def random_sample(sample_idx: Tensor, sample_val: Tensor, logits: Tensor, temperature: float, top_k: int, + top_p: float, seed: int | None = None): + seed_arg = -1 if seed is None else int(seed) + LIB_LLAISYS.llaisysRandomSample( + sample_idx.lib_tensor(), + sample_val.lib_tensor(), + logits.lib_tensor(), + c_float(temperature), + c_int(top_k), + c_float(top_p), + c_int64(seed_arg), + ) diff --git a/python/llaisys/tensor.py b/python/llaisys/tensor.py index 1466d851..5a208ffd 100644 --- a/python/llaisys/tensor.py +++ b/python/llaisys/tensor.py @@ -95,3 +95,14 @@ def slice(self, dim: int, start: int, end: int): self._tensor, c_size_t(dim), c_size_t(start), c_size_t(end) ) ) + def reshape(self, *shape: int): + _shape = (c_size_t * len(shape))(*shape) + return Tensor( + tensor=LIB_LLAISYS.tensorReshape(self._tensor, _shape, c_size_t(len(shape))) + ) + def to(self, device: DeviceType, device_id: int = 0): + return Tensor( + tensor=LIB_LLAISYS.tensorTo( + self._tensor, llaisysDeviceType_t(device), c_int(device_id) + ) + ) \ No newline at end of file diff --git a/python/pyproject.toml b/python/llaisyscore/pyproject.toml similarity index 100% rename from python/pyproject.toml rename to python/llaisyscore/pyproject.toml diff --git a/python/setup.cfg b/python/llaisyscore/setup.cfg similarity index 77% rename from python/setup.cfg rename to python/llaisyscore/setup.cfg index b35fc65f..0d8e2175 100644 --- a/python/setup.cfg +++ b/python/llaisyscore/setup.cfg @@ -7,12 +7,20 @@ license = MIT [options] packages = find: +package_dir = + = .. include_package_data = True zip_safe = False install_requires = torch>=2.4.0 transformers accelerate + huggingface-hub + +[options.packages.find] +where = .. +include = + llaisys* [options.package_data] llaisys = diff --git a/python/server-project/pyproject.toml b/python/server-project/pyproject.toml new file mode 100644 index 00000000..8fe2f47a --- /dev/null +++ b/python/server-project/pyproject.toml @@ -0,0 +1,3 @@ +[build-system] +requires = ["setuptools>=42", "wheel"] +build-backend = "setuptools.build_meta" diff --git a/python/server-project/setup.cfg b/python/server-project/setup.cfg new file mode 100644 index 00000000..97fe2dca --- /dev/null +++ b/python/server-project/setup.cfg @@ -0,0 +1,22 @@ +[metadata] +name = llaisys-server +version = 0.1.0 +description = Chat server project for LLAISYS +author = Pan Zezhong +license = MIT + +[options] +packages = find: +package_dir = + = .. +include_package_data = True +zip_safe = False +install_requires = + llaisys>=0.1.0 + fastapi + uvicorn + +[options.packages.find] +where = .. +include = + server* diff --git a/python/server/README.md b/python/server/README.md new file mode 100644 index 00000000..80a8eba3 --- /dev/null +++ b/python/server/README.md @@ -0,0 +1,68 @@ +# LLAISYS Chat Server (Project #3) + +A minimal Python chat server built on top of `llaisys.models.Qwen2`. + +## Install deps + +```bash +python3 -m pip install fastapi uvicorn +``` + +## Start server + +```bash +cd python +uvicorn server.app:app --host 0.0.0.0 --port 8000 +``` + +Open chat UI: + +```bash +http://127.0.0.1:8000/ +``` + +Optional environment variables: + +- `LLAISYS_MODEL_PATH`: local model path (default: `/home/vankari/code/DeepSeek-R1-Distill-Qwen-1.5B/`; if path does not exist, falls back to HF model) +- `LLAISYS_DEVICE`: `cpu` or `nvidia` (default: `cpu`) +- `LLAISYS_MAX_NEW_TOKENS` (default: `128`) +- `LLAISYS_TOP_K` (default: `50`) +- `LLAISYS_TOP_P` (default: `0.8`) +- `LLAISYS_TEMPERATURE` (default: `0.8`) +- `LLAISYS_USE_CACHE` (default: `true`) +- `LLAISYS_SERVE_MODEL_NAME` (default: `llaisys-qwen2`) + +## API + +### Health + +```bash +curl http://127.0.0.1:8000/healthz +``` + +### Chat completion + +```bash +curl -X POST http://127.0.0.1:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "llaisys-qwen2", + "messages": [{"role": "user", "content": "你好,介绍一下你自己"}], + "max_tokens": 64, + "top_k": 50, + "top_p": 0.8, + "temperature": 0.8, + "stream": false + }' +``` + +### Streaming completion (SSE) + +```bash +curl -N -X POST http://127.0.0.1:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "messages": [{"role": "user", "content": "讲个笑话"}], + "stream": true + }' +``` diff --git a/python/server/__init__.py b/python/server/__init__.py new file mode 100644 index 00000000..1ef54839 --- /dev/null +++ b/python/server/__init__.py @@ -0,0 +1 @@ +"""LLAISYS chat server package.""" diff --git a/python/server/app.py b/python/server/app.py new file mode 100644 index 00000000..673902b6 --- /dev/null +++ b/python/server/app.py @@ -0,0 +1,255 @@ +import json +import re +import time +import uuid +from pathlib import Path +from typing import Dict, Iterator, List, Optional + +from fastapi import FastAPI, HTTPException +from fastapi.responses import FileResponse, StreamingResponse +from pydantic import BaseModel + +from .config import CONFIG +from .engine import ENGINE +from .schemas import ( + ChatCompletionRequest, + ChatCompletionResponse, + ChatChoice, + ChatResponseMessage, + Usage, +) + +app = FastAPI(title="LLAISYS Chat Server", version="0.1.0") +UI_INDEX = Path(__file__).resolve().parent / "ui" / "index.html" + + +class SessionCreateRequest(BaseModel): + session_id: Optional[str] = None + + +@app.get("/") +def chat_ui(): + if not UI_INDEX.exists(): + raise HTTPException(status_code=404, detail="UI page not found") + return FileResponse(UI_INDEX) + + +@app.get("/healthz") +def healthz(): + return {"status": "ok", "device": CONFIG.device, "model": CONFIG.default_model_name} + + +def _chunk_payload(chunk_id: str, model_name: str, content: str, finish_reason=None): + return { + "id": chunk_id, + "object": "chat.completion.chunk", + "created": int(time.time()), + "model": model_name, + "choices": [ + { + "index": 0, + "delta": {"content": content} if content else {}, + "finish_reason": finish_reason, + } + ], + } + +def _normalize_think_text(text: str, prompt_has_think_tag: bool) -> str: + if not text: + return text + has_open = re.search(r"", text) is not None + has_close = re.search(r"", text) is not None + if prompt_has_think_tag and has_close and not has_open: + text = f"{text}" + has_open = True + if prompt_has_think_tag and has_open and not has_close: + text = f"{text}" + return text + + +def _prompt_has_think_tag(messages: List[Dict[str, str]]) -> bool: + prompt = ENGINE._tokenizer.apply_chat_template( + conversation=messages, + add_generation_prompt=True, + tokenize=False, + ) + return "" in prompt or "" in prompt + + +def _filter_newlines(text: str) -> str: + if not text: + return text + return text.replace("\n", "") + + +def _longest_suffix_prefix(value: str, token: str) -> int: + upper = min(len(value), len(token) - 1) + for n in range(upper, 0, -1): + if value.endswith(token[:n]): + return n + return 0 + + +def _normalize_stream_piece(piece: str, prompt_has_think_tag: bool, seen_open_tag: bool): + start_tag = "" + end_tag = "" + source = piece + out_parts = [] + + while source: + start_idx = source.find(start_tag) + end_idx = source.find(end_tag) + + next_idx = -1 + tag_type = "" + if start_idx != -1 and (end_idx == -1 or start_idx < end_idx): + next_idx = start_idx + tag_type = "start" + elif end_idx != -1: + next_idx = end_idx + tag_type = "end" + + if next_idx == -1: + keep = max( + _longest_suffix_prefix(source, start_tag), + _longest_suffix_prefix(source, end_tag), + ) + if keep > 0: + out_parts.append(source[:-keep]) + return "".join(out_parts), source[-keep:], seen_open_tag + out_parts.append(source) + return "".join(out_parts), "", seen_open_tag + + out_parts.append(source[:next_idx]) + source = source[next_idx + (len(start_tag) if tag_type == "start" else len(end_tag)) :] + + if tag_type == "start": + if not seen_open_tag: + out_parts.append(start_tag) + seen_open_tag = True + else: + if prompt_has_think_tag and not seen_open_tag: + out_parts.append(start_tag) + seen_open_tag = True + out_parts.append(end_tag) + + return "".join(out_parts), "", seen_open_tag + + +def _stream_completion(req: ChatCompletionRequest, model_name: str) -> Iterator[str]: + chunk_id = f"chatcmpl-{uuid.uuid4().hex}" + messages = [m.model_dump() for m in req.messages] + prompt_has_think_tag = _prompt_has_think_tag(messages) + seen_open_tag = False + carry = "" + + def flush_payload(content: str) -> Iterator[str]: + content = _filter_newlines(content) + if not content: + return + payload = _chunk_payload(chunk_id, model_name, content) + yield f"data: {json.dumps(payload, ensure_ascii=False)}\n\n" + + if prompt_has_think_tag: + seen_open_tag = True + yield from flush_payload("") + + for piece in ENGINE.stream_generate( + messages=messages, + max_tokens=req.max_tokens or CONFIG.default_max_new_tokens, + top_k=req.top_k if req.top_k is not None else CONFIG.default_top_k, + top_p=req.top_p if req.top_p is not None else CONFIG.default_top_p, + temperature=req.temperature if req.temperature is not None else CONFIG.default_temperature, + use_cache=req.use_cache if req.use_cache is not None else CONFIG.default_use_cache, + session_id=req.session_id, + ): + if not piece: + continue + merged = carry + piece + normalized_piece, carry, seen_open_tag = _normalize_stream_piece( + merged, + prompt_has_think_tag, + seen_open_tag, + ) + if normalized_piece: + yield from flush_payload(normalized_piece) + + if carry: + final_piece, _, _ = _normalize_stream_piece(carry, prompt_has_think_tag, seen_open_tag) + if final_piece: + yield from flush_payload(final_piece) + + done_payload = _chunk_payload(chunk_id, model_name, "", finish_reason="stop") + yield f"data: {json.dumps(done_payload, ensure_ascii=False)}\n\n" + yield "data: [DONE]\n\n" + + +@app.post("/v1/chat/completions") +def chat_completions(req: ChatCompletionRequest): + if not req.messages: + raise HTTPException(status_code=400, detail="messages cannot be empty") + + messages = [m.model_dump() for m in req.messages] + model_name = req.model or CONFIG.default_model_name + if req.stream: + return StreamingResponse(_stream_completion(req, model_name), media_type="text/event-stream") + + result = ENGINE.generate( + messages=messages, + max_tokens=req.max_tokens or CONFIG.default_max_new_tokens, + top_k=req.top_k if req.top_k is not None else CONFIG.default_top_k, + top_p=req.top_p if req.top_p is not None else CONFIG.default_top_p, + temperature=req.temperature if req.temperature is not None else CONFIG.default_temperature, + use_cache=req.use_cache if req.use_cache is not None else CONFIG.default_use_cache, + session_id=req.session_id, + ) + + completion_text = _normalize_think_text(result["completion_text"], _prompt_has_think_tag(messages)).strip() + completion_text = _filter_newlines(completion_text) + + completion_id = f"chatcmpl-{uuid.uuid4().hex}" + response = ChatCompletionResponse( + id=completion_id, + object="chat.completion", + created=result["created"], + model=model_name, + choices=[ + ChatChoice( + index=0, + message=ChatResponseMessage(role="assistant", content=completion_text), + finish_reason="stop", + ) + ], + usage=Usage( + prompt_tokens=len(result["prompt_ids"]), + completion_tokens=len(result["completion_ids"]), + total_tokens=len(result["prompt_ids"]) + len(result["completion_ids"]), + ), + ) + return response + + +@app.post("/v1/sessions") +def create_session(req: SessionCreateRequest): + session_id = ENGINE.create_session(req.session_id) + return {"session_id": session_id} + + +@app.get("/v1/sessions") +def list_sessions(): + return {"sessions": ENGINE.list_sessions()} + + +@app.delete("/v1/sessions/{session_id}") +def delete_session(session_id: str): + ENGINE.delete_session(session_id) + return {"deleted": session_id} + + +@app.get("/v1/sessions/{session_id}") +def get_session(session_id: str): + try: + messages = ENGINE.get_session_messages(session_id) + except KeyError: + raise HTTPException(status_code=404, detail="session not found") + return {"session_id": session_id, "messages": messages} diff --git a/python/server/config.py b/python/server/config.py new file mode 100644 index 00000000..4a1e316f --- /dev/null +++ b/python/server/config.py @@ -0,0 +1,38 @@ +import os +from pathlib import Path + + +def _env_bool(name: str, default: bool) -> bool: + value = os.getenv(name) + if value is None: + return default + return value.strip().lower() in {"1", "true", "yes", "on"} + + +class ServerConfig: + def __init__(self) -> None: + self.default_local_model_path = "/home/vankari/code/DeepSeek-R1-Distill-Qwen-1.5B/" + self.configured_model_path = os.getenv("LLAISYS_MODEL_PATH", self.default_local_model_path) + self.model_path = self._resolve_existing_model_path(self.configured_model_path) + self.device = os.getenv("LLAISYS_DEVICE", "cpu").strip().lower() + self.default_max_new_tokens = int(os.getenv("LLAISYS_MAX_NEW_TOKENS", "128")) + self.default_top_k = int(os.getenv("LLAISYS_TOP_K", "10")) + self.default_top_p = float(os.getenv("LLAISYS_TOP_P", "0.8")) + self.default_temperature = float(os.getenv("LLAISYS_TEMPERATURE", "0.8")) + self.default_use_cache = _env_bool("LLAISYS_USE_CACHE", True) + self.default_model_name = os.getenv("LLAISYS_SERVE_MODEL_NAME", "llaisys-qwen2") + + if self.device not in {"cpu", "nvidia"}: + raise ValueError("LLAISYS_DEVICE must be 'cpu' or 'nvidia'") + + @staticmethod + def _resolve_existing_model_path(path_value: str | None) -> str | None: + if path_value is None: + return None + candidate = Path(path_value).expanduser() + if candidate.exists() and candidate.is_dir(): + return str(candidate) + return None + + +CONFIG = ServerConfig() diff --git a/python/server/engine.py b/python/server/engine.py new file mode 100644 index 00000000..7d18bb28 --- /dev/null +++ b/python/server/engine.py @@ -0,0 +1,100 @@ +import threading +import time +from typing import List, Dict, Any, Iterator, Optional + +import llaisys + +from .config import CONFIG + + +class ChatEngine: + def __init__(self) -> None: + self._lock = threading.Lock() + self._backend = llaisys.backend.InferenceBackend( + model_path=CONFIG.model_path, + device=llaisys.DeviceType.CPU if CONFIG.device == "cpu" else llaisys.DeviceType.NVIDIA, + ) + self._default_session_id = "engine-default" + self._backend.create_session(self._default_session_id) + self._tokenizer = self._backend._tokenizer + + def _reset_default_session(self, messages: List[Dict[str, str]]) -> None: + self._backend.delete_session(self._default_session_id) + self._backend.create_session(self._default_session_id, messages=list(messages)) + + def _ensure_named_session(self, session_id: str, messages: List[Dict[str, str]]) -> None: + try: + self._backend.get_session(session_id) + self._backend.replace_messages(session_id, messages) + except KeyError: + self._backend.create_session(session_id, messages=list(messages)) + + def _build_prompt(self, messages: List[Dict[str, str]]) -> str: + return self._tokenizer.apply_chat_template(conversation=messages, add_generation_prompt=True, tokenize=False) + + def generate(self, messages: List[Dict[str, str]], max_tokens: int, top_k: int, top_p: float, temperature: float, use_cache: bool, + session_id: Optional[str] = None) -> Dict[str, Any]: + with self._lock: + active_session_id = session_id or self._default_session_id + if session_id: + self._ensure_named_session(active_session_id, messages) + else: + self._reset_default_session(messages) + result = self._backend.generate( + session_id=active_session_id, + max_tokens=max_tokens, + top_k=top_k, + top_p=top_p, + temperature=temperature, + use_cache=use_cache, + append_assistant_message=True, + ) + + return { + "prompt_ids": result["prompt_ids"], + "completion_ids": result["completion_ids"], + "completion_text": result["completion_text"], + "created": int(time.time()), + } + + def stream_generate(self, messages: List[Dict[str, str]], max_tokens: int, top_k: int, top_p: float, temperature: float, use_cache: bool, + session_id: Optional[str] = None) -> Iterator[str]: + with self._lock: + active_session_id = session_id or self._default_session_id + if session_id: + self._ensure_named_session(active_session_id, messages) + else: + self._reset_default_session(messages) + for token_id in self._backend.stream_generate( + session_id=active_session_id, + max_tokens=max_tokens, + top_k=top_k, + top_p=top_p, + temperature=temperature, + use_cache=use_cache, + append_assistant_message=True, + ): + yield token_id + + def get_session_messages(self, session_id: str) -> List[Dict[str, str]]: + with self._lock: + state = self._backend.get_session(session_id) + return list(state.messages) + + def create_session(self, session_id: Optional[str] = None) -> str: + with self._lock: + state = self._backend.create_session(session_id=session_id) + return state.session_id + + def list_sessions(self) -> List[str]: + with self._lock: + return [item.session_id for item in self._backend.list_sessions()] + + def delete_session(self, session_id: str) -> None: + if session_id == self._default_session_id: + return + with self._lock: + self._backend.delete_session(session_id) + + +ENGINE = ChatEngine() diff --git a/python/server/schemas.py b/python/server/schemas.py new file mode 100644 index 00000000..682ad8d5 --- /dev/null +++ b/python/server/schemas.py @@ -0,0 +1,45 @@ +from typing import List, Optional +from pydantic import BaseModel, Field + + +class ChatMessage(BaseModel): + role: str + content: str + + +class ChatCompletionRequest(BaseModel): + model: Optional[str] = None + session_id: Optional[str] = None + messages: List[ChatMessage] + max_tokens: Optional[int] = Field(default=None, ge=1) + top_k: Optional[int] = None + top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0) + temperature: Optional[float] = Field(default=None, gt=0.0) + stream: bool = False + use_cache: Optional[bool] = None + + +class ChatResponseMessage(BaseModel): + role: str + content: str + + +class ChatChoice(BaseModel): + index: int + message: ChatResponseMessage + finish_reason: str + + +class Usage(BaseModel): + prompt_tokens: int + completion_tokens: int + total_tokens: int + + +class ChatCompletionResponse(BaseModel): + id: str + object: str + created: int + model: str + choices: List[ChatChoice] + usage: Usage diff --git a/python/server/ui/index.html b/python/server/ui/index.html new file mode 100644 index 00000000..79a2a5a7 --- /dev/null +++ b/python/server/ui/index.html @@ -0,0 +1,783 @@ + + + + + + LLAISYS Chat UI + + + + + + + LLAISYS Chat + + + 新会话 + 删除会话 + + 就绪 + + + + + + 发送 + 清空对话 + + + 流式显示 + + + + + + + + + diff --git a/report/image.png b/report/image.png new file mode 100644 index 00000000..c7d534df Binary files /dev/null and b/report/image.png differ diff --git a/report/report_chatbot.md b/report/report_chatbot.md new file mode 100644 index 00000000..77b19092 --- /dev/null +++ b/report/report_chatbot.md @@ -0,0 +1,146 @@ +# 本项目基于llaisys实现了一个聊天AI(方向3) +## 项目依赖(与llaisys保持一致) +- 编译工具:[Xmake](https://xmake.io/) +- C++编译器:MSVC(Windows)或Clang或GCC +- Python >= 3.9 +## 项目运行 +- fork 本仓库 +- 编译运行 + ```bash + # 编译c++代码 + xmake + # 安装llaisys共享库 + xmake install + # 安装core包 + python3 -m pip install -e ./python/llaisyscore --user --break-system-packages + # 安装server包 + python3 -m pip install -e ./python/server-project --user --break-system-packages + ``` +- 从[huggingface](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B)手动下载权重并指定权重加载目录(或使用snapshot download下载) +## 实现功能 +1. 实现随机采样算子randomsample(sample_val, sample_idx ,logits, temperature, top_k, top_p) + - temperature:按如下公式 $$ y = t-softmax(x,temperature),\ y_i = \frac{\frac{x_i}{temperature}}{\sum_j \frac{x_j}{temperature}} $$ scale logits的系数 + - top_k:按概率排序(t-softmax归一后的输出)后选择最高k个token作为candidate + - top_p:经过top_k选取后,再对logits进行一次普通softmax,指定一个累积概率的threshold,累积概率低于这个threshold的token选入candidate + - output:对经过top_p选取后的logits再使用一次普通softmax,得到最后的token分布,按照分布采样的结果的val和idx写入相应张量 + +2. 基于llaisys实现 Project #3 聊天服务(FastAPI + Web UI) + - 后端入口:`python/server/app.py` + - 推理引擎:`python/server/engine.py` + - 会话与缓存后端:`python/llaisys/backend/inference_backend.py` + - KV-Cache池:`python/llaisys/models/kvcachepool.py` + +## project3 核心实现说明 +1. OpenAI风格 chat-completion API + - `POST /v1/chat/completions`:支持非流式与流式(SSE)返回 + - `GET /healthz`:健康检查 + - `GET /`:返回前端聊天页面 + +2. 会话管理能力 + - `POST /v1/sessions`:创建会话 + - `GET /v1/sessions`:列出会话 + - `GET /v1/sessions/{session_id}`:读取会话消息 + - `DELETE /v1/sessions/{session_id}`:删除会话 + - `InferenceBackend`中维护`SessionState`,支持`append_message`和`replace_messages`,满足“编辑历史并重生成”需求 + +3. KVCache复用与前缀匹配 + - 基于trie + session slots实现的Cache pool,前者维护session的tokens prefix,后者维护session的KVCache(并非ref count based的trie,所以prepare的时候会额外多一个复制prefix的开销) + - 在`KVCachePool.prepare_session`阶段计算当前prompt与历史缓存的最长公共前缀 + - 复用可复用前缀,减少重复prefill + - 生成完成后通过`commit_session`提交最新token历史 + - 删除会话时同步释放对应cache资源 + +4. 非流式与流式推理路径 + - 非流式:调用`model.generate(...)`,按输入长度切分`prompt_ids`和`completion_ids` + - 流式:使用`model._infer_tokens(...)`逐token迭代 + - 首次执行prefill + - 后续每步单token decode + - 过滤EOS,'\n'等特殊token,避免前端显示结束标记 + +5. 前端UI能力(`python/server/ui/index.html`) + - 多会话切换 + - 用户消息“编辑重生成” + - 流式开关 + - 对`...`进行分段展示(思考区/回复区) + +6. 输出规范化 + - 对流式与非流式输出进行一致性处理 + - 处理``标签开闭不对称问题 + - 过滤换行符(`\n`)以匹配当前前端展示要求 + +## 测试与验证 + - 随机采样算子测试文件位于`test/ops/random_sample.py`,主要对random_sample的极端p值(p<=0, p>=1),极端k值(k=1),以及(固定temperature,top_k,top_p组合采样,与pytorch实现进行容差校验) + - 后端会话与流式行为:`test/test_inference_backend.py` + - 覆盖会话创建/删除/替换历史 + - 覆盖流式路径“首步prefill + 后续单token解码” + - 服务端接口行为:`test/test_server.py` + - 覆盖`healthz`、`/`、`/v1/chat/completions`(流式/非流式) + - 覆盖`/v1/sessions`增删查 + - 配置行为:`test/test_server_config.py` + - 覆盖模型路径环境变量存在/不存在时的分支 + +## 演示效果 +1. 启动服务 + ```bash + cd ./python + python3 -m uvicorn server.app:app --host 0.0.0.0 --port {port} + ``` +2. 按下面的调用示例或者直接在webgui进行交互 + - 非流式调用示例 + ```bash + curl -X POST http://127.0.0.1:{port}/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "messages": [{"role": "user", "content": "你好,介绍一下你自己"}], + "stream": false, + "max_tokens": 64 + }' + ``` + + - 流式调用示例(SSE) + ```bash + curl -N -X POST http://127.0.0.1:{port}/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "messages": [{"role": "user", "content": "讲个笑话"}], + "stream": true + }' + ``` + + - 会话接口示例 + ```bash + # 创建会话 + curl -X POST http://127.0.0.1:{port}/v1/sessions -H "Content-Type: application/json" -d '{"session_id":"demo-session"}' + # 查询会话列表 + curl http://127.0.0.1:{port}/v1/sessions + # 删除会话 + curl -X DELETE http://127.0.0.1:{port}/v1/sessions/demo-session + # 非流式 chat-completions(指定 session_id) + curl -X POST http://127.0.0.1:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "session_id":"{session_id }", + "messages":[{"role":"user","content":"你好,介绍一下你自己"}], + "stream":false, + "max_tokens":64 + }' + # 流式 chat-completions(指定 session_id) + curl -N -X POST http://127.0.0.1:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "session_id":"demo-session-001", + "messages":[{"role":"user","content":"讲个笑话"}], + "stream":true, + "max_tokens":64 + }' + ``` +3. webui展示 +  + +## 说明 +- 当前项目3实现聚焦“单实例可用聊天服务 + 会话管理 + KV缓存复用 + 流式输出”。未实现多用户管理及连续批处理等功能。 +## todo +- 项目4跟进 +- 项目1跟进 +- CUDA linear算子优化 + diff --git a/report/report_cuda.md b/report/report_cuda.md new file mode 100644 index 00000000..4427368a --- /dev/null +++ b/report/report_cuda.md @@ -0,0 +1,457 @@ +# LLAISYS CUDA 适配报告 + +## 1. 项目目标与范围 + +本报告记录了 LLAISYS 在 NVIDIA CUDA 平台上的适配工作,覆盖内容包括: + +- 前置依赖安装 +- CUDA 构建与安装流程 +- CUDA Runtime API 与算子实现 +- Qwen2 GPU 推理接入与 CPU 兼容性说明 +- 复现与验证方法 + +本次适配遵循原则: + +1. 不改变现有 CPU 推理接口语义。 +2. CUDA 路径以算子原生 kernel 为主,不采用“CPU 计算后拷回 GPU”的默认策略。 +3. Python 层 API 与现有调用方式保持兼容。 + +--- + +## 2. 前置依赖安装 + +### 2.1 系统与工具链 + +建议环境(本次验证环境): + +- Linux x86_64 +- NVIDIA GPU(A100,SM80) +- CUDA Toolkit 12.x(本仓库验证为 12.8) +- GCC / G++(支持 C++17) +- Xmake +- Python 3.9+ + +### 2.2 Python 依赖 + +在仓库根目录执行: + +```bash +python3 -m pip install -e ./python/llaisyscore --user --break-system-packages +python3 -m pip install -e ./python/server-project --user --break-system-packages +``` + +如需推理测试(含 HF 对照),还需确保环境中可用: + +- torch +- transformers +- accelerate +- safetensors +- huggingface-hub + +--- + +## 3. 编译与安装 + +### 3.1 CUDA 开关编译 + +```bash +xmake f --nv-gpu=y -cv +xmake -v +xmake install +``` + +其中 `xmake install` 会将 `libllaisys.so` 同步到 `python/llaisys/libllaisys/`。 + +### 3.2 构建系统关键配置 + +文件:`xmake.lua` + +```lua +option("nv-gpu") + set_default(false) + set_showmenu(true) + set_description("Whether to compile implementations for Nvidia GPU") +option_end() + +if has_config("nv-gpu") then + add_defines("ENABLE_NVIDIA_API") + includes("xmake/nvidia.lua") +end +``` + +文件:`xmake/nvidia.lua` + +```lua +target("llaisys-device-nvidia") + set_kind("static") + set_languages("cxx17") + set_warnings("all", "error") + set_policy("build.cuda.devlink", true) + + add_files("../src/device/nvidia/*.cu") + + add_cugencodes("compute_80") + add_cuflags("--use_fast_math") + add_cuflags("-Xcompiler=-fPIC") + add_culdflags("-Xcompiler=-fPIC") + + add_links("cudart") + add_linkdirs("/usr/local/cuda/lib64") + +target_end() +``` + +说明: + +- `build.cuda.devlink` + `-Xcompiler=-fPIC` 解决了 CUDA device link 与共享库链接问题。 +- `compute_80` 与当前测试硬件匹配。 + +--- + +## 4. CUDA Runtime API 实现 + +文件:`src/device/nvidia/nvidia_runtime_api.cu` + +主要实现了: + +- `getDeviceCount` / `setDevice` +- stream 创建、销毁、同步 +- `cudaMalloc/cudaFree`、`cudaMallocHost/cudaFreeHost` +- `memcpySync/memcpyAsync` 的 H2H/H2D/D2H/D2D 映射 + +代码片段(仓库实现): + +```cpp +int getDeviceCount() { + int count = 0; + CUDA_CHECK(cudaGetDeviceCount(&count)); + return count; +} + +void *mallocDevice(size_t size) { + void *ptr = nullptr; + CUDA_CHECK(cudaMalloc(&ptr, size)); + return ptr; +} + +void memcpySync(void *dst, const void *src, size_t size, llaisysMemcpyKind_t kind) { + cudaMemcpyKind cuda_kind = cudaMemcpyDefault; + switch (kind) { + case LLAISYS_MEMCPY_H2H: + cuda_kind = cudaMemcpyHostToHost; + break; + case LLAISYS_MEMCPY_H2D: + cuda_kind = cudaMemcpyHostToDevice; + break; + case LLAISYS_MEMCPY_D2H: + cuda_kind = cudaMemcpyDeviceToHost; + break; + case LLAISYS_MEMCPY_D2D: + cuda_kind = cudaMemcpyDeviceToDevice; + break; + default: + ASSERT(false, "Unsupported memcpy kind in CUDA memcpySync"); + } + CUDA_CHECK(cudaMemcpy(dst, src, size, cuda_kind)); +} +``` + +--- + +## 5. CUDA 算子实现 + +### 5.1 通用 dtype 读写工具 + +文件:`src/utils/cuda_type_utils.cuh` + +该工具统一支持 F32/F16/BF16 的 device 端读取与写回,避免各算子重复实现类型分发。 + +```cpp +__device__ __forceinline__ float load_as_float(const void *ptr, size_t idx, llaisysDataType_t dtype) { + if (dtype == LLAISYS_DTYPE_F32) return load_as_float_f32(ptr, idx); + if (dtype == LLAISYS_DTYPE_F16) return load_as_float_f16(ptr, idx); + return load_as_float_bf16(ptr, idx); +} +``` + +### 5.2 add(F32/F16/BF16) + +文件:`src/ops/add/cuda/add_cuda.cu` + +```cpp +template +__global__ void add_kernel(T *c, const T *a, const T *b, size_t numel) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= numel) return; + c[idx] = a[idx] + b[idx]; +} +``` + +对于 FP16/BF16 使用对应半精度运算指令。 + +### 5.3 embedding + +文件:`src/ops/embedding/cuda/embedding_cuda.cu` + +```cpp +__global__ void embedding_kernel(void *out, const int64_t *index, const void *weight, + llaisysDataType_t dtype, size_t num, size_t dim) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + size_t total = num * dim; + if (idx >= total) return; + + size_t row = idx / dim; + size_t col = idx % dim; + int64_t token = index[row]; + size_t src_idx = static_cast(token) * dim + col; + float val = load_as_float(weight, src_idx, dtype); + store_from_float(out, idx, val, dtype); +} +``` + +同时在 `src/ops/embedding/op.cpp` 中补充了 GPU index 越界校验时的 D2H 安全路径(避免直接把 device 指针当 host 指针解引用)。 + +### 5.4 linear + +文件:`src/ops/linear/cuda/linear_cuda.cu` + +```cpp +__global__ void linear_kernel(void *out, const void *in, const void *weight, const void *bias, + llaisysDataType_t type, size_t batch_size, size_t in_features, + size_t out_features, bool has_bias) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + size_t total = batch_size * out_features; + if (idx >= total) return; + + size_t b = idx / out_features; + size_t o = idx % out_features; + + float sum = 0.0f; + for (size_t i = 0; i < in_features; ++i) { + float x = load_as_float(in, b * in_features + i, type); + float w = load_as_float(weight, o * in_features + i, type); + sum += x * w; + } + if (has_bias) sum += load_as_float(bias, o, type); + store_from_float(out, idx, sum, type); +} +``` + +### 5.5 argmax + +文件:`src/ops/argmax/cuda/argmax_cuda.cu` + +```cpp +__global__ void argmax_kernel(int64_t *mi, void *mv, const void *v, llaisysDataType_t type, size_t numel) { + if (blockIdx.x != 0 || threadIdx.x != 0) return; + + size_t best_idx = 0; + float best_val = load_as_float(v, 0, type); + for (size_t i = 1; i < numel; ++i) { + float cur = load_as_float(v, i, type); + if (cur > best_val) { + best_val = cur; + best_idx = i; + } + } + mi[0] = static_cast(best_idx); + store_from_float(mv, 0, best_val, type); +} +``` + +### 5.6 rms_norm + +文件:`src/ops/rms_norm/cuda/rms_norm_cuda.cu` + +```cpp +__global__ void rms_norm_kernel(void *out, const void *in, const void *weight, + llaisysDataType_t type, size_t feature_dim, float eps) { + size_t row = blockIdx.x; + extern __shared__ float sdata[]; + + float local_sum = 0.0f; + for (size_t col = threadIdx.x; col < feature_dim; col += blockDim.x) { + float val = load_as_float(in, row * feature_dim + col, type); + local_sum += val * val; + } + sdata[threadIdx.x] = local_sum; + __syncthreads(); + + for (unsigned int stride = blockDim.x / 2; stride > 0; stride >>= 1) { + if (threadIdx.x < stride) sdata[threadIdx.x] += sdata[threadIdx.x + stride]; + __syncthreads(); + } + + float inv_rms = rsqrtf(sdata[0] / static_cast(feature_dim) + eps); + for (size_t col = threadIdx.x; col < feature_dim; col += blockDim.x) { + float x = load_as_float(in, row * feature_dim + col, type); + float w = load_as_float(weight, col, type); + store_from_float(out, row * feature_dim + col, x * w * inv_rms, type); + } +} +``` + +### 5.7 rope + +文件:`src/ops/rope/cuda/rope_cuda.cu` + +```cpp +__global__ void rope_kernel(void *out, const void *in, const int64_t *pos_ids, llaisysDataType_t type, + size_t seq_len, size_t n_heads, size_t head_dim, float theta) { + // ... 计算 phi/cos/sin 后旋转写回 +} +``` + +### 5.8 self_attention + +文件:`src/ops/self_attention/cuda/self_attention_cuda.cu` + +```cpp +__global__ void self_attention_kernel(void *attn_val, const void *q, const void *k, const void *v, + llaisysDataType_t type, size_t seqlen, size_t totlen, + size_t nh, size_t nkvh, size_t d, size_t dv, float scale) { + // 单 kernel 内完成 causal masked attention 的分数、归一化与加权求和 +} +``` + +### 5.9 swiglu + +文件:`src/ops/swiglu/cuda/swiglu_cuda.cu` + +```cpp +__global__ void swiglu_kernel(void *out, const void *gate, const void *up, + llaisysDataType_t type, size_t numel) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= numel) return; + float g = load_as_float(gate, idx, type); + float u = load_as_float(up, idx, type); + float sig = g / (1.0f + expf(-g)); + store_from_float(out, idx, u * sig, type); +} +``` + +### 5.10 random_sample(temperature/top-k/top-p) + +文件:`src/ops/random_sample/cuda/random_sample_cuda.cu` + +实现流程: + +1. logits 温度缩放 +2. device 侧排序(thrust) +3. 按 top-k/top-p 截断 +4. 使用 `curand` 采样 + +```cpp +curandStatePhilox4_32_10_t rng_state; +curand_init(seed, 0ULL, 0ULL, &rng_state); +float r = curand_uniform(&rng_state); +``` + +并通过 host 侧原子计数 + 时间戳组合 seed,避免固定采样序列。 + +--- + +## 6. Qwen2 CUDA 推理支持 + +### 6.1 Python 侧模型设备接入 + +文件:`python/llaisys/models/qwen2.py` + +- 支持 `DeviceType.NVIDIA` 初始化 +- CUDA 路径权重 dtype 使用 BF16 +- 权重加载时保持 host tensor,使用 `tensorLoad` 进行 H2D + +关键代码: + +```python +def maybe_cast_tensor(tensor): + if self.device == DeviceType.CPU: + return tensor.to(torch.float32).contiguous() + elif self.device == DeviceType.NVIDIA: + return tensor.to(torch.bfloat16).contiguous() + return tensor +``` + +此处避免 `.cuda()`,防止把 device 指针走到 `tensorLoad(H2D)` 的 host 输入路径造成不兼容。 + +### 6.2 端到端验证结果 + +已通过: + +- `python test/test_infer.py --device nvidia --max_steps 128` +- `python test/test_infer.py --device nvidia --max_steps 8 --test` + +其中 `--test` 模式已与 HF 在确定性配置下对齐并 `Test passed`。 + +--- + +## 7. 复现方法(可直接执行) + +### 7.1 编译与安装 + +```bash +xmake f --nv-gpu=y -cv +xmake -v +xmake install +``` + +### 7.2 运行时测试 + +```bash +python test/test_runtime.py --device nvidia +``` + +### 7.3 算子测试(NVIDIA) + +```bash +for f in test/ops/*.py; do + echo "===== RUN $f --device nvidia =====" + python "$f" --device nvidia + echo + done +``` + +### 7.4 Tensor 双设备迁移补充测试 + +```bash +python test/test_tensor.py --device nvidia +python test/test_tensor_dual_device.py --device_id 0 +``` + +### 7.5 Qwen2 推理测试 + +```bash +python test/test_infer.py --device nvidia --max_steps 128 +python test/test_infer.py --device nvidia --max_steps 8 --test +``` + +--- + +## 8. 兼容性说明 + +1. CPU 路径保持可用,CUDA 仅在 `--nv-gpu=y` + `ENABLE_NVIDIA_API` 条件下启用。 +2. Python API 兼容既有调用方式,不要求业务侧改接口。 +3. 采样接口未新增外部 seed 参数;CUDA 侧 seed 为实现细节,不影响现有测试接口。 + +--- + +## 9. 已知限制与后续优化方向 + +1. 当前部分 CUDA kernel 仍是基础实现(可进一步做块内优化、向量化、融合)。 +2. 与 HF 相比 decode 吞吐仍有差距,后续可优先优化: + - 减少每 token 动态分配 + - 优化采样(减少全排序开销) + - 注意力 kernel 融合/flash 化 + - 进一步减少同步与跨设备拷贝 + +--- + +## 10. 本次交付结论 + +本次已经完成从 Runtime 到算子到模型推理的 CUDA 全链路接入: + +- 构建可编译、可安装 +- `test_runtime` 与 `test/ops` 在 NVIDIA 设备通过 +- `Qwen2` 在 NVIDIA 设备可完成推理,并通过 `test_infer --test` 的一致性验证 + +即:LLAISYS 已具备可复现的 CUDA 推理能力,且与 CPU 实现保持接口兼容。 diff --git a/src/core/context/context.cpp b/src/core/context/context.cpp index 44894b9e..63756fab 100644 --- a/src/core/context/context.cpp +++ b/src/core/context/context.cpp @@ -52,7 +52,7 @@ Context::~Context() { void Context::setDevice(llaisysDeviceType_t device_type, int device_id) { // If doest not match the current runtime. if (_current_runtime == nullptr || _current_runtime->deviceType() != device_type || _current_runtime->deviceId() != device_id) { - auto runtimes = _runtime_map[device_type]; + auto &runtimes = _runtime_map[device_type]; CHECK_ARGUMENT((size_t)device_id < runtimes.size() && device_id >= 0, "invalid device id"); if (_current_runtime != nullptr) { _current_runtime->_deactivate(); diff --git a/src/device/nvidia/nvidia_runtime_api.cu b/src/device/nvidia/nvidia_runtime_api.cu index cab92826..c99b886f 100644 --- a/src/device/nvidia/nvidia_runtime_api.cu +++ b/src/device/nvidia/nvidia_runtime_api.cu @@ -1,56 +1,119 @@ #include "../runtime_api.hpp" +#include + #include #include +#define CUDA_CHECK(call) \ + do { \ + cudaError_t err__ = (call); \ + ASSERT(err__ == cudaSuccess, "CUDA Runtime API call failed"); \ + } while (0) + namespace llaisys::device::nvidia { namespace runtime_api { int getDeviceCount() { - TO_BE_IMPLEMENTED(); + int count = 0; + CUDA_CHECK(cudaGetDeviceCount(&count)); + return count; } -void setDevice(int) { - TO_BE_IMPLEMENTED(); +void setDevice(int device) { + CUDA_CHECK(cudaSetDevice(device)); } void deviceSynchronize() { - TO_BE_IMPLEMENTED(); + CUDA_CHECK(cudaDeviceSynchronize()); } llaisysStream_t createStream() { - TO_BE_IMPLEMENTED(); + cudaStream_t stream = nullptr; + CUDA_CHECK(cudaStreamCreate(&stream)); + return reinterpret_cast(stream); } void destroyStream(llaisysStream_t stream) { - TO_BE_IMPLEMENTED(); + if (!stream) { + return; + } + CUDA_CHECK(cudaStreamDestroy(reinterpret_cast(stream))); } void streamSynchronize(llaisysStream_t stream) { - TO_BE_IMPLEMENTED(); + if (!stream) { + CUDA_CHECK(cudaDeviceSynchronize()); + return; + } + CUDA_CHECK(cudaStreamSynchronize(reinterpret_cast(stream))); } void *mallocDevice(size_t size) { - TO_BE_IMPLEMENTED(); + void *ptr = nullptr; + CUDA_CHECK(cudaMalloc(&ptr, size)); + return ptr; } void freeDevice(void *ptr) { - TO_BE_IMPLEMENTED(); + if (!ptr) { + return; + } + CUDA_CHECK(cudaFree(ptr)); } void *mallocHost(size_t size) { - TO_BE_IMPLEMENTED(); + void *ptr = nullptr; + CUDA_CHECK(cudaMallocHost(&ptr, size)); + return ptr; } void freeHost(void *ptr) { - TO_BE_IMPLEMENTED(); + if (!ptr) { + return; + } + CUDA_CHECK(cudaFreeHost(ptr)); } void memcpySync(void *dst, const void *src, size_t size, llaisysMemcpyKind_t kind) { - TO_BE_IMPLEMENTED(); + cudaMemcpyKind cuda_kind = cudaMemcpyDefault; + switch (kind) { + case LLAISYS_MEMCPY_H2H: + cuda_kind = cudaMemcpyHostToHost; + break; + case LLAISYS_MEMCPY_H2D: + cuda_kind = cudaMemcpyHostToDevice; + break; + case LLAISYS_MEMCPY_D2H: + cuda_kind = cudaMemcpyDeviceToHost; + break; + case LLAISYS_MEMCPY_D2D: + cuda_kind = cudaMemcpyDeviceToDevice; + break; + default: + ASSERT(false, "Unsupported memcpy kind in CUDA memcpySync"); + } + CUDA_CHECK(cudaMemcpy(dst, src, size, cuda_kind)); } -void memcpyAsync(void *dst, const void *src, size_t size, llaisysMemcpyKind_t kind) { - TO_BE_IMPLEMENTED(); +void memcpyAsync(void *dst, const void *src, size_t size, llaisysMemcpyKind_t kind, llaisysStream_t stream) { + cudaMemcpyKind cuda_kind = cudaMemcpyDefault; + switch (kind) { + case LLAISYS_MEMCPY_H2H: + cuda_kind = cudaMemcpyHostToHost; + break; + case LLAISYS_MEMCPY_H2D: + cuda_kind = cudaMemcpyHostToDevice; + break; + case LLAISYS_MEMCPY_D2H: + cuda_kind = cudaMemcpyDeviceToHost; + break; + case LLAISYS_MEMCPY_D2D: + cuda_kind = cudaMemcpyDeviceToDevice; + break; + default: + ASSERT(false, "Unsupported memcpy kind in CUDA memcpyAsync"); + } + CUDA_CHECK(cudaMemcpyAsync(dst, src, size, cuda_kind, reinterpret_cast(stream))); } static const LlaisysRuntimeAPI RUNTIME_API = { diff --git a/src/llaisys/ops.cc b/src/llaisys/ops.cc index c99fbc32..ddcc9772 100644 --- a/src/llaisys/ops.cc +++ b/src/llaisys/ops.cc @@ -7,6 +7,7 @@ #include "../ops/embedding/op.hpp" #include "../ops/linear/op.hpp" #include "../ops/rearrange/op.hpp" +#include "../ops/random_sample/op.hpp" #include "../ops/rms_norm/op.hpp" #include "../ops/rope/op.hpp" #include "../ops/self_attention/op.hpp" @@ -23,7 +24,10 @@ __C { llaisys::ops::embedding(out->tensor, index->tensor, weight->tensor); } void llaisysLinear(llaisysTensor_t out, llaisysTensor_t in, llaisysTensor_t weight, llaisysTensor_t bias) { - llaisys::ops::linear(out->tensor, in->tensor, weight->tensor, bias->tensor); + if(bias == nullptr) { + llaisys::ops::linear(out->tensor, in->tensor, weight->tensor, nullptr); + }else + llaisys::ops::linear(out->tensor, in->tensor, weight->tensor, bias->tensor); } void llaisysRearrange(llaisysTensor_t out, llaisysTensor_t in) { llaisys::ops::rearrange(out->tensor, in->tensor); @@ -40,4 +44,9 @@ __C { void llaisysSwiGLU(llaisysTensor_t out, llaisysTensor_t gate, llaisysTensor_t up) { llaisys::ops::swiglu(out->tensor, gate->tensor, up->tensor); } + void llaisysRandomSample(llaisysTensor_t sample_idx, llaisysTensor_t sample_val, llaisysTensor_t logits, + float temperature, int top_k, float top_p, int64_t seed) { + llaisys::ops::random_sample(sample_idx->tensor, sample_val->tensor, logits->tensor, temperature, top_k, top_p, + seed); + } } diff --git a/src/llaisys/qwen2.cc b/src/llaisys/qwen2.cc new file mode 100644 index 00000000..441c8d69 --- /dev/null +++ b/src/llaisys/qwen2.cc @@ -0,0 +1,510 @@ +#include "llaisys/models/qwen2.h" +#include "llaisys/ops.h" +#include "llaisys_tensor.hpp" + +#include +#include +#include +#include +#include + + +__C { + struct LlaisysQwen2Model *llaisysQwen2ModelCreate(const LlaisysQwen2Meta *meta, llaisysDeviceType_t device, int *device_ids, int ndevice) { + // Validate input parameters + if (!meta || !device_ids || ndevice <= 0) { + std::cerr << "Invalid parameters for Qwen2 model creation" << std::endl; + return nullptr; + } + + // Allocate main model structure + auto model = static_cast(std::calloc(1, sizeof(LlaisysQwen2Model))); + if (!model) { + std::cerr << "Failed to allocate LlaisysQwen2Model" << std::endl; + return nullptr; + } + + // Initialize metadata + model->meta = static_cast(std::malloc(sizeof(LlaisysQwen2Meta))); + if (!model->meta) { + std::cerr << "Failed to allocate model metadata" << std::endl; + free(model); + return nullptr; + } + std::memcpy(model->meta, meta, sizeof(LlaisysQwen2Meta)); + + // Initialize weights structure + model->weights = static_cast(std::calloc(1, sizeof(LlaisysQwen2Weights))); + if (!model->weights) { + std::cerr << "Failed to allocate model weights" << std::endl; + free(model->meta); + free(model); + return nullptr; + } + + // Initialize device information + model->device = device; + model->ndevice = ndevice; + model->device_ids = static_cast(std::malloc(sizeof(int) * ndevice)); + if (!model->device_ids) { + std::cerr << "Failed to allocate device IDs" << std::endl; + free(model->weights); + free(model->meta); + free(model); + return nullptr; + } + std::memcpy(model->device_ids, device_ids, sizeof(int) * ndevice); + + + + // Input Embedding + size_t shape_in_embed[2] = { meta->voc, meta->hs }; + model->weights->in_embed = tensorCreate(shape_in_embed, 2, meta->dtype, device, device_ids[0]); + + // Helper functions for tensor array allocation + auto alloc_layer_array_2d = [&](llaisysTensor_t *&ptr, size_t dim0, size_t dim1) -> bool { + ptr = static_cast(std::malloc(sizeof(llaisysTensor_t) * meta->nlayer)); + if (!ptr) return false; + + for (size_t i = 0; i < meta->nlayer; ++i) { + size_t shape[2] = { dim0, dim1 }; + ptr[i] = tensorCreate(shape, 2, meta->dtype, model->device, model->device_ids[0]); + if (!ptr[i]) { + // Clean up previously allocated tensors on failure + for (size_t j = 0; j < i; ++j) { + tensorDestroy(ptr[j]); + } + free(ptr); + ptr = nullptr; + return false; + } + } + return true; + }; + + auto alloc_layer_array_1d = [&](llaisysTensor_t *&ptr, size_t dim0) -> bool { + ptr = static_cast(std::malloc(sizeof(llaisysTensor_t) * meta->nlayer)); + if (!ptr) return false; + + for (size_t i = 0; i < meta->nlayer; ++i) { + size_t shape[1] = { dim0 }; + ptr[i] = tensorCreate(shape, 1, meta->dtype, model->device, model->device_ids[0]); + if (!ptr[i]) { + // Clean up previously allocated tensors on failure + for (size_t j = 0; j < i; ++j) { + tensorDestroy(ptr[j]); + } + free(ptr); + ptr = nullptr; + return false; + } + } + return true; + }; + + // Self-Attention + alloc_layer_array_1d(model->weights->attn_norm_w, meta->hs); // [1536] + alloc_layer_array_2d(model->weights->attn_q_w, meta->hs, meta->nh * meta->dh); // [1536, 1536] + alloc_layer_array_1d(model->weights->attn_q_b, meta->nh * meta->dh); // [1536] + alloc_layer_array_2d(model->weights->attn_k_w, meta->nkvh * meta->dh, meta->hs); // [256, 1536] + alloc_layer_array_1d(model->weights->attn_k_b, meta->nkvh * meta->dh); // [256] + alloc_layer_array_2d(model->weights->attn_v_w, meta->nkvh * meta->dh, meta->hs); // [256, 1536] + alloc_layer_array_1d(model->weights->attn_v_b, meta->nkvh * meta->dh); // [256] + alloc_layer_array_2d(model->weights->attn_o_w, meta->nh * meta->dh, meta->hs); // [1536, 1536] + + // MLP + alloc_layer_array_1d(model->weights->mlp_norm_w, meta->hs); // [1536] + alloc_layer_array_2d(model->weights->mlp_gate_w, meta->di, meta->hs); // [8960, 1536] + alloc_layer_array_2d(model->weights->mlp_up_w, meta->di, meta->hs); // [8960, 1536] + alloc_layer_array_2d(model->weights->mlp_down_w, meta->hs, meta->di); // [1536, 8960] + + // Output Layer Norm + size_t shape_out_norm[1] = { meta->hs }; + model->weights->out_norm_w = tensorCreate(shape_out_norm, 1, meta->dtype, model->device, model->device_ids[0]); + + // Output Embedding + size_t shape_out_embed[2] = { meta->voc, meta->hs }; + model->weights->out_embed = tensorCreate(shape_out_embed, 2, meta->dtype, device, device_ids[0]); + + return model; + } + + void llaisysQwen2ModelDestroy(struct LlaisysQwen2Model * model) { + if (!model) return; + + tensorDestroy(model->weights->in_embed); + tensorDestroy(model->weights->out_embed); + tensorDestroy(model->weights->out_norm_w); + + // attn_norm_w (1d tensor array) + if (model->weights->attn_norm_w) { + for (size_t i = 0; i < model->meta->nlayer; ++i) { + tensorDestroy(model->weights->attn_norm_w[i]); + } + free(model->weights->attn_norm_w); + } + + // attn_q_w (2d tensor array) + if (model->weights->attn_q_w) { + for (size_t i = 0; i < model->meta->nlayer; ++i) { + tensorDestroy(model->weights->attn_q_w[i]); + } + free(model->weights->attn_q_w); + } + + // attn_q_b (1d tensor array) + if (model->weights->attn_q_b) { + for (size_t i = 0; i < model->meta->nlayer; ++i) { + tensorDestroy(model->weights->attn_q_b[i]); + } + free(model->weights->attn_q_b); + } + + // attn_k_w (2d tensor array) + if (model->weights->attn_k_w) { + for (size_t i = 0; i < model->meta->nlayer; ++i) { + tensorDestroy(model->weights->attn_k_w[i]); + } + free(model->weights->attn_k_w); + } + + // attn_k_b (1d tensor array) + if (model->weights->attn_k_b) { + for (size_t i = 0; i < model->meta->nlayer; ++i) { + tensorDestroy(model->weights->attn_k_b[i]); + } + free(model->weights->attn_k_b); + } + + // attn_v_w (2d tensor array) + if (model->weights->attn_v_w) { + for (size_t i = 0; i < model->meta->nlayer; ++i) { + tensorDestroy(model->weights->attn_v_w[i]); + } + free(model->weights->attn_v_w); + } + + // attn_v_b (1d tensor array) + if (model->weights->attn_v_b) { + for (size_t i = 0; i < model->meta->nlayer; ++i) { + tensorDestroy(model->weights->attn_v_b[i]); + } + free(model->weights->attn_v_b); + } + + // attn_o_w (2d tensor array) + if (model->weights->attn_o_w) { + for (size_t i = 0; i < model->meta->nlayer; ++i) { + tensorDestroy(model->weights->attn_o_w[i]); + } + free(model->weights->attn_o_w); + } + + // mlp_norm_w (1d tensor array) + if (model->weights->mlp_norm_w) { + for (size_t i = 0; i < model->meta->nlayer; ++i) { + tensorDestroy(model->weights->mlp_norm_w[i]); + } + free(model->weights->mlp_norm_w); + } + + // mlp_gate_w (2d tensor array) + if (model->weights->mlp_gate_w) { + for (size_t i = 0; i < model->meta->nlayer; ++i) { + tensorDestroy(model->weights->mlp_gate_w[i]); + } + free(model->weights->mlp_gate_w); + } + + // mlp_up_w (2d tensor array) + if (model->weights->mlp_up_w) { + for (size_t i = 0; i < model->meta->nlayer; ++i) { + tensorDestroy(model->weights->mlp_up_w[i]); + } + free(model->weights->mlp_up_w); + } + + // mlp_down_w (2d tensor array) + if (model->weights->mlp_down_w) { + for (size_t i = 0; i < model->meta->nlayer; ++i) { + tensorDestroy(model->weights->mlp_down_w[i]); + } + free(model->weights->mlp_down_w); + } + + if (model->device_ids) { + free(model->device_ids); + } + + if (model->meta) { + free(model->meta); + } + + free(model); + } + + struct LlaisysQwen2Weights *llaisysQwen2ModelWeights(struct LlaisysQwen2Model * model) { + return model->weights; + } + + int64_t llaisysQwen2ModelInfer(struct LlaisysQwen2Model * model, int64_t * token_ids, size_t ntoken, + llaisysTensor_t *kcache, llaisysTensor_t *vcache, size_t past_len, + float temperature, int top_k, float top_p, int64_t seed) { + if (!model || !token_ids || ntoken == 0) return -1; + + // If kv_cache != nullptr, it means KV Cache is used for performance. + bool kv_cache_used = (kcache != nullptr && vcache != nullptr); + // kcache [max_seq, nkvh, d], vcache [max_seq, nkvh, dv] + + size_t seqlen = ntoken; + size_t hs = model->meta->hs; // hidden size + size_t nh = model->meta->nh; // num heads + size_t dh = model->meta->dh; // head dim + size_t nkvh = model->meta->nkvh; // num key-value heads + size_t di = model->meta->di; // mlp intermediate dim + size_t voc = model->meta->voc; // vocab size + size_t nlayer = model->meta->nlayer; + float scale = 1.0f / std::sqrt(static_cast(dh)); // scale for attention + float rms_eps = model->meta->epsilon; // epsilon for RMSNorm + float rope_theta = model->meta->theta; // theta for RoPE + + // 1. intput token_ids -> tensor + size_t input_tensor_shape[1] = {seqlen}; + + llaisysTensor_t input_tensor = tensorCreate(input_tensor_shape, 1, LLAISYS_DTYPE_I64, model->device, model->device_ids[0]); + + tensorLoad(input_tensor, token_ids); + + // 2. Embedding lookup: [seqlen] -> [seqlen, hs] + size_t output_embedding_tensor_shape[2] = {seqlen, hs}; + llaisysTensor_t output_embedding_tensor = tensorCreate(output_embedding_tensor_shape, 2, model->meta->dtype, model->device, model->device_ids[0]); + llaisysEmbedding(output_embedding_tensor, input_tensor, model->weights->in_embed); + + + + // output_hidden_layer_tensor is used to store the output of the hidden layer + llaisysTensor_t output_hidden_layer_tensor = output_embedding_tensor; + size_t position_shape[1] = {seqlen}; + llaisysTensor_t position_ids = tensorCreate(position_shape, 1, LLAISYS_DTYPE_I64, model->device, model->device_ids[0]); + int64_t* pos_data_cpu = new int64_t[seqlen]; + for (size_t i = 0; i < seqlen; i++) { + if(kv_cache_used) + pos_data_cpu[i] = (int64_t)(past_len + i); // When using KV cache, position ids continue from past_len + else + pos_data_cpu[i] = (int64_t) i; + } + tensorLoad(position_ids, pos_data_cpu); + delete[] pos_data_cpu; + + + + // 3. Transformer hidden layers + for (size_t i = 0; i < nlayer; i++) { + // 3.1 LayerNorm before Self-attention + llaisysTensor_t attn_norm_w = model->weights->attn_norm_w[i]; + size_t output_input_layernorm_shape[2] = {seqlen, hs}; + llaisysTensor_t output_input_layernorm_tensor = tensorCreate(output_input_layernorm_shape, 2, model->meta->dtype, model->device, model->device_ids[0]); + llaisysRmsNorm(output_input_layernorm_tensor, output_hidden_layer_tensor, attn_norm_w, rms_eps); + + + + // 3.2 Q projection + llaisysTensor_t attn_q_w = model->weights->attn_q_w[i]; + llaisysTensor_t attn_q_b = model->weights->attn_q_b[i]; + size_t q_tensor_shape[2] = {seqlen, nh * dh}; + llaisysTensor_t q_tensor = tensorCreate(q_tensor_shape, 2, model->meta->dtype, model->device, model->device_ids[0]); + llaisysLinear(q_tensor, output_input_layernorm_tensor, attn_q_w, attn_q_b); + + size_t q_tensor_reshape_shape[3] = {seqlen, nh, dh}; + q_tensor = tensorReshape(q_tensor, q_tensor_reshape_shape, 3); + llaisysTensor_t q_rope_tensor = tensorCreate(q_tensor_reshape_shape, 3, model->meta->dtype, model->device, model->device_ids[0]); + llaisysROPE(q_rope_tensor, q_tensor, position_ids, rope_theta); + + + + // 3.3 K projection + llaisysTensor_t attn_k_w = model->weights->attn_k_w[i]; + llaisysTensor_t attn_k_b = model->weights->attn_k_b[i]; + size_t k_tensor_shape[2] = {seqlen, nkvh * dh}; + llaisysTensor_t k_tensor = tensorCreate(k_tensor_shape, 2, model->meta->dtype, model->device, model->device_ids[0]); + llaisysLinear(k_tensor, output_input_layernorm_tensor, attn_k_w, attn_k_b); + + size_t k_tensor_reshape_shape[3] = {seqlen, nkvh, dh}; + k_tensor = tensorReshape(k_tensor, k_tensor_reshape_shape, 3); + llaisysTensor_t k_rope_tensor; + if (kv_cache_used) { + // When using KV cache, we need to update the kcache tensor + // kcache shape: [max_seq, nkvh, dh] + // Slice the kcache to get the current position to update + k_rope_tensor = tensorSlice(kcache[i], 0, past_len, past_len + seqlen); // Write to kcache + } else { + k_rope_tensor = tensorCreate(k_tensor_reshape_shape, 3, model->meta->dtype, model->device, model->device_ids[0]); + } + llaisysROPE(k_rope_tensor, k_tensor, position_ids, rope_theta); + + + + // 3.4 V projection + llaisysTensor_t attn_v_w = model->weights->attn_v_w[i]; + llaisysTensor_t attn_v_b = model->weights->attn_v_b[i]; + size_t v_tensor_shape[2] = {seqlen, nkvh * dh}; + llaisysTensor_t v_tensor; + if (kv_cache_used) { + // When using KV cache, we need to update the vcache tensor + // vcache shape: [max_seq, nkvh, dh] + // Slice the vcache to get the current position to update + v_tensor = tensorView(tensorSlice(vcache[i], 0, past_len, past_len + seqlen), v_tensor_shape, 2); + } else { + v_tensor = tensorCreate(v_tensor_shape, 2, model->meta->dtype, model->device, model->device_ids[0]); + } + llaisysLinear(v_tensor, output_input_layernorm_tensor, attn_v_w, attn_v_b); + + + // 3.5 Self-attention with Flash Attention + size_t output_self_attn_multihead_tensor_shape[3] = {seqlen, nh, dh}; + llaisysTensor_t output_self_attn_tensor = tensorCreate(output_self_attn_multihead_tensor_shape, 3, model->meta->dtype, model->device, model->device_ids[0]); + + // Prepare tensors for attention (with or without KV cache) + llaisysTensor_t k_for_attn, v_for_attn; + size_t kv_seq_len; + + if (kv_cache_used) { + // Use KV cache to speed up inference + // kcache [max_seq, nkvh, d], vcache [max_seq, nkvh, dv] + kv_seq_len = past_len + seqlen; + k_for_attn = tensorSlice(kcache[i], 0, 0, kv_seq_len); + v_for_attn = tensorSlice(vcache[i], 0, 0, kv_seq_len); + } else { + size_t v_tensor_reshape_shape[3] = {seqlen, nkvh, dh}; + v_for_attn = tensorReshape(v_tensor, v_tensor_reshape_shape, 3); + k_for_attn = k_rope_tensor; + kv_seq_len = seqlen; + } + + + llaisysSelfAttention(output_self_attn_tensor, q_rope_tensor, k_for_attn, v_for_attn, scale); + + size_t output_self_attn_tensor_shape[2] = {seqlen, nh * dh}; + output_self_attn_tensor = tensorReshape(output_self_attn_tensor, output_self_attn_tensor_shape, 2); + + + // 3.6 Self-attention output projection + llaisysTensor_t attn_o_w = model->weights->attn_o_w[i]; + size_t o_tensor_shape[2] = {seqlen, hs}; + llaisysTensor_t o_tensor = tensorCreate(o_tensor_shape, 2, model->meta->dtype, model->device, model->device_ids[0]); + + llaisysLinear(o_tensor, output_self_attn_tensor, attn_o_w, nullptr); + + // 3.7 Residual connection after attn + size_t output_res1_tensor_shape[2] = {seqlen, hs}; + llaisysTensor_t output_res1_tensor = tensorCreate(output_res1_tensor_shape, 2, model->meta->dtype, model->device, model->device_ids[0]); + llaisysAdd(output_res1_tensor, output_hidden_layer_tensor, o_tensor); + + + // 3.8 Post-attention LayerNorm + llaisysTensor_t post_attn_norm_w = model->weights->mlp_norm_w[i]; + size_t output_post_self_attn_layernorm_tensor_shape[2] = {seqlen, hs}; + llaisysTensor_t output_post_self_attn_layernorm_tensor = tensorCreate(output_post_self_attn_layernorm_tensor_shape, 2, model->meta->dtype, model->device, model->device_ids[0]); + llaisysRmsNorm(output_post_self_attn_layernorm_tensor, output_res1_tensor, post_attn_norm_w, rms_eps); + + + // 3.9 MLP (Gate, Up, Down) + llaisysTensor_t mlp_gate_w = model->weights->mlp_gate_w[i]; + llaisysTensor_t mlp_up_w = model->weights->mlp_up_w[i]; + llaisysTensor_t mlp_down_w = model->weights->mlp_down_w[i]; + + size_t mlp_gate_tensor_shape[2] = {seqlen, di}; + llaisysTensor_t mlp_gate_tensor = tensorCreate(mlp_gate_tensor_shape, 2, model->meta->dtype, model->device, model->device_ids[0]); + llaisysLinear(mlp_gate_tensor, output_post_self_attn_layernorm_tensor, mlp_gate_w, nullptr); + + + size_t mlp_up_tensor_shape[2] = {seqlen, di}; + llaisysTensor_t mlp_up_tensor = tensorCreate(mlp_up_tensor_shape, 2, model->meta->dtype, model->device, model->device_ids[0]); + llaisysLinear(mlp_up_tensor, output_post_self_attn_layernorm_tensor, mlp_up_w, nullptr); + + + size_t swiglu_tensor_shape[2] = {seqlen, di}; + llaisysTensor_t swiglu_tensor = tensorCreate(swiglu_tensor_shape, 2, model->meta->dtype, model->device, model->device_ids[0]); + llaisysSwiGLU(swiglu_tensor, mlp_gate_tensor, mlp_up_tensor); + + + size_t mlp_down_tensor_shape[2] = {seqlen, hs}; + llaisysTensor_t mlp_down_tensor = tensorCreate(mlp_down_tensor_shape, 2, model->meta->dtype, model->device, model->device_ids[0]); + llaisysLinear(mlp_down_tensor, swiglu_tensor, mlp_down_w, nullptr); + + + + // 3.10 Residual connection after MLP + llaisysAdd(output_hidden_layer_tensor, output_res1_tensor, mlp_down_tensor); + + + + + + if(!kv_cache_used){ + tensorDestroy(k_rope_tensor); + tensorDestroy(v_tensor); + } + + // release intermediate tensors + tensorDestroy(output_input_layernorm_tensor); + tensorDestroy(q_tensor); + tensorDestroy(q_rope_tensor); + tensorDestroy(k_tensor); + tensorDestroy(output_self_attn_tensor); + tensorDestroy(o_tensor); + tensorDestroy(output_res1_tensor); + tensorDestroy(output_post_self_attn_layernorm_tensor); + tensorDestroy(mlp_gate_tensor); + tensorDestroy(mlp_up_tensor); + tensorDestroy(swiglu_tensor); + tensorDestroy(mlp_down_tensor); + } + // 4. Output LayerNorm + llaisysTensor_t final_layernorm_w = model->weights->out_norm_w; + size_t output_final_layernorm_shape[2] = {seqlen, hs}; + llaisysTensor_t output_final_layernorm_tensor = tensorCreate(output_final_layernorm_shape, 2, model->meta->dtype, model->device, model->device_ids[0]); + llaisysRmsNorm(output_final_layernorm_tensor, output_hidden_layer_tensor, final_layernorm_w, rms_eps); + + + + // 5. Output [seqlen, voc] + size_t output_tensor_shape[2] = {seqlen, voc}; + llaisysTensor_t output_tensor = tensorCreate(output_tensor_shape, 2, model->meta->dtype, model->device, model->device_ids[0]); + llaisysLinear(output_tensor, output_final_layernorm_tensor, model->weights->out_embed, nullptr); + + + + // [seqlen, voc] -> [1, voc] + size_t output_tensor_slice_reshape_shape[1] = {voc}; + llaisysTensor_t output_tensor_slice = tensorSlice(output_tensor, 0, seqlen-1, seqlen); // only need the last token's logits + + + size_t index_shape[1] = {1}; + llaisysTensor_t index_tensor = tensorCreate(index_shape, 1, LLAISYS_DTYPE_I64, model->device, model->device_ids[0]); + llaisysTensor_t value_tensor = tensorCreate(index_shape, 1, model->meta->dtype, model->device, model->device_ids[0]); + llaisysRandomSample(index_tensor, value_tensor, tensorView(output_tensor_slice, output_tensor_slice_reshape_shape, 1), + temperature, top_k, top_p, seed); + + + // For NVIDIA device, need to copy data back to CPU to read it + int64_t index; + if (model->device == LLAISYS_DEVICE_NVIDIA) { + // Use C++ API to convert tensor to CPU + auto index_cpu_tensor = index_tensor->tensor->to(LLAISYS_DEVICE_CPU); + index = *((int64_t *) index_cpu_tensor->data()); + } else { + index = *((int64_t *) tensorGetData(index_tensor)); + } + + tensorDestroy(position_ids); + tensorDestroy(input_tensor); + tensorDestroy(output_embedding_tensor); + tensorDestroy(output_final_layernorm_tensor); + tensorDestroy(output_tensor); + tensorDestroy(index_tensor); + tensorDestroy(value_tensor); + + return index; + } +} \ No newline at end of file diff --git a/src/llaisys/tensor.cc b/src/llaisys/tensor.cc index 5e6e5012..0e6e6f48 100644 --- a/src/llaisys/tensor.cc +++ b/src/llaisys/tensor.cc @@ -93,4 +93,17 @@ __C { size_t end) { return new LlaisysTensor{tensor->tensor->slice(dim, start, end)}; } + llaisysTensor_t tensorReshape( + llaisysTensor_t tensor, + size_t * shape, + size_t ndim) { + std::vector shape_vec(shape, shape + ndim); + return new LlaisysTensor{tensor->tensor->reshape(shape_vec)}; + } + llaisysTensor_t tensorTo( + llaisysTensor_t tensor, + llaisysDeviceType_t device_type, + int device_id) { + return new LlaisysTensor{tensor->tensor->to(device_type, device_id)}; + } } diff --git a/src/ops/add/cuda/add_cuda.cu b/src/ops/add/cuda/add_cuda.cu new file mode 100644 index 00000000..cfd855d9 --- /dev/null +++ b/src/ops/add/cuda/add_cuda.cu @@ -0,0 +1,66 @@ +#include "add_cuda.cuh" + +#include "../../../utils.hpp" +#include "../../../utils/check.hpp" + +#include +#include + +namespace llaisys::ops::cuda { + +template +__global__ void add_kernel(T *c, const T *a, const T *b, size_t numel) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= numel) { + return; + } + c[idx] = a[idx] + b[idx]; +} + +__global__ void add_kernel_fp16(__half *c, const __half *a, const __half *b, size_t numel) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= numel) { + return; + } + c[idx] = __hadd(a[idx], b[idx]); +} + +__global__ void add_kernel_bf16(__nv_bfloat16 *c, const __nv_bfloat16 *a, const __nv_bfloat16 *b, size_t numel) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= numel) { + return; + } +#if __CUDA_ARCH__ >= 800 + c[idx] = __hadd(a[idx], b[idx]); +#else + c[idx] = __float2bfloat16(__bfloat162float(a[idx]) + __bfloat162float(b[idx])); +#endif +} + +void add(std::byte *c, const std::byte *a, const std::byte *b, llaisysDataType_t dtype, size_t numel) { + dim3 block(256); + dim3 grid((numel + block.x - 1) / block.x); + + switch (dtype) { + case LLAISYS_DTYPE_F32: + add_kernel<<>>(reinterpret_cast(c), reinterpret_cast(a), + reinterpret_cast(b), numel); + break; + case LLAISYS_DTYPE_F16: + add_kernel_fp16<<>>(reinterpret_cast<__half *>(c), reinterpret_cast(a), + reinterpret_cast(b), numel); + break; + case LLAISYS_DTYPE_BF16: + add_kernel_bf16<<>>(reinterpret_cast<__nv_bfloat16 *>(c), + reinterpret_cast(a), + reinterpret_cast(b), numel); + break; + default: + EXCEPTION_UNSUPPORTED_DATATYPE(dtype); + } + + auto err = cudaGetLastError(); + ASSERT(err == cudaSuccess, "add cuda kernel launch failed"); +} + +} // namespace llaisys::ops::cuda diff --git a/src/ops/add/cuda/add_cuda.cuh b/src/ops/add/cuda/add_cuda.cuh new file mode 100644 index 00000000..9802a736 --- /dev/null +++ b/src/ops/add/cuda/add_cuda.cuh @@ -0,0 +1,9 @@ +#pragma once + +#include "llaisys.h" + +#include + +namespace llaisys::ops::cuda { +void add(std::byte *c, const std::byte *a, const std::byte *b, llaisysDataType_t dtype, size_t numel); +} diff --git a/src/ops/add/op.cpp b/src/ops/add/op.cpp index a057330d..c39021ec 100644 --- a/src/ops/add/op.cpp +++ b/src/ops/add/op.cpp @@ -4,6 +4,9 @@ #include "../../utils.hpp" #include "cpu/add_cpu.hpp" +#ifdef ENABLE_NVIDIA_API +#include "cuda/add_cuda.cuh" +#endif namespace llaisys::ops { void add(tensor_t c, tensor_t a, tensor_t b) { @@ -25,7 +28,7 @@ void add(tensor_t c, tensor_t a, tensor_t b) { return cpu::add(c->data(), a->data(), b->data(), c->dtype(), c->numel()); #ifdef ENABLE_NVIDIA_API case LLAISYS_DEVICE_NVIDIA: - TO_BE_IMPLEMENTED(); + return cuda::add(c->data(), a->data(), b->data(), c->dtype(), c->numel()); return; #endif default: diff --git a/src/ops/argmax/cpu/argmax_cpu.cpp b/src/ops/argmax/cpu/argmax_cpu.cpp new file mode 100644 index 00000000..e1e97e88 --- /dev/null +++ b/src/ops/argmax/cpu/argmax_cpu.cpp @@ -0,0 +1,45 @@ +#include "argmax_cpu.hpp" + +#include "../../../utils.hpp" + +#include + +template +void argmax_(int64_t *mi, T *mv, const T *v, size_t numel) { + mi[0]=0; + if constexpr (std::is_same_v || std::is_same_v) { + mv[0] = llaisys::utils::cast(llaisys::utils::cast(v[0])); + + for (size_t i = 1; i < numel; i++) { + if(llaisys::utils::cast(mv[0]) < llaisys::utils::cast(v[i])){ + mi[0] = static_cast(i); + mv[0] = v[i]; + } + } + + }else{ + mv[0] = v[0]; + for (size_t i = 1; i < numel; i++) { + if(mv[0] < v[i]){ + mi[0] = static_cast(i); + mv[0] = v[i]; + } + } + } +} +namespace llaisys::ops::cpu { +void argmax(std::byte *mi, std::byte *mv, const std::byte *v, llaisysDataType_t type, size_t numel) { + switch (type) { + case LLAISYS_DTYPE_F32: + return argmax_(reinterpret_cast(mi), reinterpret_cast(mv), reinterpret_cast(v), numel); + case LLAISYS_DTYPE_BF16: + return argmax_(reinterpret_cast(mi), reinterpret_cast(mv), + reinterpret_cast(v), numel); + case LLAISYS_DTYPE_F16: + return argmax_(reinterpret_cast(mi), reinterpret_cast(mv), + reinterpret_cast(v), numel); + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } +} +} // namespace llaisys::ops::cpu diff --git a/src/ops/argmax/cpu/argmax_cpu.hpp b/src/ops/argmax/cpu/argmax_cpu.hpp new file mode 100644 index 00000000..3ba45392 --- /dev/null +++ b/src/ops/argmax/cpu/argmax_cpu.hpp @@ -0,0 +1,8 @@ +#pragma once +#include "llaisys.h" + +#include + +namespace llaisys::ops::cpu { +void argmax(std::byte *mi, std::byte *mv, const std::byte *v, llaisysDataType_t type, size_t size); +} \ No newline at end of file diff --git a/src/ops/argmax/cuda/argmax_cuda.cu b/src/ops/argmax/cuda/argmax_cuda.cu new file mode 100644 index 00000000..d0ddec2a --- /dev/null +++ b/src/ops/argmax/cuda/argmax_cuda.cu @@ -0,0 +1,35 @@ +#include "argmax_cuda.cuh" + +#include "../../../utils.hpp" +#include "../../../utils/check.hpp" + +namespace llaisys::ops::cuda { + +__global__ void argmax_kernel(int64_t *mi, void *mv, const void *v, llaisysDataType_t type, size_t numel) { + if (blockIdx.x != 0 || threadIdx.x != 0) { + return; + } + + size_t best_idx = 0; + float best_val = load_as_float(v, 0, type); + for (size_t i = 1; i < numel; ++i) { + float cur = load_as_float(v, i, type); + if (cur > best_val) { + best_val = cur; + best_idx = i; + } + } + + mi[0] = static_cast(best_idx); + store_from_float(mv, 0, best_val, type); +} + +void argmax(std::byte *mi, std::byte *mv, const std::byte *v, llaisysDataType_t type, size_t numel) { + ASSERT(type == LLAISYS_DTYPE_F32 || type == LLAISYS_DTYPE_F16 || type == LLAISYS_DTYPE_BF16, + "argmax cuda only supports F32/F16/BF16"); + argmax_kernel<<<1, 1>>>(reinterpret_cast(mi), mv, v, type, numel); + auto err = cudaGetLastError(); + ASSERT(err == cudaSuccess, "argmax cuda kernel launch failed"); +} + +} // namespace llaisys::ops::cuda diff --git a/src/ops/argmax/cuda/argmax_cuda.cuh b/src/ops/argmax/cuda/argmax_cuda.cuh new file mode 100644 index 00000000..fc733aec --- /dev/null +++ b/src/ops/argmax/cuda/argmax_cuda.cuh @@ -0,0 +1,9 @@ +#pragma once + +#include "llaisys.h" + +#include + +namespace llaisys::ops::cuda { +void argmax(std::byte *mi, std::byte *mv, const std::byte *v, llaisysDataType_t type, size_t numel); +} diff --git a/src/ops/argmax/op.cpp b/src/ops/argmax/op.cpp index 6dc37d42..1c78dcc9 100644 --- a/src/ops/argmax/op.cpp +++ b/src/ops/argmax/op.cpp @@ -1,7 +1,56 @@ #include "op.hpp" +#include "../../core/llaisys_core.hpp" +#include "../../utils.hpp" +#include "cpu/argmax_cpu.hpp" +#ifdef ENABLE_NVIDIA_API +#include "cuda/argmax_cuda.cuh" +#endif namespace llaisys::ops { void argmax(tensor_t max_idx, tensor_t max_val, tensor_t vals) { - TO_BE_IMPLEMENTED(); + CHECK_SAME_DEVICE(max_idx, max_val, vals); + + // 验证vals是1D张量 + ASSERT(vals->shape().size() == 1, "Argmax: vals must be a 1D tensor"); + + //验证vals连续性 + ASSERT(vals->isContiguous(),"Argmax: vals must be contiguous."); + + // 验证max_idx是包含单个元素的1D张量 + ASSERT(max_idx->shape().size() == 1 && max_idx->shape()[0] == 1, + "Argmax: max_idx must be a 1D tensor with single element"); + + // 验证max_val是包含单个元素的1D张量 + ASSERT(max_val->shape().size() == 1 && max_val->shape()[0] == 1, + "Argmax: max_val must be a 1D tensor with single element"); + + // 验证max_idx是Int64类型 + ASSERT(max_idx->dtype() == LLAISYS_DTYPE_I64, + "Argmax: max_idx must be Int64 type"); + + // 验证max_val类型与vals相同 + ASSERT(max_val->dtype() == vals->dtype(), + "Argmax: max_val must have same type as vals"); + + // always support cpu calculation + if (vals->deviceType() == LLAISYS_DEVICE_CPU) { + return cpu::argmax(max_idx->data(), max_val->data(), vals->data(), + vals->dtype(), vals->numel()); + } + + llaisys::core::context().setDevice(vals->deviceType(), vals->deviceId()); + + switch (vals->deviceType()) { + case LLAISYS_DEVICE_CPU: + return cpu::argmax(max_idx->data(), max_val->data(), vals->data(), + vals->dtype(), vals->numel()); +#ifdef ENABLE_NVIDIA_API + case LLAISYS_DEVICE_NVIDIA: + return cuda::argmax(max_idx->data(), max_val->data(), vals->data(), vals->dtype(), vals->numel()); + return; +#endif + default: + EXCEPTION_UNSUPPORTED_DEVICE; + } } } // namespace llaisys::ops diff --git a/src/ops/embedding/cpu/embedding_cpu.cpp b/src/ops/embedding/cpu/embedding_cpu.cpp new file mode 100644 index 00000000..3400ab62 --- /dev/null +++ b/src/ops/embedding/cpu/embedding_cpu.cpp @@ -0,0 +1,28 @@ +#include "embedding_cpu.hpp" + +#include "../../../utils.hpp" + +#include + +template +void embedding_(T *out, const int64_t *index, const T *weight, size_t num , size_t dim ) { + for(size_t i = 0;i < num; i++) + for(size_t j = 0;j < dim ; j++) + out[i * dim + j] = weight[index[i] * dim + j]; +} +namespace llaisys::ops::cpu { +void embedding(std::byte *out, const std::byte *index, const std::byte *weight, llaisysDataType_t type, size_t num , size_t dim) { + switch (type) { + case LLAISYS_DTYPE_F32: + return embedding_(reinterpret_cast(out), reinterpret_cast(index), reinterpret_cast(weight), num , dim); + case LLAISYS_DTYPE_BF16: + return embedding_(reinterpret_cast(out), reinterpret_cast(index), + reinterpret_cast(weight), num , dim); + case LLAISYS_DTYPE_F16: + return embedding_(reinterpret_cast(out), reinterpret_cast(index), + reinterpret_cast(weight), num , dim); + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } +} +} // namespace llaisys::ops::cpu diff --git a/src/ops/embedding/cpu/embedding_cpu.hpp b/src/ops/embedding/cpu/embedding_cpu.hpp new file mode 100644 index 00000000..29788657 --- /dev/null +++ b/src/ops/embedding/cpu/embedding_cpu.hpp @@ -0,0 +1,8 @@ +#pragma once +#include "llaisys.h" + +#include + +namespace llaisys::ops::cpu { +void embedding(std::byte *out, const std::byte *index, const std::byte *weight, llaisysDataType_t type, size_t num , size_t dim); +} \ No newline at end of file diff --git a/src/ops/embedding/cuda/embedding_cuda.cu b/src/ops/embedding/cuda/embedding_cuda.cu new file mode 100644 index 00000000..b936ff18 --- /dev/null +++ b/src/ops/embedding/cuda/embedding_cuda.cu @@ -0,0 +1,36 @@ +#include "embedding_cuda.cuh" + +#include "../../../utils.hpp" +#include "../../../utils/check.hpp" + +namespace llaisys::ops::cuda { + +__global__ void embedding_kernel(void *out, const int64_t *index, const void *weight, llaisysDataType_t dtype, size_t num, + size_t dim) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + size_t total = num * dim; + if (idx >= total) { + return; + } + + size_t row = idx / dim; + size_t col = idx % dim; + int64_t token = index[row]; + size_t src_idx = static_cast(token) * dim + col; + float val = load_as_float(weight, src_idx, dtype); + store_from_float(out, idx, val, dtype); +} + +void embedding(std::byte *out, const std::byte *index, const std::byte *weight, llaisysDataType_t dtype, size_t num, + size_t dim) { + ASSERT(dtype == LLAISYS_DTYPE_F32 || dtype == LLAISYS_DTYPE_F16 || dtype == LLAISYS_DTYPE_BF16, + "embedding cuda only supports F32/F16/BF16"); + size_t total = num * dim; + dim3 block(256); + dim3 grid((total + block.x - 1) / block.x); + embedding_kernel<<>>(out, reinterpret_cast(index), weight, dtype, num, dim); + auto err = cudaGetLastError(); + ASSERT(err == cudaSuccess, "embedding cuda kernel launch failed"); +} + +} // namespace llaisys::ops::cuda diff --git a/src/ops/embedding/cuda/embedding_cuda.cuh b/src/ops/embedding/cuda/embedding_cuda.cuh new file mode 100644 index 00000000..bb0d7556 --- /dev/null +++ b/src/ops/embedding/cuda/embedding_cuda.cuh @@ -0,0 +1,10 @@ +#pragma once + +#include "llaisys.h" + +#include + +namespace llaisys::ops::cuda { +void embedding(std::byte *out, const std::byte *index, const std::byte *weight, llaisysDataType_t dtype, size_t num, + size_t dim); +} diff --git a/src/ops/embedding/op.cpp b/src/ops/embedding/op.cpp index 84b9a5d0..ce471fca 100644 --- a/src/ops/embedding/op.cpp +++ b/src/ops/embedding/op.cpp @@ -1,7 +1,70 @@ #include "op.hpp" +#include "../../core/llaisys_core.hpp" +#include "../../utils.hpp" +#include "cpu/embedding_cpu.hpp" +#include +#ifdef ENABLE_NVIDIA_API +#include "cuda/embedding_cuda.cuh" +#endif namespace llaisys::ops { void embedding(tensor_t out, tensor_t index, tensor_t weight) { - TO_BE_IMPLEMENTED(); -} + CHECK_SAME_DEVICE(out, index, weight); + + ASSERT(index->shape().size() == 1, "Embedding: index must be 1D tensor"); + ASSERT(index->dtype() == LLAISYS_DTYPE_I64, "embedding: index must be Int64 type"); + + + ASSERT(weight->shape().size() == 2, "Embedding: weight must be 2D tensor"); + + + ASSERT(out->shape().size() == 2, "Embedding: out must be 2D tensor"); + + + ASSERT(out->shape()[0] == index->shape()[0], "Embedding: out and index first dimension must match"); + ASSERT(out->shape()[1] == weight->shape()[1], "Embedding: out and weight second dimension must match"); + + size_t vocab_size = weight->shape()[0]; + std::vector host_index(index->numel()); + if (index->deviceType() == LLAISYS_DEVICE_CPU) { + auto index_data = reinterpret_cast(index->data()); + for (size_t i = 0; i < index->numel(); i++) { + host_index[i] = index_data[i]; + } + } else { + llaisys::core::context().setDevice(index->deviceType(), index->deviceId()); + llaisys::core::context().runtime().api()->memcpy_sync( + host_index.data(), + index->data(), + index->numel() * sizeof(int64_t), + LLAISYS_MEMCPY_D2H); + } + + for (size_t i = 0; i < host_index.size(); i++) { + ASSERT(host_index[i] >= 0 && static_cast(host_index[i]) < vocab_size, + "Embedding: index out of bounds"); + } + + // always support cpu calculation + if (out->deviceType() == LLAISYS_DEVICE_CPU) { + return cpu::embedding(out->data(), index->data(), weight->data(), + out->dtype(), index->numel(), weight->shape()[1]); + } + + llaisys::core::context().setDevice(out->deviceType(), out->deviceId()); + + switch (out->deviceType()) { + case LLAISYS_DEVICE_CPU: + return cpu::embedding(out->data(), index->data(), weight->data(), + out->dtype(), index->numel(), weight->shape()[1]); +#ifdef ENABLE_NVIDIA_API + case LLAISYS_DEVICE_NVIDIA: + return cuda::embedding(out->data(), index->data(), weight->data(), out->dtype(), index->numel(), weight->shape()[1]); + return; +#endif + default: + EXCEPTION_UNSUPPORTED_DEVICE; + } +} + } // namespace llaisys::ops diff --git a/src/ops/linear/cpu/linear_cpu.cpp b/src/ops/linear/cpu/linear_cpu.cpp new file mode 100644 index 00000000..2b1b19e3 --- /dev/null +++ b/src/ops/linear/cpu/linear_cpu.cpp @@ -0,0 +1,75 @@ +#include "linear_cpu.hpp" + +#include "../../../utils.hpp" + +template +void linear_(T *out, const T *in, const T *weight, const T *bias, + size_t batch_size, size_t in_features, size_t out_features) { + + for (size_t b = 0; b < batch_size; b++) { + for (size_t o = 0; o < out_features; o++) { + if constexpr (std::is_same_v || std::is_same_v) { + float sum_float = 0.0f; + + for (size_t i = 0; i < in_features; i++) { + float x_float = llaisys::utils::cast(in[b * in_features + i]); + float w_float = llaisys::utils::cast(weight[o * in_features + i]); + sum_float += x_float * w_float; + } + + if (bias != nullptr) { + sum_float += llaisys::utils::cast(bias[o]); + } + + out[b * out_features + o] = llaisys::utils::cast(sum_float); + } else { + + T sum = T(0); + + for (size_t i = 0; i < in_features; i++) { + sum += in[b * in_features + i] * weight[o * in_features + i]; + } + + if (bias != nullptr) { + sum += bias[o]; + } + + out[b * out_features + o] = sum; + } + } + } +} + +namespace llaisys::ops::cpu { +void linear(std::byte *out, const std::byte *in, const std::byte *weight, const std::byte *bias, + llaisysDataType_t type, size_t batch_size, size_t in_features, size_t out_features) { + switch (type) { + case LLAISYS_DTYPE_F32: + return linear_( + reinterpret_cast(out), + reinterpret_cast(in), + reinterpret_cast(weight), + bias ? reinterpret_cast(bias) : nullptr, + batch_size, in_features, out_features + ); + case LLAISYS_DTYPE_BF16: + return linear_( + reinterpret_cast(out), + reinterpret_cast(in), + reinterpret_cast(weight), + bias ? reinterpret_cast(bias) : nullptr, + batch_size, in_features, out_features + ); + case LLAISYS_DTYPE_F16: + return linear_( + reinterpret_cast(out), + reinterpret_cast(in), + reinterpret_cast(weight), + bias ? reinterpret_cast(bias) : nullptr, + batch_size, in_features, out_features + ); + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } +} +}//// namespace llaisys::ops \ No newline at end of file diff --git a/src/ops/linear/cpu/linear_cpu.hpp b/src/ops/linear/cpu/linear_cpu.hpp new file mode 100644 index 00000000..fb491ccd --- /dev/null +++ b/src/ops/linear/cpu/linear_cpu.hpp @@ -0,0 +1,10 @@ + +#pragma once +#include "llaisys.h" + +#include + +namespace llaisys::ops::cpu { +void linear(std::byte *out, const std::byte *in, const std::byte *weight, const std::byte *bias, + llaisysDataType_t type, size_t batch_size, size_t in_features, size_t out_features); +} \ No newline at end of file diff --git a/src/ops/linear/cuda/linear_cuda.cu b/src/ops/linear/cuda/linear_cuda.cu new file mode 100644 index 00000000..e3ceee00 --- /dev/null +++ b/src/ops/linear/cuda/linear_cuda.cu @@ -0,0 +1,43 @@ +#include "linear_cuda.cuh" + +#include "../../../utils.hpp" +#include "../../../utils/check.hpp" + +namespace llaisys::ops::cuda { + +__global__ void linear_kernel(void *out, const void *in, const void *weight, const void *bias, llaisysDataType_t type, + size_t batch_size, size_t in_features, size_t out_features, bool has_bias) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + size_t total = batch_size * out_features; + if (idx >= total) { + return; + } + + size_t b = idx / out_features; + size_t o = idx % out_features; + + float sum = 0.0f; + for (size_t i = 0; i < in_features; ++i) { + float x = load_as_float(in, b * in_features + i, type); + float w = load_as_float(weight, o * in_features + i, type); + sum += x * w; + } + if (has_bias) { + sum += load_as_float(bias, o, type); + } + store_from_float(out, idx, sum, type); +} + +void linear(std::byte *out, const std::byte *in, const std::byte *weight, const std::byte *bias, llaisysDataType_t type, + size_t batch_size, size_t in_features, size_t out_features) { + ASSERT(type == LLAISYS_DTYPE_F32 || type == LLAISYS_DTYPE_F16 || type == LLAISYS_DTYPE_BF16, + "linear cuda only supports F32/F16/BF16"); + size_t total = batch_size * out_features; + dim3 block(256); + dim3 grid((total + block.x - 1) / block.x); + linear_kernel<<>>(out, in, weight, bias, type, batch_size, in_features, out_features, bias != nullptr); + auto err = cudaGetLastError(); + ASSERT(err == cudaSuccess, "linear cuda kernel launch failed"); +} + +} // namespace llaisys::ops::cuda diff --git a/src/ops/linear/cuda/linear_cuda.cuh b/src/ops/linear/cuda/linear_cuda.cuh new file mode 100644 index 00000000..c46077ba --- /dev/null +++ b/src/ops/linear/cuda/linear_cuda.cuh @@ -0,0 +1,10 @@ +#pragma once + +#include "llaisys.h" + +#include + +namespace llaisys::ops::cuda { +void linear(std::byte *out, const std::byte *in, const std::byte *weight, const std::byte *bias, llaisysDataType_t type, + size_t batch_size, size_t in_features, size_t out_features); +} diff --git a/src/ops/linear/op.cpp b/src/ops/linear/op.cpp index 97d1f865..ba6e9222 100644 --- a/src/ops/linear/op.cpp +++ b/src/ops/linear/op.cpp @@ -1,7 +1,75 @@ #include "op.hpp" +#include "../../core/llaisys_core.hpp" +#include "../../utils.hpp" + +#include "cpu/linear_cpu.hpp" +#ifdef ENABLE_NVIDIA_API +#include "cuda/linear_cuda.cuh" +#endif namespace llaisys::ops { void linear(tensor_t out, tensor_t in, tensor_t weight, tensor_t bias) { - TO_BE_IMPLEMENTED(); + CHECK_SAME_DEVICE(out, in, weight); + if (bias) { + CHECK_SAME_DEVICE(out, bias); + } + + // Check data types + CHECK_SAME_DTYPE(out->dtype(), in->dtype(), weight->dtype()); + if (bias) { + CHECK_SAME_DTYPE(out->dtype(), bias->dtype()); + } + + // Check contiguity + ASSERT(out->isContiguous() && in->isContiguous() && weight->isContiguous(), + "Linear: input tensors must be contiguous"); + if (bias) { + ASSERT(bias->isContiguous(), "Linear: bias must be contiguous"); + } + + // Check dimensions + ASSERT(in->ndim() == 2, "Linear: input must be 2D tensor"); + ASSERT(weight->ndim() == 2, "Linear: weight must be 2D tensor"); + ASSERT(out->ndim() == 2, "Linear: output must be 2D tensor"); + if (bias) { + ASSERT(bias->ndim() == 1, "Linear: bias must be 1D tensor"); + } + + // Get dimensions + size_t batch_size = in->shape()[0]; + size_t in_features = in->shape()[1]; + size_t out_features = weight->shape()[0]; + + // Check shapes are compatible + ASSERT(weight->shape()[1] == in_features, "Linear: weight input dimension must match input features"); + ASSERT(out->shape()[0] == batch_size, "Linear: output batch size must match input batch size"); + ASSERT(out->shape()[1] == out_features, "Linear: output features must match weight output features"); + if (bias) { + ASSERT(bias->shape()[0] == out_features, "Linear: bias size must match output features"); + } + + // always support cpu calculation + if (out->deviceType() == LLAISYS_DEVICE_CPU) { + return cpu::linear(out->data(), in->data(), weight->data(), + bias ? bias->data() : nullptr, + out->dtype(), batch_size, in_features, out_features); + } + + llaisys::core::context().setDevice(out->deviceType(), out->deviceId()); + + switch (out->deviceType()) { + case LLAISYS_DEVICE_CPU: + return cpu::linear(out->data(), in->data(), weight->data(), + bias ? bias->data() : nullptr, + out->dtype(), batch_size, in_features, out_features); +#ifdef ENABLE_NVIDIA_API + case LLAISYS_DEVICE_NVIDIA: + return cuda::linear(out->data(), in->data(), weight->data(), bias ? bias->data() : nullptr, + out->dtype(), batch_size, in_features, out_features); + return; +#endif + default: + EXCEPTION_UNSUPPORTED_DEVICE; + } } } // namespace llaisys::ops diff --git a/src/ops/random_sample/cpu/random_sample_cpu.cpp b/src/ops/random_sample/cpu/random_sample_cpu.cpp new file mode 100644 index 00000000..b43a84a0 --- /dev/null +++ b/src/ops/random_sample/cpu/random_sample_cpu.cpp @@ -0,0 +1,109 @@ +#include "random_sample_cpu.hpp" + +#include "../../../utils.hpp" + +#include +#include +#include +#include +#include + +template +void random_sample_(int64_t *sample_idx, T *sample_val, const T *logits, size_t vocab_size, float temperature, + int top_k, float top_p, int64_t seed) { + if (top_p <= 0.0f) { + top_k = 1; + } + + std::vector scaled_logits(vocab_size); + float inv_temperature = 1.0f / std::max(temperature, 1e-6f); + + for (size_t i = 0; i < vocab_size; ++i) { + scaled_logits[i] = llaisys::utils::cast(logits[i]) * inv_temperature; + } + + std::vector sorted_indices(vocab_size); + std::iota(sorted_indices.begin(), sorted_indices.end(), 0); + std::sort(sorted_indices.begin(), sorted_indices.end(), + [&](size_t a, size_t b) { return scaled_logits[a] > scaled_logits[b]; }); + + if (top_k <= 0 || static_cast(top_k) > vocab_size) { + top_k = static_cast(vocab_size); + } + sorted_indices.resize(static_cast(top_k)); + + std::vector candidate_probs(sorted_indices.size()); + float max_logit = scaled_logits[sorted_indices[0]]; + float sum_exp = 0.0f; + for (size_t i = 0; i < sorted_indices.size(); ++i) { + float p = std::exp(scaled_logits[sorted_indices[i]] - max_logit); + candidate_probs[i] = p; + sum_exp += p; + } + for (float &p : candidate_probs) { + p /= sum_exp; + } + + float top_p_clamped = std::min(1.0f, std::max(0.0f, top_p)); + size_t keep_count = sorted_indices.size(); + if (top_p_clamped > 0.0f && top_p_clamped < 1.0f) { + float cumulative = 0.0f; + keep_count = 0; + for (size_t i = 0; i < candidate_probs.size(); ++i) { + cumulative += candidate_probs[i]; + keep_count = i + 1; + if (cumulative >= top_p_clamped) { + break; + } + } + keep_count = std::max(1, keep_count); + } + + sorted_indices.resize(keep_count); + candidate_probs.resize(keep_count); + + float renorm_sum = 0.0f; + for (float p : candidate_probs) { + renorm_sum += p; + } + for (float &p : candidate_probs) { + p /= renorm_sum; + } + + std::discrete_distribution distribution(candidate_probs.begin(), candidate_probs.end()); + size_t sampled_local_idx = 0; + if (seed >= 0) { + std::mt19937_64 generator(static_cast(seed)); + sampled_local_idx = distribution(generator); + } else { + static thread_local std::mt19937_64 generator(std::random_device{}()); + sampled_local_idx = distribution(generator); + } + size_t sampled_vocab_idx = sorted_indices[sampled_local_idx]; + + sample_idx[0] = static_cast(sampled_vocab_idx); + sample_val[0] = logits[sampled_vocab_idx]; +} + +namespace llaisys::ops::cpu { +void random_sample(std::byte *sample_idx, std::byte *sample_val, const std::byte *logits, llaisysDataType_t type, + size_t vocab_size, float temperature, int top_k, float top_p, int64_t seed) { + switch (type) { + case LLAISYS_DTYPE_F32: + return random_sample_(reinterpret_cast(sample_idx), reinterpret_cast(sample_val), + reinterpret_cast(logits), vocab_size, temperature, top_k, top_p, seed); + case LLAISYS_DTYPE_BF16: + return random_sample_(reinterpret_cast(sample_idx), + reinterpret_cast(sample_val), + reinterpret_cast(logits), vocab_size, temperature, top_k, + top_p, seed); + case LLAISYS_DTYPE_F16: + return random_sample_(reinterpret_cast(sample_idx), + reinterpret_cast(sample_val), + reinterpret_cast(logits), vocab_size, temperature, top_k, + top_p, seed); + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } +} +} // namespace llaisys::ops::cpu diff --git a/src/ops/random_sample/cpu/random_sample_cpu.hpp b/src/ops/random_sample/cpu/random_sample_cpu.hpp new file mode 100644 index 00000000..88fc3bcc --- /dev/null +++ b/src/ops/random_sample/cpu/random_sample_cpu.hpp @@ -0,0 +1,10 @@ +#pragma once + +#include "llaisys.h" + +#include + +namespace llaisys::ops::cpu { +void random_sample(std::byte *sample_idx, std::byte *sample_val, const std::byte *logits, llaisysDataType_t type, + size_t vocab_size, float temperature, int top_k, float top_p, int64_t seed); +} diff --git a/src/ops/random_sample/cuda/random_sample_cuda.cu b/src/ops/random_sample/cuda/random_sample_cuda.cu new file mode 100644 index 00000000..07883cd1 --- /dev/null +++ b/src/ops/random_sample/cuda/random_sample_cuda.cu @@ -0,0 +1,155 @@ +#include "random_sample_cuda.cuh" + +#include "../../../utils.hpp" +#include "../../../utils/check.hpp" + +#include +#include + +#include +#include +#include + +#include +#include +#include + +namespace llaisys::ops::cuda { + +__global__ void init_indices_kernel(int64_t *indices, size_t n) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= n) { + return; + } + indices[idx] = static_cast(idx); +} + +__global__ void scale_logits_kernel(float *scaled, const void *logits, llaisysDataType_t type, size_t vocab_size, + float inv_temperature) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= vocab_size) { + return; + } + float x = load_as_float(logits, idx, type); + scaled[idx] = x * inv_temperature; +} + +__global__ void random_sample_kernel(int64_t *sample_idx, void *sample_val, const void *logits, + const float *sorted_scaled_logits, const int64_t *sorted_indices, + llaisysDataType_t type, size_t top_k, float top_p, + unsigned long long seed) { + if (blockIdx.x != 0 || threadIdx.x != 0) { + return; + } + + if (top_k == 0) { + sample_idx[0] = 0; + store_from_float(sample_val, 0, load_as_float(logits, 0, type), type); + return; + } + + top_p = fminf(1.0f, fmaxf(0.0f, top_p)); + if (top_p <= 0.0f) { + top_k = 1; + top_p = 1.0f; + } + + const float max_logit = sorted_scaled_logits[0]; + float sum_exp = 0.0f; + for (size_t i = 0; i < top_k; ++i) { + sum_exp += expf(sorted_scaled_logits[i] - max_logit); + } + + size_t keep_count = top_k; + if (top_p < 1.0f) { + float cumulative = 0.0f; + keep_count = 0; + for (size_t i = 0; i < top_k; ++i) { + float p = expf(sorted_scaled_logits[i] - max_logit) / sum_exp; + cumulative += p; + keep_count = i + 1; + if (cumulative >= top_p) { + break; + } + } + if (keep_count == 0) { + keep_count = 1; + } + } + + float renorm = 0.0f; + for (size_t i = 0; i < keep_count; ++i) { + renorm += expf(sorted_scaled_logits[i] - max_logit) / sum_exp; + } + + curandStatePhilox4_32_10_t rng_state; + curand_init(seed, 0ULL, 0ULL, &rng_state); + float r = curand_uniform(&rng_state); + + float running = 0.0f; + size_t sampled_i = keep_count - 1; + for (size_t i = 0; i < keep_count; ++i) { + float p = (expf(sorted_scaled_logits[i] - max_logit) / sum_exp) / renorm; + running += p; + if (r <= running) { + sampled_i = i; + break; + } + } + + const int64_t sampled_vocab_idx = sorted_indices[sampled_i]; + sample_idx[0] = sampled_vocab_idx; + const float sampled_val = load_as_float(logits, static_cast(sampled_vocab_idx), type); + store_from_float(sample_val, 0, sampled_val, type); +} + +void random_sample(std::byte *sample_idx, std::byte *sample_val, const std::byte *logits, llaisysDataType_t type, + size_t vocab_size, float temperature, int top_k, float top_p, int64_t seed) { + static std::atomic seed_counter{0}; + unsigned long long sample_seed = 0; + if (seed >= 0) { + sample_seed = static_cast(seed); + } else { + sample_seed = + static_cast( + std::chrono::high_resolution_clock::now().time_since_epoch().count()) ^ + (seed_counter.fetch_add(1, std::memory_order_relaxed) + 0x9e3779b97f4a7c15ULL); + } + + ASSERT(type == LLAISYS_DTYPE_F32 || type == LLAISYS_DTYPE_F16 || type == LLAISYS_DTYPE_BF16, + "random_sample cuda only supports F32/F16/BF16"); + + const float inv_temperature = 1.0f / fmaxf(temperature, 1e-6f); + size_t k = (top_k <= 0 || static_cast(top_k) > vocab_size) ? vocab_size : static_cast(top_k); + + float *d_scaled = nullptr; + int64_t *d_indices = nullptr; + auto err = cudaMalloc(&d_scaled, sizeof(float) * vocab_size); + ASSERT(err == cudaSuccess, "random_sample cuda malloc d_scaled failed"); + err = cudaMalloc(&d_indices, sizeof(int64_t) * vocab_size); + ASSERT(err == cudaSuccess, "random_sample cuda malloc d_indices failed"); + + dim3 block(256); + dim3 grid((vocab_size + block.x - 1) / block.x); + init_indices_kernel<<>>(d_indices, vocab_size); + err = cudaGetLastError(); + ASSERT(err == cudaSuccess, "random_sample init_indices kernel launch failed"); + + scale_logits_kernel<<>>(d_scaled, logits, type, vocab_size, inv_temperature); + err = cudaGetLastError(); + ASSERT(err == cudaSuccess, "random_sample scale_logits kernel launch failed"); + + thrust::device_ptr scaled_ptr(d_scaled); + thrust::device_ptr indices_ptr(d_indices); + thrust::sort_by_key(thrust::device, scaled_ptr, scaled_ptr + vocab_size, indices_ptr, thrust::greater()); + + random_sample_kernel<<<1, 1>>>(reinterpret_cast(sample_idx), sample_val, logits, d_scaled, d_indices, + type, k, top_p, sample_seed); + err = cudaGetLastError(); + ASSERT(err == cudaSuccess, "random_sample sample kernel launch failed"); + + cudaFree(d_scaled); + cudaFree(d_indices); +} + +} // namespace llaisys::ops::cuda diff --git a/src/ops/random_sample/cuda/random_sample_cuda.cuh b/src/ops/random_sample/cuda/random_sample_cuda.cuh new file mode 100644 index 00000000..d3e2ec5a --- /dev/null +++ b/src/ops/random_sample/cuda/random_sample_cuda.cuh @@ -0,0 +1,10 @@ +#pragma once + +#include "llaisys.h" + +#include + +namespace llaisys::ops::cuda { +void random_sample(std::byte *sample_idx, std::byte *sample_val, const std::byte *logits, llaisysDataType_t type, + size_t vocab_size, float temperature, int top_k, float top_p, int64_t seed); +} diff --git a/src/ops/random_sample/op.cpp b/src/ops/random_sample/op.cpp new file mode 100644 index 00000000..540bf698 --- /dev/null +++ b/src/ops/random_sample/op.cpp @@ -0,0 +1,51 @@ +#include "op.hpp" + +#include "../../core/llaisys_core.hpp" +#include "../../utils.hpp" + +#include "cpu/random_sample_cpu.hpp" +#ifdef ENABLE_NVIDIA_API +#include "cuda/random_sample_cuda.cuh" +#endif + +namespace llaisys::ops { +void random_sample(tensor_t sample_idx, tensor_t sample_val, tensor_t logits, float temperature, int top_k, float top_p, + int64_t seed) { + CHECK_SAME_DEVICE(sample_idx, sample_val, logits); + CHECK_SAME_DTYPE(sample_val->dtype(), logits->dtype()); + + ASSERT(logits->ndim() == 1, "RandomSample: logits must be a 1D tensor"); + ASSERT(logits->numel() > 0, "RandomSample: logits cannot be empty"); + ASSERT(sample_idx->ndim() == 1 && sample_idx->shape()[0] == 1, + "RandomSample: sample_idx must be a 1D tensor with single element"); + ASSERT(sample_val->ndim() == 1 && sample_val->shape()[0] == 1, + "RandomSample: sample_val must be a 1D tensor with single element"); + + ASSERT(logits->isContiguous() && sample_idx->isContiguous() && sample_val->isContiguous(), + "RandomSample: tensors must be contiguous"); + ASSERT(sample_idx->dtype() == LLAISYS_DTYPE_I64, "RandomSample: sample_idx must be Int64 type"); + ASSERT(logits->dtype() == LLAISYS_DTYPE_F32 || logits->dtype() == LLAISYS_DTYPE_F16 || logits->dtype() == LLAISYS_DTYPE_BF16, + "RandomSample: logits dtype must be Float32/Float16/BFloat16"); + + if (logits->deviceType() == LLAISYS_DEVICE_CPU) { + return cpu::random_sample(sample_idx->data(), sample_val->data(), logits->data(), logits->dtype(), logits->numel(), + temperature, top_k, top_p, seed); + } + + llaisys::core::context().setDevice(logits->deviceType(), logits->deviceId()); + + switch (logits->deviceType()) { + case LLAISYS_DEVICE_CPU: + return cpu::random_sample(sample_idx->data(), sample_val->data(), logits->data(), logits->dtype(), logits->numel(), + temperature, top_k, top_p, seed); +#ifdef ENABLE_NVIDIA_API + case LLAISYS_DEVICE_NVIDIA: + return cuda::random_sample(sample_idx->data(), sample_val->data(), logits->data(), logits->dtype(), logits->numel(), + temperature, top_k, top_p, seed); + return; +#endif + default: + EXCEPTION_UNSUPPORTED_DEVICE; + } +} +} // namespace llaisys::ops diff --git a/src/ops/random_sample/op.hpp b/src/ops/random_sample/op.hpp new file mode 100644 index 00000000..81025545 --- /dev/null +++ b/src/ops/random_sample/op.hpp @@ -0,0 +1,8 @@ +#pragma once + +#include "../../tensor/tensor.hpp" + +namespace llaisys::ops { +void random_sample(tensor_t sample_idx, tensor_t sample_val, tensor_t logits, float temperature, int top_k, float top_p, + int64_t seed); +} diff --git a/src/ops/rms_norm/cpu/rms_norm_cpu.cpp b/src/ops/rms_norm/cpu/rms_norm_cpu.cpp new file mode 100644 index 00000000..0c59afbd --- /dev/null +++ b/src/ops/rms_norm/cpu/rms_norm_cpu.cpp @@ -0,0 +1,70 @@ +#include "rms_norm_cpu.hpp" + +#include "../../../utils.hpp" + +#include +template +void rms_norm_(T *out, const T *in, const T *weight, + size_t batch_size, size_t feature_dim, float eps) { + if constexpr (std::is_same_v || std::is_same_v) + for(size_t i = 0 ; i < batch_size ; i++){ + float sum = 0.0f; + auto base = in + i * feature_dim ; + auto baseout = out +i *feature_dim ; + for(size_t j = 0 ; j < feature_dim ;j++){ + float u = llaisys::utils::cast(base[j]); + sum += u*u; + } + float factor = std::sqrt(sum / static_cast(feature_dim) +eps); + for(size_t j = 0 ; j < feature_dim ;j++){ + float u = llaisys::utils::cast(base[j]); + baseout[j] = llaisys::utils::cast( llaisys::utils::cast(weight[j])*u / factor); + } + }else{ + for(size_t i = 0 ; i < batch_size ; i++){ + float sum = 0.0f; + auto base = in + i * feature_dim ; + auto baseout = out +i *feature_dim ; + for(size_t j = 0 ; j < feature_dim ;j++){ + float u = static_cast(base[j]); + sum += u*u; + } + float factor = std::sqrt(sum / static_cast(feature_dim) +eps); + for(size_t j = 0 ; j < feature_dim ;j++){ + float u = static_cast(base[j]); + baseout[j] = static_cast(static_cast(weight[j])*u / factor ); + } + } + } + +} +namespace llaisys::ops::cpu { +void rms_norm(std::byte *out, const std::byte *in, const std::byte *weight, + llaisysDataType_t type, size_t batch_size, size_t feature_dim, float eps) { + switch (type) { + case LLAISYS_DTYPE_F32: + return rms_norm_( + reinterpret_cast(out), + reinterpret_cast(in), + reinterpret_cast(weight), + batch_size, feature_dim, eps + ); + case LLAISYS_DTYPE_BF16: + return rms_norm_( + reinterpret_cast(out), + reinterpret_cast(in), + reinterpret_cast(weight), + batch_size, feature_dim, eps + ); + case LLAISYS_DTYPE_F16: + return rms_norm_( + reinterpret_cast(out), + reinterpret_cast(in), + reinterpret_cast(weight), + batch_size, feature_dim, eps + ); + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } + } +} \ No newline at end of file diff --git a/src/ops/rms_norm/cpu/rms_norm_cpu.hpp b/src/ops/rms_norm/cpu/rms_norm_cpu.hpp new file mode 100644 index 00000000..d4a5fa07 --- /dev/null +++ b/src/ops/rms_norm/cpu/rms_norm_cpu.hpp @@ -0,0 +1,9 @@ +#pragma once +#include "llaisys.h" + +#include + +namespace llaisys::ops::cpu { +void rms_norm(std::byte *out, const std::byte *in, const std::byte *weight, + llaisysDataType_t type, size_t batch_size, size_t feature_dim, float eps); +} \ No newline at end of file diff --git a/src/ops/rms_norm/cuda/rms_norm_cuda.cu b/src/ops/rms_norm/cuda/rms_norm_cuda.cu new file mode 100644 index 00000000..acd56845 --- /dev/null +++ b/src/ops/rms_norm/cuda/rms_norm_cuda.cu @@ -0,0 +1,48 @@ +#include "rms_norm_cuda.cuh" + +#include "../../../utils.hpp" +#include "../../../utils/check.hpp" + +namespace llaisys::ops::cuda { + +__global__ void rms_norm_kernel(void *out, const void *in, const void *weight, llaisysDataType_t type, size_t feature_dim, + float eps) { + size_t row = blockIdx.x; + extern __shared__ float sdata[]; + + float local_sum = 0.0f; + for (size_t col = threadIdx.x; col < feature_dim; col += blockDim.x) { + float val = load_as_float(in, row * feature_dim + col, type); + local_sum += val * val; + } + sdata[threadIdx.x] = local_sum; + __syncthreads(); + + for (unsigned int stride = blockDim.x / 2; stride > 0; stride >>= 1) { + if (threadIdx.x < stride) { + sdata[threadIdx.x] += sdata[threadIdx.x + stride]; + } + __syncthreads(); + } + + float inv_rms = rsqrtf(sdata[0] / static_cast(feature_dim) + eps); + for (size_t col = threadIdx.x; col < feature_dim; col += blockDim.x) { + float x = load_as_float(in, row * feature_dim + col, type); + float w = load_as_float(weight, col, type); + store_from_float(out, row * feature_dim + col, x * w * inv_rms, type); + } +} + +void rms_norm(std::byte *out, const std::byte *in, const std::byte *weight, llaisysDataType_t type, size_t batch_size, + size_t feature_dim, float eps) { + ASSERT(type == LLAISYS_DTYPE_F32 || type == LLAISYS_DTYPE_F16 || type == LLAISYS_DTYPE_BF16, + "rms_norm cuda only supports F32/F16/BF16"); + dim3 block(256); + dim3 grid(batch_size); + size_t smem = block.x * sizeof(float); + rms_norm_kernel<<>>(out, in, weight, type, feature_dim, eps); + auto err = cudaGetLastError(); + ASSERT(err == cudaSuccess, "rms_norm cuda kernel launch failed"); +} + +} // namespace llaisys::ops::cuda diff --git a/src/ops/rms_norm/cuda/rms_norm_cuda.cuh b/src/ops/rms_norm/cuda/rms_norm_cuda.cuh new file mode 100644 index 00000000..57180f82 --- /dev/null +++ b/src/ops/rms_norm/cuda/rms_norm_cuda.cuh @@ -0,0 +1,10 @@ +#pragma once + +#include "llaisys.h" + +#include + +namespace llaisys::ops::cuda { +void rms_norm(std::byte *out, const std::byte *in, const std::byte *weight, llaisysDataType_t type, size_t batch_size, + size_t feature_dim, float eps); +} diff --git a/src/ops/rms_norm/op.cpp b/src/ops/rms_norm/op.cpp index 529553d9..327526db 100644 --- a/src/ops/rms_norm/op.cpp +++ b/src/ops/rms_norm/op.cpp @@ -1,7 +1,54 @@ #include "op.hpp" +#include "../../core/llaisys_core.hpp" +#include "../../utils.hpp" + +#include "cpu/rms_norm_cpu.hpp" +#ifdef ENABLE_NVIDIA_API +#include "cuda/rms_norm_cuda.cuh" +#endif + namespace llaisys::ops { void rms_norm(tensor_t out, tensor_t in, tensor_t weight, float eps) { - TO_BE_IMPLEMENTED(); + CHECK_SAME_DEVICE(out, in, weight); + + + CHECK_SAME_DTYPE(out->dtype(), in->dtype(), weight->dtype()); + + + ASSERT(out->isContiguous() && in->isContiguous() && weight->isContiguous(), + "RMS Norm: all tensors must be contiguous"); + + + ASSERT(in->ndim() == 2, "RMS Norm: input must be 2D tensor"); + ASSERT(weight->ndim() == 1, "RMS Norm: weight must be 1D tensor"); + ASSERT(out->ndim() == 2, "RMS Norm: output must be 2D tensor"); + + + size_t batch_size = in->shape()[0]; + size_t feature_dim = in->shape()[1]; + ASSERT(out->shape()[0] == batch_size, "RMS Norm: output batch size must match input"); + ASSERT(out->shape()[1] == feature_dim, "RMS Norm: output feature dim must match input"); + ASSERT(weight->shape()[0] == feature_dim, "RMS Norm: weight size must match feature dim"); + + if (out->deviceType() == LLAISYS_DEVICE_CPU) { + return cpu::rms_norm(out->data(), in->data(), weight->data(), + out->dtype(), batch_size, feature_dim, eps); + } + + llaisys::core::context().setDevice(out->deviceType(), out->deviceId()); + + switch (out->deviceType()) { + case LLAISYS_DEVICE_CPU: + return cpu::rms_norm(out->data(), in->data(), weight->data(), + out->dtype(), batch_size, feature_dim, eps); +#ifdef ENABLE_NVIDIA_API + case LLAISYS_DEVICE_NVIDIA: + return cuda::rms_norm(out->data(), in->data(), weight->data(), out->dtype(), batch_size, feature_dim, eps); + return; +#endif + default: + EXCEPTION_UNSUPPORTED_DEVICE; + } } } // namespace llaisys::ops diff --git a/src/ops/rope/cpu/rope_cpu.cpp b/src/ops/rope/cpu/rope_cpu.cpp new file mode 100644 index 00000000..195fd12a --- /dev/null +++ b/src/ops/rope/cpu/rope_cpu.cpp @@ -0,0 +1,88 @@ +#include "rope_cpu.hpp" + +#include "../../../utils.hpp" + +#include +#include + + +template +void rope_(T *out, const T *in, const int64_t *pos_ids, + size_t seq_len, size_t n_heads, size_t head_dim, float theta) { + auto half_dim = head_dim / 2; + if constexpr (std::is_same_v || std::is_same_v) { + for(size_t s = 0 ; s < seq_len ; s++){ + + auto position = pos_ids[s]; + + for(size_t h = 0 ; h < n_heads ; h++){ + + auto in_a = in + s* n_heads * head_dim + h * head_dim ; + auto in_b = in + s* n_heads * head_dim + h * head_dim + half_dim; + auto out_a = out + s* n_heads * head_dim + h * head_dim; + auto out_b = out + s* n_heads * head_dim + h * head_dim + half_dim; + + for(size_t d = 0 ; d < half_dim ; d++){ + + float phi = position /(std::pow(theta , 2.0f * static_cast(d) /static_cast(head_dim) )) ; + float cosval = std::cos(phi); + float sinval = std::sin(phi); + out_a[d] = llaisys::utils::cast(llaisys::utils::cast(in_a[d]) *cosval - llaisys::utils::cast(in_b[d]) * sinval); + out_b[d] = llaisys::utils::cast(llaisys::utils::cast(in_b[d]) *cosval + llaisys::utils::cast(in_a[d]) * sinval); + } + } + } + } else{ + for(size_t s = 0 ; s < seq_len ; s++){ + + auto position = pos_ids[s]; + + for(size_t h = 0 ; h < n_heads ; h++){ + + auto in_a = in + s* n_heads * head_dim + h * head_dim ; + auto in_b = in + s* n_heads * head_dim + h * head_dim + half_dim; + auto out_a = out + s* n_heads * head_dim + h * head_dim; + auto out_b = out + s* n_heads * head_dim + h * head_dim + half_dim; + + for(size_t d = 0 ; d < half_dim ; d++){ + + float phi = position /(std::pow(theta , 2.0f * static_cast(d) /static_cast(head_dim) )) ; + float cosval = std::cos(phi); + float sinval = std::sin(phi); + out_a[d] = static_cast(static_cast(in_a[d]) *cosval - static_cast(in_b[d]) * sinval); + out_b[d] = static_cast(static_cast(in_b[d]) *cosval + static_cast(in_a[d]) * sinval); + } + } + } + } +} +namespace llaisys::ops::cpu { +void rope(std::byte *out, const std::byte *in, const std::byte *pos_ids, + llaisysDataType_t type, size_t seq_len, size_t n_heads, size_t head_dim, float theta) { + switch (type) { + case LLAISYS_DTYPE_F32: + return rope_( + reinterpret_cast(out), + reinterpret_cast(in), + reinterpret_cast(pos_ids), + seq_len, n_heads, head_dim, theta + ); + case LLAISYS_DTYPE_BF16: + return rope_( + reinterpret_cast(out), + reinterpret_cast(in), + reinterpret_cast(pos_ids), + seq_len, n_heads, head_dim, theta + ); + case LLAISYS_DTYPE_F16: + return rope_( + reinterpret_cast(out), + reinterpret_cast(in), + reinterpret_cast(pos_ids), + seq_len, n_heads, head_dim, theta + ); + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } +} +} \ No newline at end of file diff --git a/src/ops/rope/cpu/rope_cpu.hpp b/src/ops/rope/cpu/rope_cpu.hpp new file mode 100644 index 00000000..705c248d --- /dev/null +++ b/src/ops/rope/cpu/rope_cpu.hpp @@ -0,0 +1,9 @@ +#pragma once +#include "llaisys.h" + +#include + +namespace llaisys::ops::cpu { +void rope(std::byte *out, const std::byte *in, const std::byte *pos_ids, + llaisysDataType_t type, size_t seq_len, size_t n_heads, size_t head_dim, float theta); +} \ No newline at end of file diff --git a/src/ops/rope/cuda/rope_cuda.cu b/src/ops/rope/cuda/rope_cuda.cu new file mode 100644 index 00000000..ddc4a5c9 --- /dev/null +++ b/src/ops/rope/cuda/rope_cuda.cu @@ -0,0 +1,49 @@ +#include "rope_cuda.cuh" + +#include "../../../utils.hpp" +#include "../../../utils/check.hpp" + +#include + +namespace llaisys::ops::cuda { + +__global__ void rope_kernel(void *out, const void *in, const int64_t *pos_ids, llaisysDataType_t type, size_t seq_len, + size_t n_heads, size_t head_dim, float theta) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + size_t half_dim = head_dim / 2; + size_t total = seq_len * n_heads * half_dim; + if (idx >= total) { + return; + } + + size_t d = idx % half_dim; + size_t h = (idx / half_dim) % n_heads; + size_t s = idx / (half_dim * n_heads); + + float pos = static_cast(pos_ids[s]); + float phi = pos / powf(theta, 2.0f * static_cast(d) / static_cast(head_dim)); + float c = cosf(phi); + float sn = sinf(phi); + + size_t base = (s * n_heads + h) * head_dim; + float a = load_as_float(in, base + d, type); + float b = load_as_float(in, base + half_dim + d, type); + + store_from_float(out, base + d, a * c - b * sn, type); + store_from_float(out, base + half_dim + d, b * c + a * sn, type); +} + +void rope(std::byte *out, const std::byte *in, const std::byte *pos_ids, llaisysDataType_t type, size_t seq_len, + size_t n_heads, size_t head_dim, float theta) { + ASSERT(type == LLAISYS_DTYPE_F32 || type == LLAISYS_DTYPE_F16 || type == LLAISYS_DTYPE_BF16, + "rope cuda only supports F32/F16/BF16"); + size_t total = seq_len * n_heads * (head_dim / 2); + dim3 block(256); + dim3 grid((total + block.x - 1) / block.x); + rope_kernel<<>>(out, in, reinterpret_cast(pos_ids), type, seq_len, n_heads, head_dim, + theta); + auto err = cudaGetLastError(); + ASSERT(err == cudaSuccess, "rope cuda kernel launch failed"); +} + +} // namespace llaisys::ops::cuda diff --git a/src/ops/rope/cuda/rope_cuda.cuh b/src/ops/rope/cuda/rope_cuda.cuh new file mode 100644 index 00000000..38e3aae8 --- /dev/null +++ b/src/ops/rope/cuda/rope_cuda.cuh @@ -0,0 +1,10 @@ +#pragma once + +#include "llaisys.h" + +#include + +namespace llaisys::ops::cuda { +void rope(std::byte *out, const std::byte *in, const std::byte *pos_ids, llaisysDataType_t type, size_t seq_len, + size_t n_heads, size_t head_dim, float theta); +} diff --git a/src/ops/rope/op.cpp b/src/ops/rope/op.cpp index d60dbe64..02ced154 100644 --- a/src/ops/rope/op.cpp +++ b/src/ops/rope/op.cpp @@ -1,7 +1,59 @@ #include "op.hpp" +#include "../../core/llaisys_core.hpp" +#include "../../utils.hpp" + +#include "cpu/rope_cpu.hpp" +#ifdef ENABLE_NVIDIA_API +#include "cuda/rope_cuda.cuh" +#endif + namespace llaisys::ops { void rope(tensor_t out, tensor_t in, tensor_t pos_ids, float theta) { - TO_BE_IMPLEMENTED(); + CHECK_SAME_DEVICE(out, in, pos_ids); + + CHECK_SAME_DTYPE(out->dtype(), in->dtype()); + ASSERT(pos_ids->dtype() == LLAISYS_DTYPE_I64, "RoPE: pos_ids must be int64"); + + + ASSERT(out->isContiguous() && in->isContiguous() && pos_ids->isContiguous(), + "RoPE: all tensors must be contiguous"); + + + ASSERT(in->ndim() == 3, "RoPE: input must be 3D tensor [seq_len, n_heads, head_dim]"); + ASSERT(out->ndim() == 3, "RoPE: output must be 3D tensor [seq_len, n_heads, head_dim]"); + ASSERT(pos_ids->ndim() == 1, "RoPE: pos_ids must be 1D tensor [seq_len]"); + + + size_t seq_len = in->shape()[0]; + size_t n_heads = in->shape()[1]; + size_t head_dim = in->shape()[2]; + + ASSERT(out->shape()[0] == seq_len, "RoPE: output seq_len must match input"); + ASSERT(out->shape()[1] == n_heads, "RoPE: output n_heads must match input"); + ASSERT(out->shape()[2] == head_dim, "RoPE: output head_dim must match input"); + ASSERT(pos_ids->shape()[0] == seq_len, "RoPE: pos_ids length must match seq_len"); + ASSERT(head_dim % 2 == 0, "RoPE: head_dim must be even"); + + + if (out->deviceType() == LLAISYS_DEVICE_CPU) { + return cpu::rope(out->data(), in->data(), pos_ids->data(), + out->dtype(), seq_len, n_heads, head_dim, theta); + } + + llaisys::core::context().setDevice(out->deviceType(), out->deviceId()); + + switch (out->deviceType()) { + case LLAISYS_DEVICE_CPU: + return cpu::rope(out->data(), in->data(), pos_ids->data(), + out->dtype(), seq_len, n_heads, head_dim, theta); +#ifdef ENABLE_NVIDIA_API + case LLAISYS_DEVICE_NVIDIA: + return cuda::rope(out->data(), in->data(), pos_ids->data(), out->dtype(), seq_len, n_heads, head_dim, theta); + return; +#endif + default: + EXCEPTION_UNSUPPORTED_DEVICE; + } } } // namespace llaisys::ops diff --git a/src/ops/self_attention/cpu/self_attention_cpu.cpp b/src/ops/self_attention/cpu/self_attention_cpu.cpp new file mode 100644 index 00000000..7b46a057 --- /dev/null +++ b/src/ops/self_attention/cpu/self_attention_cpu.cpp @@ -0,0 +1,136 @@ +#include "self_attention_cpu.hpp" + +#include "../../../utils.hpp" + +#include +#include +#include + +template +void self_attention_(T *attn_val, const T *q, const T *k, const T *v, + size_t seqlen, size_t totlen, size_t nh, size_t nkvh, size_t d, size_t dv, float scale) { + auto shared = nh/nkvh ; + if constexpr( std::is_same_v || std::is_same_v) + for(size_t s = 0 ; s(baseq[d_])*llaisys::utils::cast(basek[d_]); + } + attn[t] *= scale; + if(t > s + totlen - seqlen){ + attn[t] = -std::numeric_limits::infinity(); + } + } + float max_attn = -std::numeric_limits::infinity(); + for (size_t t= 0; t < totlen; t++) { + if (attn[t] > max_attn) { + max_attn = attn[t]; + } + } + float sum_exp = 0.0f; + for (size_t t = 0; t < totlen; t++) { + attn[t] = std::exp(attn[t] - max_attn); + sum_exp += attn[t]; + } + for (size_t t = 0; t < totlen; t++) { + attn[t] /= sum_exp; + } + for (size_t dv_ = 0;dv_ < dv;dv_++){ + float val = 0; + for(size_t t = 0; t < totlen; t++){ + auto basev = v+ t* nkvh * dv + kvhead * dv; + val+=llaisys::utils::cast( basev[dv_])*attn[t]; + } + baseout[dv_]=llaisys::utils::cast(val); + } + delete[]attn; + } + } + else{ + for(size_t s = 0 ; s(baseq[d_])*static_cast(basek[d_]); + } + attn[t]*= scale; + if(t > s + totlen - seqlen){ + attn[t] = -std::numeric_limits::infinity(); + } + } + float max_attn = -std::numeric_limits::infinity(); + for (size_t t= 0; t < totlen; t++) { + if (attn[t] > max_attn) { + max_attn = attn[t]; + } + } + float sum_exp = 0.0f; + for (size_t t = 0; t < totlen; t++) { + attn[t] = std::exp(attn[t] - max_attn); + sum_exp += attn[t]; + } + for (size_t t = 0; t < totlen; t++) { + attn[t] /= sum_exp; + } + for (size_t dv_ = 0;dv_ < dv;dv_++){ + float val = 0; + for(size_t t = 0; t < totlen; t++){ + auto basev = v+ t* nkvh * dv + kvhead * dv; + val+=static_cast( basev[dv_])*attn[t]; + } + baseout[dv_]=llaisys::utils::cast(val); + } + delete[]attn; + } + } + } + } + + +namespace llaisys::ops::cpu { +void self_attention(std::byte *attn_val, const std::byte *q, const std::byte *k, const std::byte *v, + llaisysDataType_t type, size_t seqlen, size_t totlen, size_t nh, size_t nkvh, + size_t d, size_t dv , float scale) { + switch (type) { + case LLAISYS_DTYPE_F32: + return self_attention_( + reinterpret_cast(attn_val), + reinterpret_cast(q), + reinterpret_cast(k), + reinterpret_cast(v), + seqlen, totlen, nh, nkvh, d, dv , scale + ); + case LLAISYS_DTYPE_BF16: + return self_attention_( + reinterpret_cast(attn_val), + reinterpret_cast(q), + reinterpret_cast(k), + reinterpret_cast(v), + seqlen, totlen, nh, nkvh, d, dv , scale + ); + case LLAISYS_DTYPE_F16: + return self_attention_( + reinterpret_cast(attn_val), + reinterpret_cast(q), + reinterpret_cast(k), + reinterpret_cast(v), + seqlen, totlen, nh, nkvh, d, dv , scale + ); + default: + EXCEPTION_UNSUPPORTED_DATATYPE(type); + } +} +} // namespace llaisys::ops::cpu \ No newline at end of file diff --git a/src/ops/self_attention/cpu/self_attention_cpu.hpp b/src/ops/self_attention/cpu/self_attention_cpu.hpp new file mode 100644 index 00000000..f7b5be9a --- /dev/null +++ b/src/ops/self_attention/cpu/self_attention_cpu.hpp @@ -0,0 +1,10 @@ +#pragma once +#include "llaisys.h" + +#include + +namespace llaisys::ops::cpu { +void self_attention(std::byte *attn_val, const std::byte *q, const std::byte *k, const std::byte *v, + llaisysDataType_t type, size_t seqlen, size_t totlen, size_t nh, size_t nkvh, + size_t d, size_t dv, float scale); +} diff --git a/src/ops/self_attention/cuda/self_attention_cuda.cu b/src/ops/self_attention/cuda/self_attention_cuda.cu new file mode 100644 index 00000000..be9dfa0a --- /dev/null +++ b/src/ops/self_attention/cuda/self_attention_cuda.cu @@ -0,0 +1,91 @@ +#include "self_attention_cuda.cuh" + +#include "../../../utils.hpp" +#include "../../../utils/check.hpp" + +#include + +namespace llaisys::ops::cuda { + +__global__ void self_attention_kernel(void *attn_val, const void *q, const void *k, const void *v, + llaisysDataType_t type, size_t seqlen, size_t totlen, size_t nh, size_t nkvh, + size_t d, size_t dv, float scale) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + size_t total = seqlen * nh * dv; + if (idx >= total) { + return; + } + + size_t dv_i = idx % dv; + size_t h = (idx / dv) % nh; + size_t s = idx / (dv * nh); + + size_t shared = nh / nkvh; + size_t kvh = h / shared; + + float max_score = -INFINITY; + for (size_t t = 0; t < totlen; ++t) { + float score = 0.0f; + size_t qbase = (s * nh + h) * d; + size_t kbase = (t * nkvh + kvh) * d; + for (size_t di = 0; di < d; ++di) { + score += load_as_float(q, qbase + di, type) * load_as_float(k, kbase + di, type); + } + score *= scale; + if (t > s + totlen - seqlen) { + score = -INFINITY; + } + if (score > max_score) { + max_score = score; + } + } + + float sum_exp = 0.0f; + for (size_t t = 0; t < totlen; ++t) { + float score = 0.0f; + size_t qbase = (s * nh + h) * d; + size_t kbase = (t * nkvh + kvh) * d; + for (size_t di = 0; di < d; ++di) { + score += load_as_float(q, qbase + di, type) * load_as_float(k, kbase + di, type); + } + score *= scale; + if (t > s + totlen - seqlen) { + score = -INFINITY; + } + sum_exp += expf(score - max_score); + } + + float out_val = 0.0f; + for (size_t t = 0; t < totlen; ++t) { + float score = 0.0f; + size_t qbase = (s * nh + h) * d; + size_t kbase = (t * nkvh + kvh) * d; + for (size_t di = 0; di < d; ++di) { + score += load_as_float(q, qbase + di, type) * load_as_float(k, kbase + di, type); + } + score *= scale; + if (t > s + totlen - seqlen) { + score = -INFINITY; + } + float p = expf(score - max_score) / sum_exp; + size_t vbase = (t * nkvh + kvh) * dv; + out_val += p * load_as_float(v, vbase + dv_i, type); + } + + store_from_float(attn_val, idx, out_val, type); +} + +void self_attention(std::byte *attn_val, const std::byte *q, const std::byte *k, const std::byte *v, + llaisysDataType_t type, size_t seqlen, size_t totlen, size_t nh, size_t nkvh, size_t d, size_t dv, + float scale) { + ASSERT(type == LLAISYS_DTYPE_F32 || type == LLAISYS_DTYPE_F16 || type == LLAISYS_DTYPE_BF16, + "self_attention cuda only supports F32/F16/BF16"); + size_t total = seqlen * nh * dv; + dim3 block(128); + dim3 grid((total + block.x - 1) / block.x); + self_attention_kernel<<>>(attn_val, q, k, v, type, seqlen, totlen, nh, nkvh, d, dv, scale); + auto err = cudaGetLastError(); + ASSERT(err == cudaSuccess, "self_attention cuda kernel launch failed"); +} + +} // namespace llaisys::ops::cuda diff --git a/src/ops/self_attention/cuda/self_attention_cuda.cuh b/src/ops/self_attention/cuda/self_attention_cuda.cuh new file mode 100644 index 00000000..cf2a90bc --- /dev/null +++ b/src/ops/self_attention/cuda/self_attention_cuda.cuh @@ -0,0 +1,11 @@ +#pragma once + +#include "llaisys.h" + +#include + +namespace llaisys::ops::cuda { +void self_attention(std::byte *attn_val, const std::byte *q, const std::byte *k, const std::byte *v, + llaisysDataType_t type, size_t seqlen, size_t totlen, size_t nh, size_t nkvh, size_t d, size_t dv, + float scale); +} diff --git a/src/ops/self_attention/op.cpp b/src/ops/self_attention/op.cpp index 43d62014..3aed732c 100644 --- a/src/ops/self_attention/op.cpp +++ b/src/ops/self_attention/op.cpp @@ -1,7 +1,68 @@ #include "op.hpp" +#include "../../core/llaisys_core.hpp" +#include "../../utils.hpp" +#include "cpu/self_attention_cpu.hpp" +#ifdef ENABLE_NVIDIA_API +#include "cuda/self_attention_cuda.cuh" +#endif namespace llaisys::ops { void self_attention(tensor_t attn_val, tensor_t q, tensor_t k, tensor_t v, float scale) { - TO_BE_IMPLEMENTED(); + CHECK_SAME_DEVICE(attn_val, q, k, v); + + // Check data types + CHECK_SAME_DTYPE(attn_val->dtype(), q->dtype(), k->dtype(), v->dtype()); + + // Check contiguity + ASSERT(attn_val->isContiguous() && q->isContiguous() && k->isContiguous() && v->isContiguous(), + "Self-Attention: all tensors must be contiguous"); + + // Check dimensions + ASSERT(q->ndim() == 3, "Self-Attention: q must be 3D tensor [seqlen, nh, d]"); + ASSERT(k->ndim() == 3, "Self-Attention: k must be 3D tensor [totlen, nkvh, d]"); + ASSERT(v->ndim() == 3, "Self-Attention: v must be 3D tensor [totlen, nkvh, dv]"); + ASSERT(attn_val->ndim() == 3, "Self-Attention: attn_val must be 3D tensor [seqlen, nh, dv]"); + + // Get dimensions + size_t seqlen = q->shape()[0]; + size_t nh = q->shape()[1]; // number of query heads + size_t d = q->shape()[2]; // head dimension + size_t dv = v->shape()[2]; + size_t totlen = k->shape()[0]; // key/value sequence length + size_t nkvh = k->shape()[1]; // number of key/value heads + + // Check shapes are compatible + ASSERT(attn_val->shape()[0] == seqlen, "Self-Attention: attn_val seq_len must match q"); + ASSERT(attn_val->shape()[1] == nh, "Self-Attention: attn_val heads must match q"); + ASSERT(attn_val->shape()[2] == dv, "Self-Attention: attn_val head_dim must match v"); + + ASSERT(k->shape()[2] == d, "Self-Attention: k head_dim must match q"); + ASSERT(v->shape()[0] == totlen, "Self-Attention: v seq_len must match k"); + ASSERT(v->shape()[1] == nkvh, "Self-Attention: v heads must match k"); + + // Check Group Query Attention compatibility + ASSERT(nh % nkvh == 0, "Self-Attention: query heads must be divisible by key/value heads"); + + // always support cpu calculation + if (attn_val->deviceType() == LLAISYS_DEVICE_CPU) { + return cpu::self_attention(attn_val->data(), q->data(), k->data(), v->data(), + attn_val->dtype(), seqlen, totlen, nh, nkvh, d,dv, scale); + } + + llaisys::core::context().setDevice(attn_val->deviceType(), attn_val->deviceId()); + + switch (attn_val->deviceType()) { + case LLAISYS_DEVICE_CPU: + return cpu::self_attention(attn_val->data(), q->data(), k->data(), v->data(), + attn_val->dtype(), seqlen, totlen, nh, nkvh, d,dv, scale); +#ifdef ENABLE_NVIDIA_API + case LLAISYS_DEVICE_NVIDIA: + return cuda::self_attention(attn_val->data(), q->data(), k->data(), v->data(), attn_val->dtype(), seqlen, + totlen, nh, nkvh, d, dv, scale); + return; +#endif + default: + EXCEPTION_UNSUPPORTED_DEVICE; + } } } // namespace llaisys::ops diff --git a/src/ops/swiglu/cpu/swiglu_cpu.cpp b/src/ops/swiglu/cpu/swiglu_cpu.cpp new file mode 100644 index 00000000..d2b452bd --- /dev/null +++ b/src/ops/swiglu/cpu/swiglu_cpu.cpp @@ -0,0 +1,52 @@ +#include "swiglu_cpu.hpp" + +#include "../../../utils.hpp" + +#include + +// SwiGLU: out_i = up_i * sigmoid(gate_i) +// 其中 sigmoid(x) = x / (1 + exp(-x)) +template