diff --git a/.claude/skills/nsys-profiling/SKILL.md b/.claude/skills/nsys-profiling/SKILL.md index 337fc8d7..c118f220 100644 --- a/.claude/skills/nsys-profiling/SKILL.md +++ b/.claude/skills/nsys-profiling/SKILL.md @@ -79,7 +79,7 @@ For profiling a server that's already running, use the launch/start/stop workflo ```bash # Terminal 1: launch the server under nsys control, but don't start collecting yet nsys launch --trace=cuda,nvtx --cuda-graph-trace=node \ - --session-new=my_session -- cargo run -r --bin pegainfer-server -- ... + --session-new=my_session -- cargo run -r --bin openinfer-server -- ... # Terminal 2: start/stop collection on demand nsys start --session=my_session diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5e88cb1e..d3d493fe 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -35,4 +35,4 @@ jobs: run: cargo metadata --locked --no-deps --format-version 1 - name: Run simulated frontend e2e tests - run: cargo test --release -p pegainfer-sim --test frontend_e2e + run: cargo test --release -p openinfer-sim --test frontend_e2e diff --git a/.gitmodules b/.gitmodules index a8ad50c1..7d91b809 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,6 +1,6 @@ -[submodule "pegainfer-kernels/third_party/flashinfer"] - path = pegainfer-kernels/third_party/flashinfer +[submodule "openinfer-kernels/third_party/flashinfer"] + path = openinfer-kernels/third_party/flashinfer url = https://github.com/flashinfer-ai/flashinfer -[submodule "pegainfer-kernels/third_party/DeepEP"] - path = pegainfer-kernels/third_party/DeepEP +[submodule "openinfer-kernels/third_party/DeepEP"] + path = openinfer-kernels/third_party/DeepEP url = https://github.com/deepseek-ai/DeepEP diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1a1ab018..618f0d6d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -18,7 +18,7 @@ repos: - id: clippy args: [--release, --all-targets] - # The hook above lints `default-members` (pegainfer-server) with default + # The hook above lints `default-members` (openinfer-server) with default # features, so the feature-gated kimi-k2 crate is never compiled and never # linted. This local hook closes that hole — but only when kimi-k2 source # changes (the CUDA build is expensive), and `--no-deps` keeps it scoped to @@ -27,24 +27,24 @@ repos: hooks: - id: clippy-kimi-k2 name: clippy (kimi-k2, -D warnings) - entry: cargo clippy -p pegainfer-kimi-k2 --no-deps --release --features kimi-k2,kernel-report --all-targets -- -D warnings + entry: cargo clippy -p openinfer-kimi-k2 --no-deps --release --features kimi-k2,kernel-report --all-targets -- -D warnings language: system types: [rust] - files: ^pegainfer-kimi-k2/ + files: ^openinfer-kimi-k2/ pass_filenames: false # The kimi-k2 hook above is --no-deps, so the kimi code living in - # pegainfer-kernels (ops/kimi_k2 + csrc) is never linted by it. + # openinfer-kernels (ops/kimi_k2 + csrc) is never linted by it. - id: clippy-kernels-kimi name: clippy (kernels w/ kimi-k2, -D warnings) - entry: cargo clippy -p pegainfer-kernels --no-deps --release --features kimi-k2 --all-targets -- -D warnings + entry: cargo clippy -p openinfer-kernels --no-deps --release --features kimi-k2 --all-targets -- -D warnings language: system types: [rust] - files: ^pegainfer-kernels/ + files: ^openinfer-kernels/ pass_filenames: false - id: clippy-comm - name: clippy (pegainfer-comm, -D warnings) - entry: cargo clippy -p pegainfer-comm --no-deps --release --all-targets -- -D warnings + name: clippy (openinfer-comm, -D warnings) + entry: cargo clippy -p openinfer-comm --no-deps --release --all-targets -- -D warnings language: system types: [rust] - files: ^pegainfer-comm/ + files: ^openinfer-comm/ pass_filenames: false diff --git a/CLAUDE.md b/CLAUDE.md index f9e81f00..c7c30107 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -1,6 +1,6 @@ This file provides guidance to Coding Agent when working with code in this repository. -## What is pegainfer +## What is openinfer Pure Rust + CUDA LLM inference engine (~83K Rust, ~11K CUDA). No PyTorch, no frameworks. OpenAI-compatible `/v1/completions` API. @@ -8,11 +8,11 @@ Pure Rust + CUDA LLM inference engine (~83K Rust, ~11K CUDA). No PyTorch, no fra | Model | Crate | Feature flag | Architecture | |-------|-------|-------------|-------------| -| Qwen3-4B / 8B | `pegainfer-qwen3-4b` | always built | Full attention, TP support | -| Qwen3.5-4B | `pegainfer-qwen35-4b` | always built | 24 linear + 8 full attention | -| DeepSeek-V4 | `pegainfer-deepseek-v4` | `--features deepseek-v4` | MoE + compressor + indexer, 8-GPU | -| DeepSeek-V2-Lite | `pegainfer-deepseek-v2-lite` | `--features deepseek-v2-lite` | MoE + EP, 2-GPU | -| Kimi-K2 | `pegainfer-kimi-k2` | `--features kimi-k2` | MLA + MoE + Marlin INT4, 8-GPU EP | +| Qwen3-4B / 8B | `openinfer-qwen3-4b` | always built | Full attention, TP support | +| Qwen3.5-4B | `openinfer-qwen35-4b` | always built | 24 linear + 8 full attention | +| DeepSeek-V4 | `openinfer-deepseek-v4` | `--features deepseek-v4` | MoE + compressor + indexer, 8-GPU | +| DeepSeek-V2-Lite | `openinfer-deepseek-v2-lite` | `--features deepseek-v2-lite` | MoE + EP, 2-GPU | +| Kimi-K2 | `openinfer-kimi-k2` | `--features kimi-k2` | MLA + MoE + Marlin INT4, 8-GPU EP | ## Build & Run @@ -28,11 +28,11 @@ cargo run --release --features deepseek-v4 -- --model-path models/DeepSeek-V4 ``` **Key env vars:** -- `PEGAINFER_CUDA_SM` — GPU SM target override when `nvidia-smi` unavailable (e.g. `120` or `120,80`) -- `PEGAINFER_TRITON_PYTHON` — Python with Triton for build-time AOT kernel generation -- `PEGAINFER_TEST_MODEL_PATH` — override test model path (default: `models/Qwen3-4B`) -- `PEGAINFER_BUILD_TIMING=1` — print per-phase build timings (nvcc, Triton AOT, etc.) -- `PEGAINFER_NVCC_JOBS` — override parallel nvcc job count +- `OPENINFER_CUDA_SM` — GPU SM target override when `nvidia-smi` unavailable (e.g. `120` or `120,80`) +- `OPENINFER_TRITON_PYTHON` — Python with Triton for build-time AOT kernel generation +- `OPENINFER_TEST_MODEL_PATH` — override test model path (default: `models/Qwen3-4B`) +- `OPENINFER_BUILD_TIMING=1` — print per-phase build timings (nvcc, Triton AOT, etc.) +- `OPENINFER_NVCC_JOBS` — override parallel nvcc job count ## Tests @@ -41,9 +41,9 @@ cargo run --release --features deepseek-v4 -- --model-path models/DeepSeek-V4 cargo test --release --workspace --lib # Accuracy and integration tests — require GPU + model weights -PEGAINFER_TEST_MODEL_PATH=models/Qwen3-4B cargo test --release -p pegainfer-qwen3-4b --test hf_golden_gate -PEGAINFER_TEST_MODEL_PATH=models/Qwen3.5-4B cargo test --release -p pegainfer-qwen35-4b --test hf_golden_gate -PEGAINFER_TEST_MODEL_PATH=models/Qwen3.5-4B cargo test --release -p pegainfer-qwen35-4b --test e2e_scheduler +OPENINFER_TEST_MODEL_PATH=models/Qwen3-4B cargo test --release -p openinfer-qwen3-4b --test hf_golden_gate +OPENINFER_TEST_MODEL_PATH=models/Qwen3.5-4B cargo test --release -p openinfer-qwen35-4b --test hf_golden_gate +OPENINFER_TEST_MODEL_PATH=models/Qwen3.5-4B cargo test --release -p openinfer-qwen35-4b --test e2e_scheduler # Single test cargo test --release embedding_variants -- --nocapture @@ -58,13 +58,13 @@ HTTP Request → vLLM frontend → EngineHandle → per-model scheduler/executor │ ┌──────────────┬─────────────────┼─────────────────┬──────────────┐ │ │ │ │ │ - pegainfer- pegainfer- pegainfer- pegainfer- pegainfer- + openinfer- openinfer- openinfer- openinfer- openinfer- qwen3-4b qwen35-4b deepseek-v4 deepseek-v2- kimi-k2 (full attn) (linear+full) (MoE+indexer) lite (MoE+EP) (MLA+MoE) │ │ │ │ │ └──────────────┴─────────────────┼─────────────────┴──────────────┘ │ - pegainfer-core runtime + pegainfer-kernels + openinfer-core runtime + openinfer-kernels │ ┌────────────────┼────────────────┐ │ │ │ @@ -75,17 +75,17 @@ HTTP Request → vLLM frontend → EngineHandle → per-model scheduler/executor **Key abstractions:** -- **`pegainfer-core::engine`** — shared request/event contract (`EngineHandle`, `GenerateRequest`, `TokenEvent`) used by the server and model crates. +- **`openinfer-core::engine`** — shared request/event contract (`EngineHandle`, `GenerateRequest`, `TokenEvent`) used by the server and model crates. - **Per-model crates** — each model owns config, weights, prefill/decode execution, scheduler, tests, and benches. -- **`pegainfer-core::ops`** — shared GPU operator wrappers used by model crates. -- **`pegainfer-kernels`** — tensor/FFI/kernel build owner for CUDA, cuBLAS, FlashInfer, and Triton AOT. Model-specific kernels live in feature-gated submodules (`kimi_k2`, `deepseek_v4`). -- **`pegainfer-comm`** — EP all-to-all communication (GDR, NCCL, IB verbs). Requires CUDA + RDMA hardware to compile. +- **`openinfer-core::ops`** — shared GPU operator wrappers used by model crates. +- **`openinfer-kernels`** — tensor/FFI/kernel build owner for CUDA, cuBLAS, FlashInfer, and Triton AOT. Model-specific kernels live in feature-gated submodules (`kimi_k2`, `deepseek_v4`). +- **`openinfer-comm`** — EP all-to-all communication (GDR, NCCL, IB verbs). Requires CUDA + RDMA hardware to compile. - **CUDA Graph** — decode path captured inside model executors with pre-allocated buffers to preserve pointer stability. -- **KV state** — model schedulers own request state; shared paged-KV primitives live in `pegainfer-core`. +- **KV state** — model schedulers own request state; shared paged-KV primitives live in `openinfer-core`. -**Build system**: the virtual workspace root has no package build script. `pegainfer-kernels/build.rs` owns CUDA/Triton compilation: -1. Compiles `pegainfer-kernels/csrc/*.cu` with nvcc (auto-detects GPU SM targets) -2. Runs Triton AOT via `pegainfer-kernels/tools/triton/gen_triton_aot.py` for Qwen3.5 kernels +**Build system**: the virtual workspace root has no package build script. `openinfer-kernels/build.rs` owns CUDA/Triton compilation: +1. Compiles `openinfer-kernels/csrc/*.cu` with nvcc (auto-detects GPU SM targets) +2. Runs Triton AOT via `openinfer-kernels/tools/triton/gen_triton_aot.py` for Qwen3.5 kernels 3. Feature-gated: `deepseek-v4` triggers TileLang + CuTe DSL codegen; `kimi-k2` adds MLA/MoE/Marlin CUDA --- diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index f76cbde7..0c9bf066 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,4 +1,4 @@ -# Contributing to pegainfer +# Contributing to openinfer First off, thank you for considering contributing to our project! It's people like you that make this community great. diff --git a/Cargo.lock b/Cargo.lock index 611d1125..fdb8506a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3897,230 +3897,18 @@ dependencies = [ ] [[package]] -name = "openssl" -version = "0.10.79" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bf0b434746ee2832f4f0baf10137e1cabb18cbe6912c69e2e33263c45250f542" -dependencies = [ - "bitflags 2.11.1", - "cfg-if", - "foreign-types", - "libc", - "openssl-macros", - "openssl-sys", -] - -[[package]] -name = "openssl-macros" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.117", -] - -[[package]] -name = "openssl-probe" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7c87def4c32ab89d880effc9e097653c8da5d6ef28e6b539d313baaacfbafcbe" - -[[package]] -name = "openssl-sys" -version = "0.9.115" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "158fe5b292746440aa6e7a7e690e55aeb72d41505e2804c23c6973ad0e9c9781" -dependencies = [ - "cc", - "libc", - "pkg-config", - "vcpkg", -] - -[[package]] -name = "opentelemetry" -version = "0.31.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b84bcd6ae87133e903af7ef497404dda70c60d0ea14895fc8a5e6722754fc2a0" -dependencies = [ - "futures-core", - "futures-sink", - "js-sys", - "pin-project-lite", - "thiserror 2.0.18", - "tracing", -] - -[[package]] -name = "option-ext" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d" - -[[package]] -name = "ordered-float" -version = "4.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7bb71e1b3fa6ca1c61f383464aaf2bb0e2f8e772a1f01d486832464de363b951" -dependencies = [ - "num-traits", -] - -[[package]] -name = "os_info" -version = "3.15.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9cf20a545b305cf1da722b236b5155c9bb35f1d5ceb28c048bd96ca842f41b5b" -dependencies = [ - "android_system_properties", - "log", - "nix 0.31.3", - "objc2", - "objc2-foundation", - "objc2-ui-kit", - "serde", - "windows-sys 0.61.2", -] - -[[package]] -name = "page_size" -version = "0.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30d5b2194ed13191c1999ae0704b7839fb18384fa22e49b57eeaa97d79ce40da" -dependencies = [ - "libc", - "winapi", -] - -[[package]] -name = "parking" -version = "2.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f38d5652c16fde515bb1ecef450ab0f6a219d619a7274976324d5e377f7dceba" - -[[package]] -name = "parking_lot" -version = "0.12.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93857453250e3077bd71ff98b6a65ea6621a19bb0f559a85248955ac12c45a1a" -dependencies = [ - "lock_api", - "parking_lot_core", -] - -[[package]] -name = "parking_lot_core" -version = "0.9.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1" -dependencies = [ - "cfg-if", - "libc", - "redox_syscall", - "smallvec", - "windows-link", -] - -[[package]] -name = "paste" -version = "1.0.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" - -[[package]] -name = "pastey" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "35fb2e5f958ec131621fdd531e9fc186ed768cbe395337403ae56c17a74c68ec" - -[[package]] -name = "pcre2" -version = "0.2.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e970b0fcce0c7ee6ef662744ff711f21ccd6f11b7cf03cd187a80e89797fc67" -dependencies = [ - "libc", - "log", - "pcre2-sys", -] - -[[package]] -name = "pcre2-sys" -version = "0.2.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "18b9073c1a2549bd409bf4a32c94d903bb1a09bf845bc306ae148897fa0760a4" -dependencies = [ - "cc", - "libc", - "pkg-config", -] - -[[package]] -name = "pegaflow-common" -version = "0.22.6" -source = "git+https://github.com/novitalabs/pegaflow.git?rev=07cac7e50e8ae7be15ad1b9311401039c9ee439b#07cac7e50e8ae7be15ad1b9311401039c9ee439b" -dependencies = [ - "colored", - "libc", - "log", - "logforth", -] - -[[package]] -name = "pegaflow-core" -version = "0.22.6" -source = "git+https://github.com/novitalabs/pegaflow.git?rev=07cac7e50e8ae7be15ad1b9311401039c9ee439b#07cac7e50e8ae7be15ad1b9311401039c9ee439b" -dependencies = [ - "ahash", - "bytesize", - "cudarc", - "dashmap", - "futures", - "hashlink", - "io-uring", - "libc", - "log", - "logforth", - "mea", - "offset-allocator", - "opentelemetry", - "parking_lot", - "pegaflow-common", - "pegaflow-proto", - "rand 0.10.1", - "shared_memory", - "smallvec", - "tokio", - "tonic", - "uuid", -] - -[[package]] -name = "pegaflow-proto" -version = "0.22.6" -source = "git+https://github.com/novitalabs/pegaflow.git?rev=07cac7e50e8ae7be15ad1b9311401039c9ee439b#07cac7e50e8ae7be15ad1b9311401039c9ee439b" -dependencies = [ - "prost", - "tonic", - "tonic-prost", - "tonic-prost-build", -] - -[[package]] -name = "pegainfer-bench" +name = "openinfer-bench" version = "0.1.0" dependencies = [ "anyhow", "cudarc", - "pegainfer-kernels", + "openinfer-kernels", "serde", "serde_json", ] [[package]] -name = "pegainfer-comm" +name = "openinfer-comm" version = "0.0.0" dependencies = [ "anyhow", @@ -4128,71 +3916,71 @@ dependencies = [ "cudarc", "half", "log", - "pegainfer-comm-cuda-lib", - "pegainfer-comm-fabric-lib", - "pegainfer-comm-p2p-all-to-all", - "pegainfer-core", + "openinfer-comm-cuda-lib", + "openinfer-comm-fabric-lib", + "openinfer-comm-p2p-all-to-all", + "openinfer-core", "thiserror 2.0.18", ] [[package]] -name = "pegainfer-comm-a2a-kernels" +name = "openinfer-comm-a2a-kernels" version = "0.0.0" dependencies = [ "cc", "cxx", "cxx-build", - "pegainfer-comm-build-utils", + "openinfer-comm-build-utils", ] [[package]] -name = "pegainfer-comm-build-utils" +name = "openinfer-comm-build-utils" version = "0.0.0" [[package]] -name = "pegainfer-comm-cuda-lib" +name = "openinfer-comm-cuda-lib" version = "0.0.0" dependencies = [ "bincode 2.0.1", "libc", - "pegainfer-comm-cuda-sys", - "pegainfer-comm-cudart-sys", - "pegainfer-comm-gdrapi-sys", - "pegainfer-comm-proc-lib", + "openinfer-comm-cuda-sys", + "openinfer-comm-cudart-sys", + "openinfer-comm-gdrapi-sys", + "openinfer-comm-proc-lib", "thiserror 2.0.18", ] [[package]] -name = "pegainfer-comm-cuda-sys" +name = "openinfer-comm-cuda-sys" version = "0.0.0" dependencies = [ "bindgen 0.72.1", - "pegainfer-comm-build-utils", + "openinfer-comm-build-utils", ] [[package]] -name = "pegainfer-comm-cudart-sys" +name = "openinfer-comm-cudart-sys" version = "0.0.0" dependencies = [ "bindgen 0.72.1", - "pegainfer-comm-build-utils", + "openinfer-comm-build-utils", ] [[package]] -name = "pegainfer-comm-fabric-debug" +name = "openinfer-comm-fabric-debug" version = "0.0.0" dependencies = [ "anyhow", "bytes", - "pegainfer-comm-cuda-lib", - "pegainfer-comm-fabric-lib", - "pegainfer-comm-logging-lib", + "openinfer-comm-cuda-lib", + "openinfer-comm-fabric-lib", + "openinfer-comm-logging-lib", "postcard", "serde", ] [[package]] -name = "pegainfer-comm-fabric-lib" +name = "openinfer-comm-fabric-lib" version = "0.0.0" dependencies = [ "anyhow", @@ -4203,10 +3991,10 @@ dependencies = [ "mockall", "once_cell", "oneshot", + "openinfer-comm-cuda-lib", + "openinfer-comm-libibverbs-sys", + "openinfer-comm-thread-lib", "parking_lot", - "pegainfer-comm-cuda-lib", - "pegainfer-comm-libibverbs-sys", - "pegainfer-comm-thread-lib", "postcard", "serde", "smallvec", @@ -4217,24 +4005,24 @@ dependencies = [ ] [[package]] -name = "pegainfer-comm-gdrapi-sys" +name = "openinfer-comm-gdrapi-sys" version = "0.0.0" dependencies = [ "bindgen 0.72.1", - "pegainfer-comm-build-utils", + "openinfer-comm-build-utils", ] [[package]] -name = "pegainfer-comm-libibverbs-sys" +name = "openinfer-comm-libibverbs-sys" version = "0.0.0" dependencies = [ "bindgen 0.72.1", "cc", - "pegainfer-comm-build-utils", + "openinfer-comm-build-utils", ] [[package]] -name = "pegainfer-comm-logging-lib" +name = "openinfer-comm-logging-lib" version = "0.0.0" dependencies = [ "anyhow", @@ -4247,41 +4035,41 @@ dependencies = [ ] [[package]] -name = "pegainfer-comm-p2p-all-to-all" +name = "openinfer-comm-p2p-all-to-all" version = "0.0.0" dependencies = [ "anyhow", "nvtx", "oneshot", - "pegainfer-comm-a2a-kernels", - "pegainfer-comm-cuda-lib", - "pegainfer-comm-fabric-lib", - "pegainfer-comm-thread-lib", + "openinfer-comm-a2a-kernels", + "openinfer-comm-cuda-lib", + "openinfer-comm-fabric-lib", + "openinfer-comm-thread-lib", "tracing", ] [[package]] -name = "pegainfer-comm-proc-lib" +name = "openinfer-comm-proc-lib" version = "0.0.0" dependencies = [ - "pegainfer-comm-cudart-sys", + "openinfer-comm-cudart-sys", "proc-macro2", "quote", ] [[package]] -name = "pegainfer-comm-python-ext" +name = "openinfer-comm-python-ext" version = "0.0.0" dependencies = [ "bincode 2.0.1", "bytes", + "openinfer-comm-cuda-lib", + "openinfer-comm-fabric-lib", + "openinfer-comm-logging-lib", + "openinfer-comm-p2p-all-to-all", + "openinfer-comm-thread-lib", + "openinfer-comm-torch-lib", "parking_lot", - "pegainfer-comm-cuda-lib", - "pegainfer-comm-fabric-lib", - "pegainfer-comm-logging-lib", - "pegainfer-comm-p2p-all-to-all", - "pegainfer-comm-thread-lib", - "pegainfer-comm-torch-lib", "postcard", "pyo3", "serde", @@ -4289,7 +4077,7 @@ dependencies = [ ] [[package]] -name = "pegainfer-comm-thread-lib" +name = "openinfer-comm-thread-lib" version = "0.0.0" dependencies = [ "libc", @@ -4298,18 +4086,18 @@ dependencies = [ ] [[package]] -name = "pegainfer-comm-torch-lib" +name = "openinfer-comm-torch-lib" version = "0.0.0" dependencies = [ "cxx", "cxx-build", - "pegainfer-comm-cuda-lib", + "openinfer-comm-cuda-lib", "pkg-config", "pyo3", ] [[package]] -name = "pegainfer-core" +name = "openinfer-core" version = "0.1.0" dependencies = [ "anyhow", @@ -4320,23 +4108,23 @@ dependencies = [ "log", "logforth", "memmap2", + "openinfer-engine", + "openinfer-kernels", "parking_lot", - "pegainfer-engine", - "pegainfer-kernels", "safetensors", "serde_json", "tokio", ] [[package]] -name = "pegainfer-cupti" +name = "openinfer-cupti" version = "0.1.0" dependencies = [ "cc", ] [[package]] -name = "pegainfer-deepseek-v2-lite" +name = "openinfer-deepseek-v2-lite" version = "0.1.0" dependencies = [ "anyhow", @@ -4346,8 +4134,8 @@ dependencies = [ "libloading 0.9.0", "memmap2", "nvtx", - "pegainfer-core", - "pegainfer-engine", + "openinfer-core", + "openinfer-engine", "safetensors", "serde", "serde_json", @@ -4357,7 +4145,7 @@ dependencies = [ ] [[package]] -name = "pegainfer-deepseek-v4" +name = "openinfer-deepseek-v4" version = "0.1.0" dependencies = [ "anyhow", @@ -4368,9 +4156,9 @@ dependencies = [ "libc", "log", "memmap2", - "pegainfer-comm", - "pegainfer-core", - "pegainfer-kernels", + "openinfer-comm", + "openinfer-core", + "openinfer-kernels", "safetensors", "serde", "serde_json", @@ -4379,14 +4167,14 @@ dependencies = [ ] [[package]] -name = "pegainfer-engine" +name = "openinfer-engine" version = "0.1.0" dependencies = [ "tokio", ] [[package]] -name = "pegainfer-kernels" +name = "openinfer-kernels" version = "0.1.0" dependencies = [ "anyhow", @@ -4397,7 +4185,7 @@ dependencies = [ ] [[package]] -name = "pegainfer-kimi-k2" +name = "openinfer-kimi-k2" version = "0.1.0" dependencies = [ "anyhow", @@ -4408,10 +4196,10 @@ dependencies = [ "half", "log", "memmap2", - "pegainfer-bench", - "pegainfer-core", - "pegainfer-kernels", - "pegainfer-kv-cache", + "openinfer-bench", + "openinfer-core", + "openinfer-kernels", + "openinfer-kv-cache", "rand 0.10.1", "safetensors", "serde", @@ -4420,7 +4208,7 @@ dependencies = [ ] [[package]] -name = "pegainfer-kv-cache" +name = "openinfer-kv-cache" version = "0.1.0" dependencies = [ "anyhow", @@ -4428,24 +4216,24 @@ dependencies = [ "dynamo-kv-hashing", "half", "kvbm-logical", - "pegainfer-kernels", + "openinfer-kernels", ] [[package]] -name = "pegainfer-kv-offload" +name = "openinfer-kv-offload" version = "0.1.0" dependencies = [ "anyhow", "cudarc", "half", "log", + "openinfer-kv-cache", "pegaflow-core", - "pegainfer-kv-cache", "tokio", ] [[package]] -name = "pegainfer-qwen3-4b" +name = "openinfer-qwen3-4b" version = "0.1.0" dependencies = [ "anyhow", @@ -4457,13 +4245,13 @@ dependencies = [ "half", "hex", "log", - "pegainfer-bench", - "pegainfer-core", - "pegainfer-cupti", - "pegainfer-kernels", - "pegainfer-kv-cache", - "pegainfer-kv-offload", - "pegainfer-vllm-support", + "openinfer-bench", + "openinfer-core", + "openinfer-cupti", + "openinfer-kernels", + "openinfer-kv-cache", + "openinfer-kv-offload", + "openinfer-vllm-support", "rand 0.10.1", "safetensors", "serde", @@ -4475,7 +4263,7 @@ dependencies = [ ] [[package]] -name = "pegainfer-qwen35-4b" +name = "openinfer-qwen35-4b" version = "0.1.0" dependencies = [ "anyhow", @@ -4484,9 +4272,9 @@ dependencies = [ "fastrace", "half", "log", - "pegainfer-core", - "pegainfer-kernels", - "pegainfer-vllm-support", + "openinfer-core", + "openinfer-kernels", + "openinfer-vllm-support", "rand 0.10.1", "safetensors", "serde", @@ -4497,7 +4285,7 @@ dependencies = [ ] [[package]] -name = "pegainfer-server" +name = "openinfer-server" version = "0.1.0" dependencies = [ "anyhow", @@ -4510,14 +4298,14 @@ dependencies = [ "half", "log", "logforth", - "pegainfer-core", - "pegainfer-deepseek-v2-lite", - "pegainfer-deepseek-v4", - "pegainfer-kimi-k2", - "pegainfer-qwen3-4b", - "pegainfer-qwen35-4b", - "pegainfer-vllm-frontend", - "pegainfer-vllm-support", + "openinfer-core", + "openinfer-deepseek-v2-lite", + "openinfer-deepseek-v4", + "openinfer-kimi-k2", + "openinfer-qwen3-4b", + "openinfer-qwen35-4b", + "openinfer-vllm-frontend", + "openinfer-vllm-support", "rand 0.10.1", "serde", "serde_json", @@ -4527,13 +4315,13 @@ dependencies = [ ] [[package]] -name = "pegainfer-sim" +name = "openinfer-sim" version = "0.1.0" dependencies = [ "anyhow", "clap", - "pegainfer-engine", - "pegainfer-vllm-frontend", + "openinfer-engine", + "openinfer-vllm-frontend", "reqwest", "serde_json", "tokio", @@ -4541,13 +4329,13 @@ dependencies = [ ] [[package]] -name = "pegainfer-vllm-frontend" +name = "openinfer-vllm-frontend" version = "0.1.0" dependencies = [ "anyhow", "axum", "log", - "pegainfer-engine", + "openinfer-engine", "rmp-serde", "rmpv", "serde", @@ -4562,7 +4350,7 @@ dependencies = [ ] [[package]] -name = "pegainfer-vllm-support" +name = "openinfer-vllm-support" version = "0.1.0" dependencies = [ "once_cell", @@ -4571,6 +4359,218 @@ dependencies = [ "vllm-tokenizer", ] +[[package]] +name = "openssl" +version = "0.10.79" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf0b434746ee2832f4f0baf10137e1cabb18cbe6912c69e2e33263c45250f542" +dependencies = [ + "bitflags 2.11.1", + "cfg-if", + "foreign-types", + "libc", + "openssl-macros", + "openssl-sys", +] + +[[package]] +name = "openssl-macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + +[[package]] +name = "openssl-probe" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c87def4c32ab89d880effc9e097653c8da5d6ef28e6b539d313baaacfbafcbe" + +[[package]] +name = "openssl-sys" +version = "0.9.115" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "158fe5b292746440aa6e7a7e690e55aeb72d41505e2804c23c6973ad0e9c9781" +dependencies = [ + "cc", + "libc", + "pkg-config", + "vcpkg", +] + +[[package]] +name = "opentelemetry" +version = "0.31.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b84bcd6ae87133e903af7ef497404dda70c60d0ea14895fc8a5e6722754fc2a0" +dependencies = [ + "futures-core", + "futures-sink", + "js-sys", + "pin-project-lite", + "thiserror 2.0.18", + "tracing", +] + +[[package]] +name = "option-ext" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d" + +[[package]] +name = "ordered-float" +version = "4.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7bb71e1b3fa6ca1c61f383464aaf2bb0e2f8e772a1f01d486832464de363b951" +dependencies = [ + "num-traits", +] + +[[package]] +name = "os_info" +version = "3.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9cf20a545b305cf1da722b236b5155c9bb35f1d5ceb28c048bd96ca842f41b5b" +dependencies = [ + "android_system_properties", + "log", + "nix 0.31.3", + "objc2", + "objc2-foundation", + "objc2-ui-kit", + "serde", + "windows-sys 0.61.2", +] + +[[package]] +name = "page_size" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30d5b2194ed13191c1999ae0704b7839fb18384fa22e49b57eeaa97d79ce40da" +dependencies = [ + "libc", + "winapi", +] + +[[package]] +name = "parking" +version = "2.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f38d5652c16fde515bb1ecef450ab0f6a219d619a7274976324d5e377f7dceba" + +[[package]] +name = "parking_lot" +version = "0.12.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93857453250e3077bd71ff98b6a65ea6621a19bb0f559a85248955ac12c45a1a" +dependencies = [ + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-link", +] + +[[package]] +name = "paste" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" + +[[package]] +name = "pastey" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35fb2e5f958ec131621fdd531e9fc186ed768cbe395337403ae56c17a74c68ec" + +[[package]] +name = "pcre2" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e970b0fcce0c7ee6ef662744ff711f21ccd6f11b7cf03cd187a80e89797fc67" +dependencies = [ + "libc", + "log", + "pcre2-sys", +] + +[[package]] +name = "pcre2-sys" +version = "0.2.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "18b9073c1a2549bd409bf4a32c94d903bb1a09bf845bc306ae148897fa0760a4" +dependencies = [ + "cc", + "libc", + "pkg-config", +] + +[[package]] +name = "pegaflow-common" +version = "0.22.6" +source = "git+https://github.com/novitalabs/pegaflow.git?rev=07cac7e50e8ae7be15ad1b9311401039c9ee439b#07cac7e50e8ae7be15ad1b9311401039c9ee439b" +dependencies = [ + "colored", + "libc", + "log", + "logforth", +] + +[[package]] +name = "pegaflow-core" +version = "0.22.6" +source = "git+https://github.com/novitalabs/pegaflow.git?rev=07cac7e50e8ae7be15ad1b9311401039c9ee439b#07cac7e50e8ae7be15ad1b9311401039c9ee439b" +dependencies = [ + "ahash", + "bytesize", + "cudarc", + "dashmap", + "futures", + "hashlink", + "io-uring", + "libc", + "log", + "logforth", + "mea", + "offset-allocator", + "opentelemetry", + "parking_lot", + "pegaflow-common", + "pegaflow-proto", + "rand 0.10.1", + "shared_memory", + "smallvec", + "tokio", + "tonic", + "uuid", +] + +[[package]] +name = "pegaflow-proto" +version = "0.22.6" +source = "git+https://github.com/novitalabs/pegaflow.git?rev=07cac7e50e8ae7be15ad1b9311401039c9ee439b#07cac7e50e8ae7be15ad1b9311401039c9ee439b" +dependencies = [ + "prost", + "tonic", + "tonic-prost", + "tonic-prost-build", +] + [[package]] name = "pem-rfc7468" version = "1.0.0" diff --git a/Cargo.toml b/Cargo.toml index 0a72b866..4750bbf9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,40 +1,40 @@ [workspace] resolver = "3" -default-members = ["pegainfer-server"] +default-members = ["openinfer-server"] members = [ - "pegainfer-engine", - "pegainfer-vllm-frontend", - "pegainfer-sim", - "pegainfer-vllm-support", - "pegainfer-server", - "pegainfer-core", - "pegainfer-cupti", - "pegainfer-kernels", - "pegainfer-bench", - "pegainfer-deepseek-v4", - "pegainfer-deepseek-v2-lite", - "pegainfer-kimi-k2", - "pegainfer-qwen3-4b", - "pegainfer-qwen35-4b", - "pegainfer-kv-cache", - "pegainfer-kv-offload", - # ---- pegainfer-comm (EP all-to-all) ---- - "pegainfer-comm", - "pegainfer-comm/crates/pegainfer-comm-build-utils", - "pegainfer-comm/crates/pegainfer-comm-cuda-sys", - "pegainfer-comm/crates/pegainfer-comm-cudart-sys", - "pegainfer-comm/crates/pegainfer-comm-gdrapi-sys", - "pegainfer-comm/crates/pegainfer-comm-libibverbs-sys", - "pegainfer-comm/crates/pegainfer-comm-cuda-lib", - "pegainfer-comm/crates/pegainfer-comm-thread-lib", - "pegainfer-comm/crates/pegainfer-comm-proc-lib", - "pegainfer-comm/crates/pegainfer-comm-logging-lib", - "pegainfer-comm/crates/pegainfer-comm-torch-lib", - "pegainfer-comm/crates/pegainfer-comm-fabric-lib", - "pegainfer-comm/crates/pegainfer-comm-fabric-debug", - "pegainfer-comm/crates/pegainfer-comm-a2a-kernels", - "pegainfer-comm/crates/pegainfer-comm-p2p-all-to-all", - "pegainfer-comm/crates/pegainfer-comm-python-ext", + "openinfer-engine", + "openinfer-vllm-frontend", + "openinfer-sim", + "openinfer-vllm-support", + "openinfer-server", + "openinfer-core", + "openinfer-cupti", + "openinfer-kernels", + "openinfer-bench", + "openinfer-deepseek-v4", + "openinfer-deepseek-v2-lite", + "openinfer-kimi-k2", + "openinfer-qwen3-4b", + "openinfer-qwen35-4b", + "openinfer-kv-cache", + "openinfer-kv-offload", + # ---- openinfer-comm (EP all-to-all) ---- + "openinfer-comm", + "openinfer-comm/crates/openinfer-comm-build-utils", + "openinfer-comm/crates/openinfer-comm-cuda-sys", + "openinfer-comm/crates/openinfer-comm-cudart-sys", + "openinfer-comm/crates/openinfer-comm-gdrapi-sys", + "openinfer-comm/crates/openinfer-comm-libibverbs-sys", + "openinfer-comm/crates/openinfer-comm-cuda-lib", + "openinfer-comm/crates/openinfer-comm-thread-lib", + "openinfer-comm/crates/openinfer-comm-proc-lib", + "openinfer-comm/crates/openinfer-comm-logging-lib", + "openinfer-comm/crates/openinfer-comm-torch-lib", + "openinfer-comm/crates/openinfer-comm-fabric-lib", + "openinfer-comm/crates/openinfer-comm-fabric-debug", + "openinfer-comm/crates/openinfer-comm-a2a-kernels", + "openinfer-comm/crates/openinfer-comm-p2p-all-to-all", + "openinfer-comm/crates/openinfer-comm-python-ext", # ---- dynamo kvbm (ported, Apache-2.0) ---- "kvbm/dynamo-tokens", "kvbm/dynamo-memory", @@ -126,21 +126,21 @@ opentelemetry_sdk = { version = "0.31.0", features = ["trace", "logs", "rt-tokio ordered-float = "4" oneshot = "0.1.11" parking_lot = "0.12.5" -pegainfer-bench = { path = "pegainfer-bench" } -pegainfer-core = { path = "pegainfer-core" } -pegainfer-kv-cache = { path = "pegainfer-kv-cache" } -pegainfer-kv-offload = { path = "pegainfer-kv-offload" } -pegainfer-cupti = { path = "pegainfer-cupti" } -pegainfer-deepseek-v4 = { path = "pegainfer-deepseek-v4" } -pegainfer-engine = { path = "pegainfer-engine" } -pegainfer-kernels = { path = "pegainfer-kernels" } -pegainfer-kimi-k2 = { path = "pegainfer-kimi-k2" } -pegainfer-qwen3-4b = { path = "pegainfer-qwen3-4b" } -pegainfer-qwen35-4b = { path = "pegainfer-qwen35-4b" } -pegainfer-deepseek-v2-lite = { path = "pegainfer-deepseek-v2-lite" } -pegainfer-vllm-frontend = { path = "pegainfer-vllm-frontend" } -pegainfer-vllm-support = { path = "pegainfer-vllm-support" } -pegainfer-comm = { path = "pegainfer-comm" } +openinfer-bench = { path = "openinfer-bench" } +openinfer-core = { path = "openinfer-core" } +openinfer-kv-cache = { path = "openinfer-kv-cache" } +openinfer-kv-offload = { path = "openinfer-kv-offload" } +openinfer-cupti = { path = "openinfer-cupti" } +openinfer-deepseek-v4 = { path = "openinfer-deepseek-v4" } +openinfer-engine = { path = "openinfer-engine" } +openinfer-kernels = { path = "openinfer-kernels" } +openinfer-kimi-k2 = { path = "openinfer-kimi-k2" } +openinfer-qwen3-4b = { path = "openinfer-qwen3-4b" } +openinfer-qwen35-4b = { path = "openinfer-qwen35-4b" } +openinfer-deepseek-v2-lite = { path = "openinfer-deepseek-v2-lite" } +openinfer-vllm-frontend = { path = "openinfer-vllm-frontend" } +openinfer-vllm-support = { path = "openinfer-vllm-support" } +openinfer-comm = { path = "openinfer-comm" } pkg-config = "0.3.32" prometheus = "0.14" postcard = { version = "1.1.3", features = ["alloc", "use-std"] } diff --git a/README.md b/README.md index e33efbbb..a556cb14 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,8 @@

- pegainfer logo + openinfer logo

-

pegainfer

+

openinfer

Pure Rust + CUDA LLM inference engine. No PyTorch. No model framework runtime. @@ -18,7 +18,7 @@ --- -pegainfer is a from-scratch LLM inference engine written in **~9.6K lines of Rust**, **~2.6K lines of CUDA**, and **~1.4K lines of Triton GPU kernels**. No PyTorch, no ONNX, no model framework runtime — just Rust plus CUDA, Triton AOT, and generated compatibility kernels. +openinfer is a from-scratch LLM inference engine written in **~9.6K lines of Rust**, **~2.6K lines of CUDA**, and **~1.4K lines of Triton GPU kernels**. No PyTorch, no ONNX, no model framework runtime — just Rust plus CUDA, Triton AOT, and generated compatibility kernels. The goal is to understand every layer of the inference stack by building it from the ground up, and to explore what a Rust-native inference engine can look like. @@ -63,11 +63,11 @@ huggingface-cli download Qwen/Qwen3-4B --local-dir models/Qwen3-4B # Build & start server on port 8000 export CUDA_HOME=/usr/local/cuda -export PEGAINFER_TRITON_PYTHON=.venv/bin/python +export OPENINFER_TRITON_PYTHON=.venv/bin/python cargo run --release ``` -> **Note**: The server CLI is in `pegainfer-server`. Model crates such as `pegainfer-qwen3-4b`, `pegainfer-qwen35-4b`, and `pegainfer-deepseek-v4` contain model logic and diagnostics but are not server entrypoints. Use `cargo run --release` from the workspace root, or `cargo run --release -p pegainfer-server -- --model-path `. +> **Note**: The server CLI is in `openinfer-server`. Model crates such as `openinfer-qwen3-4b`, `openinfer-qwen35-4b`, and `openinfer-deepseek-v4` contain model logic and diagnostics but are not server entrypoints. Use `cargo run --release` from the workspace root, or `cargo run --release -p openinfer-server -- --model-path `. ```bash # Try it @@ -92,7 +92,7 @@ cargo run --release -- --model-path models/Qwen3.5-4B # DeepSeek V4 Flash requires the feature-gated MP8 path and TileLang at build time uv pip install "tilelang==0.1.9" -export PEGAINFER_TILELANG_PYTHON=.venv/bin/python +export OPENINFER_TILELANG_PYTHON=.venv/bin/python cargo run --release --features deepseek-v4 -- --model-path models/DeepSeek-V4-Flash # Disable CUDA Graph (useful for debugging) @@ -104,9 +104,9 @@ cargo run --release -- --cuda-graph=false | Variable | Description | |----------|-------------| | `CUDA_HOME` | CUDA Toolkit path (default: `/usr/local/cuda`) | -| `PEGAINFER_TRITON_PYTHON` | Python with Triton for build-time AOT compilation | -| `PEGAINFER_TILELANG_PYTHON` | Python with TileLang for `deepseek-v4` build-time kernel generation | -| `PEGAINFER_CUDA_SM` | GPU SM target override when `nvidia-smi` unavailable (e.g. `120`) | +| `OPENINFER_TRITON_PYTHON` | Python with Triton for build-time AOT compilation | +| `OPENINFER_TILELANG_PYTHON` | Python with TileLang for `deepseek-v4` build-time kernel generation | +| `OPENINFER_CUDA_SM` | GPU SM target override when `nvidia-smi` unavailable (e.g. `120`) | @@ -117,10 +117,10 @@ cargo run --release -- --cuda-graph=false $env:CUDA_PATH = "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.x" uv venv .venv --python 3.12 uv pip install "triton-windows<3.7" -$env:PEGAINFER_TRITON_PYTHON = ".venv\Scripts\python.exe" +$env:OPENINFER_TRITON_PYTHON = ".venv\Scripts\python.exe" cargo build --release -cargo run --release -p pegainfer-server -- --model-path models/Qwen3-4B +cargo run --release -p openinfer-server -- --model-path models/Qwen3-4B ``` @@ -136,7 +136,7 @@ cargo run --release -p pegainfer-server -- --model-path models/Qwen3-4B | [DeepSeek-V4-Flash](https://huggingface.co/deepseek-ai/DeepSeek-V4-Flash) | MoE + sparse attention, MP8 checkpoint | 671B total / 37B active | Initial greedy, feature-gated, 8-GPU MP8 | | [Kimi-K2-Instruct](https://huggingface.co/moonshotai/Kimi-K2-Instruct) | MLA + MoE + Marlin INT4 | 1T total / 32B active | Feature-gated, `--features kimi-k2`, 8-GPU EP path | -Model type is auto-detected from `config.json` — just point `--model-path` at any supported model directory. Feature-gated model lines require rebuilding `pegainfer-server` with the matching `--features ...` flag before launch. +Model type is auto-detected from `config.json` — just point `--model-path` at any supported model directory. Feature-gated model lines require rebuilding `openinfer-server` with the matching `--features ...` flag before launch. DeepSeek V4 support is intentionally narrower than the Qwen paths in the initial PR: it requires `--features deepseek-v4`, uses CUDA devices `0..7`, serves greedy requests only, terminates unsupported logprobs and non-greedy sampling requests with an explicit `stop_reason`, and does not use CUDA Graph yet. @@ -162,12 +162,12 @@ HTTP / vLLM frontend → EngineHandle → per-model engine crate │ ┌───────────────────────┼───────────────────────┐ │ │ │ -pegainfer-qwen3-4b pegainfer-qwen35-4b pegainfer-deepseek-v4 +openinfer-qwen3-4b openinfer-qwen35-4b openinfer-deepseek-v4 (full attention) (24 linear + 8 full) (MP8 MoE + sparse attn) │ │ │ └───────────────────────┼───────────────────────┘ │ - pegainfer-core runtime + pegainfer-kernels + openinfer-core runtime + openinfer-kernels │ CUDA / cuBLAS / Triton / TileLang / FlashInfer ``` @@ -179,7 +179,7 @@ pegainfer-qwen3-4b pegainfer-qwen35-4b pegainfer-deepseek-v4 - **Fused operators where mature** — Qwen decode paths use fused attention/MLP kernels; DeepSeek V4 is currently a multi-stage MP8 path with TileLang kernels, NCCL reductions, and CUDA glue - **BF16 storage, FP32 accumulation** — numerical stability without memory overhead - **CUDA Graph** on Qwen decode paths — eliminates kernel launch overhead where enabled -- **Per-model crate boundary** — Qwen3-4B owns its config, weights, scheduler/executor, tests, benches, and kernel plan in `pegainfer-qwen3-4b` +- **Per-model crate boundary** — Qwen3-4B owns its config, weights, scheduler/executor, tests, benches, and kernel plan in `openinfer-qwen3-4b` **Model details:** @@ -200,17 +200,17 @@ pegainfer-qwen3-4b pegainfer-qwen35-4b pegainfer-deepseek-v4 cargo test --release --workspace --lib # Accuracy and integration tests (need GPU + model weights) -PEGAINFER_TEST_MODEL_PATH=models/Qwen3-4B cargo test --release -p pegainfer-qwen3-4b --test hf_golden_gate -PEGAINFER_TEST_MODEL_PATH=models/Qwen3.5-4B cargo test --release -p pegainfer-qwen35-4b --test hf_golden_gate -PEGAINFER_TEST_MODEL_PATH=models/Qwen3.5-4B cargo test --release -p pegainfer-qwen35-4b --test e2e_scheduler -PEGAINFER_TEST_MODEL_PATH=models/DeepSeek-V4-Flash cargo test --release -p pegainfer-deepseek-v4 --features deepseek-v4 --test e2e +OPENINFER_TEST_MODEL_PATH=models/Qwen3-4B cargo test --release -p openinfer-qwen3-4b --test hf_golden_gate +OPENINFER_TEST_MODEL_PATH=models/Qwen3.5-4B cargo test --release -p openinfer-qwen35-4b --test hf_golden_gate +OPENINFER_TEST_MODEL_PATH=models/Qwen3.5-4B cargo test --release -p openinfer-qwen35-4b --test e2e_scheduler +OPENINFER_TEST_MODEL_PATH=models/DeepSeek-V4-Flash cargo test --release -p openinfer-deepseek-v4 --features deepseek-v4 --test e2e ``` ### Triton AOT Triton compiles the Qwen3.5 compatibility AOT kernels at build time. Qwen3-4B dense full-attention kernels are CUDA/cuBLAS/FlashInfer C++ wrappers. Runtime has no Python dependency — Triton is build-time only. -See `pegainfer-kernels/tools/triton/README.md` for setup and troubleshooting. +See `openinfer-kernels/tools/triton/README.md` for setup and troubleshooting. ### Source Layout @@ -220,36 +220,36 @@ See `pegainfer-kernels/tools/triton/README.md` for setup and troubleshooting. ``` Cargo.toml # Virtual workspace root -pegainfer-server/ # Product package: CLI, vLLM frontend, benchmarks +openinfer-server/ # Product package: CLI, vLLM frontend, benchmarks ├── src/main.rs # CLI + vLLM/OpenAI server startup ├── src/vllm_frontend.rs # vLLM engine-core bridge into a generic EngineHandle ├── src/server_engine.rs # Model detection and shared server helpers ├── src/scheduler.rs # Compatibility re-export of core engine request/event types ├── src/ops.rs # Compatibility re-export of shared GPU ops ├── src/ops/tests.rs # Server package operator coverage tests -├── src/tensor.rs # Re-export of pegainfer-kernels tensor types +├── src/tensor.rs # Re-export of openinfer-kernels tensor types ├── src/sampler.rs # Temperature, top-k, top-p sampling └── src/logging.rs # Runtime logging setup -pegainfer-core/ # Shared runtime API for model crates +openinfer-core/ # Shared runtime API for model crates ├── src/engine.rs # EngineHandle, GenerateRequest, TokenEvent ├── src/kv_pool.rs # Paged KV pool and request state -├── src/ops.rs # Shared op wrappers over pegainfer-kernels +├── src/ops.rs # Shared op wrappers over openinfer-kernels └── src/weight_loader.rs # Safetensors helpers shared by model crates -pegainfer-kernels/ # Shared GPU kernel/runtime crate +openinfer-kernels/ # Shared GPU kernel/runtime crate ├── KERNELS.md # LLM routing index for model op -> wrapper -> FFI -> source ├── src/ # GPU tensor types, FFI, paged KV layout, Rust ops ├── csrc/ # Hand-written CUDA / FlashInfer C++ wrappers └── tools/triton/ # Triton AOT kernels (build-time compiled) -pegainfer-qwen3-4b/ # Qwen3-4B model-owned engine crate +openinfer-qwen3-4b/ # Qwen3-4B model-owned engine crate ├── src/ # Config, weights, prefill/decode/unified, scheduler/executor ├── tests/ # Qwen3 HF logits gate and integration coverage ├── benches/ # Qwen3 model-level benchmarks └── src/kernel_plan.rs # Model DAG phase -> kernel routing index -pegainfer-qwen35-4b/ # Qwen3.5-4B model-owned engine crate +openinfer-qwen35-4b/ # Qwen3.5-4B model-owned engine crate ├── src/ # Config, weights, prefill/decode/unified, recurrent state, scheduler ├── tests/ # Qwen3.5 HF logits gate and scheduler integration └── benches/ # Qwen3.5 recurrent/norm operator benchmarks diff --git a/docs/benchmarks/accuracy-eval-results.md b/docs/benchmarks/accuracy-eval-results.md index 35f0086e..b628dc1f 100644 --- a/docs/benchmarks/accuracy-eval-results.md +++ b/docs/benchmarks/accuracy-eval-results.md @@ -5,31 +5,31 @@ | Model | Backend | GSM8K 8-shot (strict-match) | GSM8K 8-shot (flexible-extract) | Delta vs HF | Status | |-------|---------|----------------------------:|--------------------------------:|:-----------:|:------:| | Qwen3-4B | HF transformers | 85.82% | 85.82% | — | baseline | -| Qwen3-4B | pegainfer | 85.37% | 85.44% | -0.45 pp / -0.38 pp | PASS | +| Qwen3-4B | openinfer | 85.37% | 85.44% | -0.45 pp / -0.38 pp | PASS | | Qwen3.5-4B | HF transformers | 79.45% | 79.45% | — | baseline | -| Qwen3.5-4B | pegainfer (historical) | 1.97% | 10.61% | -77.48 pp / -68.84 pp | FAIL | -| Qwen3.5-4B | pegainfer (#250 RoPE cache fix) | 79.38% | 79.30% | -0.07 pp / -0.15 pp | PASS | +| Qwen3.5-4B | openinfer (historical) | 1.97% | 10.61% | -77.48 pp / -68.84 pp | FAIL | +| Qwen3.5-4B | openinfer (#250 RoPE cache fix) | 79.38% | 79.30% | -0.07 pp / -0.15 pp | PASS | **Pass criteria:** absolute delta < 1 percentage point. ## Qwen3-4B: PASS -Pegainfer and HF transformers produce near-identical results. The 0.45% delta is well within the 1% threshold and consistent with expected bf16 tie-sensitive rounding differences (2/13 token-level mismatches observed in prior token-level validation). +Openinfer and HF transformers produce near-identical results. The 0.45% delta is well within the 1% threshold and consistent with expected bf16 tie-sensitive rounding differences (2/13 token-level mismatches observed in prior token-level validation). ## Qwen3.5-4B: Historical FAIL — Recovered By #250 ### Symptoms -Before #250, pegainfer scored 10.61% (flexible) vs HF's 79.45% on GSM8K 8-shot. +Before #250, openinfer scored 10.61% (flexible) vs HF's 79.45% on GSM8K 8-shot. ### Root Cause -Before #250, Qwen3.5-4B produced divergent outputs in pegainfer vs HF transformers when processing long prompts (8-shot few-shot prefix, ~1771 input tokens): +Before #250, Qwen3.5-4B produced divergent outputs in openinfer vs HF transformers when processing long prompts (8-shot few-shot prefix, ~1771 input tokens): -- **0-shot (41 tokens):** pegainfer and HF output match — both generate `\n\n` followed by a correct answer. +- **0-shot (41 tokens):** openinfer and HF output match — both generate `\n\n` followed by a correct answer. - **8-shot (1771 tokens):** outputs diverge completely. - HF: ` Natalia sold 48 / 2 = <<48/2=24>>24` (correct format, correct answer) - - pegainfer: ` 168\n\nQuestion: Question: Question:...` (wrong number, degenerate repetition) + - openinfer: ` 168\n\nQuestion: Question: Question:...` (wrong number, degenerate repetition) The first generated token already differed, indicating the prefill logits diverged for long sequences. This did not affect Qwen3-4B (which uses a standard transformer architecture), only Qwen3.5-4B (which uses a hybrid Mamba-attention architecture with different prefill kernels). @@ -43,7 +43,7 @@ adds fail-closed cache coverage checks, and adds a long HF logits golden over ```bash export MODEL_PATH=/path/to/Qwen3.5-4B export LM_EVAL_BIN=/path/to/lm_eval -export RESULT_ROOT=results/qwen35-gsm8k-8shot-pegainfer-issue250 +export RESULT_ROOT=results/qwen35-gsm8k-8shot-openinfer-issue250 $LM_EVAL_BIN run \ --model local-completions \ @@ -55,7 +55,7 @@ $LM_EVAL_BIN run \ ``` Result file: -`results/qwen35-gsm8k-8shot-pegainfer-issue250/qwen35-eval/results_*.json` +`results/qwen35-gsm8k-8shot-openinfer-issue250/qwen35-eval/results_*.json` | Filter | exact_match | stderr | Delta vs HF 79.45% | | --- | ---: | ---: | ---: | @@ -76,7 +76,7 @@ lm-eval: 0.4.11 transformers: 5.4.0 torch: 2.11.0+cu128 GPU: NVIDIA GeForce RTX 5070 Ti (16GB) -pegainfer: commit 280e457 (main) +openinfer: commit 280e457 (main) ``` ### #250 Recovery Environment @@ -89,7 +89,7 @@ dataset: cached openai/gsm8k snapshot GPU: NVIDIA GeForce RTX 5090 (sm_120) CUDA: 12.8 Triton AOT Python: Triton 3.4.0 environment -pegainfer: issue #250 RoPE-cache fix branch +openinfer: issue #250 RoPE-cache fix branch ``` ### HF Baselines @@ -110,18 +110,18 @@ pegainfer: issue #250 RoPE-cache fix branch --output_path results/hf-qwen35-4b ``` -### Pegainfer Eval +### Openinfer Eval ```bash # Start server (one model at a time, single GPU) -PEGAINFER_TRITON_PYTHON=.venv/bin/python \ +OPENINFER_TRITON_PYTHON=.venv/bin/python \ cargo run --release -- --model-path models/Qwen3-4B --port 8000 --cuda-graph=false # Run eval (separate terminal, from repo root) .venv/bin/lm_eval run --model local-completions \ --model_args "model=Qwen3-4B,base_url=http://localhost:8000/v1/completions,tokenizer_backend=huggingface,tokenizer=models/Qwen3-4B,tokenized_requests=False" \ --tasks gsm8k --num_fewshot 8 --batch_size 1 \ - --output_path results/pegainfer-qwen3-4b + --output_path results/openinfer-qwen3-4b ``` **Note:** `local-completions` requires `tokenized_requests=False` and `base_url` pointing to the full `/v1/completions` endpoint. @@ -132,5 +132,5 @@ PEGAINFER_TRITON_PYTHON=.venv/bin/python \ |-----|----------| | HF Qwen3-4B | ~1h43m | | HF Qwen3.5-4B | ~2h11m | -| pegainfer Qwen3-4B | ~1h20m | -| pegainfer Qwen3.5-4B | ~1h16m | +| openinfer Qwen3-4B | ~1h20m | +| openinfer Qwen3.5-4B | ~1h16m | diff --git a/docs/benchmarks/bs1-4k64-vllm-pegainfer.md b/docs/benchmarks/bs1-4k64-vllm-openinfer.md similarity index 82% rename from docs/benchmarks/bs1-4k64-vllm-pegainfer.md rename to docs/benchmarks/bs1-4k64-vllm-openinfer.md index 1d024abd..4c78b607 100644 --- a/docs/benchmarks/bs1-4k64-vllm-pegainfer.md +++ b/docs/benchmarks/bs1-4k64-vllm-openinfer.md @@ -1,8 +1,8 @@ -# bs1 4k/64 vLLM vs PegaInfer +# bs1 4k/64 vLLM vs OpenInfer **Created**: 2026-05-04 **Status**: complete -**TL;DR**: On RTX 5090, `bs=1`, `input_len=4096`, `output_len=64`, `num_prompts=20`, `max_concurrency=1`, no vLLM prefix cache: PegaInfer finished `5.7%` faster wall-clock, with TTFT median `177.1ms` vs vLLM `197.8ms`. Decode TPOT was slightly slower: `6.47ms` vs vLLM `6.36ms`. PegaInfer's streaming `usage.completion_tokens` is overreported through the vLLM frontend in this run, so output throughput should be recomputed from the fixed target length. +**TL;DR**: On RTX 5090, `bs=1`, `input_len=4096`, `output_len=64`, `num_prompts=20`, `max_concurrency=1`, no vLLM prefix cache: OpenInfer finished `5.7%` faster wall-clock, with TTFT median `177.1ms` vs vLLM `197.8ms`. Decode TPOT was slightly slower: `6.47ms` vs vLLM `6.36ms`. OpenInfer's streaming `usage.completion_tokens` is overreported through the vLLM frontend in this run, so output throughput should be recomputed from the fixed target length. ## Preparation @@ -14,12 +14,12 @@ - `docs/subsystems/scheduler/scheduler.md` showed fixed-length single-concurrency results should be interpreted as latency probes rather than full serving saturation claims. - **Plan**: 1. Use `vllm` as the client and vLLM server. - 2. Use the release `pegainfer` binary for the PegaInfer server. + 2. Use the release `openinfer` binary for the OpenInfer server. 3. Run `input_len=4096`, `output_len=64`, `num_prompts=20`, `max_concurrency=1`, `request_rate=inf`, after a 3-request warmup for each engine. 4. Save JSON/log artifacts under a timestamped result directory and compare TTFT/TPOT/throughput. - **Risks / open questions**: - vLLM prefix caching must be disabled for a fair random-prompt prefill comparison. - - PegaInfer's vLLM frontend may not report streaming usage with the exact same accounting as vLLM. + - OpenInfer's vLLM frontend may not report streaming usage with the exact same accounting as vLLM. ## Execution Log @@ -29,9 +29,9 @@ - `uv pip list --python /bin/python` showed `vllm 0.19.1`, `torch 2.10.0`, `flashinfer-python 0.6.6`, and `flashinfer-cubin 0.6.6`. - Confirmed model path: - ``, size `7.6G`. -- Built PegaInfer server binary in the validation worktree: - - `PEGAINFER_CUDA_SM=120 PEGAINFER_TRITON_PYTHON=/bin/python cargo build --release -p pegainfer --bin pegainfer` - - The validation shell session hung after the build process ended, but `target/release/pegainfer` existed with timestamp `2026-05-04 21:11`. +- Built OpenInfer server binary in the validation worktree: + - `OPENINFER_CUDA_SM=120 OPENINFER_TRITON_PYTHON=/bin/python cargo build --release -p openinfer --bin openinfer` + - The validation shell session hung after the build process ended, but `target/release/openinfer` existed with timestamp `2026-05-04 21:11`. ### Step 2: vLLM run - First vLLM run used default prefix-cache behavior and showed prefix cache hits in the server log, so it was not used for the final comparison. @@ -52,12 +52,12 @@ - TPOT median `6.359ms`, p99 `6.366ms`. - ITL median `6.389ms`, p99 `6.638ms`. -### Step 3: PegaInfer run -- PegaInfer served model ID was ``, not `Qwen3-4B`, so the client model name was set to ``. +### Step 3: OpenInfer run +- OpenInfer served model ID was ``, not `Qwen3-4B`, so the client model name was set to ``. - Measured JSON: - - `pegainfer-in4096-out64-c1-n20.json` + - `openinfer-in4096-out64-c1-n20.json` - Command shape: - - server: `target/release/pegainfer --model-path --port 8000` + - server: `target/release/openinfer --model-path --port 8000` - client: same `vllm bench serve` shape as vLLM, except `--model `. - Raw results: - completed `20`, failed `0`. @@ -68,10 +68,10 @@ - ITL median `6.464ms`, p99 `6.546ms`. - Accounting caveat: - JSON `total_output_tokens` was `5312`, but the fixed workload was `20 * 64 = 1280` output tokens and the timing matches 64 generated tokens per request. - - For this run, PegaInfer output throughput should be recomputed as `1280 / 11.287s = 113.401 tok/s`, not the raw JSON `output_throughput`. + - For this run, OpenInfer output throughput should be recomputed as `1280 / 11.287s = 113.401 tok/s`, not the raw JSON `output_throughput`. ### Step 4: Comparison -- PegaInfer vs vLLM no-prefix: +- OpenInfer vs vLLM no-prefix: - Wall duration: `11.287s` vs `11.968s` (`5.7%` faster). - Request throughput: `1.772` vs `1.671 req/s` (`6.0%` higher). - Corrected output throughput: `113.401` vs `106.952 tok/s` (`6.0%` higher). @@ -83,12 +83,12 @@ ## Debrief -- **Outcome**: Completed the `bs=1`, `4k input`, `64 output` single-concurrency probe on RTX 5090. PegaInfer has better prefill/TTFT and slightly slower decode TPOT; wall-clock request throughput is higher because TTFT dominates this shape. +- **Outcome**: Completed the `bs=1`, `4k input`, `64 output` single-concurrency probe on RTX 5090. OpenInfer has better prefill/TTFT and slightly slower decode TPOT; wall-clock request throughput is higher because TTFT dominates this shape. - **Pitfalls encountered**: - The first vLLM measurement had prefix cache hits. It was rerun with `--no-enable-prefix-caching`. - The validation shell session can remain open after some long build/server scripts even when the validation work has finished; checking validation process state is necessary before assuming a command is still running. - - PegaInfer's vLLM frontend overreported streaming `completion_tokens` for this benchmark, so the raw output throughput field in `vllm bench` JSON is not reliable for PegaInfer here. + - OpenInfer's vLLM frontend overreported streaming `completion_tokens` for this benchmark, so the raw output throughput field in `vllm bench` JSON is not reliable for OpenInfer here. - **Lessons learned**: - - For fixed-output PegaInfer comparisons through `vllm bench serve`, trust TTFT/TPOT/ITL and recompute output throughput from requested output length until streaming usage accounting is fixed. + - For fixed-output OpenInfer comparisons through `vllm bench serve`, trust TTFT/TPOT/ITL and recompute output throughput from requested output length until streaming usage accounting is fixed. - Disable vLLM prefix caching for random synthetic prefill probes unless prefix-cache behavior is explicitly part of the experiment. - At this shape, the new Qwen3 prefill q64 path shows up as a TTFT advantage against vLLM, while decode remains essentially parity. diff --git a/docs/benchmarks/deepep-v2-vs-pplx-moe-backend.md b/docs/benchmarks/deepep-v2-vs-pplx-moe-backend.md index 4f5d3331..930853a1 100644 --- a/docs/benchmarks/deepep-v2-vs-pplx-moe-backend.md +++ b/docs/benchmarks/deepep-v2-vs-pplx-moe-backend.md @@ -1,6 +1,6 @@ -# DeepEP V2 vs PegaInfer PPLX EP on H20 x8 +# DeepEP V2 vs OpenInfer PPLX EP on H20 x8 -> **TL;DR** On an 8x H20 node, DeepEP V2 ElasticBuffer/NCCL Gin is clearly ahead of the current PegaInfer PPLX EP microbenchmark on the tested MoE exchange shapes. In the paired run here, the directional dispatch+combine ratio is about 2.5x to 5.3x; against the earlier PPLX snapshot, it is about 2.4x to 4.5x. This is a backend direction check, not a dtype-identical replacement gate. +> **TL;DR** On an 8x H20 node, DeepEP V2 ElasticBuffer/NCCL Gin is clearly ahead of the current OpenInfer PPLX EP microbenchmark on the tested MoE exchange shapes. In the paired run here, the directional dispatch+combine ratio is about 2.5x to 5.3x; against the earlier PPLX snapshot, it is about 2.4x to 4.5x. This is a backend direction check, not a dtype-identical replacement gate. Last touched: 2026-05-25 @@ -8,8 +8,8 @@ Last touched: 2026-05-25 | Component | Revision | | --- | --- | -| PegaInfer paired run | `f071baa` | -| PegaInfer historical PPLX snapshot | `ec514ef` | +| OpenInfer paired run | `f071baa` | +| OpenInfer historical PPLX snapshot | `ec514ef` | | DeepEP | `723716f` | ## Hardware And Software @@ -23,10 +23,10 @@ Last touched: 2026-05-25 ## Method -PegaInfer paired-run command: +OpenInfer paired-run command: ```bash -cargo run -r -p pegainfer-comm --bin pplx_a2a_bench -- --sweep --warmup 20 --repeats 100 +cargo run -r -p openinfer-comm --bin pplx_a2a_bench -- --sweep --warmup 20 --repeats 100 ``` DeepEP V2 command template: @@ -51,11 +51,11 @@ Sweep inputs: DeepEP was run with `--test-first-only`, so the measured case is the first elastic EP case: copy enabled, expert alignment 128, FP8 dispatch enabled, BF16 combine, no previous event, synchronous path. Correctness checks were skipped with `--skip-check`; this run is latency-only. -PegaInfer reports event-timed `max_rank_split_sum_us` for the full dispatch_send -> dispatch_recv -> combine_send -> combine_recv cycle. DeepEP reports profiler averages for ordinary dispatch and ordinary combine. For comparison, this note takes the ordinary dispatch line and ordinary combine line, sums dispatch+combine by rank, and reports both the worst rank and the mean rank. Because these are not identical timing harnesses, all ratios below are directional. +OpenInfer reports event-timed `max_rank_split_sum_us` for the full dispatch_send -> dispatch_recv -> combine_send -> combine_recv cycle. DeepEP reports profiler averages for ordinary dispatch and ordinary combine. For comparison, this note takes the ordinary dispatch line and ordinary combine line, sums dispatch+combine by rank, and reports both the worst rank and the mean rank. Because these are not identical timing harnesses, all ratios below are directional. ## Results -| Config | PegaInfer paired p50 us | PegaInfer paired mean us | DeepEP V2 worst-rank sum us | DeepEP V2 mean-rank sum us | Directional ratio vs PegaInfer p50 | +| Config | OpenInfer paired p50 us | OpenInfer paired mean us | DeepEP V2 worst-rank sum us | DeepEP V2 mean-rank sum us | Directional ratio vs OpenInfer p50 | | --- | ---: | ---: | ---: | ---: | ---: | | dsv4/tok=1 | 87.5 | 91.0 | 23.815 | 23.632 | 3.7x | | dsv4/tok=4 | 95.9 | 97.4 | 24.094 | 23.801 | 4.0x | @@ -87,9 +87,9 @@ PegaInfer reports event-timed `max_rank_split_sum_us` for the full dispatch_send | kimi-k2/tok=128 | 32.042 | 42.672 | 74.572 | | kimi-k2/tok=256 | 50.735 | 71.921 | 122.617 | -## PegaInfer Baseline Drift +## OpenInfer Baseline Drift -The table above uses the PegaInfer run taken in the same benchmarking session as the DeepEP run. The earlier PPLX benchmark snapshot in `docs/benchmarks/pplx-ep-a2a-h20-nvlink.md` was captured at `ec514ef`. Those two PegaInfer snapshots differ enough that the comparison should not pretend to be a precise speedup gate. +The table above uses the OpenInfer run taken in the same benchmarking session as the DeepEP run. The earlier PPLX benchmark snapshot in `docs/benchmarks/pplx-ep-a2a-h20-nvlink.md` was captured at `ec514ef`. Those two OpenInfer snapshots differ enough that the comparison should not pretend to be a precise speedup gate. Positive delta means the paired run here is slower than the historical snapshot. @@ -113,13 +113,13 @@ Using the historical PPLX p50s instead of the paired run gives a directional rat ## Interpretation Guardrails - DeepEP V2 was measured through the elastic EP path: ElasticBuffer with the NCCL Gin backend. The repository still builds legacy NVSHMEM pieces, but this V2 path is the one relevant to the current comparison. -- The measured DeepEP V2 case uses FP8 dispatch and BF16 combine. PegaInfer PPLX currently benchmarks a BF16 payload. Treat the table as a backend signal, not an exact dtype-to-dtype gate. -- DeepEP correctness checks were skipped in this latency run. A replacement decision needs a correctness run in the integrated PegaInfer path. -- DeepEP `num_tokens` is a max-per-rank input; the test uses slightly different actual token counts across ranks. PegaInfer uses the fixed max token count per rank. -- DeepEP numbers are profiler kernel averages. PegaInfer numbers are CUDA event timings around the benchmark cycle. The delta is large enough to be actionable, but integration work should add one apples-to-apples harness before replacing backend policy. +- The measured DeepEP V2 case uses FP8 dispatch and BF16 combine. OpenInfer PPLX currently benchmarks a BF16 payload. Treat the table as a backend signal, not an exact dtype-to-dtype gate. +- DeepEP correctness checks were skipped in this latency run. A replacement decision needs a correctness run in the integrated OpenInfer path. +- DeepEP `num_tokens` is a max-per-rank input; the test uses slightly different actual token counts across ranks. OpenInfer uses the fixed max token count per rank. +- DeepEP numbers are profiler kernel averages. OpenInfer numbers are CUDA event timings around the benchmark cycle. The delta is large enough to be actionable, but integration work should add one apples-to-apples harness before replacing backend policy. ## Read -DeepEP V2 is especially strong at low token counts: the tested DSV4 and Kimi-K2 shapes sit around 24-34 us for tok <= 32, while the paired PegaInfer PPLX path is roughly 96-147 us. At larger payloads, DeepEP still holds about a 2.5x to 3.1x directional advantage in the paired run. +DeepEP V2 is especially strong at low token counts: the tested DSV4 and Kimi-K2 shapes sit around 24-34 us for tok <= 32, while the paired OpenInfer PPLX path is roughly 96-147 us. At larger payloads, DeepEP still holds about a 2.5x to 3.1x directional advantage in the paired run. -The next useful gate is a strict integration benchmark with the same payload dtype, token distribution, correctness checks, and PegaInfer scheduler-facing API cost included. +The next useful gate is a strict integration benchmark with the same payload dtype, token distribution, correctness checks, and OpenInfer scheduler-facing API cost included. diff --git a/docs/benchmarks/pplx-ep-a2a-h20-nvlink.md b/docs/benchmarks/pplx-ep-a2a-h20-nvlink.md index 7ea51956..b3c39f01 100644 --- a/docs/benchmarks/pplx-ep-a2a-h20-nvlink.md +++ b/docs/benchmarks/pplx-ep-a2a-h20-nvlink.md @@ -17,7 +17,7 @@ Last touched: 2026-05 ## Benchmark -Binary: `pplx_a2a_bench --sweep` (in `pegainfer-comm`). +Binary: `pplx_a2a_bench --sweep` (in `openinfer-comm`). Each config bootstraps a fresh pplx-garden EP backend (CUMem + fabric MR + NVLink peer-map), runs 20 warmup + 100 measured iterations of the full dispatch_send → dispatch_recv → combine_send → combine_recv cycle, and reports `max_rank_split_sum_us` — the per-iteration maximum across all 8 ranks of the four-stage sum. diff --git a/docs/conventions/coding-style.md b/docs/conventions/coding-style.md index d70b0c65..0eff624b 100644 --- a/docs/conventions/coding-style.md +++ b/docs/conventions/coding-style.md @@ -6,4 +6,4 @@ Don't test for the sake of testing. Prefer integration tests over unit tests — ## Logging -Log through `pegainfer-core::logging`. The text layout already prints each record's module target, so don't prefix messages with a module or model name — no `kimi-k2:`, no `Qwen3.5 `. Error messages in `anyhow!` / `bail!` keep their prefix; they surface to callers without a target. +Log through `openinfer-core::logging`. The text layout already prints each record's module target, so don't prefix messages with a module or model name — no `kimi-k2:`, no `Qwen3.5 `. Error messages in `anyhow!` / `bail!` keep their prefix; they surface to callers without a target. diff --git a/docs/index.md b/docs/index.md index 9f1a4a33..b833cad6 100644 --- a/docs/index.md +++ b/docs/index.md @@ -25,10 +25,10 @@ Organized by domain (model line / subsystem / playbook / lesson) instead of by l | Path | TL;DR | | --- | --- | | `models/qwen3/roadmap.md` | Qwen3-4B roadmap (2026-06 review): line is the maturity bar; #220 RoPE OOB now fixed (sized cache + admission guard + kernel trap, gated by reject + in-window ITs); open set is per-row batch sampling, zero TP coverage, zero-adapter-only LoRA gate, dropped prefix-cache observability, stale docs, YaRN #8 follow-up. Sequenced Now/Next/Later + cleanup ledger. | -| `models/qwen3/model-crate.md` | `pegainfer-qwen3-4b` owns Qwen3 config/weights/executor/scheduler/tests/kernel plan; root sees generic `EngineHandle`; split-K retuned to `256/64`, with 4k/64 serving TPOT p50 at `6.46ms` on RTX 5090. | +| `models/qwen3/model-crate.md` | `openinfer-qwen3-4b` owns Qwen3 config/weights/executor/scheduler/tests/kernel plan; root sees generic `EngineHandle`; split-K retuned to `256/64`, with 4k/64 serving TPOT p50 at `6.46ms` on RTX 5090. | | `models/qwen3/prefix-cache.md` | Prefix caching on by default for Qwen3-4B: full-block kvbm radix matching at the executor, suffix-only prefill. Repeated ~1900-token prompt TTFT 141.8 → 16.3ms p50 (8.7×); warm TTFT ≈ TPOT + ~5ms setup. Includes the RoPE scalar-path corruption fix and the drain-the-stream TTFT measurement pitfall. | | `models/qwen3/accuracy-gate.md` | Qwen3-4B instance of the logits golden gate (`tests/hf_golden_gate.rs`): 48 teacher-forced sequences / 816 positions vs a stored HF bf16 golden, replayed over bs=1 / batched eager / CUDA-graph. Strict guards: regret check + mean ≤ 0.06 + p99 ≤ 0.20; absolute max printed but not asserted (coverage-unstable). Methodology in `subsystems/correctness/`. | -| `models/qwen3/kernels-crate.md` | Phase 1 split implemented and 5090-verified: Qwen3-4B kernel surface lives in `pegainfer-kernels`; release build, test-target compile, accuracy gate, and bench snapshot pass. | +| `models/qwen3/kernels-crate.md` | Phase 1 split implemented and 5090-verified: Qwen3-4B kernel surface lives in `openinfer-kernels`; release build, test-target compile, accuracy gate, and bench snapshot pass. | | `models/qwen3/tp-design.md` | Qwen3 tensor-parallel design: `TP=2` milestone scope plus the controller/worker broadcast execution model, request identity, and coarse-grained step protocol for future TP/MoE work. | | `models/qwen3/kv-pressure-hang.md` | Issue #85 Qwen3-4B KV pressure hang fixed by full-lifetime scheduler KV admission, waiting-queue deferral, cleanup on disconnect/error, impossible-request errors, scheduler/bridge gates, and real `vllm bench serve` QPS=2 `500/500` pass with post-pressure completion healthy. | @@ -40,8 +40,8 @@ Organized by domain (model line / subsystem / playbook / lesson) instead of by l | `models/qwen35/kv-admission.md` | Issue #254 complete: Qwen3.5 now uses full-lifetime KV admission, deferred pressure handling, impossible-request rejection, explicit error semantics, direct rejection-event coverage, RTX 5090 e2e, and real HTTP pressure/post-pressure validation. | | `models/qwen35/optimization.md` | Hybrid 24 linear + 8 full attn. At parity with vLLM: TTFT 234ms (+2%), TPOT 11.77ms (+1%). Post-accuracy-fix GDR decode kernel restore (#9). | | `models/qwen35/accuracy.md` | Qwen3.5-4B HF bf16 logits goldens through `past_key_values`: short replay covers sequential graph, bucket-straddling batched graph, and slot-compaction; long replay covers 4097/8192-token prompts; full GSM8K 8-shot now matches the HF baseline within 0.15 percentage points. | -| `models/qwen35/model-crate.md` | `pegainfer-qwen35-4b` owns Qwen3.5 model/scheduler/recurrent ops/tests/benches; root loads it through `EngineHandle`. Build/check/clippy, root bench sanity check, historical Qwen3.5 e2e, and scheduler e2e records live here. | -| `models/qwen35/kernel-plan.md` | Qwen3.5-4B has a `pegainfer_qwen35_4b::kernel_plan()` static descriptor mirroring the qwen3 module — enumerates every prefill/decode/unified op with its Rust call site, backend, and notes, so you can dump the active kernel mix without reading call sites. Pure refactor (issue #256), no kernel behavior change. | +| `models/qwen35/model-crate.md` | `openinfer-qwen35-4b` owns Qwen3.5 model/scheduler/recurrent ops/tests/benches; root loads it through `EngineHandle`. Build/check/clippy, root bench sanity check, historical Qwen3.5 e2e, and scheduler e2e records live here. | +| `models/qwen35/kernel-plan.md` | Qwen3.5-4B has a `openinfer_qwen35_4b::kernel_plan()` static descriptor mirroring the qwen3 module — enumerates every prefill/decode/unified op with its Rust call site, backend, and notes, so you can dump the active kernel mix without reading call sites. Pure refactor (issue #256), no kernel behavior change. | ## models / deepseek-v4 @@ -56,7 +56,7 @@ Organized by domain (model line / subsystem / playbook / lesson) instead of by l | `models/deepseek-v4/moe-ag-rs.md` | Decode MoE now uses GPU AG/RS, GPU route compaction, and grouped TileLang FP4 local experts; no route/expert D2H in hot path. Current 1x32 TPOT avg `105.54ms`, exact E2E `20/20`. | | `models/deepseek-v4/moe-tilelang-review.md` | Persistent rank workers + decode-only direct top-k MoE cut 1x32 steady TPOT to `80.49ms/token`; remaining cost is rank arrival skew before `107` f32 collectives/token. | | `models/deepseek-v4/pplx-ep-integration.md` | DeepSeek V4 PPLX EP integration: pplx-garden decode MoE path, EP8 bootstrap, common NUMA rank-slice placement, and H200 steady TPOT p50 `66.65ms`. | -| `models/deepseek-v4/kernel-paths.md` | DeepSeek V4 CUDA sources, TileLang generator path, and `pegainfer-kernels/KERNELS.md` routing index are organized. | +| `models/deepseek-v4/kernel-paths.md` | DeepSeek V4 CUDA sources, TileLang generator path, and `openinfer-kernels/KERNELS.md` routing index are organized. | ## models / deepseek-v2-lite @@ -74,24 +74,24 @@ Organized by domain (model line / subsystem / playbook / lesson) instead of by l | --- | --- | | `models/kimi-k2/roadmap.md` | Cross-cutting Kimi-K2 plan, re-verified 2026-06-08 on 8×H200. Decode leads vLLM on the active TP1/DP8 **DeepEP** line (bs64 graph TPOT `26.3 ms` p50 / `30.5` p99); M1 serving contract (sampling/EOS/admission) + M2 accuracy gate shipped and green teacher-forced. Live frontier = serving perf: the "+51% HTTP" (#225) was a **bench/metric artifact** (measured: identical prompts under-measure decode ~7–15% via the Marlin expert GEMM; transport ≈0) — floor ~34 ms, a2a ~30% GPU (#228); TTFT 4.5×/31× behind vLLM (#224). Open correctness debt: tests (#222), concurrent mispick (#286), graph-replay gate (#300). | | `models/kimi-k2/accuracy-gate.md` | vLLM-golden accuracy gate (#223):`tests/vllm_golden_gate.rs` + committed K2.6 fixture,teacher-forced regret sweep + free-greedy decode parity,走真实 serving path(TP1/DP8/EP8 PPLX);两档 regret 规则(自信位 0.30 / 平分布位 1.25 且每 pass 限 2 个),缺模型/fixture 显式 fail。 | -| `models/kimi-k2/deepep-migration.md` | PPLX→DeepEP 迁移已实现:kimi 路径 PPLX 全删(moe_pplx.rs 没了,kimi crate 不再依赖 pegainfer-comm);decode `expand=true`+`cpu_sync=false` 零 host 同步/分配(graph-ready,#227 capture 仍关);Marlin 原地消费 recv buffer(alignment 8 == block size,identity routing + sentinel);router scale 在 residual 处应用,combine 提前一步 bf16 取整。待 8×H200 数值 gate + serving bench。 | +| `models/kimi-k2/deepep-migration.md` | PPLX→DeepEP 迁移已实现:kimi 路径 PPLX 全删(moe_pplx.rs 没了,kimi crate 不再依赖 openinfer-comm);decode `expand=true`+`cpu_sync=false` 零 host 同步/分配(graph-ready,#227 capture 仍关);Marlin 原地消费 recv buffer(alignment 8 == block size,identity routing + sentinel);router scale 在 residual 处应用,combine 提前一步 bf16 取整。待 8×H200 数值 gate + serving bench。 | | `models/kimi-k2/sampling.md` | Sampling param surface + design (#237):TP1/DP8 上 temperature/top_k/top_p 经单次 batched FlashInfer pass 生效(greedy 行保持 in-graph argmax,零开销),TP8 显式拒绝非 greedy;OpenAI 参数表逐项标注 honored/rejected/ignored,无静默路径;8×H200 已验证 e2e + TPOT 无回归。 | | `models/kimi-k2/kv-cache-design.md` | KV cache 接入 qwen3 paged 栈 (#239→#230/#231),单 PR 落地:kimi kernel 层本就 paged,kernel 零改动;kvbm `BlockPool` per rank 取代静态 slot→pages 映射,full-lifetime reservation admission + 超界显式 Rejected,per-request cap 2048→8192(DP prompt 仍 ≤2048,PPLX fabric buffer 约束);#230/#231 的 substrate,8×H200 验证待做。 | | `models/kimi-k2/optimization.md` | Kimi-K2 model card + decode 优化主线。Active mainline 是 TP1+DP8+EP8 PPLX(decode batch cap 64,buckets `[1,2,4,8,16,32,64]`,bs64 output `1336 tok/s`);下半篇的 TP8+EP8 NCCL bs4 graph TPOT `14.39ms` 路径是历史 bring-up 记录,保留以解释 MLA/MoE/collective kernel 结构。 | | `models/kimi-k2/bringup-history.md` | Kimi-K2 text-only bring-up 压缩史(合并自旧 support-analysis/changelog/operator-todo trio):HF probe → 文本 manifest → TP8/EP8 sliced loader → MLA + Marlin WNA16 routed expert → NCCL bridge → bs4 wave decode → 整段 CUDA Graph → vLLM top-20 gate。持有 still-load-bearing 的 checkpoint/INT4/Marlin layout facts 与 #234 tombstone(expert-major CUTLASS 删除、weight_shape 不再加载、bs4 cap → 64)。 | | `models/kimi-k2/vllm-path-comparison.md` | Kimi-K2 decode 路径对照:vLLM-style fused qkv_a、MoE shared/routed compute overlap、shared/dense gate-up fusion、routed scaled-add 和 bridge microbench 已过 H20 gate;output64 avg/p50/p99 均在 `15ms` 内,vLLM TP-only MoE final all-reduce BF16/F32 两版均慢于当前 RS bridge。 | -| `models/kimi-k2/vllm-h20-baseline.md` | vLLM 0.19.0 H20 ×8 TP1+DP8+EP8 decode-heavy baseline:bs 1..256 扫描,bs=8 拐点 TPOT med `26.4ms` / aggregate `308 tok/s`,bs=256 拉到 `1131 tok/s`;同 client 下 pegainfer TP8+EP8 bs=4 TPOT `19.13ms` 比 vLLM 低 23%,但 HTTP 口径比 in-process 高 33%,frontend overhead 待查。 | +| `models/kimi-k2/vllm-h20-baseline.md` | vLLM 0.19.0 H20 ×8 TP1+DP8+EP8 decode-heavy baseline:bs 1..256 扫描,bs=8 拐点 TPOT med `26.4ms` / aggregate `308 tok/s`,bs=256 拉到 `1131 tok/s`;同 client 下 openinfer TP8+EP8 bs=4 TPOT `19.13ms` 比 vLLM 低 23%,但 HTTP 口径比 in-process 高 33%,frontend overhead 待查。 | | `models/kimi-k2/pplx-ep-decode.md` | PPLX EP decode bs=1 TPOT 37ms → 17.94ms(−52%),超过 NCCL no-graph 18.52ms。根因是 expert_padding=64 导致 Marlin 98% 计算浪费 + <<<1,1>>> 串行 routing kernel。含完整优化 log、failed approaches、nsys 对比数据。 | | `models/kimi-k2/pplx-ep-correctness.md` | TP8/EP8 PPLX correctness baseline:H20 64-token token trace 与 TP8/EP8 NCCL 完全一致,hash `4920f088c2338236`;记录 recv capacity、routed-row top-k weight、F32 combine 边界。 | | `models/kimi-k2/tp1-dp8-ep8-performance.md` | TP1 DP8 EP8 性能优化 ledger:O1 prompt_len1 decode admission 过 vLLM bs64 gate;O2 落地 5 个 decode kernel cherry-pick(cuBLASLt fixed-shape GEMM、argmax split、router fusion),精度由 base-vs-opt prefill logits A/B 压在 bf16 ULP 底,PPLX Marlin small-N tile 因 `-inf`/SIGSEGV 被定性为原分支精度破坏点并拒绝;bs64 TPOT 噪声内持平(p50 `40.58→40.09ms`)。 | -| `models/kimi-k2/source-layout.md` | Kimi-K2 source files over 1k lines were split by responsibility; the largest Rust file under `pegainfer-kimi-k2/src` is now `layers/attention.rs` at 950 lines. | +| `models/kimi-k2/source-layout.md` | Kimi-K2 source files over 1k lines were split by responsibility; the largest Rust file under `openinfer-kimi-k2/src` is now `layers/attention.rs` at 950 lines. | | `models/kimi-k2/dp-design.md` | TP×DP 可配置并行:每 DP rank 是独立 decode engine,EP all-to-all 天然 sync,轻量 load balancer 做 request 路由。首批 TP1×DP8 + TP8×DP1。 | ## subsystems / runtime | Path | TL;DR | | --- | --- | -| `subsystems/runtime/runtime.md` | Runtime complexity is controlled by a shared `pegainfer-core` that owns the generation contract and orchestration; per-model crates implement `ModelForward` so prefill/decode and hybrid attention stay hidden from the caller. State (`&mut`) is separated from weights (`&self`) for future bs > 1. | +| `subsystems/runtime/runtime.md` | Runtime complexity is controlled by a shared `openinfer-core` that owns the generation contract and orchestration; per-model crates implement `ModelForward` so prefill/decode and hybrid attention stay hidden from the caller. State (`&mut`) is separated from weights (`&self`) for future bs > 1. | | `subsystems/runtime/kv-cache-design.md` | Dynamo 式 logical/physical 分层 KV cache:BlockManager 管 block 生命周期和 admission,PhysicalBackend trait 管 GPU 内存和布局(FullAttention / MLA)。支持 TP / DP。基于 vLLM/Dynamo/pegaflow 调研。 | | `subsystems/runtime/pegaflow-offload-integration.md` | 把 `pegaflow-core` 当进程内 Rust 库做 KV 卸载物理后端(HBM→DRAM/SSD/RDMA),补 kvbm 没写的卸载层。**Qwen3-4B full-attn 首发,端到端已在真实 GPU 跑通并验证**(async SAVE+LOAD 接进 executor/scheduler,纯 CPU-hit 与 GPU+CPU 组合 hit 恢复后 logits 与冷算一致)。pegaflow 经 git rev pin(#331+#333)。默认关,未接 server CLI。linear 排除,sparse 暂缓。 | @@ -106,7 +106,7 @@ Organized by domain (model line / subsystem / playbook / lesson) instead of by l | Path | TL;DR | | --- | --- | | `subsystems/frontend/simulated-inference-engine.md` | CPU-only simulated model crate for vLLM/OpenAI frontend and `vllm bench serve` validation without CUDA, real model weights, or real-model performance claims. | -| `subsystems/frontend/cpu-profiling-baseline.md` | Frontend CPU profiling baseline using `pegainfer-sim` with fixed TTFT=5ms/TPOT=12ms: 200 req / concurrency=16 shows ~150ms TTFT overhead (no dominant hotspot), heap allocation ~10%, stream polling ~7.5%, IPC ~1%; reproducible benchmark command and perf evidence documented. | +| `subsystems/frontend/cpu-profiling-baseline.md` | Frontend CPU profiling baseline using `openinfer-sim` with fixed TTFT=5ms/TPOT=12ms: 200 req / concurrency=16 shows ~150ms TTFT overhead (no dominant hotspot), heap allocation ~10%, stream polling ~7.5%, IPC ~1%; reproducible benchmark command and perf evidence documented. | ## subsystems / correctness @@ -118,16 +118,16 @@ Organized by domain (model line / subsystem / playbook / lesson) instead of by l | Path | TL;DR | | --- | --- | -| `subsystems/kernels/pegainfer-kernels-boundary.md` | Architecture decision: pegainfer should use reusable frontend/runtime/data-plane layers plus per-model engines; kernels become first-class assets through a ledger, simulator, and request tracing. | +| `subsystems/kernels/openinfer-kernels-boundary.md` | Architecture decision: openinfer should use reusable frontend/runtime/data-plane layers plus per-model engines; kernels become first-class assets through a ledger, simulator, and request tracing. | | `subsystems/kernels/kernel-op-reports.md` | Qwen3 kernel/report tooling is feature-gated: `qwen3_kernel_report` covers per-op kernel reports, and `qwen3_model_report` emits runtime-traced eager-DAG decode operator rollups with TensorSpec `KernelCall`s, latency stats, tables, and Graphviz DOT; measured FA2 `CTA_TILE_Q=64` prefill default in place. | -| `subsystems/kernels/typed-forward-pipeline.md` | Reusable typed tensor pipeline macro in `pegainfer-kernels` so model crates can express common `typed_ops` chains without model-specific wrapper macros. | +| `subsystems/kernels/typed-forward-pipeline.md` | Reusable typed tensor pipeline macro in `openinfer-kernels` so model crates can express common `typed_ops` chains without model-specific wrapper macros. | ## playbooks | Path | TL;DR | | --- | --- | | `playbooks/developer-onboarding.md` | New-developer onboarding — toolchain, unified venv, build, tests, quick benchmark validation. | -| `playbooks/bench-vs-vllm.md` | pegainfer vs vLLM comparative benchmarking: method, workflow, typical configs, gotchas. | +| `playbooks/bench-vs-vllm.md` | openinfer vs vLLM comparative benchmarking: method, workflow, typical configs, gotchas. | | `playbooks/model-optimization-pipeline.md` | Per-model optimization methodology: 2 standard profiles, vLLM baseline, e2e dashboard + append-only optimization log. | | `playbooks/profiling-guide.md` | GPU profiling playbook: nsys pitfalls, diagnostic paths, measured kernel comparisons. | | `playbooks/accuracy-parity-playbook.md` | Accuracy debugging playbook: truth-source rules, first-diff workflow, bf16 rounding traps, and verified Qwen3.5 parity commands. | @@ -147,10 +147,10 @@ Organized by domain (model line / subsystem / playbook / lesson) instead of by l | Path | TL;DR | | --- | --- | -| `benchmarks/bs1-4k64-vllm-pegainfer.md` | RTX 5090 single-concurrency probe: `input_len=4096`, `output_len=64`, no vLLM prefix cache. PegaInfer TTFT median `177ms` vs vLLM `198ms`; TPOT median `6.47ms` vs `6.36ms`; corrected output throughput `+6%` for PegaInfer. | -| `benchmarks/accuracy-eval-results.md` | Phase 1 GSM8K: Qwen3-4B PASS (pegainfer 85.37% vs HF 85.82%, delta -0.45 pp). Qwen3.5-4B historical FAIL recovered by #250 (strict 79.38%, flexible 79.30% vs HF 79.45%). | +| `benchmarks/bs1-4k64-vllm-openinfer.md` | RTX 5090 single-concurrency probe: `input_len=4096`, `output_len=64`, no vLLM prefix cache. OpenInfer TTFT median `177ms` vs vLLM `198ms`; TPOT median `6.47ms` vs `6.36ms`; corrected output throughput `+6%` for OpenInfer. | +| `benchmarks/accuracy-eval-results.md` | Phase 1 GSM8K: Qwen3-4B PASS (openinfer 85.37% vs HF 85.82%, delta -0.45 pp). Qwen3.5-4B historical FAIL recovered by #250 (strict 79.38%, flexible 79.30% vs HF 79.45%). | | `benchmarks/pplx-ep-a2a-h20-nvlink.md` | pplx EP all-to-all latency on 8× H20 NV18 NVLink: DSV4 & Kimi-K2 shapes, tok=1..256. tok=1 p50 ~82μs, tok=256 p50 ~204/303μs. | -| `benchmarks/deepep-v2-vs-pplx-moe-backend.md` | H20 x8 DeepEP V2 vs current PegaInfer PPLX EP backend comparison: ElasticBuffer/NCCL Gin shows a directional 2.5x-5.3x paired-run ratio on tested DSV4 and Kimi-K2 MoE exchange shapes, with dtype, correctness, harness, and PPLX baseline-drift caveats recorded. | +| `benchmarks/deepep-v2-vs-pplx-moe-backend.md` | H20 x8 DeepEP V2 vs current OpenInfer PPLX EP backend comparison: ElasticBuffer/NCCL Gin shows a directional 2.5x-5.3x paired-run ratio on tested DSV4 and Kimi-K2 MoE exchange shapes, with dtype, correctness, harness, and PPLX baseline-drift caveats recorded. | ## conventions diff --git a/docs/lessons/exact-match-gate-thread-cublas.md b/docs/lessons/exact-match-gate-thread-cublas.md index 9a93216b..9181c5c4 100644 --- a/docs/lessons/exact-match-gate-thread-cublas.md +++ b/docs/lessons/exact-match-gate-thread-cublas.md @@ -7,7 +7,7 @@ ## Scope -This note records a cross-cutting runtime/correctness lesson, not a Qwen3.5-only story. It was lifted from the original Qwen3.5 debugging debrief because the concrete fix shipped, but the transferable lessons still matter. The triggering bug was fixed in `pegainfer-qwen35-4b`, but the takeaways apply to any model crate that moves a model onto a worker thread or guards greedy decode with an exact-text gate. +This note records a cross-cutting runtime/correctness lesson, not a Qwen3.5-only story. It was lifted from the original Qwen3.5 debugging debrief because the concrete fix shipped, but the transferable lessons still matter. The triggering bug was fixed in `openinfer-qwen35-4b`, but the takeaways apply to any model crate that moves a model onto a worker thread or guards greedy decode with an exact-text gate. ## Background @@ -18,7 +18,7 @@ The regression first appeared at `6a5b826` after cuBLAS handles became thread-lo - **Read**: - `docs/index.md` - Qwen3.5 accuracy and optimization docs are the relevant references. - `docs/models/qwen35/model-crate.md` - confirmed the model-crate split reproduced the same Qwen3.5 e2e failure on old HEAD. - - `git log -- pegainfer-qwen35-4b pegainfer-kernels ...` - identified Qwen3.5 and sampling-related commits since the last accuracy work (the historical bisect ran against the pre-split `src/model/qwen35` layout). + - `git log -- openinfer-qwen35-4b openinfer-kernels ...` - identified Qwen3.5 and sampling-related commits since the last accuracy work (the historical bisect ran against the pre-split `src/model/qwen35` layout). - **Relevant history**: - `docs/models/qwen35/model-crate.md` - old HEAD and the model-crate split both fail all 10 Qwen3.5 e2e cases with similar gibberish. - Commit history has a suspicious sampling change: `020970b refactor(sampling): switch greedy decode to flashinfer top1 (#73)`. @@ -27,7 +27,7 @@ The regression first appeared at `6a5b826` after cuBLAS handles became thread-lo ### Step 1: Reproduce and bisect through history - Created a temporary worktree so the active model-crate diff stayed untouched. -- Older commits needed the current local FlashInfer third-party tree copied into `third_party/flashinfer` and `PEGAINFER_TRITON_PYTHON` pointed at a Python with Triton. +- Older commits needed the current local FlashInfer third-party tree copied into `third_party/flashinfer` and `OPENINFER_TRITON_PYTHON` pointed at a Python with Triton. - Results: - `24be186 refactor(embedding): keep token ids unsigned end-to-end (#71)` passed Qwen3.5 e2e. - `020970b refactor(sampling): switch greedy decode to flashinfer top1 (#73)` failed a few cases with normal text, matching baseline drift rather than gibberish. @@ -41,7 +41,7 @@ The regression first appeared at `6a5b826` after cuBLAS handles became thread-lo - That showed logits/sampling were already wrong at the first sampled token after prefill; decode KV accumulation was not the primary cause. ### Step 3: Fix scheduler thread binding -- Updated `pegainfer-qwen35-4b/src/scheduler.rs` so the scheduler thread: +- Updated `openinfer-qwen35-4b/src/scheduler.rs` so the scheduler thread: - calls `cuda_set_device` for the model device, - binds the existing `CudaContext` to the scheduler thread, - initializes thread-local cuBLAS handles on that thread, @@ -59,14 +59,14 @@ The regression first appeared at `6a5b826` after cuBLAS handles became thread-lo - That exact-text e2e and `test_data/Qwen3.5-4B.json` are historical now. The current accuracy gate is the HF logits gate; `e2e_scheduler` remains a scheduler integration test for request-flow behavior rather than an exact-text replacement. ### Step 5: Validation -- Passed (set `PEGAINFER_CUDA_SM` only when overriding SM auto-detection): +- Passed (set `OPENINFER_CUDA_SM` only when overriding SM auto-detection): - `cargo fmt --all --check` - `cargo check --release --workspace --all-targets` - `cargo clippy --release --workspace --all-targets -- -D warnings` - Two-run same-seed regen comparison with a temporary model alias while evaluating FlashInfer top1 behavior. - - `cargo test --release -p pegainfer test_gpu_sample -- --nocapture` - - `PEGAINFER_TEST_MODEL_PATH= cargo test --release -p pegainfer-qwen35-4b --test e2e -- --nocapture` - - `PEGAINFER_TEST_MODEL_PATH= cargo test --release -p pegainfer-qwen35-4b --test e2e_scheduler -- --nocapture` + - `cargo test --release -p openinfer test_gpu_sample -- --nocapture` + - `OPENINFER_TEST_MODEL_PATH= cargo test --release -p openinfer-qwen35-4b --test e2e -- --nocapture` + - `OPENINFER_TEST_MODEL_PATH= cargo test --release -p openinfer-qwen35-4b --test e2e_scheduler -- --nocapture` - `git diff --check` ## Debrief diff --git a/docs/lessons/moe-zero-prefill-long-prefill.md b/docs/lessons/moe-zero-prefill-long-prefill.md index 7beddb3f..73d840e2 100644 --- a/docs/lessons/moe-zero-prefill-long-prefill.md +++ b/docs/lessons/moe-zero-prefill-long-prefill.md @@ -3,16 +3,16 @@ > **TL;DR:** ZeRO-Prefill gives us a boundary for a future long-prefill cluster, not a router design or an implementation commitment. > > - **Want:** a long-P engine path that maximizes batch throughput once an external router has already selected long-prefill work. -> - **Avoid:** putting long/delta classification, batch admission policy, or router state inside pegainfer. +> - **Avoid:** putting long/delta classification, batch admission policy, or router state inside openinfer. > - **Why:** long prefill can provide enough compute to hide expert-weight movement, while decode and delta-prefill have different latency and state constraints. > > **Status:** Discussion record. No implementation, measurement threshold, or connector API is committed here. ## Scope -This note records what we learned from "ZeRO-Prefill: Zero Redundancy Overheads in MoE Prefill Serving" ([arXiv:2605.02960](https://arxiv.org/abs/2605.02960)) and how it should shape future PegaFlow/PegaInfer planning for large MoE serving. +This note records what we learned from "ZeRO-Prefill: Zero Redundancy Overheads in MoE Prefill Serving" ([arXiv:2605.02960](https://arxiv.org/abs/2605.02960)) and how it should shape future PegaFlow/OpenInfer planning for large MoE serving. -The assumed product shape is P/D separation with an external router. The router is responsible for deciding whether work belongs to long prefill, delta prefill, or decode. This document only describes what pegainfer should care about after the router has already handed it a long-prefill batch. +The assumed product shape is P/D separation with an external router. The router is responsible for deciding whether work belongs to long prefill, delta prefill, or decode. This document only describes what openinfer should care about after the router has already handed it a long-prefill batch. The goal is a reusable boundary record, not an implementation plan. Exact backend design, telemetry fields, measurement thresholds, and connector protocols are outside this document. @@ -49,7 +49,7 @@ ZeRO-Prefill includes a KV-cache-free mode for prefill-only workloads that direc The paper's waste sources matter most when a long-prefill batch has enough work to make compute the dominant resource. In our P/D-separated roadmap, short delta-prefill and decode should not be assumed to satisfy the same condition. -For pegainfer, the first long-P goal is to keep selected long-prefill work compute-bound. Once the router has already selected a long-prefill batch, the engine should avoid fragmenting it into chunks that lose MFU or make expert transfer visible again. +For openinfer, the first long-P goal is to keep selected long-prefill work compute-bound. Once the router has already selected a long-prefill batch, the engine should avoid fragmenting it into chunks that lose MFU or make expert transfer visible again. **Want:** execution that preserves enough per-GPU prefill work to make long-P throughput the main objective. @@ -105,7 +105,7 @@ The future measurement spec should explain at least: ## Derivation: Future Reuse -When PegaFlow/PegaInfer planning revisits long-prefill MoE serving, use this note as the entry point: +When PegaFlow/OpenInfer planning revisits long-prefill MoE serving, use this note as the entry point: 1. Assume the router has already selected a long-prefill batch. 2. Evaluate whether the engine keeps that batch compute-bound. diff --git a/docs/models/deepseek-v2-lite/decode-attribution-gate.md b/docs/models/deepseek-v2-lite/decode-attribution-gate.md index a17c1bb0..5f3d207a 100644 --- a/docs/models/deepseek-v2-lite/decode-attribution-gate.md +++ b/docs/models/deepseek-v2-lite/decode-attribution-gate.md @@ -10,16 +10,16 @@ This gate deliberately stays model-specific and shape-specific: - Model: DeepSeek-V2-Lite. - Shape: batch size `1`, `4`, or `8`, prompt `Hello`, prompt token ids `[17464]`, output length `16`. -- Backends: default host-staged EP2 and `PEGAINFER_DSV2_LITE_EP_BACKEND=nccl`. +- Backends: default host-staged EP2 and `OPENINFER_DSV2_LITE_EP_BACKEND=nccl`. - Accuracy oracle: the same generated token/text/hash gate used by `hf-accuracy-gate.md`. - Attribution source: `DeepSeekV2LiteEp2Generator::generate_greedy_with_attribution` for `batch-size=1`, and `DeepSeekV2LiteEp2Generator::generate_greedy_batch_same_prompt_with_attribution` for `batch-size>1`. - GPU attribution source: CUDA events around selected stream sections in the explicit attribution path. -- NVTX source: set `PEGAINFER_DSV2_LITE_NVTX=1` to emit matching ranges for those selected sections during a profiler run. +- NVTX source: set `OPENINFER_DSV2_LITE_NVTX=1` to emit matching ranges for those selected sections during a profiler run. Out of scope: - sparse dispatch; -- pegainfer-comm / NVLink backend; +- openinfer-comm / NVLink backend; - multi-node or generic EP topology; - production continuous batching or broader prompts; - performance improvement or throughput claims. @@ -54,14 +54,14 @@ python tools/accuracy/hf_dump_dsv2_lite_ep2_greedy.py \ --output-len 16 \ --out target/accuracy/dsv2-lite-ep2/hf.json -PEGAINFER_TEST_MODEL_PATH=models/DeepSeek-V2-Lite \ -PEGAINFER_DSV2_LITE_E2E_JSON_OUT=target/accuracy/dsv2-lite-ep2/host-staged.json \ - cargo test --release -p pegainfer-deepseek-v2-lite --features deepseek-v2-lite --test e2e_ep2 -- --nocapture +OPENINFER_TEST_MODEL_PATH=models/DeepSeek-V2-Lite \ +OPENINFER_DSV2_LITE_E2E_JSON_OUT=target/accuracy/dsv2-lite-ep2/host-staged.json \ + cargo test --release -p openinfer-deepseek-v2-lite --features deepseek-v2-lite --test e2e_ep2 -- --nocapture -PEGAINFER_TEST_MODEL_PATH=models/DeepSeek-V2-Lite \ -PEGAINFER_DSV2_LITE_EP_BACKEND=nccl \ -PEGAINFER_DSV2_LITE_E2E_JSON_OUT=target/accuracy/dsv2-lite-ep2/nccl.json \ - cargo test --release -p pegainfer-deepseek-v2-lite --features deepseek-v2-lite --test e2e_ep2 -- --nocapture +OPENINFER_TEST_MODEL_PATH=models/DeepSeek-V2-Lite \ +OPENINFER_DSV2_LITE_EP_BACKEND=nccl \ +OPENINFER_DSV2_LITE_E2E_JSON_OUT=target/accuracy/dsv2-lite-ep2/nccl.json \ + cargo test --release -p openinfer-deepseek-v2-lite --features deepseek-v2-lite --test e2e_ep2 -- --nocapture python tools/accuracy/compare_dsv2_lite_ep2_outputs.py \ --hf target/accuracy/dsv2-lite-ep2/hf.json \ @@ -71,18 +71,18 @@ python tools/accuracy/compare_dsv2_lite_ep2_outputs.py \ --require-all-exact ``` -Then collect attribution for the same two pegainfer backends. Use `--batch-size 1` for the original single-row gate, and `--batch-size 4` / `--batch-size 8` for the true-batch benchmark attribution shape: +Then collect attribution for the same two openinfer backends. Use `--batch-size 1` for the original single-row gate, and `--batch-size 4` / `--batch-size 8` for the true-batch benchmark attribution shape: ```bash -cargo run --release -p pegainfer-deepseek-v2-lite \ +cargo run --release -p openinfer-deepseek-v2-lite \ --features deepseek-v2-lite \ --bin dsv2_lite_ep2_decode_attribution \ -- --model-path models/DeepSeek-V2-Lite \ --batch-size 1 \ --out target/accuracy/dsv2-lite-ep2/host-staged-attribution.json -PEGAINFER_DSV2_LITE_EP_BACKEND=nccl \ - cargo run --release -p pegainfer-deepseek-v2-lite \ +OPENINFER_DSV2_LITE_EP_BACKEND=nccl \ + cargo run --release -p openinfer-deepseek-v2-lite \ --features deepseek-v2-lite \ --bin dsv2_lite_ep2_decode_attribution \ -- --model-path models/DeepSeek-V2-Lite \ @@ -90,15 +90,15 @@ PEGAINFER_DSV2_LITE_EP_BACKEND=nccl \ --out target/accuracy/dsv2-lite-ep2/nccl-attribution.json for batch in 4 8; do - cargo run --release -p pegainfer-deepseek-v2-lite \ + cargo run --release -p openinfer-deepseek-v2-lite \ --features deepseek-v2-lite \ --bin dsv2_lite_ep2_decode_attribution \ -- --model-path models/DeepSeek-V2-Lite \ --batch-size "$batch" \ --out "target/accuracy/dsv2-lite-ep2/host-staged-batch${batch}-attribution.json" - PEGAINFER_DSV2_LITE_EP_BACKEND=nccl \ - cargo run --release -p pegainfer-deepseek-v2-lite \ + OPENINFER_DSV2_LITE_EP_BACKEND=nccl \ + cargo run --release -p openinfer-deepseek-v2-lite \ --features deepseek-v2-lite \ --bin dsv2_lite_ep2_decode_attribution \ -- --model-path models/DeepSeek-V2-Lite \ @@ -107,13 +107,13 @@ for batch in 4 8; do done ``` -For an Nsight Systems pass, run the same attribution command under the profiler and set `PEGAINFER_DSV2_LITE_NVTX=1`; the JSON `coverage` row then records `nvtx_ranges=emitted`. The NVTX labels are correlation markers for the selected GPU/NCCL sections, not timing evidence by themselves. Their wall-clock span can include CPU-side wrapper work, event setup, and synchronization around the section, so compare JSON `by_gpu_*` rows only with CUDA event timing, not with raw NVTX range duration. +For an Nsight Systems pass, run the same attribution command under the profiler and set `OPENINFER_DSV2_LITE_NVTX=1`; the JSON `coverage` row then records `nvtx_ranges=emitted`. The NVTX labels are correlation markers for the selected GPU/NCCL sections, not timing evidence by themselves. Their wall-clock span can include CPU-side wrapper work, event setup, and synchronization around the section, so compare JSON `by_gpu_*` rows only with CUDA event timing, not with raw NVTX range duration. To inspect the CUDA Graph readiness boundary for the current NCCL backend, run the attribution binary with the optional smoke flag: ```bash -PEGAINFER_DSV2_LITE_EP_BACKEND=nccl \ - cargo run --release -p pegainfer-deepseek-v2-lite \ +OPENINFER_DSV2_LITE_EP_BACKEND=nccl \ + cargo run --release -p openinfer-deepseek-v2-lite \ --features deepseek-v2-lite \ --bin dsv2_lite_ep2_decode_attribution \ -- --model-path models/DeepSeek-V2-Lite \ diff --git a/docs/models/deepseek-v2-lite/device-resident-nccl-combine.md b/docs/models/deepseek-v2-lite/device-resident-nccl-combine.md index be7991a0..7ac35f70 100644 --- a/docs/models/deepseek-v2-lite/device-resident-nccl-combine.md +++ b/docs/models/deepseek-v2-lite/device-resident-nccl-combine.md @@ -12,16 +12,16 @@ Last touched: 2026-06 - `docs/models/deepseek-v2-lite/decode-attribution-gate.md` - acceptance uses the `Hello` / 16-token HF / host-staged / NCCL gate plus graph-readiness blockers. - `docs/models/deepseek-v2-lite/hf-accuracy-gate.md` - same-host HF, host-staged, and NCCL token/text exactness is the correctness standard. - `docs/models/deepseek-v2-lite/source-layout.md` - runtime responsibilities are split, and issue #275 was intentionally left as follow-up work. - - `pegainfer-deepseek-v2-lite/src/runtime/moe.rs` - the pre-#275 NCCL combine path accumulated routed expert outputs in host `Vec` buffers, then copied H2D for NCCL and D2H before final H2D conversion. - - `pegainfer-deepseek-v2-lite/src/nccl_backend.rs` - the pre-#275 NCCL combine path allocated send/recv device buffers inside each call and synchronized both streams. - - `pegainfer-deepseek-v2-lite/src/runtime/readiness.rs` - the pre-#275 readiness report listed combine H2D, allocation, sync, and D2H blockers. - - `pegainfer-kernels/src/ops/elementwise.rs` and `pegainfer-kernels/csrc/shared/elementwise.cu` - existing device f32/bf16 conversion helpers could be reused, but there was no f32 accumulation helper for bf16 expert output. + - `openinfer-deepseek-v2-lite/src/runtime/moe.rs` - the pre-#275 NCCL combine path accumulated routed expert outputs in host `Vec` buffers, then copied H2D for NCCL and D2H before final H2D conversion. + - `openinfer-deepseek-v2-lite/src/nccl_backend.rs` - the pre-#275 NCCL combine path allocated send/recv device buffers inside each call and synchronized both streams. + - `openinfer-deepseek-v2-lite/src/runtime/readiness.rs` - the pre-#275 readiness report listed combine H2D, allocation, sync, and D2H blockers. + - `openinfer-kernels/src/ops/elementwise.rs` and `openinfer-kernels/csrc/shared/elementwise.cu` - existing device f32/bf16 conversion helpers could be reused, but there was no f32 accumulation helper for bf16 expert output. - **Relevant history**: - `docs/models/deepseek-v2-lite/status.md` - NCCL plus CUDA Graph is the preferred direction, but the current gate must not be described as production EP. - `docs/models/deepseek-v2-lite/source-layout.md` - local macOS checks are not enough for this path; remote 2-GPU validation is the real acceptance path. - **Implemented**: 1. Add a shared CUDA helper that accumulates a bf16 single-token expert output into a f32 device contribution buffer at a selected token row. - 2. Re-export that helper through `pegainfer-core::ops`. + 2. Re-export that helper through `openinfer-core::ops`. 3. Add reusable NCCL combine scratch buffers inside `NaiveNcclEp2Backend`, clear the f32 send scratch per MoE call, accumulate local/remote expert outputs on the owning device, all-reduce device buffers, and cast rank0 f32 result to bf16 on device. 4. Update graph-readiness blockers and attribution wording so removed combine H2D/D2H/allocation/sync blockers are no longer claimed, while remaining host routing and dense-exchange blockers stay explicit. 5. Run formatting and local compile gates, then use the provided remote GPU host for the DeepSeek-V2-Lite EP2 exactness and attribution gates. @@ -37,9 +37,9 @@ Validated 2026-06-08 on the provided 2x RTX 5090 host with DeepSeek-V2-Lite snap Commands run: ```bash -cargo test --offline --release -p pegainfer-deepseek-v2-lite --features deepseek-v2-lite --test e2e_ep2 --no-run +cargo test --offline --release -p openinfer-deepseek-v2-lite --features deepseek-v2-lite --test e2e_ep2 --no-run -cargo clippy --offline --release -p pegainfer-deepseek-v2-lite \ +cargo clippy --offline --release -p openinfer-deepseek-v2-lite \ --features deepseek-v2-lite --bins --tests -- \ -D warnings \ -A clippy::option_option \ @@ -52,14 +52,14 @@ python tools/accuracy/hf_dump_dsv2_lite_ep2_greedy.py \ --output-len 16 \ --out target/accuracy/dsv2-lite-ep2/hf.json -PEGAINFER_TEST_MODEL_PATH=models/DeepSeek-V2-Lite \ -PEGAINFER_DSV2_LITE_E2E_JSON_OUT=target/accuracy/dsv2-lite-ep2/host-staged.json \ - cargo test --offline --release -p pegainfer-deepseek-v2-lite --features deepseek-v2-lite --test e2e_ep2 -- --nocapture +OPENINFER_TEST_MODEL_PATH=models/DeepSeek-V2-Lite \ +OPENINFER_DSV2_LITE_E2E_JSON_OUT=target/accuracy/dsv2-lite-ep2/host-staged.json \ + cargo test --offline --release -p openinfer-deepseek-v2-lite --features deepseek-v2-lite --test e2e_ep2 -- --nocapture -PEGAINFER_TEST_MODEL_PATH=models/DeepSeek-V2-Lite \ -PEGAINFER_DSV2_LITE_EP_BACKEND=nccl \ -PEGAINFER_DSV2_LITE_E2E_JSON_OUT=target/accuracy/dsv2-lite-ep2/nccl-after-decouple-cleanup.json \ - cargo test --offline --release -p pegainfer-deepseek-v2-lite --features deepseek-v2-lite --test e2e_ep2 -- --nocapture +OPENINFER_TEST_MODEL_PATH=models/DeepSeek-V2-Lite \ +OPENINFER_DSV2_LITE_EP_BACKEND=nccl \ +OPENINFER_DSV2_LITE_E2E_JSON_OUT=target/accuracy/dsv2-lite-ep2/nccl-after-decouple-cleanup.json \ + cargo test --offline --release -p openinfer-deepseek-v2-lite --features deepseek-v2-lite --test e2e_ep2 -- --nocapture python tools/accuracy/compare_dsv2_lite_ep2_outputs.py \ --hf target/accuracy/dsv2-lite-ep2/hf.json \ @@ -68,8 +68,8 @@ python tools/accuracy/compare_dsv2_lite_ep2_outputs.py \ --out target/accuracy/dsv2-lite-ep2/comparison-after-decouple-cleanup.json \ --require-all-exact -PEGAINFER_DSV2_LITE_EP_BACKEND=nccl \ - cargo run --offline --release -p pegainfer-deepseek-v2-lite \ +OPENINFER_DSV2_LITE_EP_BACKEND=nccl \ + cargo run --offline --release -p openinfer-deepseek-v2-lite \ --features deepseek-v2-lite \ --bin dsv2_lite_ep2_decode_attribution \ -- --model-path models/DeepSeek-V2-Lite \ @@ -84,14 +84,14 @@ Results: - Token SHA256: `4fb4c8825fe4d2c4a1d966da25c259abdf675f4de4548daa5d41aea7dfe30225`. - Text SHA256: `0eedf11429e9ac13bb799c31665c6e9f70a1ac4493a08a3f3da9ecf39c1ec347`. - Candidate NCCL attribution: `gpu_timing.sample_count=8384`, `failure_count=0`. -- Initial remote cleanup gate: package `--bins --tests` clippy passed with only three explicit allows for then-existing lints (`pegainfer-core::logging` `option_option`, and two `host_ops` test lints). +- Initial remote cleanup gate: package `--bins --tests` clippy passed with only three explicit allows for then-existing lints (`openinfer-core::logging` `option_option`, and two `host_ops` test lints). Follow-up review gate on 2026-06-09 after fixing those lints: ```bash cargo fmt --all --check -cargo clippy --release -p pegainfer-deepseek-v2-lite \ +cargo clippy --release -p openinfer-deepseek-v2-lite \ --features deepseek-v2-lite --bins --tests -- -D warnings ``` diff --git a/docs/models/deepseek-v2-lite/hf-accuracy-gate.md b/docs/models/deepseek-v2-lite/hf-accuracy-gate.md index e1e17d28..61c3680a 100644 --- a/docs/models/deepseek-v2-lite/hf-accuracy-gate.md +++ b/docs/models/deepseek-v2-lite/hf-accuracy-gate.md @@ -2,7 +2,7 @@ > **TL;DR:** HF comparison gate for issue #135 after PR #149 and PR #150. The remaining correctness question was not NCCL performance; it was whether the existing DeepSeek-V2-Lite EP=2 baseline matches Hugging Face `generate(use_cache=true)` greedy decode for `prompt="Hello"`, `batch=1`, `output_len=16`. > -> **Status:** Passing for the covered shape. The latest run is token-exact and text-exact across HF `generate(use_cache=true)` greedy, pegainfer host-staged EP2, and pegainfer NCCL EP2. +> **Status:** Passing for the covered shape. The latest run is token-exact and text-exact across HF `generate(use_cache=true)` greedy, openinfer host-staged EP2, and openinfer NCCL EP2. ## Scope @@ -10,7 +10,7 @@ In scope: - HF truth: `AutoTokenizer` and `AutoModelForCausalLM` with `trust_remote_code=True`, `torch_dtype=torch.bfloat16`, `model.eval()`, and `torch.no_grad()`. - Generation shape: batch `1`, prompt `Hello`, prompt token ids `[17464]`, output length `16`, greedy argmax only. -- Pegainfer paths: default host-staged EP2 backend and explicit `PEGAINFER_DSV2_LITE_EP_BACKEND=nccl`. +- Openinfer paths: default host-staged EP2 backend and explicit `OPENINFER_DSV2_LITE_EP_BACKEND=nccl`. - Result comparison: generated token ids, generated text, token sha256, text sha256, and first different generated-token index. Out of scope: @@ -24,15 +24,15 @@ Out of scope: | Issue / maintainer requirement | Covered by | Evidence | | --- | --- | --- | -| DeepSeek-V2-Lite config loads independently from DeepSeek V4 assumptions. | PR #149 | Dedicated `pegainfer-deepseek-v2-lite` config/weight/model crate. | +| DeepSeek-V2-Lite config loads independently from DeepSeek V4 assumptions. | PR #149 | Dedicated `openinfer-deepseek-v2-lite` config/weight/model crate. | | Single-node `ep_size=2` validates rank, expert ownership, and local expert count. | PR #149 | EP layout is fixed to rank 0 experts `0..31` and rank 1 experts `32..63`, with load-time validation. | | Each rank only loads its owned 32 routed experts. | PR #149 | Driver rank loads rank 0 experts; expert rank loads only rank 1 routed experts. | | Unsupported backend/topology reports explicit errors. | PR #149 / #150 | Unsupported device count, duplicate devices, cuda_graph, and backend names fail closed. | | Minimal dispatch/combine path exists for the first correctness gate. | PR #149 | Host-staged dispatch/combine path remains the default baseline. | -| Maintainer-requested naive NCCL backend exists before pegainfer-comm/NVLink work. | PR #150 | `PEGAINFER_DSV2_LITE_EP_BACKEND=nccl` path passes the same EP2 greedy E2E as host-staged. | +| Maintainer-requested naive NCCL backend exists before openinfer-comm/NVLink work. | PR #150 | `OPENINFER_DSV2_LITE_EP_BACKEND=nccl` path passes the same EP2 greedy E2E as host-staged. | | HF ground-truth accuracy comparison exists. | This gate | HF `generate(use_cache=true)` greedy, host-staged EP2, and NCCL EP2 are token/text exact for the covered shape. | -Together with PR #149 and PR #150, this gate covers issue #135's correctness-first acceptance surface for the narrow EP=2 milestone. Follow-up work should be tracked separately for sparse/GPU dispatch, pegainfer-comm/NVLink integration, performance evidence, long context, and broader prompts/batches. +Together with PR #149 and PR #150, this gate covers issue #135's correctness-first acceptance surface for the narrow EP=2 milestone. Follow-up work should be tracked separately for sparse/GPU dispatch, openinfer-comm/NVLink integration, performance evidence, long context, and broader prompts/batches. ## Commands @@ -47,14 +47,14 @@ python tools/accuracy/hf_dump_dsv2_lite_ep2_greedy.py \ --output-len 16 \ --out target/accuracy/dsv2-lite-ep2/hf.json -PEGAINFER_TEST_MODEL_PATH=models/DeepSeek-V2-Lite \ -PEGAINFER_DSV2_LITE_E2E_JSON_OUT=target/accuracy/dsv2-lite-ep2/host-staged.json \ - cargo test --release -p pegainfer-deepseek-v2-lite --features deepseek-v2-lite --test e2e_ep2 -- --nocapture +OPENINFER_TEST_MODEL_PATH=models/DeepSeek-V2-Lite \ +OPENINFER_DSV2_LITE_E2E_JSON_OUT=target/accuracy/dsv2-lite-ep2/host-staged.json \ + cargo test --release -p openinfer-deepseek-v2-lite --features deepseek-v2-lite --test e2e_ep2 -- --nocapture -PEGAINFER_TEST_MODEL_PATH=models/DeepSeek-V2-Lite \ -PEGAINFER_DSV2_LITE_EP_BACKEND=nccl \ -PEGAINFER_DSV2_LITE_E2E_JSON_OUT=target/accuracy/dsv2-lite-ep2/nccl.json \ - cargo test --release -p pegainfer-deepseek-v2-lite --features deepseek-v2-lite --test e2e_ep2 -- --nocapture +OPENINFER_TEST_MODEL_PATH=models/DeepSeek-V2-Lite \ +OPENINFER_DSV2_LITE_EP_BACKEND=nccl \ +OPENINFER_DSV2_LITE_E2E_JSON_OUT=target/accuracy/dsv2-lite-ep2/nccl.json \ + cargo test --release -p openinfer-deepseek-v2-lite --features deepseek-v2-lite --test e2e_ep2 -- --nocapture python tools/accuracy/compare_dsv2_lite_ep2_outputs.py \ --hf target/accuracy/dsv2-lite-ep2/hf.json \ @@ -71,20 +71,20 @@ On Blackwell-class GPUs, make sure the selected NCCL runtime supports the device ## Interpretation - `all_token_text_exact`: HF, host-staged, and NCCL agree on generated token ids and generated text. -- `pegainfer_baseline_accuracy_gap`: host-staged and NCCL match each other, but both differ from HF. Treat this as a pegainfer baseline accuracy problem before touching NCCL transport. +- `openinfer_baseline_accuracy_gap`: host-staged and NCCL match each other, but both differ from HF. Treat this as a openinfer baseline accuracy problem before touching NCCL transport. - `nccl_transport_regression`: host-staged and NCCL differ. Debug the NCCL path before drawing any HF parity conclusion. ## Latest Evidence 2026-05-30, single-node 2 GPU validation with the same `models/DeepSeek-V2-Lite` snapshot for all three outputs. The model snapshot metadata recorded commit `604d5664dddd88a0433dbae533b7fe9472482de0`. The HF truth source used `AutoModelForCausalLM.generate(..., do_sample=false, use_cache=true)` with `torch==2.7.0+cu128` and `transformers==4.40.2` on 2x A800-SXM4-80GB: -The comparison gate must be run with an HF JSON dumped on the same model directory and runtime as the pegainfer outputs. The Rust E2E keeps known HF-confirmed hash pairs for this narrow `Hello`/16 shape because the same snapshot has produced different greedy text on RTX 5090 and A800 while still matching HF on each host. This does not claim a model-runtime improvement, a manual-loop root cause, or a transport issue. +The comparison gate must be run with an HF JSON dumped on the same model directory and runtime as the openinfer outputs. The Rust E2E keeps known HF-confirmed hash pairs for this narrow `Hello`/16 shape because the same snapshot has produced different greedy text on RTX 5090 and A800 while still matching HF on each host. This does not claim a model-runtime improvement, a manual-loop root cause, or a transport issue. | Source | Backend | Tokens | Token SHA256 | Text SHA256 | Text | | --- | --- | ---: | --- | --- | --- | | HF | `generate(use_cache=true)` | 16 | `d05a7b0f0ac6435fb51040582a337d8b6d72844dd61194daa1b3090fa0e16ce8` | `4aaafbe4b3a46bc5b9ab5ea8d09d5fad71225006c2e234e87a928e3265b387c6` | `, I am a 20 year old female and I have been having a` | -| pegainfer | host-staged | 16 | `d05a7b0f0ac6435fb51040582a337d8b6d72844dd61194daa1b3090fa0e16ce8` | `4aaafbe4b3a46bc5b9ab5ea8d09d5fad71225006c2e234e87a928e3265b387c6` | `, I am a 20 year old female and I have been having a` | -| pegainfer | NCCL | 16 | `d05a7b0f0ac6435fb51040582a337d8b6d72844dd61194daa1b3090fa0e16ce8` | `4aaafbe4b3a46bc5b9ab5ea8d09d5fad71225006c2e234e87a928e3265b387c6` | `, I am a 20 year old female and I have been having a` | +| openinfer | host-staged | 16 | `d05a7b0f0ac6435fb51040582a337d8b6d72844dd61194daa1b3090fa0e16ce8` | `4aaafbe4b3a46bc5b9ab5ea8d09d5fad71225006c2e234e87a928e3265b387c6` | `, I am a 20 year old female and I have been having a` | +| openinfer | NCCL | 16 | `d05a7b0f0ac6435fb51040582a337d8b6d72844dd61194daa1b3090fa0e16ce8` | `4aaafbe4b3a46bc5b9ab5ea8d09d5fad71225006c2e234e87a928e3265b387c6` | `, I am a 20 year old female and I have been having a` | Known HF-confirmed static E2E pairs for snapshot `604d5664dddd88a0433dbae533b7fe9472482de0`: diff --git a/docs/models/deepseek-v2-lite/source-layout.md b/docs/models/deepseek-v2-lite/source-layout.md index 739778a5..65fb2f91 100644 --- a/docs/models/deepseek-v2-lite/source-layout.md +++ b/docs/models/deepseek-v2-lite/source-layout.md @@ -22,7 +22,7 @@ boundaries: ## Layout -`pegainfer-deepseek-v2-lite/src/runtime.rs` is now a facade that keeps the +`openinfer-deepseek-v2-lite/src/runtime.rs` is now a facade that keeps the public generator and result exports stable. Implementation moved into: | File | Responsibility | @@ -38,7 +38,7 @@ public generator and result exports stable. Implementation moved into: ## What Stayed -- Public exports from `pegainfer-deepseek-v2-lite/src/lib.rs` still expose +- Public exports from `openinfer-deepseek-v2-lite/src/lib.rs` still expose `DeepSeekV2LiteEp2Generator`, `GenerationResult`, `BatchedGenerationResult`, `GenerationStats`, and `DecodeGraphReadinessReport`. @@ -61,12 +61,12 @@ cargo fmt --all --check Both passed. Remote validation ran on Ubuntu 22.04 with 2x NVIDIA GeForce RTX 5090, driver -580.105.08, CUDA 12.8, `PEGAINFER_CUDA_SM=120`, and -`PEGAINFER_TRITON_PYTHON=/root/autodl-tmp/pegainfer-triton-venv/bin/python`. +580.105.08, CUDA 12.8, `OPENINFER_CUDA_SM=120`, and +`OPENINFER_TRITON_PYTHON=/root/autodl-tmp/openinfer-triton-venv/bin/python`. Passed gates: -- `cargo check --offline --release -p pegainfer-deepseek-v2-lite --features deepseek-v2-lite --lib --tests` +- `cargo check --offline --release -p openinfer-deepseek-v2-lite --features deepseek-v2-lite --lib --tests` - HF oracle dump with `tools/accuracy/hf_dump_dsv2_lite_ep2_greedy.py` - host-staged `tests/e2e_ep2.rs` - NCCL `tests/e2e_ep2.rs` using `LD_LIBRARY_PATH=/root/autodl-tmp/nccl-2.27.7/nvidia/nccl/lib` diff --git a/docs/models/deepseek-v2-lite/status.md b/docs/models/deepseek-v2-lite/status.md index 30c04559..2c6b1982 100644 --- a/docs/models/deepseek-v2-lite/status.md +++ b/docs/models/deepseek-v2-lite/status.md @@ -28,7 +28,7 @@ The retained correctness gate is deliberately narrow: - prompt token ids: `[17464]`; - output length: `16`; - generation mode: greedy; -- backends: host-staged and `PEGAINFER_DSV2_LITE_EP_BACKEND=nccl`. +- backends: host-staged and `OPENINFER_DSV2_LITE_EP_BACKEND=nccl`. The comparison gate must be run on the same model snapshot for HF, host-staged, and NCCL outputs. Same-host comparison remains strict: HF, host-staged, and NCCL must be token-exact and text-exact. Host-staged remains the baseline oracle for NCCL transport changes. @@ -62,12 +62,12 @@ PR #196 extends attribution for the same direct diagnostic shapes. The retained In response to issue #170's request for a vLLM TP2+EP2 or pure TP2 comparison, a manual same-model snapshot was collected with `vllm bench serve` concurrency pressure `1`, `4`, and `8`. -This table is retained only to document the current gap. It is not evidence of a complete, fair production-serving parity comparison, and `--max-concurrency` should be read as concurrent request pressure, not as proof of true internal PegaInfer batch size. +This table is retained only to document the current gap. It is not evidence of a complete, fair production-serving parity comparison, and `--max-concurrency` should be read as concurrent request pressure, not as proof of true internal OpenInfer batch size. | Engine | Mode | conc=1 TPOT ms | conc=4 TPOT ms | conc=8 TPOT ms | Output tok/s at 1/4/8 | | --- | --- | ---: | ---: | ---: | --- | -| PegaInfer | host-staged | 49.95 | 51.30 | 51.22 | 19.84 / 19.53 / 19.56 | -| PegaInfer | NCCL | 178.31 | 173.22 | 174.46 | 5.59 / 5.77 / 5.73 | +| OpenInfer | host-staged | 49.95 | 51.30 | 51.22 | 19.84 / 19.53 / 19.56 | +| OpenInfer | NCCL | 178.31 | 173.22 | 174.46 | 5.59 / 5.77 / 5.73 | | vLLM | TP2 default | 35.61 | 36.43 | 36.37 | 27.54 / 97.72 / 195.28 | | vLLM | TP2+EP2 default | 34.15 | 34.97 | 34.88 | 28.87 / 101.52 / 204.08 | @@ -75,7 +75,7 @@ Interpretation: - at single-concurrency TPOT, host-staged is closer to vLLM than the current NCCL backend; - NCCL remains a correctness-first backend and is still significantly slower than host-staged; -- PegaInfer HTTP throughput did not scale with concurrency in this snapshot, so serving batching remains open; +- OpenInfer HTTP throughput did not scale with concurrency in this snapshot, so serving batching remains open; - vLLM TP2+EP2 worked in this environment and should stay in future comparison matrices. ## Claim Boundaries @@ -86,7 +86,7 @@ Use these labels consistently: | --- | --- | --- | | `direct single-row` | In-process batch `1` decode. | HTTP serving throughput. | | `direct same-prompt diagnostic batch` | Fixed same-prompt direct batch sizes `1/4/8`. | Production continuous batching or mixed-request scheduling. | -| `HTTP concurrency pressure` | `vllm bench serve --max-concurrency N` against an HTTP endpoint. | True PegaInfer batch size unless the engine path proves it. | +| `HTTP concurrency pressure` | `vllm bench serve --max-concurrency N` against an HTTP endpoint. | True OpenInfer batch size unless the engine path proves it. | Do not claim: @@ -113,8 +113,8 @@ The next implementation should be chosen from measured evidence: - judge issue #170 by whether it reduces NCCL decode overhead and makes the path more graph-friendly. 2. Keep a fair serving benchmark contract around future performance work. - - PegaInfer host-staged. - - PegaInfer NCCL. + - OpenInfer host-staged. + - OpenInfer NCCL. - vLLM TP2. - vLLM TP2+EP2 when supported. - default vLLM configuration plus a controlled configuration with cache/flag choices recorded. diff --git a/docs/models/deepseek-v4/decode-performance.md b/docs/models/deepseek-v4/decode-performance.md index ca114d91..160fc91b 100644 --- a/docs/models/deepseek-v4/decode-performance.md +++ b/docs/models/deepseek-v4/decode-performance.md @@ -37,7 +37,7 @@ Current evidence: | Requirement | Evidence | Status | | --- | --- | --- | | Main objective: stable sub-`25ms/token` DeepSeek V4 decode without bs=1 or seq_len=1 specialization | Best retained repeats reached the `26.28-27.31ms/token` band; fresh 5-run stability sweep after the latest rejected act_quant probe is `28.29-28.91ms` aggregate steady TPOT while another CPU load was running | Not achieved. Keep the goal active. | -| Fixed bench stable sub-30 with hash `6346f03343d75a65` | `$RESULT_ROOT/dsv4_stability_after_act_quant_revert_{1..5}.json` records 5 consecutive fixed bench runs, aggregate steady TPOT avg `28.291-28.912ms`, and all 15 per-iteration hashes `6346f03343d75a65`; reviewer rerun `$RESULT_ROOT/pegainfer_dev_pr101_bench_{1..5}.json` observed aggregate steady TPOT avg `27.552965-29.755957ms`, again with all 15 hashes `6346f03343d75a65` | Achieved for the retained tree. | +| Fixed bench stable sub-30 with hash `6346f03343d75a65` | `$RESULT_ROOT/dsv4_stability_after_act_quant_revert_{1..5}.json` records 5 consecutive fixed bench runs, aggregate steady TPOT avg `28.291-28.912ms`, and all 15 per-iteration hashes `6346f03343d75a65`; reviewer rerun `$RESULT_ROOT/openinfer_dev_pr101_bench_{1..5}.json` observed aggregate steady TPOT avg `27.552965-29.755957ms`, again with all 15 hashes `6346f03343d75a65` | Achieved for the retained tree. | | Exact E2E remains `20/20` | `$RESULT_ROOT/dsv4_fresh_e2e_after_w2_reduce_doc.log` records `All 20 DeepSeek V4 exact cases passed` | Achieved for the retained tree. | | Public vLLM/SGLang MoE decomposition is replicated first | Runtime uses routed FP4 `W13 grouped GEMM -> fused SwiGLU + W2 activation quant -> W2 grouped GEMM`; old split W1/W3/SwiGLU/W2 public and FFI paths are removed | Achieved. | | Deeper W13 accumulator -> SwiGLU -> W2-quant path is explored only after microbench/fuzz | TileLang W13 accumulator prototype was compiled after lowering fixes but failed the first active-expert fuzz shape, so it was removed before runtime integration | Explored and rejected; still open as a future true tensor-core epilogue project. | @@ -50,14 +50,14 @@ Audit conclusion: the goal is not complete because stable sub-`25ms/token` has n The goal is to copy the mature decomposition and validation discipline, not the framework surface. This table is the current "homework ledger" for DeepSeek V4 decode MoE: -| Source idea | vLLM/SGLang anchor | PegaInfer status | Decision | +| Source idea | vLLM/SGLang anchor | OpenInfer status | Decision | | --- | --- | --- | --- | | Experts core decomposition: `W13 grouped GEMM -> activation/quant -> W2 grouped GEMM` | vLLM `docs/design/fused_moe_modular_kernel.md`; vLLM `fused_moe/modular_kernel.py`; SGLang `srt/layers/moe/moe_runner/triton.py` | Retained as routed FP4 W13 grouped launch plus fused SwiGLU+W2 activation quant plus W2 grouped FP4 launch | Adopted. This is the baseline decomposition and the old split W1/W3/SwiGLU/W2 public path is removed. | -| Prepare/finalize can be separate from experts | vLLM `FusedMoEPrepareAndFinalizeModular`; SGLang EP MoE dispatcher/finalizer split | Our AG/RS, route mapping, local experts, partial combine, and reduce-scatter are explicit stages | Adopted selectively. We keep the simpler PegaInfer scheduler/worker structure rather than importing generic dispatch classes. | +| Prepare/finalize can be separate from experts | vLLM `FusedMoEPrepareAndFinalizeModular`; SGLang EP MoE dispatcher/finalizer split | Our AG/RS, route mapping, local experts, partial combine, and reduce-scatter are explicit stages | Adopted selectively. We keep the simpler OpenInfer scheduler/worker structure rather than importing generic dispatch classes. | | Async prepare/finalize enables shared expert overlap | vLLM `prepare_no_receive`; vLLM modular-kernel doc notes shared expert overlap during communication | MoE hidden/token all-gather and reduce-scatter run on a MoE NCCL stream while shared expert runs on the main compute stream | Adopted. Full shared-compute-stream overlap changed token hash and was rejected. | | `TopKWeightAndReduce` may live inside experts | vLLM `topk_weight_and_reduce.py`; vLLM `FusedMoEExpertsModular::finalize_weight_and_reduce_impl` | Atomic epilogue-shaped microbench was slower than current deterministic reduce | Rejected for current layout. This needs a token-major or deterministic W2 scheduler, not atomics bolted onto expert-major TileLang W2. | | W13 layout should match fused SwiGLU convention | vLLM `oracle/mxfp4.py`; vLLM `quantization/utils/flashinfer_utils.py`; SGLang `moe_runner/flashinfer_cutedsl.py` | Pair-interleaved `[up, gate]` standalone SwiGLU+quant was byte-identical but mostly flat and tiny | Rejected as standalone. Keep the note for a true W13 epilogue fusion. | -| FlashInfer/TRTLLM/CuteDSL FP4 MoE backends use specialized weight/scale reorder | vLLM `experts/trtllm_mxfp4_moe.py`; vLLM `oracle/mxfp4.py`; SGLang `quantization/mxfp4.py`; SGLang `moe_runner/flashinfer_trtllm.py` | Not integrated. Current PegaInfer weights are per-expert tensors and TileLang grouped GEMM takes pointer arrays; FlashInfer routes expect different packed/reordered layouts and runner-level metadata | Candidate, but only after a standalone grouped-GEMM microbench proves a real W13/W2 win on our exact shapes. Do not import the framework runner. | +| FlashInfer/TRTLLM/CuteDSL FP4 MoE backends use specialized weight/scale reorder | vLLM `experts/trtllm_mxfp4_moe.py`; vLLM `oracle/mxfp4.py`; SGLang `quantization/mxfp4.py`; SGLang `moe_runner/flashinfer_trtllm.py` | Not integrated. Current OpenInfer weights are per-expert tensors and TileLang grouped GEMM takes pointer arrays; FlashInfer routes expect different packed/reordered layouts and runner-level metadata | Candidate, but only after a standalone grouped-GEMM microbench proves a real W13/W2 win on our exact shapes. Do not import the framework runner. | | DeepGEMM-style deeper epilogue fusion | vLLM `experts/deep_gemm_moe.py`; SGLang DeepGEMM benchmarks under `benchmark/kernels/deepseek` | Scalar upper-bound microbench shows exact feasibility but absolute standalone delta is tiny | Candidate only as true tensor-core W13 epilogue work. Standalone SwiGLU/quant substitutions are no longer enough. | | FP4 quant before communication for high-throughput all-gather | SGLang `srt/layers/moe/utils.py::should_use_flashinfer_cutlass_moe_fp4_allgather` | Not adopted. Our current AG gathers BF16 hidden before routing; changing this means routing/dispatch protocol changes, not a local kernel swap | Future architecture work. Needs correctness design because router consumes hidden before expert dispatch. | | High-throughput bs>100 packed MoE layout | vLLM/SGLang packed FP4/MXFP4 backends and dispatcher/finalizer layouts | Not part of the current sub-25 latency patch. Current per-expert tensors are good for low-latency iteration but probably not the final throughput layout | Future architecture work. Design W13/W2 weight layout, FP4 scale layout, dispatch row layout, and combine/finalize together; keep conversion offline/load-time and avoid two production hot paths. | @@ -261,7 +261,7 @@ The later retained path uses the same MoE NCCL stream for the earlier hidden/tok 4. main stream waits on all-gather completion before router, local experts, and routed combine. 5. routed reduce-scatter still uses the MoE NCCL stream and the final add waits on its completion event. -This does not change route math, grouped GEMM shape, or batch/expert generality. It is not copied directly from vLLM/SGLang operator code; it is a PegaInfer scheduling step that becomes available once the vLLM/SGLang-style local expert path is exact and stable. The first reduce-scatter-only fixed bench run moved to `26.77-26.80ms/token`; two repeats landed in the `27.64-27.99ms/token` band; the post full-overlap revert calibration landed at `28.38-28.54ms/token`. Moving MoE all-gather to the same NCCL stream and overlapping shared expert with all-gather produced fresh repeated fixed benches at `26.28-27.31ms/token`, still with token hash `6346f03343d75a65`. Keep decision: retain. This is safer than the rejected full shared-expert overlap because shared expert stays on the main compute stream; only MoE collectives move to the MoE NCCL stream. +This does not change route math, grouped GEMM shape, or batch/expert generality. It is not copied directly from vLLM/SGLang operator code; it is a OpenInfer scheduling step that becomes available once the vLLM/SGLang-style local expert path is exact and stable. The first reduce-scatter-only fixed bench run moved to `26.77-26.80ms/token`; two repeats landed in the `27.64-27.99ms/token` band; the post full-overlap revert calibration landed at `28.38-28.54ms/token`. Moving MoE all-gather to the same NCCL stream and overlapping shared expert with all-gather produced fresh repeated fixed benches at `26.28-27.31ms/token`, still with token hash `6346f03343d75a65`. Keep decision: retain. This is safer than the rejected full shared-expert overlap because shared expert stays on the main compute stream; only MoE collectives move to the MoE NCCL stream. ### Fused Q/KV RoPE projection @@ -366,7 +366,7 @@ Validation on 5090: | Check | Result | | --- | --- | | `cargo fmt --check` | passed | -| `cargo check --release -p pegainfer-deepseek-v4 --features deepseek-v4` | passed | +| `cargo check --release -p openinfer-deepseek-v4 --features deepseek-v4` | passed | | release `deepseek_v4_e2e` | `All 20 DeepSeek V4 exact cases passed` | | fixed bench run 1 | steady TPOT avg `33.330ms`, p50 `32.858ms`, p95 `35.274ms`, hash `6346f03343d75a65` | | fixed bench run 2 | steady TPOT avg `34.289ms`, p50 `33.979ms`, p95 `36.852ms`, hash `6346f03343d75a65` | @@ -388,7 +388,7 @@ blockIdx.x 0..15 -> W1 pointer arrays -> gate output blockIdx.x 16..31 -> W3 pointer arrays -> up output ``` -The C++ tool `pegainfer-kernels/tools/deepseek_v4/w13_grouped_fp4_bench.cu` links the generated TileLang object directly and compares: +The C++ tool `openinfer-kernels/tools/deepseek_v4/w13_grouped_fp4_bench.cu` links the generated TileLang object directly and compares: ```text baseline: grouped_gemm(W1) + grouped_gemm(W3) @@ -400,10 +400,10 @@ Fuzz uses BF16 random input, TileLang `act_quant_k4096`, random FP4 bytes and bo Verified compile command shape: ```bash -OUT_DIR=$(find target/release/build/pegainfer-kernels-* -maxdepth 1 -type d -name out -printf '%T@ %p\n' | sort -nr | head -1 | cut -d' ' -f2-) +OUT_DIR=$(find target/release/build/openinfer-kernels-* -maxdepth 1 -type d -name out -printf '%T@ %p\n' | sort -nr | head -1 | cut -d' ' -f2-) /usr/local/cuda/bin/nvcc -std=c++17 -O3 -arch=sm_120 \ -I/usr/local/cuda/include \ - pegainfer-kernels/tools/deepseek_v4/w13_grouped_fp4_bench.cu \ + openinfer-kernels/tools/deepseek_v4/w13_grouped_fp4_bench.cu \ "$OUT_DIR/libkernels_cuda.a" \ -lcudart \ -o $RESULT_ROOT/w13_grouped_fp4_bench @@ -441,7 +441,7 @@ Runtime validation on 5090: | Check | Result | | --- | --- | | `cargo fmt --check` | passed | -| `cargo check --release -p pegainfer-deepseek-v4 --features deepseek-v4` | passed | +| `cargo check --release -p openinfer-deepseek-v4 --features deepseek-v4` | passed | | release `deepseek_v4_e2e` | `All 20 DeepSeek V4 exact cases passed` | | fixed bench text run | steady TPOT avg `34.22ms`, p50 `33.77ms`, p95 `36.53ms`, first decode avg `32.94ms` | | fixed bench JSON run | steady TPOT avg `31.986ms`, p50 `31.458ms`, p95 `34.052ms`, first decode avg `30.544ms`, hash `6346f03343d75a65` | @@ -461,11 +461,11 @@ Reference source positions: | SGLang | `$LOCAL_WORKSPACE/sglang/python/sglang/srt/layers/moe/moe_runner/deep_gemm.py` | `grouped_gemm_nt_f8f8bf16_masked` writes `gateup_output`, then `sglang_per_token_group_quant_8bit(..., fuse_silu_and_mul=True)`, then W2 grouped GEMM. | | SGLang C++ quant | `$LOCAL_WORKSPACE/sglang/sgl-kernel/csrc/gemm/per_token_group_quant_8bit_v2.cu` | The `fuse_silu_and_mul` path fuses activation with group quant, including masked expert layout. | -The next reusable lesson is their problem-size representation. vLLM builds `expert_offsets`, `blockscale_offsets`, `problem_sizes1`, and `problem_sizes2` before CUTLASS grouped GEMM. SGLang's masked path passes `masked_m` and `expected_m` into DeepGEMM. Both make the GEMM scheduler aware of per-expert logical M. PegaInfer currently has `expert_indptr`, but the TileLang grouped launch still uses `dim3 grid(out_tiles, ceil(rows / 32), local_experts)` and returns inside the kernel when `blockIdx.y * 32 >= expert_m`. That is correct and GPU-resident, but it still launches empty CTAs for short or empty experts. +The next reusable lesson is their problem-size representation. vLLM builds `expert_offsets`, `blockscale_offsets`, `problem_sizes1`, and `problem_sizes2` before CUTLASS grouped GEMM. SGLang's masked path passes `masked_m` and `expected_m` into DeepGEMM. Both make the GEMM scheduler aware of per-expert logical M. OpenInfer currently has `expert_indptr`, but the TileLang grouped launch still uses `dim3 grid(out_tiles, ceil(rows / 32), local_experts)` and returns inside the kernel when `blockIdx.y * 32 >= expert_m`. That is correct and GPU-resident, but it still launches empty CTAs for short or empty experts. The first active-tile design check found a launch-side constraint: a GPU-generated active tile list cannot by itself shrink the next CUDA launch because grid dimensions are chosen on the host. Using a device-side `active_tile_count` would require a D2H count, CUDA dynamic parallelism, or launching the original capacity grid and returning on `tile >= active_count`. The last option preserves correctness but not the desired launch reduction. A better target is the existing `local_count`: decode route mapping already computes the actual number of local routes on GPU, while runtime still carries `num_expanded = routed.seq_len * topk` (`8 * 6 = 48` for MP8 decode) through expand, activation quant, and grouped GEMM. The hard part is exploiting `local_count` without reintroducing route metadata D2H. -Historical PegaInfer path before the retained fused W2 activation-quant work: +Historical OpenInfer path before the retained fused W2 activation-quant work: ```text act_quant(expanded_input) @@ -520,7 +520,7 @@ Runtime validation on 5090: | Check | Result | | --- | --- | | `cargo fmt --check` | passed | -| `cargo check --release -p pegainfer-deepseek-v4 --features deepseek-v4` | passed | +| `cargo check --release -p openinfer-deepseek-v4 --features deepseek-v4` | passed | | release `deepseek_v4_e2e` | `All 20 DeepSeek V4 exact cases passed` | | fixed bench JSON run 1 | steady TPOT avg `33.416ms`, p50 `32.884ms`, p95 `35.510ms`, first decode avg `31.885ms`, hash `6346f03343d75a65` | | fixed bench JSON run 2 | steady TPOT avg `31.180ms`, p50 `30.675ms`, p95 `33.151ms`, first decode avg `30.020ms`, hash `6346f03343d75a65` | @@ -559,9 +559,9 @@ Implementation notes: | Check | Result | | --- | --- | | local `cargo fmt --check` | passed | -| local `cargo check --release -p pegainfer-deepseek-v4 --features deepseek-v4` | passed | +| local `cargo check --release -p openinfer-deepseek-v4 --features deepseek-v4` | passed | | 5090 `cargo fmt --check` | passed | -| 5090 `cargo check --release -p pegainfer-deepseek-v4 --features deepseek-v4` | passed | +| 5090 `cargo check --release -p openinfer-deepseek-v4 --features deepseek-v4` | passed | | release `deepseek_v4_e2e` | `All 20 DeepSeek V4 exact cases passed` | | fixed bench JSON run 1 | steady TPOT avg `29.764ms`, p50 `29.296ms`, p95 `31.766ms`, first decode avg `28.575ms`, hash `6346f03343d75a65` | | fixed bench JSON run 2 | steady TPOT avg `31.592ms`, p50 `31.082ms`, p95 `33.699ms`, first decode avg `30.019ms`, hash `6346f03343d75a65` | @@ -600,7 +600,7 @@ These clears are semantic initialization, not removable allocation noise, but th | Check | Result | | --- | --- | | local `cargo fmt --check` | passed | -| local `cargo check --release -p pegainfer-deepseek-v4 --features deepseek-v4` | passed | +| local `cargo check --release -p openinfer-deepseek-v4 --features deepseek-v4` | passed | | release `deepseek_v4_e2e` | `All 20 DeepSeek V4 exact cases passed` | | fixed bench JSON | `29.862ms`, `29.969ms`, `29.874ms`, all hash `6346f03343d75a65` | | short nsys kernel summary | old `deepseek_moe_clear_i32_kernel` gone; `deepseek_moe_clear_mapping_kernel` appears once per mapping call | @@ -625,7 +625,7 @@ For MP8 decode, `route_elems = global_batch * topk`; with the fixed single-reque | Check | Result | | --- | --- | | local `cargo fmt --check` | passed | -| local `cargo check --release -p pegainfer-deepseek-v4 --features deepseek-v4` | passed | +| local `cargo check --release -p openinfer-deepseek-v4 --features deepseek-v4` | passed | | release `deepseek_v4_e2e` | `All 20 DeepSeek V4 exact cases passed` | | fixed bench JSON run 1 | `27.608ms`, `27.662ms`, `27.826ms`, all hash `6346f03343d75a65` | | fixed bench JSON run 2 | `27.698ms`, `27.693ms`, `27.644ms`, all hash `6346f03343d75a65` | @@ -656,7 +656,7 @@ This was exact-safe but not performance-safe. The route W13 kernel launched one | --- | --- | | local `cargo fmt --check` | passed | | local `git diff --check` | passed | -| local `cargo check --release -p pegainfer-deepseek-v4 --features deepseek-v4` | passed | +| local `cargo check --release -p openinfer-deepseek-v4 --features deepseek-v4` | passed | | 5090 release build for `bench_serving` and `deepseek_v4_e2e` | passed | | release `deepseek_v4_e2e` | `All 20 DeepSeek V4 exact cases passed` | | fixed bench JSON | aggregate steady TPOT avg `33.217ms`, p50 `33.003ms`, p95 `34.584ms`, decode throughput `30.115 tok/s` | @@ -695,7 +695,7 @@ Runtime validation: | --- | --- | | local `cargo fmt --check` | passed | | local `git diff --check` | passed | -| local `cargo check --release -p pegainfer-deepseek-v4 --features deepseek-v4` | passed | +| local `cargo check --release -p openinfer-deepseek-v4 --features deepseek-v4` | passed | | 5090 release build for `bench_serving` and `deepseek_v4_e2e` | passed | | release `deepseek_v4_e2e` | `All 20 DeepSeek V4 exact cases passed` | | fixed bench run 1 | aggregate steady TPOT avg `27.807ms`; iterations `28.080ms`, `28.143ms`, `27.198ms`; all hash `6346f03343d75a65` | @@ -725,7 +725,7 @@ This preserved the no-D2H rule and kept W2 grouped GEMM semantics unchanged. It | --- | --- | | local `cargo fmt --check` | passed | | local `git diff --check` | passed | -| local `cargo check --release -p pegainfer-deepseek-v4 --features deepseek-v4` | passed | +| local `cargo check --release -p openinfer-deepseek-v4 --features deepseek-v4` | passed | | 5090 release build for `bench_serving` and `deepseek_v4_e2e` | passed | | release `deepseek_v4_e2e` | `All 20 DeepSeek V4 exact cases passed` | | fixed bench JSON | aggregate steady TPOT avg `31.270ms`, p50 `30.660ms`, p95 `33.575ms` | @@ -735,7 +735,7 @@ Drop decision: do not retain. Skipping empty rows inside a tiny regular kernel i ### Rejected: shrink grouped GEMM row-tile launch by seq_len -vLLM's CUTLASS path passes logical per-expert `problem_sizes`, while PegaInfer's TileLang grouped FP4 launch uses a host grid of: +vLLM's CUTLASS path passes logical per-expert `problem_sizes`, while OpenInfer's TileLang grouped FP4 launch uses a host grid of: ```text grid.x = output tiles @@ -767,8 +767,8 @@ Runtime validation on 5090: | --- | --- | | local `cargo fmt --check` | passed | | local `git diff --check` | passed | -| local `cargo check --release -p pegainfer-deepseek-v4 --features deepseek-v4` | passed | -| 5090 `cargo check --release -p pegainfer-deepseek-v4 --features deepseek-v4` | passed | +| local `cargo check --release -p openinfer-deepseek-v4 --features deepseek-v4` | passed | +| 5090 `cargo check --release -p openinfer-deepseek-v4 --features deepseek-v4` | passed | | release `deepseek_v4_e2e` | `All 20 DeepSeek V4 exact cases passed` | | fixed bench JSON | per-iteration steady TPOT avg `28.504ms`, `28.460ms`, `28.735ms`; all hash `6346f03343d75a65` | @@ -945,7 +945,7 @@ Additional `32768` probes stayed bitwise: | active `8`, rows/active `8` | `0.122960ms -> 0.108667ms` | `0.063493ms -> 0.057330ms` | | active `16`, rows/active `4` | `0.315041ms -> 0.283986ms` | `0.122991ms -> 0.082393ms` | -Runtime change: grouped FP4 W13 and W2 wrappers generated by `pegainfer-kernels/tools/tilelang/deepseek_v4/generate.py` now request `32768` dynamic shared bytes. Dense FP4/FP8 wrappers keep their existing requests. +Runtime change: grouped FP4 W13 and W2 wrappers generated by `openinfer-kernels/tools/tilelang/deepseek_v4/generate.py` now request `32768` dynamic shared bytes. Dense FP4/FP8 wrappers keep their existing requests. 5090 validation: @@ -1040,7 +1040,7 @@ Full-runtime validation: | Check | Result | | --- | --- | -| release `cargo check -p pegainfer-deepseek-v4 --features deepseek-v4` | passed locally and on 5090 | +| release `cargo check -p openinfer-deepseek-v4 --features deepseek-v4` | passed locally and on 5090 | | release `deepseek_v4_e2e` | `All 20 DeepSeek V4 exact cases passed` | | fixed bench run 1 | aggregate steady TPOT avg `28.971ms`; per-iteration `28.727ms`, `28.963ms`, `29.224ms`; all hash `6346f03343d75a65` | | fixed bench repeat | aggregate steady TPOT avg `29.797ms`; per-iteration `29.913ms`, `29.764ms`, `29.713ms`; all hash `6346f03343d75a65` | @@ -1135,7 +1135,7 @@ Implementation notes: | Check | Result | | --- | --- | -| release `cargo check -p pegainfer-deepseek-v4 --features deepseek-v4` | passed locally and on 5090 after generator fixes | +| release `cargo check -p openinfer-deepseek-v4 --features deepseek-v4` | passed locally and on 5090 after generator fixes | | microbench first fuzz shape | active `1`, rows/active `8`, experts `32` | | fuzz result | FAIL: FP8 activation and E8M0 scale bytes differed from the current baseline | | log | `$RESULT_ROOT/dsv4_w13_swiglu_quant_bench.log` | @@ -1145,7 +1145,7 @@ Drop decision: do not retain. The generator can express the rough shape, but it Evidence required for each adoption step: - vLLM/SGLang source location and whether we copied the decomposition, the kernel shape, or only the validation idea. -- standalone microbench with fuzz against the current PegaInfer baseline. +- standalone microbench with fuzz against the current OpenInfer baseline. - exact E2E `20/20`. - fixed JSON bench with token hash `6346f03343d75a65`. - repeated TPOT range, not a single fast run. @@ -1168,7 +1168,7 @@ Earlier exploratory runs of the same BF16 direct shape landed at `26.194ms`, `30 Rejected variant: caching score-gate weights as F32 preserved exact E2E and token hash, but the fixed bench regressed to aggregate steady TPOT avg `29.148ms` with per-iteration `29.152ms`, `29.139ms`, and `29.155ms`. The extra F32 memory footprint and F32 math path were not worth keeping. -Rejected variant: direct CUDA BF16 router projection. SGLang has a `fused_moe_router_cudacore` route in `$LOCAL_WORKSPACE/sglang/python/sglang/srt/layers/moe/router.py`, and TileKernels has warp-level top-k/scoring kernels under `$LOCAL_WORKSPACE/TileKernels/tile_kernels/moe/`. We tested the analogous PegaInfer idea with a temporary standalone bench: keep the existing select/normalization semantics, but replace the cuBLAS BF16 projection with a direct CUDA dot-product kernel over `(seq_len, n_experts, hidden_dim)`. The bench source was deleted after rejection so it cannot be accidentally wired into runtime. +Rejected variant: direct CUDA BF16 router projection. SGLang has a `fused_moe_router_cudacore` route in `$LOCAL_WORKSPACE/sglang/python/sglang/srt/layers/moe/router.py`, and TileKernels has warp-level top-k/scoring kernels under `$LOCAL_WORKSPACE/TileKernels/tile_kernels/moe/`. We tested the analogous OpenInfer idea with a temporary standalone bench: keep the existing select/normalization semantics, but replace the cuBLAS BF16 projection with a direct CUDA dot-product kernel over `(seq_len, n_experts, hidden_dim)`. The bench source was deleted after rejection so it cannot be accidentally wired into runtime. 5090 microbench: @@ -1715,7 +1715,7 @@ Standalone tool: -O3 \ -std=c++17 \ -arch=sm_120 \ - pegainfer-kernels/tools/deepseek_v4/score_select_bench.cu \ + openinfer-kernels/tools/deepseek_v4/score_select_bench.cu \ -o $RESULT_ROOT/dsv4_score_select_bench $RESULT_ROOT/dsv4_score_select_bench @@ -1823,8 +1823,8 @@ Verified command set for this PR: ```bash cargo fmt --check -cargo check --release -p pegainfer-deepseek-v4 --features deepseek-v4 -cargo check --release -p pegainfer-server --features deepseek-v4 +cargo check --release -p openinfer-deepseek-v4 --features deepseek-v4 +cargo check --release -p openinfer-server --features deepseek-v4 gcc -shared -fPIC -O2 -Wall -Wextra -o $RESULT_ROOT/cuda_api_counter.so tools/cuda_api_counter.c -ldl ``` @@ -1833,8 +1833,8 @@ gcc -shared -fPIC -O2 -Wall -Wextra -o $RESULT_ROOT/cuda_api_counter.so tools/cu Local: - `cargo fmt --check` -- `cargo check --release -p pegainfer-deepseek-v4 --features deepseek-v4` -- `cargo check --release -p pegainfer-server --features deepseek-v4` +- `cargo check --release -p openinfer-deepseek-v4 --features deepseek-v4` +- `cargo check --release -p openinfer-server --features deepseek-v4` - `gcc -shared -fPIC -O2 -Wall -Wextra -o $RESULT_ROOT/cuda_api_counter.so tools/cuda_api_counter.c -ldl` - `nm -D $RESULT_ROOT/cuda_api_counter.so` confirmed base and `_ptsz` wrappers - `git diff --check` @@ -1843,8 +1843,8 @@ Local: 5090: - `cargo fmt --check` -- `cargo check --release -p pegainfer-deepseek-v4 --features deepseek-v4` -- `cargo check --release -p pegainfer-server --features deepseek-v4` +- `cargo check --release -p openinfer-deepseek-v4 --features deepseek-v4` +- `cargo check --release -p openinfer-server --features deepseek-v4` - release `deepseek_v4_e2e`: `All 20 DeepSeek V4 exact cases passed` - release fixed bench log `$RESULT_ROOT/dsv4_pr_driver_numa_bench.log`: steady TPOT avg `35.253ms`, p50 `34.800ms`, p95 `37.335ms`, first decode avg `33.743ms`, hash `6346f03343d75a65` - current clean fixed bench log `$RESULT_ROOT/dsv4_clean_tpot_now.log`: per-iteration steady TPOT avg `29.944ms`, `29.907ms`, `29.896ms`, all hash `6346f03343d75a65` @@ -1889,7 +1889,7 @@ Local: - post-revert hand act_quant exact E2E log `$RESULT_ROOT/dsv4_act_quant_restored_e2e.log`: `All 20 DeepSeek V4 exact cases passed` - post-revert hand act_quant fixed bench logs `$RESULT_ROOT/dsv4_act_quant_restored_bench.json` and `$RESULT_ROOT/dsv4_act_quant_restored_bench_repeat.json`: first run aggregate steady TPOT avg `28.249ms` but one iteration hash changed to `a278a8140c25b812`; repeat aggregate steady TPOT avg `29.277ms`, per-iteration `29.265ms`, `29.272ms`, `29.293ms`, with all hashes restored to `6346f03343d75a65` - post-revert hand act_quant 5-run stability logs `$RESULT_ROOT/dsv4_stability_after_act_quant_revert_{1..5}.json`: aggregate steady TPOT avg `28.912ms`, `28.867ms`, `28.291ms`, `28.375ms`, and `28.715ms`; all 15 per-iteration hashes were `6346f03343d75a65`. Another CPU load was running during this sweep, so the result is a conservative sub-30 stability check rather than a clean machine best-band. -- reviewer 5090 5-run stability rerun `$RESULT_ROOT/pegainfer_dev_pr101_bench_{1..5}.json`: aggregate steady TPOT avg `28.505793ms`, `28.087102ms`, `29.755957ms`, `27.552965ms`, and `29.371630ms`; all 15 per-iteration hashes were `6346f03343d75a65`. One run wrote the complete JSON report and logged scheduler exit, then segfaulted in NCCL shutdown; treat that as the existing shutdown cleanup issue, not decode TPOT or token-correctness evidence. +- reviewer 5090 5-run stability rerun `$RESULT_ROOT/openinfer_dev_pr101_bench_{1..5}.json`: aggregate steady TPOT avg `28.505793ms`, `28.087102ms`, `29.755957ms`, `27.552965ms`, and `29.371630ms`; all 15 per-iteration hashes were `6346f03343d75a65`. One run wrote the complete JSON report and logged scheduler exit, then segfaulted in NCCL shutdown; treat that as the existing shutdown cleanup issue, not decode TPOT or token-correctness evidence. - fused Q/KV RoPE exact E2E log `$RESULT_ROOT/dsv4_qkv_rope_e2e.log`: `All 20 DeepSeek V4 exact cases passed` - fused Q/KV RoPE fixed bench logs `$RESULT_ROOT/dsv4_qkv_rope_bench.log` and `$RESULT_ROOT/dsv4_qkv_rope_bench_repeat.log`: per-iteration steady TPOT avg `28.215ms`, `28.256ms`, `28.236ms`, then `27.096ms`, `28.565ms`, `28.349ms`; all hash `6346f03343d75a65` - fused Q/KV RoPE short profile `$RESULT_ROOT/dsv4_qkv_rope_short.nsys-rep` and `$RESULT_ROOT/dsv4_qkv_rope_short_kernels_cuda_gpu_kern_sum.csv`: `deepseek_apply_rope_q_kv_kernel` appears in the kernel summary; residual hidden-RoPE kernels are from non-projection paths. @@ -1898,7 +1898,7 @@ Local: - rejected ratio-4 top-k concat removal exact E2E log `$RESULT_ROOT/dsv4_topk_no_concat_e2e.log`: `All 20 DeepSeek V4 exact cases passed` - rejected ratio-4 top-k concat removal fixed bench log `$RESULT_ROOT/dsv4_topk_no_concat_bench.log`: aggregate steady TPOT avg `29.541ms`, per-iteration `29.551ms`, `29.539ms`, `29.532ms`; all hash `6346f03343d75a65` - post-revert ratio-4 top-k fixed bench log `$RESULT_ROOT/dsv4_revert_topk_bench.log`: aggregate steady TPOT avg `28.333ms`, per-iteration `28.316ms`, `28.336ms`, `28.346ms`; all hash `6346f03343d75a65` -- old split MoE/SwiGLU cleanup: local `cargo fmt --check`, local `cargo check --release -p pegainfer-deepseek-v4 --features deepseek-v4`, local `git diff --check`, 5090 `cargo check --release -p pegainfer-deepseek-v4 --features deepseek-v4`, and 5090 release builds for `bench_serving` and `deepseek_v4_e2e` passed after removing stale public/FFI exports. +- old split MoE/SwiGLU cleanup: local `cargo fmt --check`, local `cargo check --release -p openinfer-deepseek-v4 --features deepseek-v4`, local `git diff --check`, 5090 `cargo check --release -p openinfer-deepseek-v4 --features deepseek-v4`, and 5090 release builds for `bench_serving` and `deepseek_v4_e2e` passed after removing stale public/FFI exports. - old split MoE/SwiGLU cleanup exact E2E log `$RESULT_ROOT/dsv4_fused_cleanup_e2e.log`: `All 20 DeepSeek V4 exact cases passed` - old split MoE/SwiGLU cleanup fixed bench log `$RESULT_ROOT/dsv4_fused_cleanup_bench.log`: aggregate steady TPOT avg `27.860ms`, per-iteration `27.863ms`, `27.845ms`, `27.872ms`; all hash `6346f03343d75a65` - MoE reduce-scatter/shared overlap exact E2E log `$RESULT_ROOT/dsv4_moe_rs_overlap_e2e.log`: `All 20 DeepSeek V4 exact cases passed` @@ -1939,7 +1939,7 @@ Local: - rejected naive grouped FP4 `block_M=16` logs `$RESULT_ROOT/dsv4_w13_block_m16_bench.log` and `$RESULT_ROOT/dsv4_w13_block_m16_large_rows_bench.log`: decode-like small rows sped up, but rows/active `32` failed fuzz because grouped transforms/wrappers still have hard-coded `32`-row assumptions; not retained. - rejected parameterized grouped FP4 `block_M=16` logs `$RESULT_ROOT/dsv4_w13_block_m16_param_fuzz.log`, `$RESULT_ROOT/dsv4_grouped_block_m16_e2e.log`, `$RESULT_ROOT/dsv4_grouped_block_m16_bench.log`, and `$RESULT_ROOT/dsv4_grouped_block_m16_bench_repeat.log`: broad fuzz and exact E2E passed, token hash stayed `6346f03343d75a65`, but fixed bench regressed to aggregate steady TPOT avg `28.971ms` then `29.797ms`; local and 5090 were restored to grouped FP4 `block_M=32`. - post-restore grouped FP4 fixed bench log `$RESULT_ROOT/dsv4_grouped_block_m16_restored_bench.log`: aggregate steady TPOT avg `28.736ms`, per-iteration `28.445ms`, `28.998ms`, `28.763ms`; all hash `6346f03343d75a65`. -- completion audit and cleanup: local `git diff --check`, `cargo fmt --check`, and `cargo check --release -p pegainfer-deepseek-v4 --features deepseek-v4` passed after documenting the sub-25 gap and deleting untracked rejected bench sources. The retained tool sources are `score_select_bench.cu`, `swiglu_quant_bench.cu`, and `w13_grouped_fp4_bench.cu`. +- completion audit and cleanup: local `git diff --check`, `cargo fmt --check`, and `cargo check --release -p openinfer-deepseek-v4 --features deepseek-v4` passed after documenting the sub-25 gap and deleting untracked rejected bench sources. The retained tool sources are `score_select_bench.cu`, `swiglu_quant_bench.cu`, and `w13_grouped_fp4_bench.cu`. - vLLM/SGLang large-batch gap audit: source inspection confirmed the mature FP4 MoE throughput path combines static W13/W2 weight reorder, FP4 scale interleave, packed routed top-k, and problem-size-aware grouped backends. This supports keeping packed MoE layout as a separate bs>100 architecture project rather than mixing it into the current sub-25 latency patch. - `gcc -shared -fPIC -O2 -Wall -Wextra -o $RESULT_ROOT/cuda_api_counter.so tools/cuda_api_counter.c -ldl` - `nm -D $RESULT_ROOT/cuda_api_counter.so` confirmed base and `_ptsz` wrappers diff --git a/docs/models/deepseek-v4/http-serving-benchmark.md b/docs/models/deepseek-v4/http-serving-benchmark.md index 2a079667..49b3d609 100644 --- a/docs/models/deepseek-v4/http-serving-benchmark.md +++ b/docs/models/deepseek-v4/http-serving-benchmark.md @@ -34,21 +34,21 @@ requests: Build the server on the target GPU host: ```bash -cd /path/to/pegainfer +cd /path/to/openinfer export PATH=/usr/local/cuda-13.1/bin:$PATH export CUDA_HOME=/usr/local/cuda-13.1 -export PEGAINFER_TILELANG_PYTHON=/path/to/venv/bin/python -export PEGAINFER_TRITON_PYTHON=/path/to/venv/bin/python -export PEGAINFER_NVCC_JOBS=8 -export CARGO_TARGET_DIR=/path/to/pegainfer-target +export OPENINFER_TILELANG_PYTHON=/path/to/venv/bin/python +export OPENINFER_TRITON_PYTHON=/path/to/venv/bin/python +export OPENINFER_NVCC_JOBS=8 +export CARGO_TARGET_DIR=/path/to/openinfer-target -cargo build --release -p pegainfer-server --features deepseek-v4 --bin pegainfer +cargo build --release -p openinfer-server --features deepseek-v4 --bin openinfer ``` Start the OpenAI-compatible HTTP endpoint: ```bash -$CARGO_TARGET_DIR/release/pegainfer \ +$CARGO_TARGET_DIR/release/openinfer \ --model-path $MODEL_DIR \ --port 18118 2>&1 | tee $RESULT_ROOT/dsv4_http_server.log ``` @@ -56,7 +56,7 @@ $CARGO_TARGET_DIR/release/pegainfer \ For prefill phase attribution, start the endpoint with profiling enabled: ```bash -$CARGO_TARGET_DIR/release/pegainfer \ +$CARGO_TARGET_DIR/release/openinfer \ --model-path $MODEL_DIR \ --port 18118 \ --deepseek-prefill-profile 2>&1 | tee $RESULT_ROOT/dsv4_http_server_profile.log @@ -125,8 +125,8 @@ The script is intentionally model-server agnostic at the HTTP layer. It only requires an OpenAI-compatible `/v1/completions` endpoint that supports streaming responses. -The server trace columns are pegainfer-specific and require a pegainfer server -log containing `pegainfer_http_trace` lines. The sweep fails when any cell has +The server trace columns are openinfer-specific and require a openinfer server +log containing `openinfer_http_trace` lines. The sweep fails when any cell has request failures/timeouts or per-request output hashes that change across repeats. @@ -357,7 +357,7 @@ top-k as the largest remaining indexer-side bucket: The equivalence gate is a GPU test against the current selector semantics: ```bash -cargo test --release -p pegainfer-kernels \ +cargo test --release -p openinfer-kernels \ --features deepseek-v4 \ --test deepseek_indexer_topk -- --ignored --nocapture ``` diff --git a/docs/models/deepseek-v4/kernel-paths.md b/docs/models/deepseek-v4/kernel-paths.md index e581719c..fbae16d1 100644 --- a/docs/models/deepseek-v4/kernel-paths.md +++ b/docs/models/deepseek-v4/kernel-paths.md @@ -8,22 +8,22 @@ - **Read**: - `docs/index.md` - showed DeepSeek V4 support, kernel boundary, and Qwen3 kernel extraction as the relevant prior work. - `docs/models/deepseek-v4/support.md` - confirmed DeepSeek V4 currently has native MP8 runtime, TileLang build-time kernels, exact E2E coverage, and a documented CUDA split by subsystem. - - `docs/subsystems/kernels/pegainfer-kernels-boundary.md` - confirmed kernels belong in the shared kernels crate, while model DAG/runtime policy stays in the model crate. - - `docs/models/qwen3/kernels-crate.md` - established the existing crate-first split and the role of `pegainfer-kernels/KERNELS.md`. + - `docs/subsystems/kernels/openinfer-kernels-boundary.md` - confirmed kernels belong in the shared kernels crate, while model DAG/runtime policy stays in the model crate. + - `docs/models/qwen3/kernels-crate.md` - established the existing crate-first split and the role of `openinfer-kernels/KERNELS.md`. - `docs/conventions/coding-style.md` - reminded that GPU kernels deserve targeted tests, while broad behavior is better covered by integration/E2E. - - `pegainfer-kernels/build.rs` - showed DeepSeek kernels are feature-gated by filename prefix in a flat `csrc/` scan, and TileLang generation was hard-coded to the old flat `tools/tilelang/gen_deepseek_v4_tilelang.py` path. - - `pegainfer-kernels/KERNELS.md` - currently indexes Qwen3 and only mentions DeepSeek as compatibility symbols, so DSV4 has no routing table. - - `pegainfer-kernels/csrc/deepseek_*.cu` and `pegainfer-kernels/csrc/deepseek_common.cuh` - confirmed the CUDA side is already split by subsystem but still lives in the root kernel source directory. - - `pegainfer-deepseek-v4/src/runtime/*` - confirmed runtime calls reach DeepSeek symbols through `pegainfer_kernels::ffi`, so path cleanup should not require runtime API changes. + - `openinfer-kernels/build.rs` - showed DeepSeek kernels are feature-gated by filename prefix in a flat `csrc/` scan, and TileLang generation was hard-coded to the old flat `tools/tilelang/gen_deepseek_v4_tilelang.py` path. + - `openinfer-kernels/KERNELS.md` - currently indexes Qwen3 and only mentions DeepSeek as compatibility symbols, so DSV4 has no routing table. + - `openinfer-kernels/csrc/deepseek_*.cu` and `openinfer-kernels/csrc/deepseek_common.cuh` - confirmed the CUDA side is already split by subsystem but still lives in the root kernel source directory. + - `openinfer-deepseek-v4/src/runtime/*` - confirmed runtime calls reach DeepSeek symbols through `openinfer_kernels::ffi`, so path cleanup should not require runtime API changes. - **Relevant history**: - `docs/models/deepseek-v4/support.md` records that the current DeepSeek CUDA glue is intentionally split by subsystem; this cleanup should preserve that split instead of merging files. - - `docs/models/qwen3/kernels-crate.md` moved kernel ownership into `pegainfer-kernels`; the same pattern supports moving model-specific source into a clearer subdirectory without changing model runtime ownership. + - `docs/models/qwen3/kernels-crate.md` moved kernel ownership into `openinfer-kernels`; the same pattern supports moving model-specific source into a clearer subdirectory without changing model runtime ownership. - **Plan**: - 1. First slice: move DeepSeek V4 CUDA sources from `pegainfer-kernels/csrc/deepseek_*.cu` and `deepseek_common.cuh` into `pegainfer-kernels/csrc/deepseek_v4/`, then update `pegainfer-kernels/build.rs` to discover CUDA files recursively and feature-gate DeepSeek by path instead of flat filename prefix. + 1. First slice: move DeepSeek V4 CUDA sources from `openinfer-kernels/csrc/deepseek_*.cu` and `deepseek_common.cuh` into `openinfer-kernels/csrc/deepseek_v4/`, then update `openinfer-kernels/build.rs` to discover CUDA files recursively and feature-gate DeepSeek by path instead of flat filename prefix. 2. Keep object file names stable or explicitly namespace them so `ar` input names remain collision-free when sources live in subdirectories. 3. Update include/rerun handling so `.cu` and `.cuh` changes under nested kernel directories trigger rebuilds. - 4. Run low-cost verification for the first slice: `cargo fmt --all --check`, `PEGAINFER_CUDA_SM=120 cargo check --release -p pegainfer-kernels --features deepseek-v4`, and the non-DeepSeek default check if local CUDA/TileLang availability permits. - 5. Record the result in this doc, then decide the next slice: likely moving the TileLang generator into a DeepSeek-specific tools path and adding a DSV4 section to `pegainfer-kernels/KERNELS.md`. + 4. Run low-cost verification for the first slice: `cargo fmt --all --check`, `OPENINFER_CUDA_SM=120 cargo check --release -p openinfer-kernels --features deepseek-v4`, and the non-DeepSeek default check if local CUDA/TileLang availability permits. + 5. Record the result in this doc, then decide the next slice: likely moving the TileLang generator into a DeepSeek-specific tools path and adding a DSV4 section to `openinfer-kernels/KERNELS.md`. - **Risks / open questions**: - Recursive source discovery can accidentally compile generated or third-party CUDA if scoped too broadly. It should only recurse under owned `csrc/`. - DeepSeek TileLang requires a working TileLang Python; local verification may stop at environment setup rather than code correctness. @@ -32,48 +32,48 @@ ## Execution Log ### Step 1: Move DeepSeek V4 CUDA sources under a model-specific directory -- Moved DeepSeek V4 CUDA sources from `pegainfer-kernels/csrc/deepseek_*.cu` and `pegainfer-kernels/csrc/deepseek_common.cuh` into `pegainfer-kernels/csrc/deepseek_v4/`. -- Updated `pegainfer-kernels/build.rs` to collect owned `csrc/` files recursively, emit rebuild triggers for nested `.cu`/`.cuh` files, and generate object names from the relative source path so nested CUDA files do not collide with flat ones. -- Replaced the build-script feature probe with `cfg!(feature = "deepseek-v4")`. Cargo feature resolution was checked with `cargo tree -p pegainfer-server --features deepseek-v4 -i pegainfer-kernels -e features`, which confirmed `pegainfer-server/deepseek-v4` enables `pegainfer-deepseek-v4/deepseek-v4` and then `pegainfer-kernels/deepseek-v4`. -- Updated `pegainfer-kernels/KERNELS.md` and `docs/models/deepseek-v4/support.md` to point DeepSeek CUDA references at `csrc/deepseek_v4/`. +- Moved DeepSeek V4 CUDA sources from `openinfer-kernels/csrc/deepseek_*.cu` and `openinfer-kernels/csrc/deepseek_common.cuh` into `openinfer-kernels/csrc/deepseek_v4/`. +- Updated `openinfer-kernels/build.rs` to collect owned `csrc/` files recursively, emit rebuild triggers for nested `.cu`/`.cuh` files, and generate object names from the relative source path so nested CUDA files do not collide with flat ones. +- Replaced the build-script feature probe with `cfg!(feature = "deepseek-v4")`. Cargo feature resolution was checked with `cargo tree -p openinfer-server --features deepseek-v4 -i openinfer-kernels -e features`, which confirmed `openinfer-server/deepseek-v4` enables `openinfer-deepseek-v4/deepseek-v4` and then `openinfer-kernels/deepseek-v4`. +- Updated `openinfer-kernels/KERNELS.md` and `docs/models/deepseek-v4/support.md` to point DeepSeek CUDA references at `csrc/deepseek_v4/`. Result: path move and build-script feature gating are in place. Verification: - `cargo fmt --all --check` passed. -- `cargo tree -p pegainfer-server --features deepseek-v4 -i pegainfer-kernels -e features` confirmed feature forwarding from server to DeepSeek V4 model crate to kernels. -- `PEGAINFER_CUDA_SM=120 cargo check --release -p pegainfer-kernels` passed. The build log confirmed DeepSeek V4 CUDA/TileLang kernels are disabled without the feature. -- `PEGAINFER_CUDA_SM=120 cargo check --release -p pegainfer-kernels --features deepseek-v4` passed. The build log confirmed DeepSeek V4 TileLang CUDA generation under `target/.../tilelang/deepseek_v4/`. -- `PEGAINFER_CUDA_SM=120 cargo check --release -p pegainfer-server --features deepseek-v4` passed, covering feature forwarding through the server, model crate, and kernels crate together. +- `cargo tree -p openinfer-server --features deepseek-v4 -i openinfer-kernels -e features` confirmed feature forwarding from server to DeepSeek V4 model crate to kernels. +- `OPENINFER_CUDA_SM=120 cargo check --release -p openinfer-kernels` passed. The build log confirmed DeepSeek V4 CUDA/TileLang kernels are disabled without the feature. +- `OPENINFER_CUDA_SM=120 cargo check --release -p openinfer-kernels --features deepseek-v4` passed. The build log confirmed DeepSeek V4 TileLang CUDA generation under `target/.../tilelang/deepseek_v4/`. +- `OPENINFER_CUDA_SM=120 cargo check --release -p openinfer-server --features deepseek-v4` passed, covering feature forwarding through the server, model crate, and kernels crate together. ### Step 2: Move the DeepSeek V4 TileLang generator under the shared TileLang backend directory -- Moved `pegainfer-kernels/tools/tilelang/gen_deepseek_v4_tilelang.py` to `pegainfer-kernels/tools/tilelang/deepseek_v4/generate.py`. -- Updated `pegainfer-kernels/build.rs` to run the generator from the new path. +- Moved `openinfer-kernels/tools/tilelang/gen_deepseek_v4_tilelang.py` to `openinfer-kernels/tools/tilelang/deepseek_v4/generate.py`. +- Updated `openinfer-kernels/build.rs` to run the generator from the new path. - Updated the generated CUDA banner comment to point at the new generator path. -- Added `pegainfer-kernels/tools/tilelang/README.md` to define `tools/tilelang/` as the shared TileLang backend directory, with model- or shape-family-specific generators in subdirectories. +- Added `openinfer-kernels/tools/tilelang/README.md` to define `tools/tilelang/` as the shared TileLang backend directory, with model- or shape-family-specific generators in subdirectories. - Updated `docs/models/deepseek-v4/support.md` to point at the new generator path. Verification: - `cargo fmt --all --check` passed. -- `PEGAINFER_CUDA_SM=120 cargo check --release -p pegainfer-kernels --features deepseek-v4` passed. The build log showed DeepSeek V4 TileLang CUDA generation still succeeds after the path move. -- `PEGAINFER_CUDA_SM=120 cargo check --release -p pegainfer-server --features deepseek-v4` passed after the generator move. +- `OPENINFER_CUDA_SM=120 cargo check --release -p openinfer-kernels --features deepseek-v4` passed. The build log showed DeepSeek V4 TileLang CUDA generation still succeeds after the path move. +- `OPENINFER_CUDA_SM=120 cargo check --release -p openinfer-server --features deepseek-v4` passed after the generator move. ### Step 3: Add the DeepSeek V4 kernel routing index -- Added a `DeepSeek V4 MP8 Path` section to `pegainfer-kernels/KERNELS.md`. -- Mapped runtime owners under `pegainfer-deepseek-v4/src/runtime/` to the public `pegainfer_kernels::ffi` symbols and their CUDA/TileLang source owners. +- Added a `DeepSeek V4 MP8 Path` section to `openinfer-kernels/KERNELS.md`. +- Mapped runtime owners under `openinfer-deepseek-v4/src/runtime/` to the public `openinfer_kernels::ffi` symbols and their CUDA/TileLang source owners. - Grouped rows by execution subsystem rather than every individual shape: quant, attention, collectives cast helpers, indexer, compressor, HC, logits, and MoE. - Kept TileLang shape details in source notes so the table remains a routing aid rather than a duplicate ABI declaration. Verification: -- `rg` over `pegainfer-kernels/src/ffi.rs`, `pegainfer-kernels/csrc/deepseek_v4/`, and `pegainfer-deepseek-v4/src/runtime/` was used to build the mapping. -- `PEGAINFER_CUDA_SM=120 cargo check --release -p pegainfer-kernels` passed without `deepseek-v4`; the build log confirmed DeepSeek V4 CUDA/TileLang kernels are disabled on the default path. -- `PEGAINFER_CUDA_SM=120 cargo check --release -p pegainfer-server` passed without `deepseek-v4`; the build log again confirmed the default server path skips DeepSeek V4 CUDA/TileLang. +- `rg` over `openinfer-kernels/src/ffi.rs`, `openinfer-kernels/csrc/deepseek_v4/`, and `openinfer-deepseek-v4/src/runtime/` was used to build the mapping. +- `OPENINFER_CUDA_SM=120 cargo check --release -p openinfer-kernels` passed without `deepseek-v4`; the build log confirmed DeepSeek V4 CUDA/TileLang kernels are disabled on the default path. +- `OPENINFER_CUDA_SM=120 cargo check --release -p openinfer-server` passed without `deepseek-v4`; the build log again confirmed the default server path skips DeepSeek V4 CUDA/TileLang. ## Debrief -- **Outcome**: DeepSeek V4 owned CUDA sources now live under `pegainfer-kernels/csrc/deepseek_v4/`. The DeepSeek V4 TileLang generator now lives under the shared TileLang backend directory at `pegainfer-kernels/tools/tilelang/deepseek_v4/generate.py`. The kernels build script recursively scans owned CUDA sources, skips DSV4 by path when `deepseek-v4` is disabled, uses `cfg!(feature = "deepseek-v4")` for the feature decision, namespaces object names by relative source path, and runs the TileLang generator from its new path. `pegainfer-kernels/KERNELS.md` now includes a DeepSeek V4 MP8 routing table from runtime owners to FFI symbols and CUDA/TileLang source paths. +- **Outcome**: DeepSeek V4 owned CUDA sources now live under `openinfer-kernels/csrc/deepseek_v4/`. The DeepSeek V4 TileLang generator now lives under the shared TileLang backend directory at `openinfer-kernels/tools/tilelang/deepseek_v4/generate.py`. The kernels build script recursively scans owned CUDA sources, skips DSV4 by path when `deepseek-v4` is disabled, uses `cfg!(feature = "deepseek-v4")` for the feature decision, namespaces object names by relative source path, and runs the TileLang generator from its new path. `openinfer-kernels/KERNELS.md` now includes a DeepSeek V4 MP8 routing table from runtime owners to FFI symbols and CUDA/TileLang source paths. - **Pitfalls encountered**: - - The initial feature probe used `CARGO_FEATURE_DEEPSEEK_V4`; Cargo already forwards the feature into `pegainfer-kernels`, so `cfg!(feature = "deepseek-v4")` is clearer in `build.rs`. + - The initial feature probe used `CARGO_FEATURE_DEEPSEEK_V4`; Cargo already forwards the feature into `openinfer-kernels`, so `cfg!(feature = "deepseek-v4")` is clearer in `build.rs`. - `cargo tree -e features` needs the reverse dependency form to show the exact feature forwarding path clearly. - **Lessons learned**: - Moving model-owned kernel source into a subdirectory is low-risk once build discovery is path-based rather than filename-prefix based. diff --git a/docs/models/deepseek-v4/moe-ag-rs.md b/docs/models/deepseek-v4/moe-ag-rs.md index d2ef63b7..9b08a30c 100644 --- a/docs/models/deepseek-v4/moe-ag-rs.md +++ b/docs/models/deepseek-v4/moe-ag-rs.md @@ -20,17 +20,17 @@ Decode MoE now uses GPU-resident allgather/router/local-expert/reduce-scatter fl 1. Keep production prefill group helpers, because `prefill_logits_and_decode_cache_group_bf16_hidden` is still called by the direct runtime. 2. Remove decode-only group entry points that support single-thread multi-rank decode: `block_decode_group_bf16_hidden`, `block_decode_group_rank_threads_bf16_hidden`, and their now-unused attention/MoE group helpers. 3. Remove public re-exports and mp8 manifest tests that only exercise the deleted decode group path. - 4. Run `cargo fmt --check` and `cargo check --release -p pegainfer-deepseek-v4 --features deepseek-v4`. + 4. Run `cargo fmt --check` and `cargo check --release -p openinfer-deepseek-v4 --features deepseek-v4`. - **Risks / open questions**: - Some tests currently use group decode as a small two-rank smoke path; deleting them narrows coverage to production rank-lane decode plus prefill group tests. ### Eliminate central single-thread direct runtime paths - **Read**: - - `pegainfer-deepseek-v4/src/direct.rs` - confirmed decode already uses persistent rank workers, but cache ownership is still moved through central runtime every token and prefill still uses central group helpers. - - `pegainfer-deepseek-v4/src/runtime/block.rs` - identified rank-local decode structure and central prefill group cache seeding path. - - `pegainfer-deepseek-v4/src/runtime/attention.rs` - identified ratio-4 prefill indexer all-reduce as the non-trivial collective that needs a rank-lane version. - - `pegainfer-deepseek-v4/src/runtime/moe.rs` - identified prefill MoE routed all-reduce as another group-only collective to move into rank lanes. + - `openinfer-deepseek-v4/src/direct.rs` - confirmed decode already uses persistent rank workers, but cache ownership is still moved through central runtime every token and prefill still uses central group helpers. + - `openinfer-deepseek-v4/src/runtime/block.rs` - identified rank-local decode structure and central prefill group cache seeding path. + - `openinfer-deepseek-v4/src/runtime/attention.rs` - identified ratio-4 prefill indexer all-reduce as the non-trivial collective that needs a rank-lane version. + - `openinfer-deepseek-v4/src/runtime/moe.rs` - identified prefill MoE routed all-reduce as another group-only collective to move into rank lanes. - **Relevant history**: - Step 6 removed decode group entry points, but prefill/cache ownership still left a central multi-rank path that future changes could accidentally follow. - **Plan**: @@ -49,9 +49,9 @@ Decode MoE now uses GPU-resident allgather/router/local-expert/reduce-scatter fl - **Relevant history**: - `docs/models/deepseek-v4/moe-tilelang-review.md` records that replacing local expert execution is a larger cutover; this task intentionally only adds the regular collective exchange primitives. - **Plan**: - 1. Add `all_gather_bf16_hidden_group` and `reduce_scatter_f32_hidden_group` in `pegainfer-deepseek-v4/src/runtime/collectives.rs` with explicit shape checks. - 2. Export the new collectives from `pegainfer-deepseek-v4/src/lib.rs`. - 3. Add a focused NCCL pair test in `pegainfer-deepseek-v4/tests/mp8_manifest.rs` that validates `[B_local,H] -> [world*B_local,H]` allgather and `[world*B_local,H] -> [B_local,H]` f32 reduce-scatter. + 1. Add `all_gather_bf16_hidden_group` and `reduce_scatter_f32_hidden_group` in `openinfer-deepseek-v4/src/runtime/collectives.rs` with explicit shape checks. + 2. Export the new collectives from `openinfer-deepseek-v4/src/lib.rs`. + 3. Add a focused NCCL pair test in `openinfer-deepseek-v4/tests/mp8_manifest.rs` that validates `[B_local,H] -> [world*B_local,H]` allgather and `[world*B_local,H] -> [B_local,H]` f32 reduce-scatter. 4. Run the targeted test or compile check with release settings. - **Risks / open questions**: - The pair test requires two GPUs and a loadable NCCL runtime; it should skip cleanly when NCCL is unavailable, matching the existing all-reduce test. @@ -59,30 +59,30 @@ Decode MoE now uses GPU-resident allgather/router/local-expert/reduce-scatter fl ## Execution Log ### Step 6: Remove legacy decode group path -- Removed decode-only single-thread/multi-rank entry points from `pegainfer-deepseek-v4/src/runtime/block.rs`: +- Removed decode-only single-thread/multi-rank entry points from `openinfer-deepseek-v4/src/runtime/block.rs`: - `block_decode_group_bf16_hidden` - `block_decode_group_rank_threads_bf16_hidden` - Removed decode group helpers that only existed for those entry points: - - attention decode group wrappers in `pegainfer-deepseek-v4/src/runtime/attention.rs` - - `decode_moe_ag_rs_group_bf16_hidden` in `pegainfer-deepseek-v4/src/runtime/moe.rs` + - attention decode group wrappers in `openinfer-deepseek-v4/src/runtime/attention.rs` + - `decode_moe_ag_rs_group_bf16_hidden` in `openinfer-deepseek-v4/src/runtime/moe.rs` - AG/RS group collectives `all_gather_bf16_hidden_group`, `all_gather_u32_group`, and `reduce_scatter_f32_hidden_group` - Removed public re-exports and mp8 manifest tests that referenced the deleted decode group path. - Kept production prefill group helpers, because direct prefill still calls `prefill_logits_and_decode_cache_group_bf16_hidden`. - Verification: - `cargo fmt` - `cargo fmt --check` passed - - `cargo check --release -p pegainfer-deepseek-v4 --features deepseek-v4` passed - - `rg -n "block_decode_group|group_rank_threads|attention_decode_group|decode_moe_ag_rs_group|all_gather_bf16_hidden_group|reduce_scatter_f32_hidden_group|all_gather_u32_group" pegainfer-deepseek-v4/src pegainfer-deepseek-v4/tests` returned no matches + - `cargo check --release -p openinfer-deepseek-v4 --features deepseek-v4` passed + - `rg -n "block_decode_group|group_rank_threads|attention_decode_group|decode_moe_ag_rs_group|all_gather_bf16_hidden_group|reduce_scatter_f32_hidden_group|all_gather_u32_group" openinfer-deepseek-v4/src openinfer-deepseek-v4/tests` returned no matches ### Step 7: Remote exact E2E after cleanup -- Synced the cleanup files back to `5090:$PEGAINFER_DIR`. +- Synced the cleanup files back to `5090:$OPENINFER_DIR`. - Verified model path on 5090: `$MODEL_DIR`. - Ran on 5090: ```bash source ~/.cargo/env 2>/dev/null || true -cd $PEGAINFER_DIR -PEGAINFER_NVCC_JOBS=8 cargo run --release -p pegainfer-deepseek-v4 --features deepseek-v4 --bin deepseek_v4_e2e -- --model-path $MODEL_DIR +cd $OPENINFER_DIR +OPENINFER_NVCC_JOBS=8 cargo run --release -p openinfer-deepseek-v4 --features deepseek-v4 --bin deepseek_v4_e2e -- --model-path $MODEL_DIR ``` - Result: @@ -105,23 +105,23 @@ PEGAINFER_NVCC_JOBS=8 cargo run --release -p pegainfer-deepseek-v4 --features de - mp8 tests that exercised the deleted group path - Verification: - `cargo fmt` passed locally - - `cargo check --release -p pegainfer-deepseek-v4 --features deepseek-v4` passed locally and on 5090 - - `cargo test --release -p pegainfer-deepseek-v4 --features deepseek-v4 --test mp8_manifest --no-run` passed locally - - `rg -n "group_start|group_end|all_reduce_hidden_group|all_gather_logits_group|embedding_vocab_parallel_group|final_logits_group_bf16_hidden|hash_routed_moe_group_bf16_hidden|moe_group_bf16_hidden|attention_prefill_.*group|block_prefill_group|prefill_logits_group|prefill_logits_and_decode_cache_group|deepseek_mp8_check|contexts: Vec" pegainfer-deepseek-v4/src pegainfer-deepseek-v4/tests pegainfer-deepseek-v4/Cargo.toml` returned no matches locally + - `cargo check --release -p openinfer-deepseek-v4 --features deepseek-v4` passed locally and on 5090 + - `cargo test --release -p openinfer-deepseek-v4 --features deepseek-v4 --test mp8_manifest --no-run` passed locally + - `rg -n "group_start|group_end|all_reduce_hidden_group|all_gather_logits_group|embedding_vocab_parallel_group|final_logits_group_bf16_hidden|hash_routed_moe_group_bf16_hidden|moe_group_bf16_hidden|attention_prefill_.*group|block_prefill_group|prefill_logits_group|prefill_logits_and_decode_cache_group|deepseek_mp8_check|contexts: Vec" openinfer-deepseek-v4/src openinfer-deepseek-v4/tests openinfer-deepseek-v4/Cargo.toml` returned no matches locally - 5090 exact E2E with `$MODEL_DIR` passed: `All 20 DeepSeek V4 exact cases passed` ### Step 9: Split direct scheduler and worker files -- Split the former monolithic `pegainfer-deepseek-v4/src/direct.rs` into: - - `pegainfer-deepseek-v4/src/direct.rs` as a thin module/re-export facade. - - `pegainfer-deepseek-v4/src/direct/scheduler.rs` for request validation, the single-request greedy scheduler loop, token event emission, and sampling. - - `pegainfer-deepseek-v4/src/direct/worker.rs` for rank worker commands, rank resource ownership, cache/RoPE management, per-rank prefill/decode execution, and rank-0 logits collection. +- Split the former monolithic `openinfer-deepseek-v4/src/direct.rs` into: + - `openinfer-deepseek-v4/src/direct.rs` as a thin module/re-export facade. + - `openinfer-deepseek-v4/src/direct/scheduler.rs` for request validation, the single-request greedy scheduler loop, token event emission, and sampling. + - `openinfer-deepseek-v4/src/direct/worker.rs` for rank worker commands, rank resource ownership, cache/RoPE management, per-rank prefill/decode execution, and rank-0 logits collection. - Renamed the request/scheduler thread from `deepseek-v4-direct` to `deepseek-v4-scheduler`. Rank worker thread names remain `deepseek-v4-rank-{rank}`. - Kept behavior unchanged; this is only a responsibility-boundary cleanup. - Follow-up naming debt: - The module and public type names still use `direct`; that name is legacy and should eventually become `engine` or `executor` in a dedicated rename pass. - Verification: - `cargo fmt` passed - - `cargo check --release -p pegainfer-deepseek-v4 --features deepseek-v4` passed + - `cargo check --release -p openinfer-deepseek-v4 --features deepseek-v4` passed ### Step 3: Expand scope to decode backend replacement - User goal changed from adding standalone AG/RS collectives to completing the MoE all-to-all backend replacement and passing DeepSeek V4 E2E. @@ -146,7 +146,7 @@ PEGAINFER_NVCC_JOBS=8 cargo run --release -p pegainfer-deepseek-v4 --features de - Exact E2E command run: ```bash -PEGAINFER_NVCC_JOBS=8 cargo run --release -p pegainfer-deepseek-v4 --features deepseek-v4 --bin deepseek_v4_e2e -- --model-path $MODEL_DIR +OPENINFER_NVCC_JOBS=8 cargo run --release -p openinfer-deepseek-v4 --features deepseek-v4 --bin deepseek_v4_e2e -- --model-path $MODEL_DIR ``` - Result: `19 / 20` exact cases passed. @@ -155,7 +155,7 @@ PEGAINFER_NVCC_JOBS=8 cargo run --release -p pegainfer-deepseek-v4 --features de - Performance sanity command run: ```bash -PEGAINFER_NVCC_JOBS=8 cargo run --release -p pegainfer-server --bin bench_serving --features deepseek-v4 -- --model-path $MODEL_DIR --format json request --prompt-len 1 --output-len 32 --warmup 1 --iters 1 +OPENINFER_NVCC_JOBS=8 cargo run --release -p openinfer-server --bin bench_serving --features deepseek-v4 -- --model-path $MODEL_DIR --format json request --prompt-len 1 --output-len 32 --warmup 1 --iters 1 ``` - Result: @@ -180,9 +180,9 @@ PEGAINFER_NVCC_JOBS=8 cargo run --release -p pegainfer-server --bin bench_servin - Validation commands: ```bash -PEGAINFER_NVCC_JOBS=8 cargo check --release -p pegainfer-deepseek-v4 --features deepseek-v4 -PEGAINFER_NVCC_JOBS=8 cargo run --release -p pegainfer-deepseek-v4 --features deepseek-v4 --bin deepseek_v4_e2e -- --model-path $MODEL_DIR -PEGAINFER_NVCC_JOBS=8 cargo run --release -p pegainfer-server --bin bench_serving --features deepseek-v4 -- --model-path $MODEL_DIR --format json request --prompt-len 1 --output-len 32 --warmup 1 --iters 1 +OPENINFER_NVCC_JOBS=8 cargo check --release -p openinfer-deepseek-v4 --features deepseek-v4 +OPENINFER_NVCC_JOBS=8 cargo run --release -p openinfer-deepseek-v4 --features deepseek-v4 --bin deepseek_v4_e2e -- --model-path $MODEL_DIR +OPENINFER_NVCC_JOBS=8 cargo run --release -p openinfer-server --bin bench_serving --features deepseek-v4 -- --model-path $MODEL_DIR --format json request --prompt-len 1 --output-len 32 --warmup 1 --iters 1 ``` - Results: @@ -192,22 +192,22 @@ PEGAINFER_NVCC_JOBS=8 cargo run --release -p pegainfer-server --bin bench_servin - Earlier row-routed scalar path measured around `223.30ms`, so grouped TileLang cuts the local expert bottleneck roughly in half. ### Step 1: Add AG/RS collectives -- Added `all_gather_bf16_hidden_group` in `pegainfer-deepseek-v4/src/runtime/collectives.rs`. +- Added `all_gather_bf16_hidden_group` in `openinfer-deepseek-v4/src/runtime/collectives.rs`. - Contract: every rank contributes `bf16 [B_local,H]`. - Output on each rank: `bf16 [world*B_local,H]`. - Uses NCCL `Comm::all_gather` on device buffers; no runtime D2H metadata. -- Added `reduce_scatter_f32_hidden_group` in `pegainfer-deepseek-v4/src/runtime/collectives.rs`. +- Added `reduce_scatter_f32_hidden_group` in `openinfer-deepseek-v4/src/runtime/collectives.rs`. - Contract: every rank contributes `f32 [world*B_local,H]`. - Output on each rank: `f32 [B_local,H]`. - Uses NCCL `Comm::reduce_scatter(..., ReduceOp::Sum)` on device buffers. -- Exported both helpers from `pegainfer-deepseek-v4/src/lib.rs`. +- Exported both helpers from `openinfer-deepseek-v4/src/lib.rs`. ### Step 2: Validate -- Added `nccl_hidden_all_gather_and_reduce_scatter_pair` in `pegainfer-deepseek-v4/tests/mp8_manifest.rs`. +- Added `nccl_hidden_all_gather_and_reduce_scatter_pair` in `openinfer-deepseek-v4/tests/mp8_manifest.rs`. - Ran: ```bash -PEGAINFER_NVCC_JOBS=8 cargo test --release -p pegainfer-deepseek-v4 --features deepseek-v4 --test mp8_manifest nccl_hidden_all_gather_and_reduce_scatter_pair -- --nocapture +OPENINFER_NVCC_JOBS=8 cargo test --release -p openinfer-deepseek-v4 --features deepseek-v4 --test mp8_manifest nccl_hidden_all_gather_and_reduce_scatter_pair -- --nocapture ``` - Result: passed, `1 passed; 0 failed; 23 filtered out`. diff --git a/docs/models/deepseek-v4/moe-tilelang-review.md b/docs/models/deepseek-v4/moe-tilelang-review.md index eb324596..a86c5b22 100644 --- a/docs/models/deepseek-v4/moe-tilelang-review.md +++ b/docs/models/deepseek-v4/moe-tilelang-review.md @@ -20,13 +20,13 @@ - **Read**: - `docs/index.md` - identified DeepSeek V4 support, DeepSeek kernel paths, and kernel technology reference as the relevant routing docs. - `docs/models/deepseek-v4/support.md` - confirmed the current DeepSeek V4 path has native MP8 runtime, TileLang build-time kernels, and a handwritten CUDA MoE path; it also notes MoE route-index D2H synchronization as a higher-risk remaining target. - - `docs/models/deepseek-v4/kernel-paths.md` - confirmed DeepSeek CUDA sources now live under `pegainfer-kernels/csrc/deepseek_v4/` and TileLang generators live under `pegainfer-kernels/tools/tilelang/deepseek_v4/`. + - `docs/models/deepseek-v4/kernel-paths.md` - confirmed DeepSeek CUDA sources now live under `openinfer-kernels/csrc/deepseek_v4/` and TileLang generators live under `openinfer-kernels/tools/tilelang/deepseek_v4/`. - **Relevant history**: - `docs/models/deepseek-v4/support.md` records that the current local TileLang generator emits quantized linear, sparse attention, and HC kernels, while `deepseek_moe.cu` owns routing, SwiGLU, and expert accumulation. - `docs/models/deepseek-v4/kernel-paths.md` records that the DeepSeek kernel routing table was recently organized, so this review should compare against those paths instead of rediscovering ownership from scratch. - **Plan**: 1. Inspect the official DeepSeek TileKernels `tile_kernels/moe` directory from `https://github.com/deepseek-ai/TileKernels/tree/main/tile_kernels/moe`, including file names, exported kernels, and expected tensor layouts. - 2. Inspect local MoE code paths: `pegainfer-kernels/csrc/deepseek_v4/deepseek_moe.cu`, related FFI declarations, and `pegainfer-deepseek-v4/src/runtime/` callers. + 2. Inspect local MoE code paths: `openinfer-kernels/csrc/deepseek_v4/deepseek_moe.cu`, related FFI declarations, and `openinfer-deepseek-v4/src/runtime/` callers. 3. Compare official kernels against local behavior along routing layout, expert grouping, quantization format, activation, accumulation dtype, and dispatch/combine boundaries. 4. Summarize what official TileLang operators exist, what they appear to solve, and which local MoE issue they most likely explain or do not explain. 5. If the gap is clear and small, propose the first implementation slice; otherwise stop with a focused diagnostic checklist. @@ -68,13 +68,13 @@ Result: official MoE TileLang is primarily a routing, mapping, packing, and reduction toolkit. It does not appear to provide a single fused FP4 expert MLP kernel in `tile_kernels/moe/`; the fused expert GEMM path is implied by the expert-major layout and the quantized helpers used around it. ### Step 2: Compare local MoE implementation -- Local score routing in `pegainfer-kernels/csrc/deepseek_v4/deepseek_moe.cu` broadly matches the model config's simple scoring semantics: +- Local score routing in `openinfer-kernels/csrc/deepseek_v4/deepseek_moe.cu` broadly matches the model config's simple scoring semantics: - BF16 gate scores are converted to F32 and multiplied through cuBLAS; - selection score is `sqrt(softplus(dot)) + gate_bias`; - route weight is the original `sqrt(softplus(dot))`; - selected weights are normalized and multiplied by `routed_scaling_factor`. - Local execution differs substantially from the official fused-layout path: - - `pegainfer-deepseek-v4/src/runtime/moe.rs` copies `routed.indices` from device to host and synchronizes in both `routed_local_experts_forward_bf16_hidden` and `routed_local_experts_forward_f32_hidden`. + - `openinfer-deepseek-v4/src/runtime/moe.rs` copies `routed.indices` from device to host and synchronizes in both `routed_local_experts_forward_bf16_hidden` and `routed_local_experts_forward_f32_hidden`. - The CPU then builds `active_local` and loops over local experts. - For each active local expert, `local_expert_forward_*` runs W1, W3, SwiGLU, and W2 over the full input batch, then masks/weights the result by route index. - Official TileKernels instead keeps routing metadata on GPU, creates expert-major packed token ranges, expands inputs once, executes expert work over packed ranges, then reduces back with `token_topk_to_pos`. @@ -129,7 +129,7 @@ Result: the most likely MoE problem is not the `sqrtsoftplus` math itself for no - Non-nsys synthetic decode-heavy command: ```bash -PEGAINFER_NVCC_JOBS=8 cargo run --release -p pegainfer-server --bin bench_serving --features deepseek-v4 -- \ +OPENINFER_NVCC_JOBS=8 cargo run --release -p openinfer-server --bin bench_serving --features deepseek-v4 -- \ --model-path $MODEL_DIR --format json \ request --prompt-len 1 --output-len 32 --warmup 1 --iters 1 ``` @@ -221,13 +221,13 @@ nsys profile --stats=false --force-overwrite=true \ - Validation: ```bash -cargo check --release -p pegainfer-deepseek-v4 --features deepseek-v4 +cargo check --release -p openinfer-deepseek-v4 --features deepseek-v4 ``` - Passed with the existing unreachable-pub warnings in `runtime/core.rs` and `runtime/state.rs`. ```bash -PEGAINFER_NVCC_JOBS=8 cargo run --release -p pegainfer-server --bin bench_serving --features deepseek-v4 -- \ +OPENINFER_NVCC_JOBS=8 cargo run --release -p openinfer-server --bin bench_serving --features deepseek-v4 -- \ --model-path $MODEL_DIR --format json \ request --prompt-len 1 --output-len 32 --warmup 1 --iters 1 ``` @@ -241,7 +241,7 @@ PEGAINFER_NVCC_JOBS=8 cargo run --release -p pegainfer-server --bin bench_servin - e2e: `3.69s` ```bash -PEGAINFER_NVCC_JOBS=8 cargo run --release -p pegainfer-deepseek-v4 --features deepseek-v4 --bin deepseek_v4_e2e -- \ +OPENINFER_NVCC_JOBS=8 cargo run --release -p openinfer-deepseek-v4 --features deepseek-v4 --bin deepseek_v4_e2e -- \ --model-path $MODEL_DIR \ --ground-truth test_data/deepseek-v4-ground-truth.json \ --offset 0 --limit 1 --max-new-tokens 64 @@ -251,7 +251,7 @@ PEGAINFER_NVCC_JOBS=8 cargo run --release -p pegainfer-deepseek-v4 --features de - Full exact validation required before commit: ```bash -PEGAINFER_NVCC_JOBS=8 cargo run --release -p pegainfer-deepseek-v4 --features deepseek-v4 --bin deepseek_v4_e2e -- \ +OPENINFER_NVCC_JOBS=8 cargo run --release -p openinfer-deepseek-v4 --features deepseek-v4 --bin deepseek_v4_e2e -- \ --model-path $MODEL_DIR \ --ground-truth test_data/deepseek-v4-ground-truth.json \ --max-new-tokens 64 @@ -300,13 +300,13 @@ nsys profile --stats=false --force-overwrite=true \ - Validation: ```bash -cargo check --release -p pegainfer-deepseek-v4 --features deepseek-v4 +cargo check --release -p openinfer-deepseek-v4 --features deepseek-v4 ``` - Passed with the existing unreachable-pub warnings in `runtime/core.rs` and `runtime/state.rs`. ```bash -PEGAINFER_NVCC_JOBS=8 cargo run --release -p pegainfer-server --bin bench_serving --features deepseek-v4 -- \ +OPENINFER_NVCC_JOBS=8 cargo run --release -p openinfer-server --bin bench_serving --features deepseek-v4 -- \ --model-path $MODEL_DIR --format json \ request --prompt-len 1 --output-len 32 --warmup 1 --iters 1 ``` @@ -320,7 +320,7 @@ PEGAINFER_NVCC_JOBS=8 cargo run --release -p pegainfer-server --bin bench_servin - e2e: `3.11s` ```bash -PEGAINFER_NVCC_JOBS=8 cargo run --release -p pegainfer-deepseek-v4 --features deepseek-v4 --bin deepseek_v4_e2e -- \ +OPENINFER_NVCC_JOBS=8 cargo run --release -p openinfer-deepseek-v4 --features deepseek-v4 --bin deepseek_v4_e2e -- \ --model-path $MODEL_DIR \ --ground-truth test_data/deepseek-v4-ground-truth.json \ --max-new-tokens 64 @@ -406,13 +406,13 @@ nsys profile --stats=false --force-overwrite=true \ - Interpretation: MoE EP8 route imbalance creates some phase skew, but the 100ms-scale TPOT comes from many small phase skews being paid at every barrier. Attention and indexer collectives also pay large skew despite nearly equal active GPU work, which points at CPU runtime, allocation/free, launch gaps, and host-controlled loops as the amplification mechanism. - Next experiments should measure or reduce runtime churn and host-controlled MoE decode scheduling before assuming expert compute or raw NCCL bandwidth is the limiting factor. - Temporary NVTX proof trace: - - Added a temporary profiling-only NVTX loader using runtime `dlopen`/`dlsym` for `nvtxRangePushA`, `nvtxRangePop`, and `nvtxMarkA`, gated by `PEGAINFER_DSV4_NVTX=1`. The instrumentation marked rank worker decode stages (`attn_local`, `indexer_ar`, `attention_ar`, `moe_route`, `moe_plan`, `moe_experts`, `moe_reduce`, `shared_expert`, `moe_ar`) plus active local expert counts and per-local-expert ranges. The temporary code was removed after the trace, so it is not part of the hot path. + - Added a temporary profiling-only NVTX loader using runtime `dlopen`/`dlsym` for `nvtxRangePushA`, `nvtxRangePop`, and `nvtxMarkA`, gated by `OPENINFER_DSV4_NVTX=1`. The instrumentation marked rank worker decode stages (`attn_local`, `indexer_ar`, `attention_ar`, `moe_route`, `moe_plan`, `moe_experts`, `moe_reduce`, `shared_expert`, `moe_ar`) plus active local expert counts and per-local-expert ranges. The temporary code was removed after the trace, so it is not part of the hot path. - Build and trace commands: ```bash -PEGAINFER_NVCC_JOBS=8 cargo build --release -p pegainfer-server --bin bench_serving --features deepseek-v4 +OPENINFER_NVCC_JOBS=8 cargo build --release -p openinfer-server --bin bench_serving --features deepseek-v4 -PEGAINFER_DSV4_NVTX=1 nsys profile --stats=false --force-overwrite=true \ +OPENINFER_DSV4_NVTX=1 nsys profile --stats=false --force-overwrite=true \ --trace=cuda,nvtx,osrt --cuda-graph-trace=node \ --delay=34 --duration=12 \ -o target/profiling/dsv4_rank_stage_proof \ @@ -460,13 +460,13 @@ nsys export --type sqlite --force-overwrite=true \ - Validation: ```bash -cargo check --release -p pegainfer-deepseek-v4 --features deepseek-v4 +cargo check --release -p openinfer-deepseek-v4 --features deepseek-v4 -PEGAINFER_NVCC_JOBS=8 cargo run --release -p pegainfer-server --bin bench_serving --features deepseek-v4 -- \ +OPENINFER_NVCC_JOBS=8 cargo run --release -p openinfer-server --bin bench_serving --features deepseek-v4 -- \ --model-path $MODEL_DIR --format json \ request --prompt-len 1 --output-len 32 --warmup 1 --iters 1 -PEGAINFER_NVCC_JOBS=8 cargo run --release -p pegainfer-deepseek-v4 --features deepseek-v4 --bin deepseek_v4_e2e -- \ +OPENINFER_NVCC_JOBS=8 cargo run --release -p openinfer-deepseek-v4 --features deepseek-v4 --bin deepseek_v4_e2e -- \ --model-path $MODEL_DIR \ --ground-truth test_data/deepseek-v4-ground-truth.json \ --max-new-tokens 64 diff --git a/docs/models/deepseek-v4/online-throughput.md b/docs/models/deepseek-v4/online-throughput.md index 9e7f712a..0ab3b8fa 100644 --- a/docs/models/deepseek-v4/online-throughput.md +++ b/docs/models/deepseek-v4/online-throughput.md @@ -136,12 +136,12 @@ Nsight Systems 10k direct, sorted by CUDA kernel total time: | Decode MoE | `dispatch_decode_moe_step` accepts `input.seq_len`; local routing and grouped GEMM operate over seq length. | not first candidate until active-set path proves MoE is a top online bucket. | | Prefill request batching | prefill starts one request into one KV slot; no multi-request prefill active set. | input throughput is mostly long-seq single-request kernel efficiency today; true batch prefill needs a larger scheduler/runtime shape change. | | Prefill attention/compressor | prefill kernels are sequence-parallel for one request; no native multi-request DSV4 prefill stack. | Pacer prefill replacements should target high-share single-request buckets first, especially non-overlap compressor, while preserving the chosen quality policy. | -| CUDA Graph | `pegainfer-server` passes `enable_cuda_graph=false` for DeepSeek V4; direct engine warns that DSV4 does not use CUDA Graph yet. | graph work starts after active-set shapes stabilize; blockers are dynamic seq/compressed lengths, collectives, stream/event ownership, allocator/scratch lifetimes, and batch capacity. | +| CUDA Graph | `openinfer-server` passes `enable_cuda_graph=false` for DeepSeek V4; direct engine warns that DSV4 does not use CUDA Graph yet. | graph work starts after active-set shapes stabilize; blockers are dynamic seq/compressed lengths, collectives, stream/event ownership, allocator/scratch lifetimes, and batch capacity. | ## Next Work Selection | Task | Owner | Entry | | --- | --- | --- | -| task #45 HTTP active-set batching + CUDA Graph serving gate | @PegaInfer-Dev | Make serving trace show active set size > 1 under c2/c4/c8, then measure output tok/s/TPOT against this baseline. | +| task #45 HTTP active-set batching + CUDA Graph serving gate | @OpenInfer-Dev | Make serving trace show active set size > 1 under c2/c4/c8, then measure output tok/s/TPOT against this baseline. | | task #46 decode operator replacement | @Pacer | Prefer decode compressor `_batch_` path from task #44 coverage; fallback is decode indexer top-k batch if compressor exactness blocks. | | task #46 prefill operator replacement | @Pacer | Prefer non-overlap compressor only when local microbench/correctness and precision review show meaningful input-throughput gain; skip low-yield patches. | diff --git a/docs/models/deepseek-v4/pplx-ep-integration.md b/docs/models/deepseek-v4/pplx-ep-integration.md index 13b66e2f..955ad10f 100644 --- a/docs/models/deepseek-v4/pplx-ep-integration.md +++ b/docs/models/deepseek-v4/pplx-ep-integration.md @@ -14,15 +14,15 @@ - **PPLX worker/protocol 证据**:worker-wait NVTX profile 把每层 `p2p_all_to_all` p50 拆到 **1.609 ms**,乘 43 层解释非 rank0 的 74ms 级;其中 `worker_wait_combine_recv_done` p50 **1.111 ms/layer**,`dispatch` p50 只有 **0.010 ms/layer**。per-token source sync、worker-derived active-source mask、early `tx_ready`、route processing overlap 等局部实验均失败或 wait 迁移。 - **direct routed 只作机制证据**:single-node peer-memory direct routed path 绕过 legacy PPLX 四阶段,H200 `output_len=64` p50 从 **144.00 ms** 降到 **83.94 ms**,rows512 后到 **78.68 / 77.33 ms**;clean profile 为 PPLX **79.08 ms** vs NCCL **63.17 ms**。该路径是绕过 upstream 四阶段语义的 hack,当前代码已回到 legacy four-stage,并清理 `a2a_direct_*` API/kernel、direct worker mode、debug counters 和高侵入 RDMA/fabric probes。 - **当前关键修复**:2026-05-17 `/proc` 采样坐实 CPU0 fabric worker 抢占:旧 CPU0 `tx_engine_domain` 在 **7.0s** decode 窗口只拿到 **3.60s** CPU 且 **2980** 次 nonvoluntary switch。把 rank0 TE worker 从 CPU0 挪到同 topology group 的 CPU10 后,两次 H200 `output_len=64` 复测降到 steady p50 **66.46 / 66.70 ms**、p95 **69.80 / 69.62 ms**,接近 NCCL **63 ms** 级。 -- **当前代码状态**:legacy 四阶段 kernel 已恢复 cooperative multi-block launch;保留 done-flag 最后发布 correctness 修复。dsv4 侧临时 NVTX ranges 已清理,pplx-garden 自带 NVTX 保留;CPU placement 迁到 `pegainfer_core::cpu_topology`:读取 CUDA device NUMA、当前 affinity mask、NUMA cpulist,把同一 NUMA 下的 rank 先均分连续 CPU slice,再从 slice 内分配 rank/a2a/TE/UVM。CPU0 保留不用,CPU1 给 scheduler;启动时每个 rank 打一行 `cpu_slice/rank_worker/TE/a2a/UVM`。direct routed hack、临时 profiler API capture 和高侵入诊断均已清理。 -- **验证状态**:本地 `cargo test --release -p pegainfer-core cpu_topology -- --nocapture`、`cargo fmt --check -p pegainfer-core -p pegainfer-comm-fabric-lib -p pegainfer-deepseek-v4`、`PATH=/usr/local/cuda/bin:$PATH cargo check --release -p pegainfer-deepseek-v4 --features pplx-ep-bench --bin deepseek_pplx_a2a_bench`、`PATH=/usr/local/cuda/bin:$PATH cargo check --release -p pegainfer-server --features deepseek-v4,pplx-ep --bin bench_serving` 均通过。H200 release build 通过;`output_len=2` smoke status 0;`output_len=64` 生成 64/64 token 后 teardown status 139,metrics 已打印:steady p50 **66.65 ms**、p95 **68.15 ms**、max **69.47 ms**。PPLX exact ground truth 19/20;case 13 输出 `2500` 而非 `2500 meters`,NCCL 同 case 同样失败,因此不归因到 PPLX placement/通信改动。 +- **当前代码状态**:legacy 四阶段 kernel 已恢复 cooperative multi-block launch;保留 done-flag 最后发布 correctness 修复。dsv4 侧临时 NVTX ranges 已清理,pplx-garden 自带 NVTX 保留;CPU placement 迁到 `openinfer_core::cpu_topology`:读取 CUDA device NUMA、当前 affinity mask、NUMA cpulist,把同一 NUMA 下的 rank 先均分连续 CPU slice,再从 slice 内分配 rank/a2a/TE/UVM。CPU0 保留不用,CPU1 给 scheduler;启动时每个 rank 打一行 `cpu_slice/rank_worker/TE/a2a/UVM`。direct routed hack、临时 profiler API capture 和高侵入诊断均已清理。 +- **验证状态**:本地 `cargo test --release -p openinfer-core cpu_topology -- --nocapture`、`cargo fmt --check -p openinfer-core -p openinfer-comm-fabric-lib -p openinfer-deepseek-v4`、`PATH=/usr/local/cuda/bin:$PATH cargo check --release -p openinfer-deepseek-v4 --features pplx-ep-bench --bin deepseek_pplx_a2a_bench`、`PATH=/usr/local/cuda/bin:$PATH cargo check --release -p openinfer-server --features deepseek-v4,pplx-ep --bin bench_serving` 均通过。H200 release build 通过;`output_len=2` smoke status 0;`output_len=64` 生成 64/64 token 后 teardown status 139,metrics 已打印:steady p50 **66.65 ms**、p95 **68.15 ms**、max **69.47 ms**。PPLX exact ground truth 19/20;case 13 输出 `2500` 而非 `2500 meters`,NCCL 同 case 同样失败,因此不归因到 PPLX placement/通信改动。 - **Next**:在当前 per-NUMA slice placement 上做低侵入 profile,复核 legacy PPLX 相对 NCCL 剩余 **~3-4 ms** gap 和 rank0 drain 结构。 ## TL;DR -**2026-05-17 cleanup note**:direct routed single-node path 的 77-84ms 数据只作为机制证据保留;实现已回到 legacy four-stage PPLX。最新整理把 CPU placement 公共化到 `pegainfer_core::cpu_topology`,按 NUMA cpulist 给 rank 连续切片,CPU0 不用、CPU1 给 scheduler,其它 worker 从本 rank slice 内取。bench 的 pplx bootstrap 入口改成隐藏 wrapper,避免暴露 direct 内部 placement 类型;dsv4 临时 NVTX ranges 已清理,pplx 自带 NVTX 不动。本地 release check 和 `cpu_topology` 单测通过;H200 `output_len=64` 复测 steady p50 **66.65 ms**、p95 **68.15 ms**,退出码 **139** 发生在 metrics 之后的已知 teardown 阶段。 +**2026-05-17 cleanup note**:direct routed single-node path 的 77-84ms 数据只作为机制证据保留;实现已回到 legacy four-stage PPLX。最新整理把 CPU placement 公共化到 `openinfer_core::cpu_topology`,按 NUMA cpulist 给 rank 连续切片,CPU0 不用、CPU1 给 scheduler,其它 worker 从本 rank slice 内取。bench 的 pplx bootstrap 入口改成隐藏 wrapper,避免暴露 direct 内部 placement 类型;dsv4 临时 NVTX ranges 已清理,pplx 自带 NVTX 不动。本地 release check 和 `cpu_topology` 单测通过;H200 `output_len=64` 复测 steady p50 **66.65 ms**、p95 **68.15 ms**,退出码 **139** 发生在 metrics 之后的已知 teardown 阶段。 -把 `pegainfer-comm`(pplx-garden 派生)的 NVLink + RDMA MoE all-to-all 后端从 skeleton 接成可用实现,给 dsv4-flash **decode MoE** 提供另一条通信路径,运行时通过开关切换到它;走 pplx 时 decode CUDA Graph 全局关闭。范围只覆盖 routed expert 这一段 dispatch/combine,prefill、shared expert、attention、indexer 不动。**不**引入 trait/dyn 抽象——只有一个实现,直接用 concrete 类型。 +把 `openinfer-comm`(pplx-garden 派生)的 NVLink + RDMA MoE all-to-all 后端从 skeleton 接成可用实现,给 dsv4-flash **decode MoE** 提供另一条通信路径,运行时通过开关切换到它;走 pplx 时 decode CUDA Graph 全局关闭。范围只覆盖 routed expert 这一段 dispatch/combine,prefill、shared expert、attention、indexer 不动。**不**引入 trait/dyn 抽象——只有一个实现,直接用 concrete 类型。 ## 工作场景 @@ -32,7 +32,7 @@ ## 现状(读码确认过的事实) -### pegainfer-comm 公共表面(skeleton) +### openinfer-comm 公共表面(skeleton) - `EpAllToAll` trait:`dispatch / combine / poll / release` 四个 `&self` 方法,对象安全,`Send + Sync`。 - `EpBackendBuilder::build()`:**两种 feature 模式都返回 Err**——`hw-rdma` off 时 `BackendUnavailable`,`hw-rdma` on 时 `Unimplemented`。 @@ -41,7 +41,7 @@ - `SendBuf / RecvBuf`:裸 device pointer + elem_count + elem_size + 可选 scale pointer;调用方持有底层 allocation 的所有权。 - `RdmaBackend`(`src/backend/rdma.rs`):私有类型,四个 trait 方法全是 `todo!()`,构造函数当前只存了 `EpTopology`,没拿 `AllToAllContext`。 -### pplx wrapper(`crates/pegainfer-comm-p2p-all-to-all/`) +### pplx wrapper(`crates/openinfer-comm-p2p-all-to-all/`) - `AllToAllContext::new(...)`:21 个参数,需要外部传入 `TransferEngine`、`rank_handles`、预注册的 send/recv buffer + MR、host pointer arrays(sync/send/recv),构造时启动一个 `"p2p_all_to_all Worker"` 后台线程,固定 CPU 亲和性。 - 调用形态是 **四步**(不是 trait 现在写的两步): @@ -53,7 +53,7 @@ ### dsv4-flash 当前 MoE 通信路径 -- decode:`pegainfer-deepseek-v4/src/runtime/moe.rs:1323` 的 `decode_moe_ag_rs_bf16_hidden_with_scratch` +- decode:`openinfer-deepseek-v4/src/runtime/moe.rs:1323` 的 `decode_moe_ag_rs_bf16_hidden_with_scratch` - NCCL `all_gather_bf16_hidden_into`(拼全局 hidden) - 本地路由 + grouped FP4 GEMM(local experts) - NCCL `reduce_scatter_f32_hidden_into`(聚合到本地 token) @@ -61,7 +61,7 @@ - prefill:`moe.rs:1289` 的 `moe_rank_lane_bf16_hidden` - 路由 → expand → grouped GEMM → reduce → `all_reduce_f32_hidden_in_place` - 也是 all-reduce 形态,不是 A2A。 -- 通信抽象层:`pegainfer-deepseek-v4/src/runtime/collectives.rs` 包了一组 NCCL `Comm`-based helper,所有 MoE 通信都过它。 +- 通信抽象层:`openinfer-deepseek-v4/src/runtime/collectives.rs` 包了一组 NCCL `Comm`-based helper,所有 MoE 通信都过它。 - 没有任何 dispatch/combine 形态的接口,**需要新增**而不是替换。 ### 不做的事 @@ -81,14 +81,14 @@ dsv4-flash MoE (rank lane) │ ├── 走 NCCL AG/RS(已有)—— CUDA Graph 友好 │ - └── 走 pegainfer-comm(新增)—— eager only,graph 关闭 + └── 走 openinfer-comm(新增)—— eager only,graph 关闭 │ └── EpBackend → AllToAllContext → a2a-kernels + fabric-lib ``` - 切换粒度:**整 process 一致**,由 CLI/Config 决定,启动后不变;同一 layer 不会跨后端。 -### pegainfer-comm 表面简化 +### openinfer-comm 表面简化 skeleton 里的 `EpAllToAll` trait + `Box` 删掉。`EpBackend` 改成 concrete 结构,inherent 方法直接暴露四步: @@ -107,7 +107,7 @@ impl EpBackend { ### dsv4 集成入口 -新增 `pegainfer-deepseek-v4/src/runtime/moe_pplx.rs`(flat layout,无 `mod.rs`): +新增 `openinfer-deepseek-v4/src/runtime/moe_pplx.rs`(flat layout,无 `mod.rs`): - `decode_moe_pplx_bf16_hidden_with_scratch(ctx, config, weights, ptr_cache, ep, layer, input, token_ids, scratches)` - 顺序大致:`dispatch_send` → 同流跑 shared expert → `dispatch_recv` → grouped FP4 GEMM → `combine_send` → 同流跑后续 layer 准备 → `combine_recv` 写回 hidden。 @@ -125,7 +125,7 @@ pplx 路径的 send/recv buffer 必须**预注册到 fabric-lib 的 MR**,不 ### 初始化位置 -`pegainfer-deepseek-v4/src/direct.rs` 里 `RankWorker::spawn` 阶段,跟 NCCL `Comm` 同级: +`openinfer-deepseek-v4/src/direct.rs` 里 `RankWorker::spawn` 阶段,跟 NCCL `Comm` 同级: ``` RankWorker::spawn @@ -142,7 +142,7 @@ RankWorker::spawn ### 运行时切换 -- 编译期:`pegainfer-comm` 的 `hw-rdma` feature 已经存在;dsv4 加一个 `pplx-ep` feature,关掉时 `moe_pplx.rs` 整个 `cfg`-out,不拉 fabric-lib 依赖。 +- 编译期:`openinfer-comm` 的 `hw-rdma` feature 已经存在;dsv4 加一个 `pplx-ep` feature,关掉时 `moe_pplx.rs` 整个 `cfg`-out,不拉 fabric-lib 依赖。 - 运行时:`Config` 加 `moe_backend: MoeBackend { Nccl, Pplx }`,CLI `--moe-backend nccl|pplx`,默认 `nccl`。选 `pplx` 时: - `pplx-ep` feature 必须开,否则启动报错。 - decode CUDA Graph 自动关闭(不需要用户单独传参)。 @@ -153,7 +153,7 @@ RankWorker::spawn 确认 scratch/buffer 形态、初始化位置、CLI 入口。 -### Step 1:pegainfer-comm 去 skeleton,砍 trait +### Step 1:openinfer-comm 去 skeleton,砍 trait - 删 `EpAllToAll` trait 与 `Box`,`EpBackend` 改 concrete + inherent 四步方法。 - 改 `EpBackendBuilder::build()`:`hw-rdma` 分支真正构造 `AllToAllContext`。 @@ -162,7 +162,7 @@ RankWorker::spawn ### Step 2:dsv4 加 pplx 路径 -- `pegainfer-deepseek-v4/src/runtime/moe_pplx.rs` 写 `decode_moe_pplx_bf16_hidden_with_scratch`,路由 / grouped GEMM / shared expert 复用现有 helper,只把 AG/RS 替换成 dispatch_send→recv + combine_send→recv。 +- `openinfer-deepseek-v4/src/runtime/moe_pplx.rs` 写 `decode_moe_pplx_bf16_hidden_with_scratch`,路由 / grouped GEMM / shared expert 复用现有 helper,只把 AG/RS 替换成 dispatch_send→recv + combine_send→recv。 - 新增 `MoePplxScratch`,跟 `MoeAgRsScratch` 同级,按 `cfg(feature="pplx-ep")` 在 `RankWorker` 里二选一持有。 - `RankWorker` MoE 调用点加 `if let Some(ep) = &self.ep { ... } else { 现有 NCCL 路径 }`。 - `Config` / CLI 加 `--moe-backend`,pplx 时强制关 decode CUDA Graph。 @@ -201,14 +201,14 @@ RankWorker::spawn ## 当前进度(2026-05-16) **已落地** -- `pegainfer-comm` 去 skeleton + 砍 trait:`EpBackend` 是 concrete struct,四步 `dispatch_send / dispatch_recv / combine_send / combine_recv` 是 inherent method,构造走 `EpBackend::new(EpBackendParams)`,外加 `tokens_per_expert_ptr()` 让下游 grouped GEMM 拿 per-expert 计数。`unsafe impl Send` 让 EpBackend 可以从外部线程移交进 RankWorker。 +- `openinfer-comm` 去 skeleton + 砍 trait:`EpBackend` 是 concrete struct,四步 `dispatch_send / dispatch_recv / combine_send / combine_recv` 是 inherent method,构造走 `EpBackend::new(EpBackendParams)`,外加 `tokens_per_expert_ptr()` 让下游 grouped GEMM 拿 per-expert 计数。`unsafe impl Send` 让 EpBackend 可以从外部线程移交进 RankWorker。 - **砍掉 LibTorch 依赖**:a2a-kernels 自己定义 cxx `ScalarType` enum(namespace 改 `a2a_kernels::`),a2a-kernels 与 p2p-all-to-all 的 `torch-lib` dep 全部移除。`hw-rdma` feature 现在只需要 CUDA + RDMA Verbs + GDRCopy,不再拉 LibTorch / pyo3。 -- dsv4 加 `pplx-ep` feature,optional 依赖 `pegainfer-comm/hw-rdma`。 +- dsv4 加 `pplx-ep` feature,optional 依赖 `openinfer-comm/hw-rdma`。 - `runtime/moe_pplx.rs`:`decode_moe_pplx_bf16_hidden_with_scratch` body 完整——本地 route → dispatch_send → shared expert(overlap) → dispatch_recv → host 端 prefix-sum 出 expert_indptr → grouped FP4 expert → combine_send → combine_recv(`accumulate=true`,把 shared expert 折进 routed 输出)。 - `state.rs` 加 `MoePplxScratch`(MR-recv buffer 还是占位,要在 bootstrap 阶段注册)+ `MoeRunContext` / `MoePplxRunContext` 把两条 MoE 路径统一成一个参数。 - `block_decode_rank_lane_bf16_hidden_with_scratch`(含 batch 变体)签名改成 `moe: &mut MoeRunContext<'_>`,内部 `dispatch_decode_moe_step` 按 `moe.pplx.is_some()` 分发到两条路径。 - `RankWorker` 新增 `RankCommand::EnablePplx { ep_backend }`;`DeepSeekV4DirectGenerator::enable_pplx(Vec)` 把 per-rank 后端塞进对应 worker。 -- `cargo check -p pegainfer-comm` 通过。dsv4 因为 pegainfer-kernels 在本机 CUDA/flashinfer SDK 缺失编译不了(pre-existing),结构性 review 看 diff。 +- `cargo check -p openinfer-comm` 通过。dsv4 因为 openinfer-kernels 在本机 CUDA/flashinfer SDK 缺失编译不了(pre-existing),结构性 review 看 diff。 - 修通 H200 decode 全链路的几次硬伤: - **per-rank TransferEngine**:每张卡绑自己的 CX-7 NIC,`AllToAllRankHandle` 才能带上 peer 自己的 NIC `main_address`。早期共享 TE 时所有 RankHandle 都指向 worker[0],触发 RDMA `LOC_PROT_ERR`。 - **`num_dp_groups = world_size / dp_size`**(纯 EP 下 = world_size):之前硬编码 1 让 `num_routed[N*num_experts]` 越界写。 @@ -220,7 +220,7 @@ RankWorker::spawn - `PplxRankResources.peer_mappings` 接管 peer CUMem `CUMemMapping` 的生命周期,不再 `Box::leak`。 **Functional baseline(commit `0abe8fa`)** -- 命令:`PEGAINFER_DSV4_PPLX=1 NCCL_NVLS_ENABLE=0 ./target/release/bench_serving --model-path $MODEL_DIR request --prompt-len 1 --output-len 4 --warmup 0 --iters 1` +- 命令:`OPENINFER_DSV4_PPLX=1 NCCL_NVLS_ENABLE=0 ./target/release/bench_serving --model-path $MODEL_DIR request --prompt-len 1 --output-len 4 --warmup 0 --iters 1` - 结果:prefill 521 ms,first decode step 7331 ms,steady TPOT **6900 ms / tok**(0.14 tok/s)。 - NCCL 对照(同机 H200):steady TPOT **63.77 ms / tok**(15.69 tok/s)。 - 退出时 a2a_context worker shutdown 路径有 segfault,不影响前向;后续清理。 @@ -229,15 +229,15 @@ RankWorker::spawn - Per-rank TransferEngine / NIC 绑定修复后,Verbs `LOCAL_PROTECTION_ERROR` 消失,说明 MR/lkey/NIC ownership 的大方向已经对齐。 - `dispatch_recv` / `combine_recv` 的 host-visible done flag 改成最后发布:先 reset kernel/worker 共享状态,再 `fence_release_system()`,最后 `st_mmio_b8(*_done, 1)`。这修的是 worker 看到 done 后推进下一步、而上一轮 kernel 尾部又清 flag 的 timing race。 - `dispatch_recv` 额外修了 single-node 常见的 `num_fabric_tokens == 0` completion path:只允许 block 0 发布完成;有 fabric tokens 时只让 `num_local_tokens > 0` 的 block 参与 `grid_counter`。否则空 block 也可能满足 completion 条件。 -- H200 短跑命令:`PEGAINFER_DSV4_PPLX=1 NCCL_NVLS_ENABLE=0 RUST_BACKTRACE=1 ./target/release/bench_serving --model-path $MODEL_DIR request --prompt-len 1 --output-len 2 --warmup 0 --iters 1` +- H200 短跑命令:`OPENINFER_DSV4_PPLX=1 NCCL_NVLS_ENABLE=0 RUST_BACKTRACE=1 ./target/release/bench_serving --model-path $MODEL_DIR request --prompt-len 1 --output-len 2 --warmup 0 --iters 1` - H200 短跑结果:完成,`prefill_ms=534.96`,`first_decode_step_ms=1487.85`,`e2e_ms=2023.10`,`decode_tok_s=0.67`;日志 `$RESULT_ROOT/pplx_after_flag_order.log`。这是 correctness signal,不作为新的 TPOT baseline。 **GPU expert-indptr update** - `deepseek_pplx_padded_expert_indptr_cuda` 新增为 1-block helper kernel:读取 `dispatch_recv` 写出的 `recv_tokens_per_expert[local_experts]`,按 pplx `expert_padding` 生成 padded `expert_indptr[local_experts + 1]`。 - `moe_pplx.rs` 删除每层 `moe_stream.synchronize()`、D2H `recv_tokens_per_expert`、CPU prefix sum、H2D `expert_indptr`,改为 `dispatch_recv -> device prefix -> event -> grouped GEMM`。 - 第一版为了不让 host 读动态 padded count,grouped FP4 GEMM 的 host `rows` 使用 `expanded_input.seq_capacity()`;真实 expert 范围仍由 device `expert_indptr` 控制,`combine_send` 仍从 pplx worker 的 device `num_recv_tokens` 读真实 token 数。这个版本优先消掉同步闭环,后续可再把 dynamic rows 也留在 GPU 侧。 -- Local validation: `rustfmt --edition 2024 --check pegainfer-kernels/src/ffi.rs pegainfer-deepseek-v4/src/runtime/moe_pplx.rs pegainfer-deepseek-v4/src/runtime/state.rs` passed; `cargo check --release -p pegainfer-deepseek-v4 --features pplx-ep` passed. -- H200 validation: `cargo check --release -p pegainfer-deepseek-v4 --features pplx-ep` passed; `cargo build --release -p pegainfer-server --features deepseek-v4,pplx-ep --bin bench_serving` passed. +- Local validation: `rustfmt --edition 2024 --check openinfer-kernels/src/ffi.rs openinfer-deepseek-v4/src/runtime/moe_pplx.rs openinfer-deepseek-v4/src/runtime/state.rs` passed; `cargo check --release -p openinfer-deepseek-v4 --features pplx-ep` passed. +- H200 validation: `cargo check --release -p openinfer-deepseek-v4 --features pplx-ep` passed; `cargo build --release -p openinfer-server --features deepseek-v4,pplx-ep --bin bench_serving` passed. - H200 `output_len=2`: `prefill_ms=519.38`, `first_decode_ms=191.55`, `e2e_ms=711.36`, `decode_tok_s=5.22`; log `$RESULT_ROOT/pplx_gpu_indptr_olen2.log`. - H200 `output_len=4`: `prefill_ms=488.55`, `first_decode_ms=199.41`, steady TPOT **123.91 ms/tok**, `e2e_ms=936.22`, `decode_tok_s=6.71`; log `$RESULT_ROOT/pplx_gpu_indptr_olen4.log`. - Negative experiment: after GPU indptr, reading back only `expert_indptr[local_experts]` per MoE layer to pass exact dynamic `rows` into grouped FP4 regressed H200 `output_len=4` to `first_decode_ms=1451.94`, steady TPOT **1544.45 ms/tok**, `e2e_ms=5040.78`; log `$RESULT_ROOT/pplx_dynamic_rows_olen4.log`. The one-scalar host wait is still far more expensive than running grouped FP4 over scratch capacity, so this change was reverted. Dynamic rows only make sense through a GPU-only launch strategy or a custom kernel wrapper. @@ -283,9 +283,9 @@ RankWorker::spawn - 结论:当前“MoE 抖”更像 MoE 前置算子/launch/rank 到达方差被 pplx worker range 放大显示,而不是 pplx 四段 kernel 平均时间单独决定。 - 2026-05-16 临时 HC mix bypass 实验(未保留代码)把 `seq_len=1` 且无 raw/rms side output 的 `deepseek_hc_mixes_cuda` 从 BF16->F32 + cuBLAS `Sgemv` + scale kernel 改成已有 `deepseek_hc_mixes_kernel`。H200 `output_len=8` 短测 steady TPOT **149.96 ms/tok**(p50 **127.93 ms**,max **232.07 ms**),nsys event profile steady TPOT **133.31 ms/tok**。profile 证实 cuBLAS GEMV 基本消失,`deepseek_hc_mixes_kernel` 66 次 total **1.77 ms**,但整体仍由 NCCL all-reduce、FP4 grouped GEMM 和 CUDA API/launch 长尾主导;因此该改法不作为主线保留。 - 2026-05-16 加入一次性 NVTX probe(只随 `pplx-ep` feature 编译):request 主线程标出 `dsv4.request.prefill / step / sample / emit_token / advance_decode`;runtime 标出 `dsv4.runtime.dispatch_rank_decode / wait_rank_decode / rank0_logits`;rank worker 标出 `dsv4.rank.decode / token_upload / embedding / embedding_all_reduce / hc_expand / decode_layer / final_logits / gather_logits / logits_all_gather / logits_dtoh`;layer 内标出 `dsv4.layer.attn_hc_pre_norm / attention / attention_full / attention_ratio4 / attention_compressed / ffn_hc_pre_norm / moe / ffn_hc_post`。现有 pplx worker range(`p2p_all_to_all / dispatch / combine / process_routing_info / barrier`)保留。这样同一条 nsys timeline 能直接判断 steady 60ms gap 是 sampling/logits、rank response wait、MoE 前 operator 到达、还是 pplx worker protocol。 -- Validation for the probe: local `cargo fmt --check -p pegainfer-deepseek-v4` passed; local `cargo check --release -p pegainfer-deepseek-v4 --features pplx-ep` passed; H200 `cargo check --release -p pegainfer-deepseek-v4 --features pplx-ep` passed after syncing the instrumented files and restoring local `deepseek_hc.cu` on the remote tree. The remaining warnings are the pre-existing pplx visibility/unused warnings. -- 2026-05-16 ratio128 compressed decode scratch 化:单 token non-overlap compressed attention 改为复用 `AttentionProjectionScratch / AttentionIndexScratch / AttentionAuxScratch / AttentionOutputScratch`,新增 `compressor_nonoverlap_decode_bf16_hidden_at_scratch` 与 `compress_topk_indices_decode_into`,删除旧 owned-return 单 token入口。Local validation: `cargo fmt --check -p pegainfer-deepseek-v4` passed, `cargo check --release -p pegainfer-deepseek-v4 --features pplx-ep` passed, `git diff --check` passed. H200 validation: `cargo check --release -p pegainfer-deepseek-v4 --features pplx-ep` passed, `cargo build --release -p pegainfer-server --features deepseek-v4,pplx-ep --bin bench_serving` passed. H200 `output_len=8` smoke completed with steady TPOT **167.95 ms/tok** (`$RESULT_ROOT/pplx_ratio128_scratch_olen8.log`), and node NVTX profile completed with steady TPOT **147.31 ms/tok** (`$RESULT_ROOT/pplx_ratio128_scratch_nvtx_olen8.{log,nsys-rep,sqlite}`). The new sqlite confirms decode-window `cuMemAllocAsync` and `cuMemFreeAsync` are both **0**; previous probe had `cuMemAllocAsync=11200` all attributed to `dsv4.layer.attention_compressed`. TPOT did not materially improve, so allocator spikes were real but not the current wall-clock root cause. -- 2026-05-16 ratio4 topk refactor correction:第一次实现打到 single-token ratio4 helper,但 H200 profile 和源码路径确认 decode 走的是 `attention_decode_compressed_overlap_rank_local_collective_bf16_hidden_batch_with_scratch`,即 `bs=1` 也走 batch helper。随后删除 single-token fused helper、删除 dead batch `indexer_topk_indices_decode_batch_into` wrapper,在 batch path 中把 `window_topk + indexer_topk + concat` 合为 `deepseek_ratio4_decode_topk_indices_batch_kernel`;`max_compressed_len == 0` 仍走 window-only path。Local validation: `cargo fmt --check -p pegainfer-deepseek-v4 -p pegainfer-kernels` passed, `cargo check --release -p pegainfer-deepseek-v4 --features pplx-ep` passed, `git diff --check` passed. H200 validation: `cargo build --release -p pegainfer-server --features deepseek-v4,pplx-ep --bin bench_serving` passed; bench binary fatbin contains `_Z48deepseek_ratio4_decode_topk_indices_batch_kernel...`, and remote source calls `ratio4_decode_topk_indices_batch_into` from `runtime/block.rs`. +- Validation for the probe: local `cargo fmt --check -p openinfer-deepseek-v4` passed; local `cargo check --release -p openinfer-deepseek-v4 --features pplx-ep` passed; H200 `cargo check --release -p openinfer-deepseek-v4 --features pplx-ep` passed after syncing the instrumented files and restoring local `deepseek_hc.cu` on the remote tree. The remaining warnings are the pre-existing pplx visibility/unused warnings. +- 2026-05-16 ratio128 compressed decode scratch 化:单 token non-overlap compressed attention 改为复用 `AttentionProjectionScratch / AttentionIndexScratch / AttentionAuxScratch / AttentionOutputScratch`,新增 `compressor_nonoverlap_decode_bf16_hidden_at_scratch` 与 `compress_topk_indices_decode_into`,删除旧 owned-return 单 token入口。Local validation: `cargo fmt --check -p openinfer-deepseek-v4` passed, `cargo check --release -p openinfer-deepseek-v4 --features pplx-ep` passed, `git diff --check` passed. H200 validation: `cargo check --release -p openinfer-deepseek-v4 --features pplx-ep` passed, `cargo build --release -p openinfer-server --features deepseek-v4,pplx-ep --bin bench_serving` passed. H200 `output_len=8` smoke completed with steady TPOT **167.95 ms/tok** (`$RESULT_ROOT/pplx_ratio128_scratch_olen8.log`), and node NVTX profile completed with steady TPOT **147.31 ms/tok** (`$RESULT_ROOT/pplx_ratio128_scratch_nvtx_olen8.{log,nsys-rep,sqlite}`). The new sqlite confirms decode-window `cuMemAllocAsync` and `cuMemFreeAsync` are both **0**; previous probe had `cuMemAllocAsync=11200` all attributed to `dsv4.layer.attention_compressed`. TPOT did not materially improve, so allocator spikes were real but not the current wall-clock root cause. +- 2026-05-16 ratio4 topk refactor correction:第一次实现打到 single-token ratio4 helper,但 H200 profile 和源码路径确认 decode 走的是 `attention_decode_compressed_overlap_rank_local_collective_bf16_hidden_batch_with_scratch`,即 `bs=1` 也走 batch helper。随后删除 single-token fused helper、删除 dead batch `indexer_topk_indices_decode_batch_into` wrapper,在 batch path 中把 `window_topk + indexer_topk + concat` 合为 `deepseek_ratio4_decode_topk_indices_batch_kernel`;`max_compressed_len == 0` 仍走 window-only path。Local validation: `cargo fmt --check -p openinfer-deepseek-v4 -p openinfer-kernels` passed, `cargo check --release -p openinfer-deepseek-v4 --features pplx-ep` passed, `git diff --check` passed. H200 validation: `cargo build --release -p openinfer-server --features deepseek-v4,pplx-ep --bin bench_serving` passed; bench binary fatbin contains `_Z48deepseek_ratio4_decode_topk_indices_batch_kernel...`, and remote source calls `ratio4_decode_topk_indices_batch_into` from `runtime/block.rs`. - 2026-05-16 ratio4 batch topk profile:H200 `output_len=16` nsys report `$RESULT_ROOT/pplx_ratio4_batch_topk_nvtx_olen16.{log,nsys-rep,sqlite}` completed with benchmark steady TPOT avg **152.85 ms**, p50 **144.03 ms**, p95 **188.01 ms**, max **196.06 ms**. Compared with `$RESULT_ROOT/pplx_ratio4_refactor_nvtx_olen16.sqlite`, NVTX distributions improved in the specific attention range: `dsv4.layer.attention_ratio4` avg **2.178 -> 0.827 ms**, p95 **15.746 -> 1.593 ms**, max **42.748 -> 34.788 ms**; `dsv4.rank.decode` p50 **114.779 -> 79.695 ms**. Request-level `dsv4.runtime.wait_rank_decode` p50 only moved **175.804 -> 143.788 ms** and still dominates, so this change removes real ratio4 launch fanout but does not complete the TPOT target by itself. Nsight kernel table captured only one device, so kernel-name absence/presence in the sqlite is not a sufficient proof source; source path + fatbin symbol + NVTX movement are the useful evidence. - 2026-05-17 decode-only driver-contention profile:temporary `cudaProfilerStart/Stop` was hard-coded around decode and nsys was run with `--capture-range=cudaProfilerApi --sample=process-tree --sampling-period=1000000 --cpuctxsw=process-tree --cudabacktrace=all:1000 --cuda-flush-interval=100 --osrt-threshold=1000 --stats=true`; remote report `$RESULT_ROOT/pplx_driver_contention_olen8.{log,nsys-rep,sqlite}`. That profiler API patch was removed during cleanup; this entry is a historical capture record, not the current reusable profiling command. The capture fixed the previous truncated-kernel profile: every device has **17787** kernels and D2H device memcpy time is only **93 us** total across 7 copies. CUDA API summary shows long host-side tails instead: - `cudaLaunchCooperativeKernel`: 9632 calls, total **1287.95 ms**, max **168.34 ms**; stack is `cudaLaunchCooperativeKernel -> a2a_dispatch_send -> EpBackend::dispatch_send -> decode_moe_pplx...`. @@ -295,10 +295,10 @@ RankWorker::spawn - `cuKernelSetAttribute` has three **20.3-20.7 ms** tails inside `attention_ratio4 step=3 layer=2`, matching the driver-state hypothesis. Since the captured callchains for these rows are absent in Nsight, the current attribution is by NVTX containment and correlation timing rather than Rust stack. - CPU sampling marks rank threads as `Running` during samples, so the rank workers are not simply sleeping in application code during the measured decode; the expensive sleeps in OSRT are mostly channel waits outside hot work or libcuda internal mutex/futex behavior during launch. This profile changes the interpretation of ratio4/HC spikes: tune kernel arithmetic only after checking API-vs-GPU duration. The immediate optimization axis is launch count / module lookup / attribute setup churn in the decode path, while MoE/NCCL communication remains excluded from the current optimization target. -- 2026-05-17 upstream `pegainfer-comm/benchmarks/bench_all_to_all.py` was run on H200 with dsv4-shaped payloads to get the pplx-side theoretical MoE A2A floor. The script now exposes `--expert-padding` so it can match Rust `PplxBootstrapParams::default().expert_padding = 16`; default upstream padding remains 1. Common command shape: +- 2026-05-17 upstream `openinfer-comm/benchmarks/bench_all_to_all.py` was run on H200 with dsv4-shaped payloads to get the pplx-side theoretical MoE A2A floor. The script now exposes `--expert-padding` so it can match Rust `PplxBootstrapParams::default().expert_padding = 16`; default upstream padding remains 1. Common command shape: ```bash -cd $PEGAINFER_DIR-comm +cd $OPENINFER_DIR-comm NCCL_NVLS_ENABLE=0 PYTHONPATH=. ../.venv/bin/python benchmarks/bench_all_to_all.py \ --world-size 8 --dp-size 1 --nets-per-gpu 1 \ --max-private-tokens 64 \ @@ -316,15 +316,15 @@ The two payload points: | Real bs=1 decode: `max_num_tokens=1`, total EP routes = `8 * 1 * 6 = 48` | `$RESULT_ROOT/dsv4_pplx_a2a_max1_pad16.{log,json}` | **63.87 us** | **21.54 us** | **85.41 us/layer** | **52.64 us/layer** | | Rust bootstrap capacity: `max_num_tokens=8`, total EP routes = `8 * 8 * 6 = 384` | `$RESULT_ROOT/dsv4_pplx_a2a_max8_pad16.{log,json}` | **63.71 us** | **21.79 us** | **85.50 us/layer** | **53.82 us/layer** | -Interpretation: the upstream multi-process pplx benchmark reports the GPU/protocol A2A floor for our BF16/topk6/EP8 payload as roughly **0.085 ms per MoE layer**, or about **3.7 ms/token** across 43 layers if dispatch+combine are serialized. This is orders below the 140-160 ms request TPOT class, so the current gap is not explained by raw payload movement. This benchmark does not include pegainfer's full model runtime, explicit stream handoffs, model-side operator launch fanout, grouped GEMM/attention/NCCL, or request wait-rank effects; use it as a theoretical lower bound, not as an end-to-end replacement profile. +Interpretation: the upstream multi-process pplx benchmark reports the GPU/protocol A2A floor for our BF16/topk6/EP8 payload as roughly **0.085 ms per MoE layer**, or about **3.7 ms/token** across 43 layers if dispatch+combine are serialized. This is orders below the 140-160 ms request TPOT class, so the current gap is not explained by raw payload movement. This benchmark does not include openinfer's full model runtime, explicit stream handoffs, model-side operator launch fanout, grouped GEMM/attention/NCCL, or request wait-rank effects; use it as a theoretical lower bound, not as an end-to-end replacement profile. - 2026-05-17 added `deepseek_pplx_a2a_bench`, a Rust-side microbench that reuses the same dsv4 `build_intra_node_backends_for_devices` wrapper and `EpBackend` methods but excludes all model operators. It allocates BF16 hidden/out buffers on each rank, uses synthetic balanced routes to the next 6 ranks (`topk=6`), and reports both flattened rank×iteration stage times and per-iteration max across the 8 ranks. This isolates the single-process Rust wrapper / bootstrap / stream handoff layer between the upstream Python benchmark and the full dsv4 runtime. Command shape: ```bash -cd $PEGAINFER_DIR +cd $OPENINFER_DIR PATH=$CARGO_BIN_DIR:/usr/local/cuda/bin:/usr/local/bin:/usr/bin:/bin:$PATH \ - cargo build --release -p pegainfer-deepseek-v4 --features pplx-ep-bench \ + cargo build --release -p openinfer-deepseek-v4 --features pplx-ep-bench \ --bin deepseek_pplx_a2a_bench ./target/release/deepseek_pplx_a2a_bench \ --model-path $MODEL_DIR \ @@ -346,13 +346,13 @@ Interpretation: the Rust single-process wrapper still keeps pplx A2A at roughly ### Per-NUMA Slice Placement Validation -H200 validation after moving CPU topology helpers to `pegainfer_core::cpu_topology`: +H200 validation after moving CPU topology helpers to `openinfer_core::cpu_topology`: - Local build gates passed: - - `cargo test --release -p pegainfer-core cpu_topology -- --nocapture` - - `cargo fmt --check -p pegainfer-core -p pegainfer-comm-fabric-lib -p pegainfer-deepseek-v4` - - `PATH=/usr/local/cuda/bin:$PATH cargo check --release -p pegainfer-deepseek-v4 --features pplx-ep-bench --bin deepseek_pplx_a2a_bench` - - `PATH=/usr/local/cuda/bin:$PATH cargo check --release -p pegainfer-server --features deepseek-v4,pplx-ep --bin bench_serving` + - `cargo test --release -p openinfer-core cpu_topology -- --nocapture` + - `cargo fmt --check -p openinfer-core -p openinfer-comm-fabric-lib -p openinfer-deepseek-v4` + - `PATH=/usr/local/cuda/bin:$PATH cargo check --release -p openinfer-deepseek-v4 --features pplx-ep-bench --bin deepseek_pplx_a2a_bench` + - `PATH=/usr/local/cuda/bin:$PATH cargo check --release -p openinfer-server --features deepseek-v4,pplx-ep --bin bench_serving` - H200 release builds passed for `bench_serving` and `deepseek_pplx_a2a_bench`. - Startup placement now reserves CPU0 and CPU1, then slices each NUMA node by rank: - NUMA0 ranks 0-3 use even CPU slices: rank0 `2..46`, rank1 `48..94`, rank2 `96..142`, rank3 `144..190`. @@ -414,7 +414,7 @@ Profile file: `$RESULT_ROOT/pplx_moe_stage_nvtx_olen16.sqlite` on `jzh200-11`. Command shape: ```bash -PEGAINFER_DSV4_PPLX=1 NCCL_NVLS_ENABLE=0 nsys profile \ +OPENINFER_DSV4_PPLX=1 NCCL_NVLS_ENABLE=0 nsys profile \ --force-overwrite=true --trace=nvtx --sample=none --stats=false \ -o $RESULT_ROOT/pplx_moe_stage_nvtx_olen16 \ ./target/release/bench_serving --model-path $MODEL_DIR \ @@ -465,7 +465,7 @@ Profile file: `$RESULT_ROOT/pplx_moe_stage_cuda_capture_olen16.sqlite` on `jzh20 Historical command shape while the temporary profiler API patch was present: ```bash -PEGAINFER_DSV4_PPLX=1 NCCL_NVLS_ENABLE=0 nsys profile \ +OPENINFER_DSV4_PPLX=1 NCCL_NVLS_ENABLE=0 nsys profile \ --force-overwrite=true --trace=cuda,nvtx --sample=none \ --capture-range=cudaProfilerApi --capture-range-end=stop-shutdown \ --cuda-flush-interval=100 --stats=false \ @@ -534,7 +534,7 @@ Profile file: `$RESULT_ROOT/pplx_driver_contention_olen8.sqlite` on `jzh200-11`. Historical command shape while the temporary profiler API patch was present: ```bash -PEGAINFER_DSV4_PPLX=1 NCCL_NVLS_ENABLE=0 nsys profile \ +OPENINFER_DSV4_PPLX=1 NCCL_NVLS_ENABLE=0 nsys profile \ --trace=cuda,nvtx,osrt \ --sample=process-tree --sampling-period=1000000 \ --cpuctxsw=process-tree \ @@ -704,21 +704,21 @@ Current pplx worker CPU-pool separation experiment: - **Hypothesis**: if part of the steady tail comes from host progress jitter, then avoiding exact CPU overlap between DeepSeek rank workers and pplx a2a workers will reduce p95/max while leaving p50 roughly unchanged. - **Ceiling estimate**: the new clean log shows an exact conflict: DeepSeek rank worker 3 is pinned to CPU **6**, and pplx a2a worker for cuda:0 is also pinned to CPU **6**. Earlier profiles showed p95/max dominated by host progress/driver wait, so the plausible benefit is tail reduction rather than a 60-80 ms p50 win. - **Keep/revert criterion**: keep only if local/H200 build passes, H200 logs show no rank-worker/a2a-worker exact CPU overlap, `output_len=64` smoke generates all 64 tokens, and p95 improves by >=20 ms with p50 <= baseline + 5 ms. Revert if p50 regresses beyond 5 ms, p95/max worsens, or affinity selection fails on a constrained CPU mask. -- **Result**: kept. Local `cargo check --release -p pegainfer-server --features deepseek-v4,pplx-ep --bin bench_serving` passed, H200 `cargo build --release -p pegainfer-server --features deepseek-v4,pplx-ep --bin bench_serving` passed, and two H200 `output_len=64` serving runs completed all 64 tokens. The new logs show rank workers on CPUs **0/2/4/6/9/11/13/15** and pplx a2a workers on **8/26/50/74/3/27/51/75**, with no exact overlap. Results: `$RESULT_ROOT/pplx_cpu_pool_olen64.log` measured p50 **144.00 ms**, p95 **159.96 ms**, max **164.00 ms**, avg **144.06 ms**; `$RESULT_ROOT/pplx_cpu_pool_olen64_r2.log` measured p50 **144.00 ms**, p95 **162.86 ms**, max **168.01 ms**, avg **145.03 ms**. This passes the tail gate and reduces average TPOT by ~15-16 ms versus `$RESULT_ROOT/pplx_current_olen64.log`; it does not move p50, so the remaining gap is not CPU-overlap tail. Residual risk: the second run printed a teardown-time NCCL abort panic after metrics were emitted; this matches the known shutdown-path instability and is not forward-path evidence. +- **Result**: kept. Local `cargo check --release -p openinfer-server --features deepseek-v4,pplx-ep --bin bench_serving` passed, H200 `cargo build --release -p openinfer-server --features deepseek-v4,pplx-ep --bin bench_serving` passed, and two H200 `output_len=64` serving runs completed all 64 tokens. The new logs show rank workers on CPUs **0/2/4/6/9/11/13/15** and pplx a2a workers on **8/26/50/74/3/27/51/75**, with no exact overlap. Results: `$RESULT_ROOT/pplx_cpu_pool_olen64.log` measured p50 **144.00 ms**, p95 **159.96 ms**, max **164.00 ms**, avg **144.06 ms**; `$RESULT_ROOT/pplx_cpu_pool_olen64_r2.log` measured p50 **144.00 ms**, p95 **162.86 ms**, max **168.01 ms**, avg **145.03 ms**. This passes the tail gate and reduces average TPOT by ~15-16 ms versus `$RESULT_ROOT/pplx_current_olen64.log`; it does not move p50, so the remaining gap is not CPU-overlap tail. Residual risk: the second run printed a teardown-time NCCL abort panic after metrics were emitted; this matches the known shutdown-path instability and is not forward-path evidence. Current intra-process route exchange experiment: - **Target metric**: H200 EP8 `output_len=64` serving p50 should improve by at least **10 ms** over the CPU-pool baseline p50 **144.00 ms**, with p95 staying <= **165 ms** and all 64 tokens generated. - **Hypothesis**: if p50 floor still includes per-layer fabric route all-gather overhead, then in the single-process single-node case replacing `route_write_op + route_counter.wait` with a process-local barrier plus direct reads of peer `num_routed` mapped host pointers will reduce p50. This should not change dispatch/combine payload semantics. - **Ceiling estimate**: every MoE layer currently performs route exchange before `process_routing_info`, even though all 8 rank workers live in one process and each rank's `num_routed_host` pointer is directly addressable. The ceiling is one worker transfer submission + immediate wait per layer, so a plausible win is **5-15 ms** p50; it will not close the full 80 ms gap alone. - **Keep/revert criterion**: keep only if local/H200 builds pass, H200 `output_len=64` completes with all tokens, p50 improves by >=10 ms, and p95 stays <=165 ms. Revert on hang, correctness error, p50 regression, or teardown/stop deadlock. -- **Result**: reverted. Local `cargo fmt -p pegainfer-comm -p pegainfer-deepseek-v4` and `cargo check --release -p pegainfer-server --features deepseek-v4,pplx-ep --bin bench_serving` passed, H200 build passed, and the short H200 smoke `$RESULT_ROOT/pplx_direct_route_olen8.log` completed with steady avg **141.96 ms**, p50 **143.98 ms**, p95/max **159.08 ms**. The full gate run `$RESULT_ROOT/pplx_direct_route_olen64.log` completed all 64 tokens with first decode **223.66 ms**, steady avg **142.64 ms**, p50 **144.00 ms**, p95 **155.95 ms**, max **160.04 ms**. p95 stayed good but p50 did not move by the required 10 ms, so the code was removed. Post-revert H200 smoke `$RESULT_ROOT/pplx_cpu_pool_restored_olen8.log` completed all 8 tokens with p50 **151.81 ms** and p95/max **164.01 ms**; this is a short correctness smoke, not a new baseline. Mechanism lesson: removing only the route all-gather submission/wait is too small or already overlapped; the p50 floor is in larger per-layer a2a state-machine/device wait work, not this specific route exchange. +- **Result**: reverted. Local `cargo fmt -p openinfer-comm -p openinfer-deepseek-v4` and `cargo check --release -p openinfer-server --features deepseek-v4,pplx-ep --bin bench_serving` passed, H200 build passed, and the short H200 smoke `$RESULT_ROOT/pplx_direct_route_olen8.log` completed with steady avg **141.96 ms**, p50 **143.98 ms**, p95/max **159.08 ms**. The full gate run `$RESULT_ROOT/pplx_direct_route_olen64.log` completed all 64 tokens with first decode **223.66 ms**, steady avg **142.64 ms**, p50 **144.00 ms**, p95 **155.95 ms**, max **160.04 ms**. p95 stayed good but p50 did not move by the required 10 ms, so the code was removed. Post-revert H200 smoke `$RESULT_ROOT/pplx_cpu_pool_restored_olen8.log` completed all 8 tokens with p50 **151.81 ms** and p95/max **164.01 ms**; this is a short correctness smoke, not a new baseline. Mechanism lesson: removing only the route all-gather submission/wait is too small or already overlapped; the p50 floor is in larger per-layer a2a state-machine/device wait work, not this specific route exchange. Current bs=1 pplx capacity clamp experiment: - **Target metric**: H200 EP8 bs=1 `output_len=64` serving p50 should improve by at least **15 ms** over CPU-pool baseline p50 **144.00 ms**, with p95 <= **165 ms** and all 64 tokens generated. - **Hypothesis**: if a meaningful part of the 144 ms p50 floor is grouped FP4 work over unused pplx scratch capacity, then clamping pplx decode buffers to the actual bs=1 validation envelope will lower `expanded_input.seq_capacity()` and the grouped FP4 `rows` launch bound enough to move p50. This targets the current GPU-only rows issue without reintroducing per-layer host readback. - **Ceiling estimate**: current default `max_num_tokens=8` plus upstream private-token formula gives `max_recv_tokens=1376` rows for H200 EP8 (`topk=6`, local experts=32, padding=16). For bs=1, setting `max_num_tokens=1` and `max_private_tokens=topk` gives `max_recv_tokens=560`, a **59%** reduction in the W1/W3 and W2 grouped FP4 row bound. If grouped capacity work is a large part of p50, expected win is **15-30 ms**; if p50 is dominated by a2a state-machine waits, p50 will stay near 144 ms. - **Keep/revert criterion**: keep only if local/H200 builds pass, H200 `output_len=64` generates all 64 tokens, p50 improves by >=15 ms, and p95 stays <=165 ms. Revert on capacity error, illegal address, correctness-looking output failure, p50 regression, or p95 regression. This experiment is explicitly bs=1; it does not claim batch-serving support. -- **Result**: reverted. Local `cargo check --release -p pegainfer-server --features deepseek-v4,pplx-ep --bin bench_serving` passed and H200 build passed. H200 short smoke `$RESULT_ROOT/pplx_bs1_capacity_olen8.log` completed all 8 tokens with first decode **198.61 ms**, steady avg **143.97 ms**, p50 **144.00 ms**, p95/max **156.02 ms**. Full gate `$RESULT_ROOT/pplx_bs1_capacity_olen64.log` completed all 64 tokens with first decode **199.95 ms**, steady avg **143.24 ms**, p50 **144.00 ms**, p95 **155.74 ms**, max **192.25 ms**. The p50 did not move, so the 59% row-bound reduction is not the missing 80 ms. Mechanism lesson: the grouped FP4 capacity overrun may affect small averages, but the p50 floor is dominated by a fixed per-token/per-layer synchronization or worker-state cost outside grouped rows. +- **Result**: reverted. Local `cargo check --release -p openinfer-server --features deepseek-v4,pplx-ep --bin bench_serving` passed and H200 build passed. H200 short smoke `$RESULT_ROOT/pplx_bs1_capacity_olen8.log` completed all 8 tokens with first decode **198.61 ms**, steady avg **143.97 ms**, p50 **144.00 ms**, p95/max **156.02 ms**. Full gate `$RESULT_ROOT/pplx_bs1_capacity_olen64.log` completed all 64 tokens with first decode **199.95 ms**, steady avg **143.24 ms**, p50 **144.00 ms**, p95 **155.74 ms**, max **192.25 ms**. The p50 did not move, so the 59% row-bound reduction is not the missing 80 ms. Mechanism lesson: the grouped FP4 capacity overrun may affect small averages, but the p50 floor is dominated by a fixed per-token/per-layer synchronization or worker-state cost outside grouped rows. Current local CUDA Graph island experiment: - **Target metric**: H200 EP8 bs=1 pplx decode steady TPOT p50 reduced by at least **10 ms** over at least **16 decode steps**; no correctness regression or CUDA graph capture failure. @@ -727,13 +727,13 @@ Current local CUDA Graph island experiment: - **Keep/revert criterion**: keep only if local build passes, H200 build passes, bs=1 correctness smoke completes, and either steady p50 improves by >=10 ms or decode-only profile proves a large API-call reduction without new instability. Revert if graph capture fails, graph replay produces stale-token behavior, or p50 moves less than 5 ms with no clear API reduction. - **Result**: local build passed, H200 build passed, and bs=1 smoke completed. H200 `output_len=24` non-profile run completed with `prefill_ms=576.78`, `first_decode_step_ms=491.85`, steady TPOT avg **159.81 ms**, p50 **154.78 ms**, p95 **214.96 ms**, max **292.08 ms**, samples **22** (`$RESULT_ROOT/pplx_local_graph_olen24.log`). This is worse than the prior ratio4 batch topk profile class (`output_len=16`, avg **152.85 ms**, p50 **144.03 ms**, p95 **188.01 ms**), so the wall-clock gate failed. - **Profile result**: decode-only nsys profile `$RESULT_ROOT/pplx_local_graph_profile_olen12.{log,nsys-rep,sqlite}` captured 11 request steps / 88 rank decode ranges. It recorded **1056** `cuGraphInstantiateWithFlags` calls totaling **944.08 ms**, exactly matching 132 graph islands × 8 ranks. Replay added **23232** `cuGraphLaunch` calls totaling **421.25 ms**. Normalized by request step, `cudaLaunchKernel` fell from **17312** calls/step in `$RESULT_ROOT/pplx_driver_contention_olen8.sqlite` to **13827** calls/step, but adding `cuGraphLaunch` gives **15939** launch-class calls/step, only about **8%** below the old profile. `cuEventRecord` stayed **2752** calls/step, and `cuStreamWaitEvent` stayed in the same range (**2378 -> 2418** calls/step). Mechanism lesson: graph islands this small do remove some kernel launches, but they do not remove the explicit stream handoffs and they replace many launches with graph launches; the effective unit must be a larger operator island or a generated static decode block, not per-helper graphlets. -- **Cleanup result**: fine-grained graph island state/wrappers were removed from `state.rs`, `worker.rs`, and `block.rs`; NVTX instrumentation stayed. Local validation passed: `cargo fmt -p pegainfer-deepseek-v4`, `cargo check --release -p pegainfer-deepseek-v4 --features pplx-ep --bin deepseek_pplx_a2a_bench`, and `cargo check --release -p pegainfer-server --features deepseek-v4,pplx-ep --bin bench_serving`. H200 validation passed: `cargo build --release -p pegainfer-server --features deepseek-v4,pplx-ep --bin bench_serving`, then `PEGAINFER_DSV4_PPLX=1 NCCL_NVLS_ENABLE=0 ./target/release/bench_serving --model-path $MODEL_DIR request --prompt-len 1 --output-len 16 --warmup 0 --iters 1` completed with `prefill_ms=617.40`, `first_decode_ms=239.31`, steady TPOT avg **158.84 ms**, p50 **143.99 ms**, p95 **184.00 ms**, max **255.91 ms**, samples **14**; log `$RESULT_ROOT/pplx_no_graph_islands_olen16.log`. +- **Cleanup result**: fine-grained graph island state/wrappers were removed from `state.rs`, `worker.rs`, and `block.rs`; NVTX instrumentation stayed. Local validation passed: `cargo fmt -p openinfer-deepseek-v4`, `cargo check --release -p openinfer-deepseek-v4 --features pplx-ep --bin deepseek_pplx_a2a_bench`, and `cargo check --release -p openinfer-server --features deepseek-v4,pplx-ep --bin bench_serving`. H200 validation passed: `cargo build --release -p openinfer-server --features deepseek-v4,pplx-ep --bin bench_serving`, then `OPENINFER_DSV4_PPLX=1 NCCL_NVLS_ENABLE=0 ./target/release/bench_serving --model-path $MODEL_DIR request --prompt-len 1 --output-len 16 --warmup 0 --iters 1` completed with `prefill_ms=617.40`, `first_decode_ms=239.31`, steady TPOT avg **158.84 ms**, p50 **143.99 ms**, p95 **184.00 ms**, max **255.91 ms**, samples **14**; log `$RESULT_ROOT/pplx_no_graph_islands_olen16.log`. Current pplx worker-wait decomposition profile: - **Target metric**: H200 EP8 `output_len=64` NVTX-only profile should explain the PPLX non-rank0 p50 lane (**~74 ms**) by named worker waits, with at least **62 steady samples**. This is diagnostic; it does not decide keep/revert for a perf code path. - **Hypothesis**: if the 74ms non-rank0 lane is the per-layer PPLX worker state machine rather than model compute or raw payload transfer, then `p2p_all_to_all` p50 should be near `74ms / 43 layers ~= 1.7ms`, and one or two named waits should account for most of that per-layer p50. - **Ceiling estimate**: eliminating 1ms/layer of worker wait has a direct ceiling of **~43 ms/token** on non-rank0 lanes, and rank0/wait-rank should follow because `logits_dtoh` is the final drain. -- **Result**: confirmed. Instrumented only `WorkerState::step()` NVTX waits in `pegainfer-comm-p2p-all-to-all/src/a2a_worker.rs`; local `cargo fmt -p pegainfer-comm` passed, local `cargo check --release -p pegainfer-server --features deepseek-v4,pplx-ep --bin bench_serving` passed, H200 release build passed. H200 profile `$RESULT_ROOT/pplx_worker_wait_nvtx_olen64.{log,sqlite,nsys-rep}` completed all 64 tokens: first decode **211.51 ms**, steady TPOT p50 **144.00 ms**, p95 **159.92 ms**, max **164.12 ms**, samples **62**. +- **Result**: confirmed. Instrumented only `WorkerState::step()` NVTX waits in `openinfer-comm-p2p-all-to-all/src/a2a_worker.rs`; local `cargo fmt -p openinfer-comm` passed, local `cargo check --release -p openinfer-server --features deepseek-v4,pplx-ep --bin bench_serving` passed, H200 release build passed. H200 profile `$RESULT_ROOT/pplx_worker_wait_nvtx_olen64.{log,sqlite,nsys-rep}` completed all 64 tokens: first decode **211.51 ms**, steady TPOT p50 **144.00 ms**, p95 **159.92 ms**, max **164.12 ms**, samples **62**. - **Worker evidence**: - `p2p_all_to_all`: count **21680**, p50 **1.609 ms**, p95 **16.720 ms**, avg **3.951 ms**; count matches roughly `63 decode steps * 8 ranks * 43 MoE layers`. - `worker_wait_combine_recv_done`: count **21672**, p50 **1.111 ms**, p95 **1.175 ms**, p99 **1.191 ms**. This is the stable per-layer floor. @@ -749,14 +749,14 @@ Current single-node combine-recv grid clamp experiment: - **Hypothesis**: if the stable 1.111 ms/layer combine floor comes from launching `a2a_combine_recv_kernel` as an SM-count cooperative grid even when `num_tokens=1`, then single-node `combine_recv` can launch only `min(num_tokens, num_sms)` blocks without changing output semantics, reducing the worker's `combine_recv_done` wait and the non-rank0 lane. - **Ceiling estimate**: `worker_wait_combine_recv_done` p50 **1.111 ms/layer * 43 layers ~= 47.8 ms/token**. Even a 50% reduction is enough to meet the **20 ms** p50 gate. - **Keep/revert criterion**: keep only if local/H200 builds pass, H200 `output_len=64` generates all tokens, p50 improves by >=20 ms, p95 <=165 ms, and a follow-up worker-wait profile confirms `worker_wait_combine_recv_done` p50 <=0.5 ms/layer. Revert on hang, CUDA illegal address, wrong-looking output failure, p50 regression, or p95 regression. -- **Result**: reverted. Local `cargo fmt -p pegainfer-comm` passed and local `cargo check --release -p pegainfer-server --features deepseek-v4,pplx-ep --bin bench_serving` passed. H200 build passed. H200 gate `$RESULT_ROOT/pplx_combine_recv_grid_olen64.log` generated all 64 tokens but measured first decode **195.83 ms**, steady TPOT avg **146.26 ms**, p50 **144.00 ms**, p95 **164.00 ms**, max **187.52 ms**, samples **62**. The process then hit the known teardown segfault, but metrics were already emitted; the forward gate failed because p50 did not improve. Mechanism lesson: the 1.111 ms/layer `worker_wait_combine_recv_done` floor is not fixed by reducing `a2a_combine_recv_kernel` from SM-count blocks to `num_tokens` blocks. The cost is more likely in the flag/worker completion protocol or combine-send/recv dependency chain than in empty cooperative-grid block count alone. +- **Result**: reverted. Local `cargo fmt -p openinfer-comm` passed and local `cargo check --release -p openinfer-server --features deepseek-v4,pplx-ep --bin bench_serving` passed. H200 build passed. H200 gate `$RESULT_ROOT/pplx_combine_recv_grid_olen64.log` generated all 64 tokens but measured first decode **195.83 ms**, steady TPOT avg **146.26 ms**, p50 **144.00 ms**, p95 **164.00 ms**, max **187.52 ms**, samples **62**. The process then hit the known teardown segfault, but metrics were already emitted; the forward gate failed because p50 did not improve. Mechanism lesson: the 1.111 ms/layer `worker_wait_combine_recv_done` floor is not fixed by reducing `a2a_combine_recv_kernel` from SM-count blocks to `num_tokens` blocks. The cost is more likely in the flag/worker completion protocol or combine-send/recv dependency chain than in empty cooperative-grid block count alone. Current single-node combine-recv host-flag skip experiment: - **Target metric**: H200 EP8 `output_len=64` serving completes all 64 tokens; steady TPOT p50 improves by at least **20 ms** versus **144.00 ms**, p95 stays <= **165 ms**. Follow-up worker-wait profile should show `worker_wait_combine_recv_done` p50 below **0.5 ms/layer**. - **Hypothesis**: if the stable `worker_wait_combine_recv_done` p50 comes from `a2a_combine_recv_kernel` polling a host-set GDR flag, then single-node (`world_size == node_size`) can skip `combine_recv_flag` because there are no fabric combine payloads; same-stream ordering plus `sync_ptrs` already protect local NVLink combine copies. This should remove the per-layer host→GPU flag latency without changing cross-node behavior. - **Ceiling estimate**: CUDA+NVTX profile `$RESULT_ROOT/pplx_cuda_nvtx_olen8.sqlite` captured `a2a_combine_recv_kernel` p50 **179 us** but worker `combine_recv_done` wait p50 **1.113 ms**, leaving roughly **0.9 ms/layer** unexplained by kernel compute. That is **~38 ms/token** across 43 MoE layers. - **Keep/revert criterion**: keep only if local/H200 builds pass, H200 `output_len=64` generates all tokens, p50 improves by >=20 ms, p95 <=165 ms, and worker-wait profile confirms the `combine_recv_done` floor moved. Revert on hang, CUDA illegal address, wrong-looking output failure, p50 regression, or p95 regression. -- **Result**: reverted. Local build with `PATH=/usr/local/cuda/bin:$PATH cargo check --release -p pegainfer-server --features deepseek-v4,pplx-ep --bin bench_serving` passed; H200 release build passed. H200 gate `$RESULT_ROOT/pplx_combine_recv_skip_host_flag_olen64.log` completed with exit status 0 and generated all 64 tokens, but measured first decode **199.75 ms**, steady TPOT avg **144.71 ms**, p50 **144.00 ms**, p95 **160.00 ms**, max **164.00 ms**, samples **62**. The p50 did not move, so skipping the host flag is not enough. Mechanism lesson: the `combine_recv_done` wait range is not explained by either empty cooperative-grid blocks or the `combine_recv_flag` MMIO poll in isolation; the remaining floor is likely the broader same-stream combine-send/recv dependency plus worker state-machine cadence. +- **Result**: reverted. Local build with `PATH=/usr/local/cuda/bin:$PATH cargo check --release -p openinfer-server --features deepseek-v4,pplx-ep --bin bench_serving` passed; H200 release build passed. H200 gate `$RESULT_ROOT/pplx_combine_recv_skip_host_flag_olen64.log` completed with exit status 0 and generated all 64 tokens, but measured first decode **199.75 ms**, steady TPOT avg **144.71 ms**, p50 **144.00 ms**, p95 **160.00 ms**, max **164.00 ms**, samples **62**. The p50 did not move, so skipping the host flag is not enough. Mechanism lesson: the `combine_recv_done` wait range is not explained by either empty cooperative-grid blocks or the `combine_recv_flag` MMIO poll in isolation; the remaining floor is likely the broader same-stream combine-send/recv dependency plus worker state-machine cadence. Current a2a device wait-counter profile: - **Target metric**: H200 short run should emit device-side wait counters for all four a2a kernels at shutdown, especially `combine_recv recv_flag_avg_cycles` and `combine_recv nvlink_sum_cycles`. This is diagnostic only. @@ -777,7 +777,7 @@ Current single-node active-source combine mask experiment: - **Hypothesis**: if the previous source-specific hang came from kernel-side source inference rather than the protocol itself, then a worker-derived exact active-source mask should avoid the hang and reduce the `sync_ptrs[local_rank][peer + NODE_SIZE]` all-peer wait. - **Ceiling estimate**: same as the previous source-specific attempt: the all-peer `combine_recv nvlink_sum_cycles` counter is large enough that removing non-source peers could plausibly exceed the **20 ms/token** gate if the protocol allowed it. - **Keep/revert criterion**: keep only if local/H200 builds pass, H200 `output_len=64` generates all tokens, p50 improves by >=20 ms, and p95 <=165 ms. Revert on hang, illegal address, wrong-looking output failure, p50 regression, or p95 regression. -- **Result**: reverted. The implementation used `num_recv_tokens[2]` as a GDR-visible active-source mask, updated the C++/Rust FFI signature, and made warp1 in `a2a_combine_recv_kernel` wait only mask lanes. Local `cargo fmt -p pegainfer-comm` passed and local `PATH=/usr/local/cuda/bin:$PATH cargo check --release -p pegainfer-server --features deepseek-v4,pplx-ep --bin bench_serving` passed. H200 release build passed, but `$RESULT_ROOT/pplx_active_source_mask_olen64.log` timed out with status **124** before any forward metric. The experiment was removed locally and remotely; restored H200 build passed and `$RESULT_ROOT/pplx_restored_after_mask_olen8.log` completed with 8 tokens, steady p50 **143.96 ms**. Mechanism lesson: even an exact active-source wait set is not a safe local change. The all-peer combine sync is part of a larger bidirectional buffer-reuse/state-machine protocol, not a pure data-dependency wait. +- **Result**: reverted. The implementation used `num_recv_tokens[2]` as a GDR-visible active-source mask, updated the C++/Rust FFI signature, and made warp1 in `a2a_combine_recv_kernel` wait only mask lanes. Local `cargo fmt -p openinfer-comm` passed and local `PATH=/usr/local/cuda/bin:$PATH cargo check --release -p openinfer-server --features deepseek-v4,pplx-ep --bin bench_serving` passed. H200 release build passed, but `$RESULT_ROOT/pplx_active_source_mask_olen64.log` timed out with status **124** before any forward metric. The experiment was removed locally and remotely; restored H200 build passed and `$RESULT_ROOT/pplx_restored_after_mask_olen8.log` completed with 8 tokens, steady p50 **143.96 ms**. Mechanism lesson: even an exact active-source wait set is not a safe local change. The all-peer combine sync is part of a larger bidirectional buffer-reuse/state-machine protocol, not a pure data-dependency wait. Direct-combine feasibility probe: - **Target metric**: determine whether a future direct-combine prototype can reuse ordinary `CudaSlice` pointers (`expert_out.data`) through CUDA peer access, or whether `expert_out` must be reallocated as bootstrap-managed CUMem. @@ -789,14 +789,14 @@ Current direct-combine prototype: - **Target metric**: H200 EP8 `output_len=64` serving completes all 64 tokens; steady TPOT p50 improves by at least **20 ms** versus **144.00 ms**, p95 stays <= **165 ms**. Short `output_len=8` smoke must generate all 8 tokens before the long gate. - **Hypothesis**: if the 1.111 ms/layer combine-completion floor comes from copying routed expert output through legacy `combine_send -> recv_buffer -> combine_recv`, then a single-node direct-combine kernel can publish local `expert_out` readiness, wait for peer readiness, compute the same padded source index from `indices/token_offset/num_routed`, and reduce directly from peer `expert_out` pointers. This should remove one legacy combine-send payload/copy stage and materially lower the non-rank0 lane. - **Ceiling estimate**: `worker_wait_combine_recv_done` p50 is **1.111 ms/layer**, or **~47.8 ms/token** across 43 layers. A direct-combine path only needs to recover ~40% of that floor to pass the **20 ms** p50 gate. -- **Implementation result**: local `cargo fmt -p pegainfer-comm -p pegainfer-deepseek-v4` passed. Local `PATH=/usr/local/cuda/bin:$PATH cargo check --release -p pegainfer-server --features deepseek-v4,pplx-ep --bin bench_serving` passed. H200 release build passed after syncing the direct kernel, cxx FFI, `AllToAllContext::direct_combine_recv`, `EpBackend::direct_combine_recv`, the peer `expert_out` pointer table, and the `moe_pplx.rs` call site. +- **Implementation result**: local `cargo fmt -p openinfer-comm -p openinfer-deepseek-v4` passed. Local `PATH=/usr/local/cuda/bin:$PATH cargo check --release -p openinfer-server --features deepseek-v4,pplx-ep --bin bench_serving` passed. H200 release build passed after syncing the direct kernel, cxx FFI, `AllToAllContext::direct_combine_recv`, `EpBackend::direct_combine_recv`, the peer `expert_out` pointer table, and the `moe_pplx.rs` call site. - **Gate result**: failed and disabled. Initial H200 `$RESULT_ROOT/pplx_direct_combine_olen8.log` timed out with status **124** after benchmark start. Metadata-only direct-combine also timed out until the direct kernel waited for all peer first-half flags before publishing its second-half ready flag; after that protocol fix, `$RESULT_ROOT/pplx_direct_metadata_waitfix_olen8.log` generated all 8 tokens with steady p50 **152.00 ms**. Full direct then localized CUDA 700 to `direct_combine_recv` under `CUDA_LAUNCH_BLOCKING=1` (`$RESULT_ROOT/pplx_direct_full_waitfix_lblock_olen2.log`). Enabling both CUDA peer access and default-mempool peer access fixed the illegal address: `$RESULT_ROOT/pplx_direct_full_mempool_lblock_olen2.log` completed 2 tokens, and `$RESULT_ROOT/pplx_direct_full_mempool_olen8.log` completed 8 tokens with steady avg **141.30 ms**, p50 **143.99 ms**, p95/max **144.04 ms**. The required long gate `$RESULT_ROOT/pplx_direct_full_mempool_olen64.log` did not emit request metrics before the process ended, so the gate failed. The hot path is now hard-disabled with `USE_SINGLE_NODE_DIRECT_COMBINE=false` while the compiled prototype remains dormant. Restored H200 release build passed, and `$RESULT_ROOT/pplx_direct_false_mempool_olen64.log` generated all 64 tokens with first decode **178.04 ms**, steady avg **146.13 ms**, p50 **144.00 ms**, p95 **160.00 ms**, max **164.02 ms**, samples **62**; the process then hit the known teardown-time NCCL abort with status **134** after metrics were printed. - **Mechanism lesson**: direct peer pointer addressability is solvable, and the first protocol deadlock was specifically an early overwrite of the second-half sync slots before lagging peers consumed the previous value. But replacing only `combine_send + combine_recv` inside the legacy worker step does not prove a p50 win: short full-direct p50 stayed **143.99 ms**, and the long run produced no gate metric. The next version needs a distinct single-node worker mode that removes the legacy combine stage/barrier from the state machine, instead of dropping a GPU-only data path behind the same worker cadence. Single-node direct worker mode experiment: - **Target metric**: H200 EP8 `output_len=64` serving completes all 64 tokens; steady TPOT p50 improves by at least **20 ms** versus **144.00 ms**, p95 stays <= **165 ms**. Worker-wait NVTX should show `worker_wait_combine_recv_done` p50 below **0.5 ms/layer** without simply moving the same wait into another range. - **Change**: added an explicit `single_node_direct_combine_enabled` mode on `WorkerState`, exposed through `AllToAllContext` and `EpBackend`. `moe_pplx.rs` sets the mode before `dispatch_send` when the direct-combine branch is active. In that mode the worker keeps route/dispatch processing but skips the dispatch and combine fabric barriers, waits for the direct kernel's `combine_send_done`/`combine_recv_done`, then releases `tx_ready` directly. This isolates the single-node direct path from the legacy barrier cadence without changing the legacy combine path. -- **Result**: failed and disabled. Local `cargo fmt -p pegainfer-comm -p pegainfer-deepseek-v4` passed and local `PATH=/usr/local/cuda/bin:$PATH cargo check --release -p pegainfer-server --features deepseek-v4,pplx-ep --bin bench_serving` passed. H200 release build passed. `$RESULT_ROOT/pplx_direct_worker_mode_olen8.log` completed with status 0, steady avg **145.30 ms**, p50 **144.01 ms**, p95/max **151.61 ms**. `$RESULT_ROOT/pplx_direct_worker_mode_olen64.log` generated all 64 tokens, first decode **239.77 ms**, steady avg **147.61 ms**, p50 **144.00 ms**, p95 **164.00 ms**, max **176.21 ms**, samples **62**, then hit known teardown status **134** after metrics. The p50 gate failed, so `USE_SINGLE_NODE_DIRECT_COMBINE` is back to false. +- **Result**: failed and disabled. Local `cargo fmt -p openinfer-comm -p openinfer-deepseek-v4` passed and local `PATH=/usr/local/cuda/bin:$PATH cargo check --release -p openinfer-server --features deepseek-v4,pplx-ep --bin bench_serving` passed. H200 release build passed. `$RESULT_ROOT/pplx_direct_worker_mode_olen8.log` completed with status 0, steady avg **145.30 ms**, p50 **144.01 ms**, p95/max **151.61 ms**. `$RESULT_ROOT/pplx_direct_worker_mode_olen64.log` generated all 64 tokens, first decode **239.77 ms**, steady avg **147.61 ms**, p50 **144.00 ms**, p95 **164.00 ms**, max **176.21 ms**, samples **62**, then hit known teardown status **134** after metrics. The p50 gate failed, so `USE_SINGLE_NODE_DIRECT_COMBINE` is back to false. - **Profile result**: `$RESULT_ROOT/pplx_direct_worker_mode_nvtx_olen16.{log,sqlite,nsys-rep}` confirms the new mode really skipped the hot-path barriers: `barrier` ranges dropped from **43344** in `$RESULT_ROOT/pplx_worker_wait_nvtx_olen64.sqlite` to **16** in the direct-worker profile. But the wait moved, not disappeared. Baseline worker p50s were `worker_wait_combine_send_done` **0.003 ms** and `worker_wait_combine_recv_done` **1.111 ms**; direct-worker mode changed them to **0.970 ms** and **0.224 ms** respectively, while `p2p_all_to_all` p50 stayed **1.669 ms** vs baseline **1.609 ms**. This means the worker now waits earlier for grouped-GEMM/direct-kernel readiness rather than later for combine-recv completion. - **Restore validation**: after disabling direct again, H200 release build passed and `$RESULT_ROOT/pplx_direct_mode_restored_olen64.log` generated all 64 tokens with first decode **199.80 ms**, steady avg **144.84 ms**, p50 **144.00 ms**, p95 **156.01 ms**, max **168.22 ms**, samples **62**. It then hit the known teardown segfault after metrics. - **Mechanism lesson**: removing legacy barriers around direct-combine is insufficient because the per-layer p50 budget is not only barrier/worker completion overhead. In the direct path, the worker reaches combine stage before local expert output is ready and waits for `combine_send_done`, so the same per-layer lane time remains visible. The next p50 attempt should stop treating `p2p_all_to_all` duration as pure communication overhead and instead correlate `worker_wait_combine_send_done` with grouped FP4/local MoE compute and stream/event handoff. A correct optimization now needs either a cheaper local expert path / fewer grouped rows, or a schedule where the worker is not the serialized lane owner for local expert readiness. @@ -806,7 +806,7 @@ Single-node direct worker early-release experiment: - **Hypothesis**: if direct-worker mode failed because the worker still waited for local expert output readiness (`worker_wait_combine_send_done` p50 **0.970 ms/layer**), then in the direct-combine path the worker can release `tx_ready` immediately after `dispatch_recv_done` and return to the next step. The direct combine kernel remains ordered on `moe_stream` before the next layer's `dispatch_send`, and it owns `sync_counter`/`sync_ptrs` completion without the worker spinning on expert readiness. - **Ceiling estimate**: direct-worker-mode `p2p_all_to_all` p50 was **1.669 ms/layer** and `worker_wait_combine_send_done` p50 was **0.970 ms/layer**. Removing that worker wait has a theoretical ceiling of **~41.7 ms/token** across 43 MoE layers; even half of that passes the **20 ms** p50 gate. - **Keep/revert criterion**: keep only if local/H200 builds pass, H200 `output_len=8` smoke generates all tokens, H200 `output_len=64` generates all tokens with p50 <= **124 ms** and p95 <= **165 ms**, and a follow-up worker-wait profile confirms `p2p_all_to_all` p50 <= **1.1 ms/layer**. Revert on hang, CUDA illegal address, stale sync/counter behavior, p50 regression, p95 regression, or teardown/stop deadlock before metrics. -- **Result**: reverted. The change set `USE_SINGLE_NODE_DIRECT_COMBINE=true` and made direct mode release `tx_ready` immediately after `dispatch_recv_done`, before returning from `WorkerState::step()`. Local `cargo fmt -p pegainfer-comm -p pegainfer-deepseek-v4` passed; local `PATH=/usr/local/cuda/bin:$PATH cargo check --release -p pegainfer-server --features deepseek-v4,pplx-ep --bin bench_serving` passed; H200 release build passed. H200 short smoke `$RESULT_ROOT/pplx_direct_worker_early_release_olen8.log` generated 8 tokens with steady avg **146.73 ms**, p50 **144.00 ms**, p95/max **164.07 ms**, then hit the known teardown segfault after metrics. The full gate `$RESULT_ROOT/pplx_direct_worker_early_release_olen64.log` timed out with status **124** before any forward metric. The experiment was removed and `USE_SINGLE_NODE_DIRECT_COMBINE=false` restored; H200 release build passed and `$RESULT_ROOT/pplx_after_early_release_revert_olen8.log` generated 8 tokens with steady avg **144.30 ms**, p50 **144.03 ms**, p95/max **151.98 ms**. +- **Result**: reverted. The change set `USE_SINGLE_NODE_DIRECT_COMBINE=true` and made direct mode release `tx_ready` immediately after `dispatch_recv_done`, before returning from `WorkerState::step()`. Local `cargo fmt -p openinfer-comm -p openinfer-deepseek-v4` passed; local `PATH=/usr/local/cuda/bin:$PATH cargo check --release -p openinfer-server --features deepseek-v4,pplx-ep --bin bench_serving` passed; H200 release build passed. H200 short smoke `$RESULT_ROOT/pplx_direct_worker_early_release_olen8.log` generated 8 tokens with steady avg **146.73 ms**, p50 **144.00 ms**, p95/max **164.07 ms**, then hit the known teardown segfault after metrics. The full gate `$RESULT_ROOT/pplx_direct_worker_early_release_olen64.log` timed out with status **124** before any forward metric. The experiment was removed and `USE_SINGLE_NODE_DIRECT_COMBINE=false` restored; H200 release build passed and `$RESULT_ROOT/pplx_after_early_release_revert_olen8.log` generated 8 tokens with steady avg **144.30 ms**, p50 **144.03 ms**, p95/max **151.98 ms**. - **Mechanism lesson**: direct-combine completion cannot be detached from the worker simply by releasing `tx_ready` after dispatch. Short runs can survive, but longer decode eventually wedges, which means the worker still owns part of the per-layer lifetime beyond send-buffer reuse. The next state-machine change needs an explicit completion acknowledgment or a persistent GPU progress design; early host release alone is not a safe scheduling model. PPLX routed-MoE ceiling experiment: @@ -814,18 +814,18 @@ PPLX routed-MoE ceiling experiment: - **Hypothesis**: if the 144 ms p50 is dominated by PPLX routed-MoE composition, this fake shared-only run should drop near the NCCL 60 ms class. If it stays far above 100 ms, the next bottleneck is outside routed MoE and PPLX state-machine work has a lower ceiling. - **Ceiling estimate**: current PPLX p50 **144 ms** vs NCCL p50 **63 ms** leaves **~81 ms/token**. Removing all routed-MoE PPLX work is the maximum possible PPLX-side win; any real implementation has a lower ceiling. - **Keep/revert criterion**: never keep the code path. Record the number, then restore the real routed-MoE path and verify local build health. -- **Result**: reverted. The first remote run accidentally synced `moe_pplx.rs` to the repository root and reproduced the baseline p50 **144.00 ms**; the real source was then synced to `pegainfer-deepseek-v4/src/runtime/moe_pplx.rs` and grep verified `PPLX_SHARED_ONLY_CEILING=true`. H200 `$RESULT_ROOT/pplx_shared_only_ceiling_real_olen64.log` generated all 64 tokens, then hit the known teardown segfault after metrics. Metrics: first decode **30.31 ms**, steady TPOT avg **21.69 ms**, p50 **21.84 ms**, p95 **24.27 ms**, max **25.74 ms**, samples **62**. The output is intentionally invalid, but the performance bound is decisive: removing routed MoE/PPLX work drops far below the NCCL p50 target, so the remaining 144 ms p50 is overwhelmingly in routed-MoE/PPLX composition. The code was restored locally and remotely; local release check passed, remote grep confirmed `PPLX_SHARED_ONLY_CEILING` is absent, remote release build passed, and `$RESULT_ROOT/pplx_restored_after_shared_ceiling_olen8.log` returned to the real path with steady p50 **144.03 ms**. The useful optimization direction is a real single-node routed path that avoids the current four PPLX cooperative kernels plus worker state-machine cadence, not attention/sampling/logits work. +- **Result**: reverted. The first remote run accidentally synced `moe_pplx.rs` to the repository root and reproduced the baseline p50 **144.00 ms**; the real source was then synced to `openinfer-deepseek-v4/src/runtime/moe_pplx.rs` and grep verified `PPLX_SHARED_ONLY_CEILING=true`. H200 `$RESULT_ROOT/pplx_shared_only_ceiling_real_olen64.log` generated all 64 tokens, then hit the known teardown segfault after metrics. Metrics: first decode **30.31 ms**, steady TPOT avg **21.69 ms**, p50 **21.84 ms**, p95 **24.27 ms**, max **25.74 ms**, samples **62**. The output is intentionally invalid, but the performance bound is decisive: removing routed MoE/PPLX work drops far below the NCCL p50 target, so the remaining 144 ms p50 is overwhelmingly in routed-MoE/PPLX composition. The code was restored locally and remotely; local release check passed, remote grep confirmed `PPLX_SHARED_ONLY_CEILING` is absent, remote release build passed, and `$RESULT_ROOT/pplx_restored_after_shared_ceiling_olen8.log` returned to the real path with steady p50 **144.03 ms**. The useful optimization direction is a real single-node routed path that avoids the current four PPLX cooperative kernels plus worker state-machine cadence, not attention/sampling/logits work. Single-node peer-memory routed path groundwork: - **Target metric**: behavior-preserving setup change only. Existing PPLX `output_len=8` smoke should still generate all tokens and stay in the real-path p50 **144 ms** class. This patch does not claim a TPOT win. - **Change**: `EnablePplx` now returns a `PplxPeerScratchPtrs` bundle instead of only `expert_out`. Each rank installs peer pointer tables for `expert_out`, `expanded_input`, `recv_tokens_per_expert`, `expert_indptr`, and the EP backend's full `num_routed` table into `MoePplxScratch`. Existing direct-combine keeps consuming `peer_expert_out_ptrs`; the new pointer tables are dormant until a direct-dispatch kernel lands. - **Why**: a correct single-node direct dispatch should run on sender ranks, read local `input + route_indices`, and write directly into peer `expanded_input` plus peer per-expert counters. That requires persistent peer destination pointers; trying to ask for peer input pointers per layer would fight the rank-worker ownership model. -- **Validation**: local `cargo fmt -p pegainfer-comm -p pegainfer-deepseek-v4`, local `PATH=/usr/local/cuda/bin:$PATH cargo check --release -p pegainfer-server --features deepseek-v4,pplx-ep --bin bench_serving`, and `git diff --check` passed. H200 release build passed. H200 `$RESULT_ROOT/pplx_peer_ptr_tables_olen8.log` generated all 8 tokens on the real path: first decode **255.71 ms**, steady avg **143.96 ms**, p50 **144.01 ms**, p95/max **144.02 ms**, then hit the known teardown segfault after metrics. After adding the `num_routed` table, H200 `$RESULT_ROOT/pplx_peer_num_routed_tables_olen8.log` generated all 8 tokens with first decode **219.85 ms**, steady avg **142.31 ms**, p50 **144.00 ms**, p95/max **146.10 ms**, then hit the same known teardown segfault after metrics. This validates the peer pointer table expansion as behavior-preserving groundwork. +- **Validation**: local `cargo fmt -p openinfer-comm -p openinfer-deepseek-v4`, local `PATH=/usr/local/cuda/bin:$PATH cargo check --release -p openinfer-server --features deepseek-v4,pplx-ep --bin bench_serving`, and `git diff --check` passed. H200 release build passed. H200 `$RESULT_ROOT/pplx_peer_ptr_tables_olen8.log` generated all 8 tokens on the real path: first decode **255.71 ms**, steady avg **143.96 ms**, p50 **144.01 ms**, p95/max **144.02 ms**, then hit the known teardown segfault after metrics. After adding the `num_routed` table, H200 `$RESULT_ROOT/pplx_peer_num_routed_tables_olen8.log` generated all 8 tokens with first decode **219.85 ms**, steady avg **142.31 ms**, p50 **144.00 ms**, p95/max **146.10 ms**, then hit the same known teardown segfault after metrics. This validates the peer pointer table expansion as behavior-preserving groundwork. Single-node peer-memory direct routed experiment: - **Target metric**: H200 `output_len=64` must generate all 64 tokens and beat the pre-written gate: steady p50 <= **124 ms** and p95 <= **165 ms**. This is a correctness-path experiment, unlike the fake shared-only ceiling run. -- **Change**: added `a2a_direct_dispatch` to `pegainfer-comm-a2a-kernels`, exposed it through `AllToAllContext::direct_dispatch` and `EpBackend::direct_dispatch`, and hard-coded `USE_SINGLE_NODE_DIRECT_ROUTED=true` in `moe_pplx.rs`. The kernel runs on sender ranks, counts local routes, writes each source row into every peer's `num_routed` table, builds the destination rank's `recv_tokens_per_expert` and padded `expert_indptr`, writes routed BF16 activations directly into peer `expanded_input`, and advances the existing `sync_counter/sync_ptrs` protocol so the existing direct-combine kernel can read peer `expert_out` by the same `base + source-prefix + token_offset` formula. -- **Validation**: local `cargo fmt -p pegainfer-comm -p pegainfer-deepseek-v4`, local `PATH=/usr/local/cuda/bin:$PATH cargo check --release -p pegainfer-server --features deepseek-v4,pplx-ep --bin bench_serving`, and `git diff --check` passed. H200 release build passed. H200 `$RESULT_ROOT/pplx_direct_routed_olen8.log` generated all 8 tokens: first decode **140.19 ms**, steady avg **87.89 ms**, p50 **89.32 ms**, p95/max **92.17 ms**, then hit the known teardown segfault after metrics. H200 `$RESULT_ROOT/pplx_direct_routed_olen64.log` generated all 64 tokens: first decode **152.45 ms**, steady avg **86.05 ms**, p50 **83.94 ms**, p95 **94.12 ms**, p99 **103.54 ms**, max **107.80 ms**, then hit the same teardown segfault after metrics. +- **Change**: added `a2a_direct_dispatch` to `openinfer-comm-a2a-kernels`, exposed it through `AllToAllContext::direct_dispatch` and `EpBackend::direct_dispatch`, and hard-coded `USE_SINGLE_NODE_DIRECT_ROUTED=true` in `moe_pplx.rs`. The kernel runs on sender ranks, counts local routes, writes each source row into every peer's `num_routed` table, builds the destination rank's `recv_tokens_per_expert` and padded `expert_indptr`, writes routed BF16 activations directly into peer `expanded_input`, and advances the existing `sync_counter/sync_ptrs` protocol so the existing direct-combine kernel can read peer `expert_out` by the same `base + source-prefix + token_offset` formula. +- **Validation**: local `cargo fmt -p openinfer-comm -p openinfer-deepseek-v4`, local `PATH=/usr/local/cuda/bin:$PATH cargo check --release -p openinfer-server --features deepseek-v4,pplx-ep --bin bench_serving`, and `git diff --check` passed. H200 release build passed. H200 `$RESULT_ROOT/pplx_direct_routed_olen8.log` generated all 8 tokens: first decode **140.19 ms**, steady avg **87.89 ms**, p50 **89.32 ms**, p95/max **92.17 ms**, then hit the known teardown segfault after metrics. H200 `$RESULT_ROOT/pplx_direct_routed_olen64.log` generated all 64 tokens: first decode **152.45 ms**, steady avg **86.05 ms**, p50 **83.94 ms**, p95 **94.12 ms**, p99 **103.54 ms**, max **107.80 ms**, then hit the same teardown segfault after metrics. - **Result**: keep for the next profiling pass. The p50 gate passed by **~60 ms/token** versus the old **144.00 ms** PPLX p50, and p95 moved from the previous **~160 ms** class to **94.12 ms**. The remaining gap to NCCL p50 **~63 ms** is now about **21 ms/token**; the next evidence should come from a direct-path CUDA+NVTX profile, not the old worker-wait profile. Direct routed follow-up tightening: @@ -848,17 +848,17 @@ Direct active-peer sync attempts: GPU-only compact grouped attempt: - **Change**: added compact scratch buffers and two CUDA wrappers to compact padded `expanded_input` into an unpadded layout, run grouped FP4 with host rows `world_size * num_tokens * topk` (**48** for bs=1/EP8/topk6), then scatter `compact_out` back to padded `expert_out` so direct combine could keep its address formula. -- **Validation**: local `cargo fmt`, local `PATH=/usr/local/cuda/bin:$PATH cargo check --release -p pegainfer-server --features deepseek-v4,pplx-ep --bin bench_serving`, `git diff --check`, H200 release build, and H200 `$RESULT_ROOT/pplx_compact_grouped_olen8.log` all passed. The smoke generated 8/8 tokens with p50 **86.51 ms**. +- **Validation**: local `cargo fmt`, local `PATH=/usr/local/cuda/bin:$PATH cargo check --release -p openinfer-server --features deepseek-v4,pplx-ep --bin bench_serving`, `git diff --check`, H200 release build, and H200 `$RESULT_ROOT/pplx_compact_grouped_olen8.log` all passed. The smoke generated 8/8 tokens with p50 **86.51 ms**. - **Gate result**: H200 `$RESULT_ROOT/pplx_compact_grouped_olen64.log` generated all 64 tokens but steady p50 regressed to **84.00 ms**, p95 **97.38 ms**, max **104.21 ms**. This missed the **<=70 ms** p50 / **<=92 ms** p95 gate and was reverted. The result says the extra compact/scatter launches and fixed API/scheduling cost exceed the benefit of reducing grouped rows from 512 to 48 in this shape. Direct combine on compute stream attempt: - **Change**: left direct dispatch on `moe_stream` so it could still overlap with shared expert, but moved `direct_combine_recv` to `ctx.stream`, removing the direct-path `expert_handoff` and `combine_handoff` event pair. -- **Validation**: local `cargo fmt`, local `PATH=/usr/local/cuda/bin:$PATH cargo check --release -p pegainfer-server --features deepseek-v4,pplx-ep --bin bench_serving`, `git diff --check`, and H200 release build passed. +- **Validation**: local `cargo fmt`, local `PATH=/usr/local/cuda/bin:$PATH cargo check --release -p openinfer-server --features deepseek-v4,pplx-ep --bin bench_serving`, `git diff --check`, and H200 release build passed. - **Gate result**: H200 `$RESULT_ROOT/pplx_direct_combine_ctx_stream_olen64.log` generated all 64 tokens with p50 **77.11 ms**, p95 **90.41 ms**, p99 **92.13 ms**, max **103.39 ms**. This is within rows512 noise and missed the prewritten **<=74 ms** p50 gate, so it was reverted. The result says the two event handoffs after grouped FP4 are not a large p50 owner by themselves. Rows512 direct clean PPLX vs NCCL profile: -- **PPLX command/profile**: `/usr/local/cuda-12.9/bin/nsys profile --trace=cuda,nvtx,osrt --sample=none --cuda-event-trace=false --cuda-flush-interval=100 --force-overwrite=true --stats=false -o $RESULT_ROOT/pplx_rows512_narrow_olen16 env PEGAINFER_DSV4_PPLX=1 NCCL_NVLS_ENABLE=0 ./target/release/bench_serving --model-path $MODEL_DIR request --prompt-len 1 --output-len 16 --warmup 0 --iters 1`. Artifacts: `$RESULT_ROOT/pplx_rows512_narrow_olen16.log`, `$RESULT_ROOT/pplx_rows512_narrow_olen16.nsys-rep`, `$RESULT_ROOT/pplx_rows512_narrow_olen16.sqlite`. It generated 16/16 tokens with steady avg **80.76 ms**, p50 **79.08 ms**, p95 **86.74 ms**, max **91.02 ms**. Kernel rows survived on all 8 devices: **38,871** per GPU overall and **34,146** per GPU in the steady window. -- **NCCL comparison**: same command without `PEGAINFER_DSV4_PPLX=1`, artifacts `$RESULT_ROOT/nccl_clean_compare_olen16.log`, `$RESULT_ROOT/nccl_clean_compare_olen16.nsys-rep`, `$RESULT_ROOT/nccl_clean_compare_olen16.sqlite`. It generated 16/16 tokens and hit the known teardown segfault after metrics, with steady avg **63.84 ms**, p50 **63.17 ms**, p95 **66.01 ms**, max **69.82 ms**. The NCCL sqlite lost some per-device kernel rows during teardown, so use its runtime/NVTX rank data for comparison rather than full 8-GPU kernel accounting. +- **PPLX command/profile**: `/usr/local/cuda-12.9/bin/nsys profile --trace=cuda,nvtx,osrt --sample=none --cuda-event-trace=false --cuda-flush-interval=100 --force-overwrite=true --stats=false -o $RESULT_ROOT/pplx_rows512_narrow_olen16 env OPENINFER_DSV4_PPLX=1 NCCL_NVLS_ENABLE=0 ./target/release/bench_serving --model-path $MODEL_DIR request --prompt-len 1 --output-len 16 --warmup 0 --iters 1`. Artifacts: `$RESULT_ROOT/pplx_rows512_narrow_olen16.log`, `$RESULT_ROOT/pplx_rows512_narrow_olen16.nsys-rep`, `$RESULT_ROOT/pplx_rows512_narrow_olen16.sqlite`. It generated 16/16 tokens with steady avg **80.76 ms**, p50 **79.08 ms**, p95 **86.74 ms**, max **91.02 ms**. Kernel rows survived on all 8 devices: **38,871** per GPU overall and **34,146** per GPU in the steady window. +- **NCCL comparison**: same command without `OPENINFER_DSV4_PPLX=1`, artifacts `$RESULT_ROOT/nccl_clean_compare_olen16.log`, `$RESULT_ROOT/nccl_clean_compare_olen16.nsys-rep`, `$RESULT_ROOT/nccl_clean_compare_olen16.sqlite`. It generated 16/16 tokens and hit the known teardown segfault after metrics, with steady avg **63.84 ms**, p50 **63.17 ms**, p95 **66.01 ms**, max **69.82 ms**. The NCCL sqlite lost some per-device kernel rows during teardown, so use its runtime/NVTX rank data for comparison rather than full 8-GPU kernel accounting. - **Rank-lane accounting**: PPLX rank0-like decode p50 **78.334 ms** versus NCCL rank0-like p50 **62.857 ms**, gap **15.477 ms**. On the same rank0-like lane, PPLX launch API p50 **36.307 ms** versus NCCL **27.521 ms** (gap **8.786 ms**), and PPLX final D2H/drain p50 **32.651 ms** versus NCCL **25.964 ms** (gap **6.687 ms**). These two gaps sum to **15.473 ms**, matching the rank0 decode p50 gap. Non-rank0 PPLX decode p50 median is **43.656 ms**; NCCL non-rank0 p50 median is **36.495 ms**. - **PPLX steady runtime API**: `cudaLaunchKernel_v7000` dominates by total time with **239,568** calls totaling **3913.162 ms**; `cuMemcpyDtoHAsync_v2` has **14** calls totaling **469.559 ms** and is the final queue drain; `cuLaunchKernelEx` totals **76.816 ms** and `cudaLaunchCooperativeKernel_v9000` totals **59.689 ms**. Event waits/records are small at this profile granularity compared with launch and drain. - **Launch owners**: PPLX launch API time is mostly in HC / GEMV / TileLang / grouped wrappers, not the direct kernels. Top correlated launch totals include `deepseek_hc_bf16_to_f32_kernel` **574.6 ms**, cuBLAS `gemvx::kernel...` **444.3 ms**, `deepseek_hc_scale_mixes_block_kernel` **343.7 ms**, `deepseek_tilelang_fp8_gemm_n4096_k1024_kernel` **272.9 ms**, and `deepseek_hc_pre_norm_from_mixes_kernel` **256.0 ms**. Direct dispatch launch total is **37.0 ms** and direct combine launch total is **22.6 ms** across all rank threads in the steady window. @@ -905,14 +905,14 @@ Current direct route-position reuse experiment: - **Hypothesis**: `a2a_direct_dispatch_kernel` already computes the exact `(source_rank, padded position)` for each local token route when writing peer `expanded_input`. `a2a_direct_combine_recv_kernel` recomputes the same base/prefix position from `num_routed` and `token_offset` before reading peer `expert_out`. Persisting dispatch's per-route `position/source_rank` in direct workspace and feeding it to direct combine removes the duplicated metadata pass and shared-memory position staging. - **Ceiling estimate**: clean rows512 rank0 steady profile shows direct combine kernel body **320.282 ms** and direct dispatch kernel body **158.331 ms** across 14 rank0 steady steps. This change targets direct combine body work only; expected p50 gain is keepable only if the removed position calculation shifts wall-clock by **>=5 ms/token** or materially reduces final drain. - **Keep/revert criterion**: keep only if local/H200 release builds pass, PPLX `output_len=64` generates all 64 tokens with p50 improved by **>=5 ms/token** and p95 <= **92 ms**. Revert on build failure, CUDA error, hang/timeout, wrong-looking output, p50 movement under the gate, or p95 regression. -- **Result**: reverted. Local `git diff --check` and `PATH=/usr/local/cuda/bin:$PATH cargo check --release -p pegainfer-server --features deepseek-v4,pplx-ep --bin bench_serving` passed; H200 release build passed. H200 smoke `$RESULT_ROOT/pplx_routepos_reuse_olen8.log` generated 8/8 tokens with steady p50 **74.41 ms**. Full gates generated all 64 tokens twice: `$RESULT_ROOT/pplx_routepos_reuse_olen64.log` measured first decode **140.74 ms**, steady avg **76.95 ms**, p50 **74.52 ms**, p95 **87.90 ms**, p99 **89.30 ms**, max **98.87 ms**; `$RESULT_ROOT/pplx_routepos_reuse_olen64_r2.log` measured first decode **129.11 ms**, steady avg **78.23 ms**, p50 **74.54 ms**, p95 **89.30 ms**, p99 **89.74 ms**, max **89.82 ms**. Teardown hit the known status 139 segfault after metrics. The result is a repeatable 2.8-4.6 ms p50 improvement and healthier p95, but it misses the prewritten **>=5 ms/token** p50 gate, so the code was removed and H200 was rebuilt after revert. Lesson: duplicated direct-combine position calculation is real but too small alone; the next retained change has to merge a larger direct-side stage or reduce launch/queue depth. +- **Result**: reverted. Local `git diff --check` and `PATH=/usr/local/cuda/bin:$PATH cargo check --release -p openinfer-server --features deepseek-v4,pplx-ep --bin bench_serving` passed; H200 release build passed. H200 smoke `$RESULT_ROOT/pplx_routepos_reuse_olen8.log` generated 8/8 tokens with steady p50 **74.41 ms**. Full gates generated all 64 tokens twice: `$RESULT_ROOT/pplx_routepos_reuse_olen64.log` measured first decode **140.74 ms**, steady avg **76.95 ms**, p50 **74.52 ms**, p95 **87.90 ms**, p99 **89.30 ms**, max **98.87 ms**; `$RESULT_ROOT/pplx_routepos_reuse_olen64_r2.log` measured first decode **129.11 ms**, steady avg **78.23 ms**, p50 **74.54 ms**, p95 **89.30 ms**, p99 **89.74 ms**, max **89.82 ms**. Teardown hit the known status 139 segfault after metrics. The result is a repeatable 2.8-4.6 ms p50 improvement and healthier p95, but it misses the prewritten **>=5 ms/token** p50 gate, so the code was removed and H200 was rebuilt after revert. Lesson: duplicated direct-combine position calculation is real but too small alone; the next retained change has to merge a larger direct-side stage or reduce launch/queue depth. Current reusable PPLX handoff events experiment: - **Target metric**: H200 rows512 PPLX `output_len=64` steady p50 should improve by at least **5 ms/token** from the retained **77-79 ms** baseline, p95 should stay <= **92 ms**, and all 64 tokens must be generated. NCCL path is untouched; rebuild validation is enough unless shared code moves. - **Hypothesis**: `moe_pplx.rs` creates a fresh CUDA event for each explicit stream handoff (`route`, `direct_dispatch`, `indptr`, `expert`, `combine`) via `CudaStream::record_event(Some(DISABLE_TIMING))`. The clean rows512 decode window shows `cuEventCreate` **20640** calls / **48.418 ms** and `cuEventDestroy` **20640** calls / **10.168 ms**, while `cudaEventRecord/cuEventRecord/cuStreamWaitEvent` remain separate. Preallocating the handoff events in `MoePplxScratch` and re-recording them each layer should remove event create/destroy fanout without changing stream ordering. - **Ceiling estimate**: The measured create/destroy total is about **58.6 ms** over the profile's decode window across all rank threads. The keep gate still requires **>=5 ms/token** p50 because API totals across ranks do not directly translate to request p50; this is only worthwhile if event allocation contributes to the launch/queue tail. - **Keep/revert criterion**: keep only if local/H200 release builds pass, PPLX `output_len=64` generates all 64 tokens with p50 improved by **>=5 ms/token** and p95 <= **92 ms**. Revert on build failure, CUDA event/stream error, hang/timeout, wrong-looking output, insufficient p50 movement, or p95 regression. -- **Result**: reverted. Local `git diff --check` and `PATH=/usr/local/cuda/bin:$PATH cargo check --release -p pegainfer-server --features deepseek-v4,pplx-ep --bin bench_serving` passed; H200 release build passed. H200 smoke `$RESULT_ROOT/pplx_reuse_events_olen8.log` generated 8/8 tokens with steady p50 **77.55 ms**. Full gate `$RESULT_ROOT/pplx_reuse_events_olen64.log` generated 64/64 tokens with first decode **149.52 ms**, steady avg **80.20 ms**, p50 **77.09 ms**, p95 **90.45 ms**, p99 **91.70 ms**, max **91.84 ms**; teardown hit the known status 139 segfault after metrics. The p50 stayed in the retained rows512 baseline band, so removing event create/destroy alone is not enough. Lesson: the event allocation calls show up in API totals, but the request p50 is governed by queued work and waits that remain after event reuse. +- **Result**: reverted. Local `git diff --check` and `PATH=/usr/local/cuda/bin:$PATH cargo check --release -p openinfer-server --features deepseek-v4,pplx-ep --bin bench_serving` passed; H200 release build passed. H200 smoke `$RESULT_ROOT/pplx_reuse_events_olen8.log` generated 8/8 tokens with steady p50 **77.55 ms**. Full gate `$RESULT_ROOT/pplx_reuse_events_olen64.log` generated 64/64 tokens with first decode **149.52 ms**, steady avg **80.20 ms**, p50 **77.09 ms**, p95 **90.45 ms**, p99 **91.70 ms**, max **91.84 ms**; teardown hit the known status 139 segfault after metrics. The p50 stayed in the retained rows512 baseline band, so removing event create/destroy alone is not enough. Lesson: the event allocation calls show up in API totals, but the request p50 is governed by queued work and waits that remain after event reuse. Current direct combine ctx-stream plus route-position experiment: - **Target metric**: H200 rows512 PPLX `output_len=64` steady p50 should improve by at least **5 ms/token** from the retained **77-79 ms** baseline, p95 should stay <= **92 ms**, and all 64 tokens must be generated. NCCL path is untouched; rebuild validation is enough unless shared runtime code moves. @@ -931,7 +931,7 @@ Current direct combine ctx-stream plus route-position experiment: | Hadamard+FP4 single-pair fusion closes the ratio4 tail | killed | Fused serial kernel built and completed H200 smoke, but serving p95 regressed **184.00 -> 216.01 ms** while p50 stayed **144.03 ms**. | Re-open only for a larger fused ratio4 boundary operator that removes several launches and shows request-level p50/p95 improvement over >=16 decode steps. | | Fine-grained local CUDA Graph islands can close the non-communication gap | killed as implemented | H200 `output_len=24` graph-island run completed but steady p50 regressed to **154.78 ms**. Decode-only profile shows 1056 graph instantiates totaling **944.08 ms**, **23232** graph launches totaling **421.25 ms**, launch-class calls/step down only about **8%**, and event wait/record counts essentially unchanged. | Re-open only for a larger static island that removes explicit stream handoff boundaries and graph-launch fanout, with a pre-written gate of at least 15 ms p50 improvement over >=16 decode steps. | | Rank/a2a worker CPU overlap drives p95/max tail | alive, mitigation kept | H200 clean log had exact CPU overlap on CPU **6**; separating rank worker and pplx a2a worker pools cut `output_len=64` p95 **216.01 -> 159.96/162.86 ms** and max **300.01 -> 164.00/168.01 ms** across two runs, with p50 unchanged at **144.00 ms**. | Keep the pool split. Do not claim p50 progress from it; next p50 work needs a different mechanism. | -| Rank0 TE/fabric worker on CPU0 drives the legacy p50 floor | alive, mitigation kept | `/proc` decode-window sampling before the fix showed CPU0 `tx_engine_domain` pinned correctly but only got **3602 ms** runtime over a **7012 ms** sample and took **2980** nonvoluntary switches, while fabric workers on CPU24/48/72 got ~**7008-7012 ms** runtime and **9-17** nonvoluntary switches. Moving rank0 TE off CPU0 produced two H200 `output_len=64` runs with steady p50 **66.46 / 66.70 ms**, p95 **69.80 / 69.62 ms**, max **71.48 / 71.89 ms**. The corrected `/proc` sample saw CPU10 `tx_engine_domain` runtime delta **3452 ms** and only **4** nonvoluntary switches. Logs: `$RESULT_ROOT/pplx_te_repin_olen64.log`, `$RESULT_ROOT/pplx_te_repin_olen64_r2.log`, `$RESULT_ROOT/pplx_te_repin_olen64_r2_proc_summary.txt`. Cleanup first introduced a per-rank placement plan, then a later review found topology-group role selection could collide with rank workers (`rank0 a2a/TE/UVM` on CPUs already used by rank1/2/3 workers). Current code moves the generic pieces to `pegainfer_core::cpu_topology`: read CUDA device NUMA, current affinity, and NUMA cpulist; split each NUMA pool into contiguous rank slices; reserve CPU0 for the system and CPU1 for scheduler; assign rank/a2a/TE/UVM roles from that rank's own slice; log `cpu_slice/rank_worker/TE/a2a/UVM` per rank at startup. H200 per-NUMA slice validation showed no CPU collision and measured `output_len=64` steady p50 **66.65 ms**, p95 **68.15 ms**, max **69.47 ms** before the known teardown segfault. | Keep CPU0/CPU1 reservation and per-NUMA rank slices. Validate future placement changes with startup logs, TPOT, and `/proc//sched` deltas; do not rely on topology-group CPU order alone. | +| Rank0 TE/fabric worker on CPU0 drives the legacy p50 floor | alive, mitigation kept | `/proc` decode-window sampling before the fix showed CPU0 `tx_engine_domain` pinned correctly but only got **3602 ms** runtime over a **7012 ms** sample and took **2980** nonvoluntary switches, while fabric workers on CPU24/48/72 got ~**7008-7012 ms** runtime and **9-17** nonvoluntary switches. Moving rank0 TE off CPU0 produced two H200 `output_len=64` runs with steady p50 **66.46 / 66.70 ms**, p95 **69.80 / 69.62 ms**, max **71.48 / 71.89 ms**. The corrected `/proc` sample saw CPU10 `tx_engine_domain` runtime delta **3452 ms** and only **4** nonvoluntary switches. Logs: `$RESULT_ROOT/pplx_te_repin_olen64.log`, `$RESULT_ROOT/pplx_te_repin_olen64_r2.log`, `$RESULT_ROOT/pplx_te_repin_olen64_r2_proc_summary.txt`. Cleanup first introduced a per-rank placement plan, then a later review found topology-group role selection could collide with rank workers (`rank0 a2a/TE/UVM` on CPUs already used by rank1/2/3 workers). Current code moves the generic pieces to `openinfer_core::cpu_topology`: read CUDA device NUMA, current affinity, and NUMA cpulist; split each NUMA pool into contiguous rank slices; reserve CPU0 for the system and CPU1 for scheduler; assign rank/a2a/TE/UVM roles from that rank's own slice; log `cpu_slice/rank_worker/TE/a2a/UVM` per rank at startup. H200 per-NUMA slice validation showed no CPU collision and measured `output_len=64` steady p50 **66.65 ms**, p95 **68.15 ms**, max **69.47 ms** before the known teardown segfault. | Keep CPU0/CPU1 reservation and per-NUMA rank slices. Validate future placement changes with startup logs, TPOT, and `/proc//sched` deltas; do not rely on topology-group CPU order alone. | | Route all-gather alone explains the 144 ms p50 floor | killed | Intra-process route exchange skipped `route_write_op + route_counter.wait` through a process-local barrier and direct peer `num_routed` pointer reads. H200 `output_len=64` completed all tokens but measured p50 **144.00 ms**, p95 **155.95 ms**, matching the CPU-pool p50 baseline. | Do not spend more patches on isolated route exchange. Re-open only as part of a full single-node state-machine replacement that reduces per-layer a2a union p50 by at least 20 ms. | | Grouped FP4 unused capacity explains the 144 ms p50 floor | killed | bs=1 capacity clamp reduced theoretical pplx grouped row bound from **1376** to **560** rows and completed H200 `output_len=64`, but p50 remained **144.00 ms** with p95 **155.74 ms**. | Do not pursue host-side capacity clamps as a p50 fix. Re-open only with a profile proving grouped FP4 kernels, not a2a/driver synchronization, own at least 15 ms p50. | | libcuda driver lock/contention explains non-communication operator spikes | alive as tail source, not p50 owner | Decode-only `--cudabacktrace=all:1000` + OSRT profile shows rank-thread `pthread_mutex_lock` total **782.16 ms** and max **108.95 ms**. The largest mutex stacks are `libcuda -> cuModuleGetFunction/cuLibraryGetModule -> cudaLaunchCooperativeKernel -> a2a_dispatch_send`, while non-communication `cudaLaunchKernel` tails show 16-30 ms API time for microsecond-scale kernels inside ratio4/HC ranges. NVTX-only `output_len=64` shows attention-local medians are sub-ms and do not explain the **144 ms** p50 floor. | Keep using API-vs-kernel profiles for tails. Do not spend more p50 experiments on single tiny operator launches unless a step-correlated profile shows at least 15 ms p50 ownership. | @@ -978,9 +978,9 @@ Current direct combine ctx-stream plus route-position experiment: - 同一窗口里其它设备经常还在跑 `a2a_dispatch_recv`、`a2a_combine_send`、`a2a_combine_recv`。这解释了为什么 direct-worker-mode 只把 wait 从 `combine_recv_done` 移到 `combine_send_done`:worker 仍然是每层本地 MoE readiness 的序列化观察者。 - single-node direct routed path 把 p50 从 **144.00 ms** 降到 **83.94 ms**,rows512 再降到 **77-79 ms**。这证明旧 floor 大部分来自 legacy worker/cooperative-kernel cadence,但该 path 绕过 upstream 四阶段语义,已从代码移除。 4. **当前保留实现回到 legacy four-stage** - - direct routed path 的实验价值是定位机制,不是要把 bypass 留在 `pegainfer-comm`。现在保留的是 legacy `dispatch_send -> dispatch_recv -> combine_send -> combine_recv` 路径,以及 per-NUMA rank-slice placement 修复。 + - direct routed path 的实验价值是定位机制,不是要把 bypass 留在 `openinfer-comm`。现在保留的是 legacy `dispatch_send -> dispatch_recv -> combine_send -> combine_recv` 路径,以及 per-NUMA rank-slice placement 修复。 - CPU placement 修正后,H200 `output_len=64` 两次复测 steady p50 **66.46 / 66.70 ms**、p95 **69.80 / 69.62 ms**,已接近 NCCL **63 ms** 级。剩余方向应先低侵入 profile 新 baseline,而不是继续维护 direct hack。 - - 新 placement 使用 `pegainfer_core::cpu_topology`,把 common CPU list parsing、affinity mask、thread pinning 和 CUDA-device NUMA lookup 从 dsv4 私有 helper 里抽出。本地 gate 覆盖 `cpu_topology` 单测、Rust/CUDA bridge 编译、格式和 diagnostic bench feature 编译;下一次 H200 profile 应基于这个 cleaned legacy path。 + - 新 placement 使用 `openinfer_core::cpu_topology`,把 common CPU list parsing、affinity mask、thread pinning 和 CUDA-device NUMA lookup 从 dsv4 私有 helper 里抽出。本地 gate 覆盖 `cpu_topology` 单测、Rust/CUDA bridge 编译、格式和 diagnostic bench feature 编译;下一次 H200 profile 应基于这个 cleaned legacy path。 5. **把 grouped GEMM 的 host rows 上界收回来,但只能 GPU-only** - 旧 GPU indptr 版本用 `expanded_input.seq_capacity()` 作为 grouped GEMM `rows`,会多跑空行的 act-quant / epilogue work。direct routed 后这个浪费重新变得可见,rows512 已实测带来 **5-7 ms/token** p50 收益。 - 每层 D2H 一个 padded-total 标量已实测会把 TPOT 拉坏到 1.54s/token;单独 compact/scatter 也已实测退化到 p50 **84.00 ms**。后续更进一步只能让 grouped path 原生接受 sparse/padded indptr,或把 compact 融进已有 kernel,不能新增两次 per-layer launch。 diff --git a/docs/models/deepseek-v4/prefix-paged-kv-pd-handoff.md b/docs/models/deepseek-v4/prefix-paged-kv-pd-handoff.md index 09fbd73e..d5cb87ac 100644 --- a/docs/models/deepseek-v4/prefix-paged-kv-pd-handoff.md +++ b/docs/models/deepseek-v4/prefix-paged-kv-pd-handoff.md @@ -31,7 +31,7 @@ them. ### Direct KV Ownership -`code-fact`: `pegainfer-deepseek-v4/src/direct/scheduler.rs` owns +`code-fact`: `openinfer-deepseek-v4/src/direct/scheduler.rs` owns `DirectKvCacheManager` and `DirectKvCacheLease`. Current lifecycle: @@ -78,7 +78,7 @@ P-D handoff, or transport-level handles. ### Communication Boundary -`code-fact`: `pegainfer-comm` currently provides EP all-to-all public surface and +`code-fact`: `openinfer-comm` currently provides EP all-to-all public surface and opaque operation handles. It does not yet provide KV transfer or ownership handoff primitives. @@ -375,7 +375,7 @@ field names. `derivation`: P-D handoff requires ownership handles, not transport objects. The handle describes who owns cleanup and which observable signal proves transfer completion or cancellation. It does not choose RDMA, IPC, serialization, or a -specific `pegainfer-comm` operation. +specific `openinfer-comm` operation. ### Export Side @@ -520,7 +520,7 @@ Out of scope: - prefix eviction performance tuning; - production prefix cache policy; - changing HTTP benchmark semantics; -- replacing `pegainfer-comm` or adding KV transfer to it. +- replacing `openinfer-comm` or adding KV transfer to it. Merge criteria: @@ -536,5 +536,5 @@ Merge criteria: deliberately does not commit to a value. - Decide whether prefix entries are rank-local only in v1 or require a multi-rank consistency object. -- Define a future `pegainfer-comm` KV-transfer extension only after allocator +- Define a future `openinfer-comm` KV-transfer extension only after allocator handles and cleanup semantics are proven locally. diff --git a/docs/models/deepseek-v4/serving-baseline.md b/docs/models/deepseek-v4/serving-baseline.md index 3255d70c..511af13c 100644 --- a/docs/models/deepseek-v4/serving-baseline.md +++ b/docs/models/deepseek-v4/serving-baseline.md @@ -19,7 +19,7 @@ Use this document as the baseline contract before changing the DeepSeek V4 sched | Capability | Status | Evidence | | --- | --- | --- | -| DeepSeek V4 engine load behind the OpenAI HTTP facade | Available for smoke testing | `pegainfer-server --features deepseek-v4 --bin pegainfer` starts an OpenAI server for `$MODEL_DIR` on 8x RTX 5090 | +| DeepSeek V4 engine load behind the OpenAI HTTP facade | Available for smoke testing | `openinfer-server --features deepseek-v4 --bin openinfer` starts an OpenAI server for `$MODEL_DIR` on 8x RTX 5090 | | `/v1/models` | Available | The returned model id is the full model path: `$MODEL_DIR` | | `/v1/completions` single-request greedy smoke | Available | Prompt `hello`, `max_tokens=4`, `temperature=0` returned a text completion and usage accounting | | Direct single-request TPOT/hash regression | Available | `bench_serving request --prompt-len 1 --output-len 160 --warmup 2 --iters 3 --seed 42` is the retained DeepSeek V4 decode gate | @@ -34,21 +34,21 @@ Run these commands from any checkout at or after PR #101's merge commit `d6d2cee Build the HTTP server on the 5090 host: ```bash -cd /path/to/pegainfer +cd /path/to/openinfer export PATH=/usr/local/cuda-13.1/bin:$PWD/.venv/bin:$PATH export CUDA_HOME=/usr/local/cuda-13.1 -export PEGAINFER_TILELANG_PYTHON=$PWD/.venv/bin/python -export PEGAINFER_TRITON_PYTHON=$PWD/.venv/bin/python -export PEGAINFER_NVCC_JOBS=8 -export CARGO_TARGET_DIR=/path/to/pegainfer-target +export OPENINFER_TILELANG_PYTHON=$PWD/.venv/bin/python +export OPENINFER_TRITON_PYTHON=$PWD/.venv/bin/python +export OPENINFER_NVCC_JOBS=8 +export CARGO_TARGET_DIR=/path/to/openinfer-target -cargo build --release -p pegainfer-server --features deepseek-v4 --bin pegainfer +cargo build --release -p openinfer-server --features deepseek-v4 --bin openinfer ``` Start the HTTP endpoint: ```bash -$CARGO_TARGET_DIR/release/pegainfer \ +$CARGO_TARGET_DIR/release/openinfer \ --model-path $MODEL_DIR \ --port 18103 ``` @@ -102,7 +102,7 @@ Observed smoke result: Run the direct single-request decode regression gate: ```bash -cargo run --release -p pegainfer-server \ +cargo run --release -p openinfer-server \ --bin bench_serving \ --features deepseek-v4 \ -- \ diff --git a/docs/models/deepseek-v4/support.md b/docs/models/deepseek-v4/support.md index e924ec99..b490bbb7 100644 --- a/docs/models/deepseek-v4/support.md +++ b/docs/models/deepseek-v4/support.md @@ -9,8 +9,8 @@ This document is the single project record for the initial DeepSeek V4 PR. It re The PR scope is: -- add `pegainfer-deepseek-v4` as the model crate for the DeepSeek V4 Flash MP8 checkpoint; -- wire DeepSeek V4 into `pegainfer-server` model detection and `bench_serving`; +- add `openinfer-deepseek-v4` as the model crate for the DeepSeek V4 Flash MP8 checkpoint; +- wire DeepSeek V4 into `openinfer-server` model detection and `bench_serving`; - build official-style DeepSeek V4 TileLang kernels at compile time; - keep runtime Python-free; - provide exact text, operator, and HTTP service validation; @@ -23,7 +23,7 @@ DeepSeek V4 currently requires the `deepseek-v4` Cargo feature and TileLang at b The kernels build script probes: -- `PEGAINFER_TILELANG_PYTHON`, if set; +- `OPENINFER_TILELANG_PYTHON`, if set; - `../.venv/bin/python`; - `.venv/bin/python`; - `python3`; @@ -40,28 +40,28 @@ Minimal setup: uv venv && source .venv/bin/activate uv pip install torch --index-url https://download.pytorch.org/whl/cu128 uv pip install "tilelang==0.1.9" -export PEGAINFER_TILELANG_PYTHON=.venv/bin/python +export OPENINFER_TILELANG_PYTHON=.venv/bin/python ``` -The generated CUDA is linked into `pegainfer-kernels` when the feature is enabled; Python is not needed at runtime. +The generated CUDA is linked into `openinfer-kernels` when the feature is enabled; Python is not needed at runtime. ## Implementation Summary ### Model Crate -`pegainfer-deepseek-v4` owns: +`openinfer-deepseek-v4` owns: - config parsing for DeepSeek V4 MP8; - per-rank weight manifests and GPU loading; - runtime ops for block prefill/decode, HC, sparse attention, routing, compressor state, and final logits; - direct `EngineHandle` integration used by server and tests; -- exact E2E tests driven by `test_data/deepseek-v4-ground-truth.json`, with `PEGAINFER_DEEPSEEK_GT_PATH` available for regenerations. +- exact E2E tests driven by `test_data/deepseek-v4-ground-truth.json`, with `OPENINFER_DEEPSEEK_GT_PATH` available for regenerations. The direct engine seeds decode cache from prompt prefill instead of replaying prompt tokens through decode. This made exact validation practical enough for PR use. ### TileLang Kernels -`pegainfer-kernels/tools/tilelang/deepseek_v4/generate.py` generates CUDA sources for official-style DeepSeek V4 kernels: +`openinfer-kernels/tools/tilelang/deepseek_v4/generate.py` generates CUDA sources for official-style DeepSeek V4 kernels: - `act_quant_kernel` - `fp8_gemm_kernel` @@ -104,11 +104,11 @@ All 20 ground-truth cases pass exact text validation as four 5-case slices with Command shape: ```bash -PEGAINFER_DEEPSEEK_GT_OFFSET= \ -PEGAINFER_DEEPSEEK_GT_LIMIT=5 \ -PEGAINFER_DEEPSEEK_GT_MAX_NEW_TOKENS=64 \ -PEGAINFER_TEST_MODEL_PATH=models/DeepSeek-V4-Flash \ -cargo test --release -p pegainfer-deepseek-v4 --features deepseek-v4 --test e2e -- --nocapture --exact test_e2e_deepseek_v4_generation +OPENINFER_DEEPSEEK_GT_OFFSET= \ +OPENINFER_DEEPSEEK_GT_LIMIT=5 \ +OPENINFER_DEEPSEEK_GT_MAX_NEW_TOKENS=64 \ +OPENINFER_TEST_MODEL_PATH=models/DeepSeek-V4-Flash \ +cargo test --release -p openinfer-deepseek-v4 --features deepseek-v4 --test e2e -- --nocapture --exact test_e2e_deepseek_v4_generation ``` ### Operator Guards @@ -116,9 +116,9 @@ cargo test --release -p pegainfer-deepseek-v4 --features deepseek-v4 --test e2e The full DeepSeek V4 `mp8_manifest` release test passes: ```bash -PEGAINFER_TEST_MODEL_PATH=$MODEL_DIR \ -PEGAINFER_NVCC_JOBS=8 \ -cargo test --release -p pegainfer-deepseek-v4 --features deepseek-v4 --test mp8_manifest -- --nocapture +OPENINFER_TEST_MODEL_PATH=$MODEL_DIR \ +OPENINFER_NVCC_JOBS=8 \ +cargo test --release -p openinfer-deepseek-v4 --features deepseek-v4 --test mp8_manifest -- --nocapture ``` Result: `23 passed`, `0 failed`. @@ -127,14 +127,14 @@ Coverage includes MP8 layout accessors, RoPE formula checks, TileLang FP8/FP4 li ### HTTP Service -With `--features deepseek-v4`, `pegainfer-server` detects `model_type="deepseek_v4"` and starts DeepSeek V4 with eight devices and CUDA graph disabled. +With `--features deepseek-v4`, `openinfer-server` detects `model_type="deepseek_v4"` and starts DeepSeek V4 with eight devices and CUDA graph disabled. The initial service path is intentionally greedy-only. Requests that ask for sampling or logprobs are rejected before generation and surfaced through `stop_reason` instead of being silently coerced to greedy. This is a temporary compatibility choice in the vLLM frontend path; a later API cleanup should reject unsupported DeepSeek V4 request parameters during request validation instead of representing them as a completed generation. Server command used for HTTP validation: ```bash -PEGAINFER_NVCC_JOBS=8 cargo run --release -p pegainfer-server --features deepseek-v4 -- \ +OPENINFER_NVCC_JOBS=8 cargo run --release -p openinfer-server --features deepseek-v4 -- \ --model-path $MODEL_DIR --port 18080 ``` @@ -197,7 +197,7 @@ Earlier exact-request profiling removed the large synchronous `cudaMalloc/cudaFr The current synthetic decode-heavy baseline on 5090-dev is: ```bash -PEGAINFER_NVCC_JOBS=8 cargo run --release -p pegainfer-server --bin bench_serving --features deepseek-v4 -- \ +OPENINFER_NVCC_JOBS=8 cargo run --release -p openinfer-server --bin bench_serving --features deepseek-v4 -- \ --model-path $MODEL_DIR --format json \ request --prompt-len 1 --output-len 32 --warmup 1 --iters 1 ``` @@ -219,12 +219,12 @@ The remaining TPOT problem is structural: decode still launches hundreds of thou ## Workspace Isolation -DeepSeek V4 is a workspace member, but its DeepSeek-specific bins, integration tests, and `pegainfer-kernels/deepseek-v4` dependency are gated behind the `deepseek-v4` feature. This keeps default Qwen-oriented workspace checks from requiring TileLang. +DeepSeek V4 is a workspace member, but its DeepSeek-specific bins, integration tests, and `openinfer-kernels/deepseek-v4` dependency are gated behind the `deepseek-v4` feature. This keeps default Qwen-oriented workspace checks from requiring TileLang. Verified: -- `PEGAINFER_NVCC_JOBS=8 cargo check --release --workspace` passed with DeepSeek TileLang disabled in `pegainfer-kernels`. -- `cargo test --release --workspace --lib` passed with DeepSeek TileLang disabled in `pegainfer-kernels`. Qwen model-loading lib tests now skip only when `PEGAINFER_TEST_MODEL_PATH` is unset and the default local model directory is absent; explicitly provided model paths still run normally and fail normally if invalid. +- `OPENINFER_NVCC_JOBS=8 cargo check --release --workspace` passed with DeepSeek TileLang disabled in `openinfer-kernels`. +- `cargo test --release --workspace --lib` passed with DeepSeek TileLang disabled in `openinfer-kernels`. Qwen model-loading lib tests now skip only when `OPENINFER_TEST_MODEL_PATH` is unset and the default local model directory is absent; explicitly provided model paths still run normally and fail normally if invalid. ## Known Follow-ups @@ -235,7 +235,7 @@ These are intentionally out of the initial PR scope: - add arbitrary-value per-shape TileLang FP8/FP4 parity tests beyond the current power-of-two guards; - profile final logits only if nsys shows it matters; - profile NCCL all-reduce and TileLang FP4 GEMM after the initial PR lands; -- narrow the current public diagnostic surface in `pegainfer-deepseek-v4` after bring-up bins/tests are either retired or moved behind a dedicated test-helper boundary; +- narrow the current public diagnostic surface in `openinfer-deepseek-v4` after bring-up bins/tests are either retired or moved behind a dedicated test-helper boundary; - move unsupported DeepSeek V4 request handling from generation-time `stop_reason` compatibility into frontend request validation; - add an explicit non-panicking shutdown path for NCCL communicator teardown. @@ -243,21 +243,21 @@ These are intentionally out of the initial PR scope: Before opening the PR, keep the required gate focused: -- `cargo fmt --check -p pegainfer-deepseek-v4` -- `cargo check --release -p pegainfer-server` -- `PEGAINFER_NVCC_JOBS=8 cargo check --release -p pegainfer-server --features deepseek-v4` -- `PEGAINFER_TEST_MODEL_PATH=$MODEL_DIR PEGAINFER_NVCC_JOBS=8 cargo test --release -p pegainfer-deepseek-v4 --features deepseek-v4 --test mp8_manifest -- --nocapture` +- `cargo fmt --check -p openinfer-deepseek-v4` +- `cargo check --release -p openinfer-server` +- `OPENINFER_NVCC_JOBS=8 cargo check --release -p openinfer-server --features deepseek-v4` +- `OPENINFER_TEST_MODEL_PATH=$MODEL_DIR OPENINFER_NVCC_JOBS=8 cargo test --release -p openinfer-deepseek-v4 --features deepseek-v4 --test mp8_manifest -- --nocapture` - four exact E2E slices over `test_data/deepseek-v4-ground-truth.json`, using: ```bash -PEGAINFER_TEST_MODEL_PATH=$MODEL_DIR \ -PEGAINFER_DEEPSEEK_GT_OFFSET=<0|5|10|15> \ -PEGAINFER_DEEPSEEK_GT_LIMIT=5 \ -PEGAINFER_DEEPSEEK_GT_MAX_NEW_TOKENS=64 \ -PEGAINFER_NVCC_JOBS=8 \ -cargo test --release -p pegainfer-deepseek-v4 --features deepseek-v4 --test e2e -- --nocapture --exact test_e2e_deepseek_v4_generation +OPENINFER_TEST_MODEL_PATH=$MODEL_DIR \ +OPENINFER_DEEPSEEK_GT_OFFSET=<0|5|10|15> \ +OPENINFER_DEEPSEEK_GT_LIMIT=5 \ +OPENINFER_DEEPSEEK_GT_MAX_NEW_TOKENS=64 \ +OPENINFER_NVCC_JOBS=8 \ +cargo test --release -p openinfer-deepseek-v4 --features deepseek-v4 --test e2e -- --nocapture --exact test_e2e_deepseek_v4_generation ``` -- one `/v1/completions` or `/v1/chat/completions` validation through `pegainfer-server`, plus one unsupported-parameter request checking `stop_reason`. +- one `/v1/completions` or `/v1/chat/completions` validation through `openinfer-server`, plus one unsupported-parameter request checking `stop_reason`. Broader workspace checks are valuable, but failures outside the DeepSeek V4 diff should be separated from this initial support PR. diff --git a/docs/models/kimi-k2/accuracy-gate.md b/docs/models/kimi-k2/accuracy-gate.md index ec25fdf4..9fd63164 100644 --- a/docs/models/kimi-k2/accuracy-gate.md +++ b/docs/models/kimi-k2/accuracy-gate.md @@ -1,6 +1,6 @@ # Kimi-K2 accuracy gate (vLLM-golden) -**TL;DR**: `pegainfer-kimi-k2/tests/vllm_golden_gate.rs` + `test_data/kimi-k2.6-vllm-golden.safetensors` give Kimi-K2 its first accuracy gate reproducible from a fresh clone (#223). Reference is vLLM (same INT4 quantized model, marlin kernels), not HF. Two passes through the public serving path: teacher-forced argmax sweep (prefill numerics, regret rule + two-sided |Δlogprob| bound) and free-greedy decode parity (decode kernels, divergence-classified). The TP1/DP8 path emits exact per-token logprobs (#236), so the gate measures both engines' logprobs of the same token, like the Qwen gates. Needs 8 GPUs + K2.6 weights; fails loudly when prerequisites are missing. +**TL;DR**: `openinfer-kimi-k2/tests/vllm_golden_gate.rs` + `test_data/kimi-k2.6-vllm-golden.safetensors` give Kimi-K2 its first accuracy gate reproducible from a fresh clone (#223). Reference is vLLM (same INT4 quantized model, marlin kernels), not HF. Two passes through the public serving path: teacher-forced argmax sweep (prefill numerics, regret rule + two-sided |Δlogprob| bound) and free-greedy decode parity (decode kernels, divergence-classified). The TP1/DP8 path emits exact per-token logprobs (#236), so the gate measures both engines' logprobs of the same token, like the Qwen gates. Needs 8 GPUs + K2.6 weights; fails loudly when prerequisites are missing. Last touched: 2026-06 @@ -9,7 +9,7 @@ Last touched: 2026-06 Kimi-K2.6 is INT4 (compressed-tensors, pack-quantized). The general methodology (`docs/subsystems/correctness/logits-golden-gate.md`) uses HF bf16 as golden — for Kimi that is the wrong precision regime: HF decompresses INT4 to bf16 and -runs dense GEMMs, while both vLLM and pegainfer execute the quantized model +runs dense GEMMs, while both vLLM and openinfer execute the quantized model through marlin-style INT4 kernels. vLLM is the closest equal-precision reference, and the same box that runs the gate can regenerate the fixture (vLLM 0.22.0 serves K2.6 out of the box). @@ -27,7 +27,7 @@ asserts through the *real serving path* (DP coordinator → PPLX EP → MLA kernels, TP1/DP8/EP8): 1. **Teacher-forced argmax sweep** (prefill numerics): for every tail position - `i`, prefill `prompt + vllm_tail[..i]` with `max_tokens=1`. pegainfer's + `i`, prefill `prompt + vllm_tail[..i]` with `max_tokens=1`. openinfer's pick must satisfy the flatness-scaled regret rule (see Tolerances): the allowed distance below vLLM's argmax *in vLLM's own logprobs* grows with vLLM's own uncertainty at that position. An aggregate exact-match floor @@ -43,14 +43,14 @@ kernels, TP1/DP8/EP8): bit-identical). Both passes additionally bound the **two-sided |Δlogprob|** at exact-match -positions — pegainfer's own logprob of the agreed token against vLLM's +positions — openinfer's own logprob of the agreed token against vLLM's stored one (mean + p99 per pass). Flip positions are excluded from that population on purpose: their Δ is structurally larger (the engines disagree about a flat distribution, which the regret rule already governs), and mixing the populations parks the p99 on the boundary between them — the same run-to-run straddling that killed fixed regret thresholds. Flip-pick Δ is printed for observability. A per-position internal-consistency check -(the pick's logprob must equal the head of pegainfer's own top-K) catches +(the pick's logprob must equal the head of openinfer's own top-K) catches GPU-argmax-vs-host-log-softmax disagreement on the same logits. ## Running it @@ -62,16 +62,16 @@ GPU-argmax-vs-host-log-softmax disagreement on the same logits. --out test_data/kimi-k2.6-vllm-golden.safetensors # Run the gate (8 GPUs; vLLM must be stopped first — both need the full node): -PEGAINFER_TEST_MODEL_PATH=/data/models/Kimi-K2.6 \ -cargo test -p pegainfer-kimi-k2 --features kimi-k2 --release \ +OPENINFER_TEST_MODEL_PATH=/data/models/Kimi-K2.6 \ +cargo test -p openinfer-kimi-k2 --features kimi-k2 --release \ --test vllm_golden_gate -- --nocapture ``` Build env on an H200/H20 node: `PATH` must include `/root/.cargo/bin` and -`/usr/local/cuda/bin`, plus `PEGAINFER_CUDA_SM=90a` and -`PEGAINFER_TRITON_PYTHON` (see `docs/models/kimi-k2/tp1-dp8-ep8-performance.md`). +`/usr/local/cuda/bin`, plus `OPENINFER_CUDA_SM=90a` and +`OPENINFER_TRITON_PYTHON` (see `docs/models/kimi-k2/tp1-dp8-ep8-performance.md`). -There is no silent skip: missing `PEGAINFER_TEST_MODEL_PATH` or a missing +There is no silent skip: missing `OPENINFER_TEST_MODEL_PATH` or a missing fixture panics. (The qwen35 gate's env-gated skip silently reported "ok 0.00s" — this gate deliberately does not.) @@ -92,12 +92,12 @@ regret ≤ REGRET_BASE + REGRET_FLATNESS_SLOPE × (−vllm_top1_logprob) = 0.30 + 0.35 × (−vllm_top1_logprob) ``` -where regret = how far pegainfer's pick sits below vLLM's argmax in vLLM's +where regret = how far openinfer's pick sits below vLLM's argmax in vLLM's own logprobs. At a confident position (top-1 ≈ 90%) the bound is ≈ 0.34 nat — near-exact agreement; at a flat multi-modal position (top-1 ≈ 11%) it reaches ≈ 1.07, because there is no single correct token for cross-engine noise to deviate from. The bound depends only on the committed vLLM fixture, -so pegainfer cannot influence its own tolerance. +so openinfer cannot influence its own tolerance. Calibration (three 8×H200 runs, 2026-06-05/06, vLLM 0.22.0 fixture): ~98% of positions match exactly in every pass; every cross-engine diff --git a/docs/models/kimi-k2/bringup-history.md b/docs/models/kimi-k2/bringup-history.md index ee28d80b..598d5e7e 100644 --- a/docs/models/kimi-k2/bringup-history.md +++ b/docs/models/kimi-k2/bringup-history.md @@ -38,11 +38,11 @@ K2.5 与 K2.6 文本架构相同,K2.6 是继续训练版;shape/TP8/EP8 规 - Marlin weight repack(`gptq_marlin_moe_repack`,no-actorder):checkpoint offset-binary `[expert,out,K/8] int32` → Marlin `uint4b8` bias=8 `[expert,K/16,N*2] int32`,总字节不变,**不做 `xor 0x88`**(保留 unsigned nibble)。 - Marlin scale permute(`marlin_moe_permute_scales`):checkpoint `[expert,out,in_group]` → group-major + 64-block `scale_perm` 的 `[expert,in_group,out]`。 - W13 必须在 load/package 阶段 fuse 成 `gate_then_up`(vLLM runtime ABI 不吃独立 gate/up):fused W13 int32 view `[48,448,8192]`,scale `[48,224,4096]`;W2 packed `[48,128,14336]`,scale `[48,64,7168]`。常驻 package 是 fused W13 + W2,gate/up 只是 load-time 临时 buffer。 -- 关键修复(Marlin atomic split-K):vLLM `fused_marlin_moe` 对 W13/W2 都用 `use_atomic_add=False, use_fp32_reduce=True`,走 global F32 `c_tmp` 累加。PegaInfer 早期固定 `use_atomic_add=true` 且不传 `c_tmp`,split-K>1 时 BF16 `atomicAdd` 写 C,累加顺序非确定 → row-state 发散。修复为预分配 `c_tmp` F32 + 关 atomic。H20 单层 W13/route_output/final 对 vLLM reference `max_diff=0 / mean_diff=0`(real K2.5 rank0 layer1)。 +- 关键修复(Marlin atomic split-K):vLLM `fused_marlin_moe` 对 W13/W2 都用 `use_atomic_add=False, use_fp32_reduce=True`,走 global F32 `c_tmp` 累加。OpenInfer 早期固定 `use_atomic_add=true` 且不传 `c_tmp`,split-K>1 时 BF16 `atomicAdd` 写 C,累加顺序非确定 → row-state 发散。修复为预分配 `c_tmp` F32 + 关 atomic。H20 单层 W13/route_output/final 对 vLLM reference `max_diff=0 / mean_diff=0`(real K2.5 rank0 layer1)。 ### Router scale placement -vLLM `grouped_topk` 返回 **未乘** `routed_scaling_factor` 的 normalized topk weights;`DeepseekV2MoE.forward` 在 routed expert 总输出后整体乘 `2.827`。PegaInfer 早期把 `2.827` 提前乘进 router topk weight 再喂 W2,rounding boundary 与 vLLM 不一致 → 已改为 router 输出 unscaled weights,routed F32 sum/reduce 后整体乘 scale。 +vLLM `grouped_topk` 返回 **未乘** `routed_scaling_factor` 的 normalized topk weights;`DeepseekV2MoE.forward` 在 routed expert 总输出后整体乘 `2.827`。OpenInfer 早期把 `2.827` 提前乘进 router topk weight 再喂 W2,rounding boundary 与 vLLM 不一致 → 已改为 router 输出 unscaled weights,routed F32 sum/reduce 后整体乘 scale。 ### Tokenizer / prompt contract @@ -52,11 +52,11 @@ vLLM `grouped_topk` 返回 **未乘** `routed_scaling_factor` 的 normalized top ## Removed / superseded (tombstones) -- **Expert-major INT4 / CUTLASS example69 path — removed in #234.** Bring-up first built a CUTLASS example69 (Hopper INT4×BF16 grouped GEMM) probe as the routed-expert backend. A focused H20 probe proved it could not express Kimi's `group_size=32` per-K-group scale: example69 reloads scale on a 64-wide K tile (`TileShapeK=64`), so col `32/33` reused group0 scale and col `64` used group1 scale; `TileShapeK=32` hits CUTLASS static assert `K_BLOCK_MAX >= 4`. The path was demoted to a limitation probe and then deleted in #234 — the CUTLASS-era projection kernels/probe (`weight_packed_cutlass_example69`, `weight_shape` tensor loading, the example69 launcher and FFI) are gone. Marlin WNA16 is the only runtime INT4 path. `KimiExpertMajorProjectionPlan` (`pegainfer-kimi-k2/src/weights/package.rs`) remains **live** — it describes the EP weight layout, not the dead CUTLASS kernel. `KimiExpertMajorRoute` outlived its callers (DeepEP routing replaced it) and was deleted in the post-#298 dead-code sweep. -- **`weight_shape` GPU load — removed in #234.** It was loaded for 60 MoE layers × 384 experts × 3 projections, validated to `[2]`, then never consumed by any kernel (dims come from manifest constants). Dropping it removes **8,640 tensors** from the load set (`pegainfer-kimi-k2/src/weights/tests.rs` asserts the count). The checkpoint still carries `weight_shape` on disk; the runtime simply no longer reads it. -- **`KIMI_RUNNER_MAX_BATCH = 4` hard-cap — superseded.** Bring-up locked decode at a fixed bs4 wave. The const is now `64` (`pegainfer-kimi-k2/src/runner/scheduler.rs`), with worker decode arenas bucketed `[1, 2, 4, 8, 16, 32, 64]` (`KIMI_DECODE_BATCH_BUCKETS`, `worker.rs`) and per-request cap `KIMI_MAX_REQUEST_TOKENS = 8192` (DP prompt cap `PPLX_MAX_DISPATCH_TOKENS = 2048`). Changing the cap is not a one-const edit: it ties arena count, every `decode_batch_size`-shaped scratch/router/Marlin shape, and per-bucket CUDA-graph capture. -- **`kimi-k2-pplx-ep` cargo feature + `PEGAINFER_KIMI_PARALLEL` env — removed.** Parallel shape and EP backend are now CLI flags: `--tp-size/--dp-size/--ep-backend`. The feature is just `kimi-k2`. Active line: `--tp-size 1 --dp-size 8 --ep-backend pplx`. -- **Internal H20 smoke/candidate/debug test entries — removed.** Direct worker/scheduler no longer carries `forward_prompt_smoke`, `ForwardOneTokenSmoke`, full-decode smoke, row-diff D2H instrumentation, or candidate-dump tests; only CPU unit tests (placement, page metadata) remain. Progress is gated end-to-end through `pegainfer-server` / `bench_serving` / OpenAI `/v1/completions`. +- **Expert-major INT4 / CUTLASS example69 path — removed in #234.** Bring-up first built a CUTLASS example69 (Hopper INT4×BF16 grouped GEMM) probe as the routed-expert backend. A focused H20 probe proved it could not express Kimi's `group_size=32` per-K-group scale: example69 reloads scale on a 64-wide K tile (`TileShapeK=64`), so col `32/33` reused group0 scale and col `64` used group1 scale; `TileShapeK=32` hits CUTLASS static assert `K_BLOCK_MAX >= 4`. The path was demoted to a limitation probe and then deleted in #234 — the CUTLASS-era projection kernels/probe (`weight_packed_cutlass_example69`, `weight_shape` tensor loading, the example69 launcher and FFI) are gone. Marlin WNA16 is the only runtime INT4 path. `KimiExpertMajorProjectionPlan` (`openinfer-kimi-k2/src/weights/package.rs`) remains **live** — it describes the EP weight layout, not the dead CUTLASS kernel. `KimiExpertMajorRoute` outlived its callers (DeepEP routing replaced it) and was deleted in the post-#298 dead-code sweep. +- **`weight_shape` GPU load — removed in #234.** It was loaded for 60 MoE layers × 384 experts × 3 projections, validated to `[2]`, then never consumed by any kernel (dims come from manifest constants). Dropping it removes **8,640 tensors** from the load set (`openinfer-kimi-k2/src/weights/tests.rs` asserts the count). The checkpoint still carries `weight_shape` on disk; the runtime simply no longer reads it. +- **`KIMI_RUNNER_MAX_BATCH = 4` hard-cap — superseded.** Bring-up locked decode at a fixed bs4 wave. The const is now `64` (`openinfer-kimi-k2/src/runner/scheduler.rs`), with worker decode arenas bucketed `[1, 2, 4, 8, 16, 32, 64]` (`KIMI_DECODE_BATCH_BUCKETS`, `worker.rs`) and per-request cap `KIMI_MAX_REQUEST_TOKENS = 8192` (DP prompt cap `PPLX_MAX_DISPATCH_TOKENS = 2048`). Changing the cap is not a one-const edit: it ties arena count, every `decode_batch_size`-shaped scratch/router/Marlin shape, and per-bucket CUDA-graph capture. +- **`kimi-k2-pplx-ep` cargo feature + `OPENINFER_KIMI_PARALLEL` env — removed.** Parallel shape and EP backend are now CLI flags: `--tp-size/--dp-size/--ep-backend`. The feature is just `kimi-k2`. Active line: `--tp-size 1 --dp-size 8 --ep-backend pplx`. +- **Internal H20 smoke/candidate/debug test entries — removed.** Direct worker/scheduler no longer carries `forward_prompt_smoke`, `ForwardOneTokenSmoke`, full-decode smoke, row-diff D2H instrumentation, or candidate-dump tests; only CPU unit tests (placement, page metadata) remain. Progress is gated end-to-end through `openinfer-server` / `bench_serving` / OpenAI `/v1/completions`. ## Chronology (decision records) @@ -82,10 +82,10 @@ The bring-up ran ~2026-05-20 to 2026-05-22 on an 8×H200 node against a vLLM `0. ## Reference tooling (off-repo fixtures) -- `pegainfer-kernels/tools/kimi_k2/hf_logits_reference.py` — HF raw full-logits reference (trust_remote_code + vision-tower stub; INT4-only reference, slow run_compressed load). -- `pegainfer-kernels/tools/kimi_k2/vllm_logits_reference.py` — vLLM serving top-logprob fixture (vLLM 0.19.0 caps sample `logprobs` at 20, so the bring-up gate used top-20). Supports `--prompt-set-json` for batched multi-prompt cases. -- `pegainfer-kernels/tools/kimi_k2/vllm_marlin_wna16_reference.py` — vLLM Marlin W13 / W2 / final BF16 reference; `--model-path ... --layer-idx 1 --rank 0` reads the real checkpoint's rank-local experts. -- `pegainfer-kernels/tools/kimi_k2/compare_vllm_topk_fixture.py` / `compare_logits_fixture.py` — candidate comparison (argmax / top-k overlap / logits diff). -- `pegainfer-kernels/tools/kimi_k2/torch_reference.py` — compressed-tensors official pack/dequant, bit-exact INT4 single-expert fixture (self-check `0-diff`). +- `openinfer-kernels/tools/kimi_k2/hf_logits_reference.py` — HF raw full-logits reference (trust_remote_code + vision-tower stub; INT4-only reference, slow run_compressed load). +- `openinfer-kernels/tools/kimi_k2/vllm_logits_reference.py` — vLLM serving top-logprob fixture (vLLM 0.19.0 caps sample `logprobs` at 20, so the bring-up gate used top-20). Supports `--prompt-set-json` for batched multi-prompt cases. +- `openinfer-kernels/tools/kimi_k2/vllm_marlin_wna16_reference.py` — vLLM Marlin W13 / W2 / final BF16 reference; `--model-path ... --layer-idx 1 --rank 0` reads the real checkpoint's rank-local experts. +- `openinfer-kernels/tools/kimi_k2/compare_vllm_topk_fixture.py` / `compare_logits_fixture.py` — candidate comparison (argmax / top-k overlap / logits diff). +- `openinfer-kernels/tools/kimi_k2/torch_reference.py` — compressed-tensors official pack/dequant, bit-exact INT4 single-expert fixture (self-check `0-diff`). A strict bit-level `h20_kimi_marlin_wna16_single_layer_matches_vllm_reference` gate is kept `#[ignore]` (red by design): vLLM Marlin's W2 atomic split-K accumulation order gives `route_output max_diff=96 / mean_diff=1.86` at BF16 magnitude ~7000 (< 0.03% relative, ~1.5 ULP) — not an algorithm bug. Turning it green requires either a `use_fp32_reduce` fixture or a ULP-relative tolerance. diff --git a/docs/models/kimi-k2/deepep-migration.md b/docs/models/kimi-k2/deepep-migration.md index da4fe9e6..e75b23d3 100644 --- a/docs/models/kimi-k2/deepep-migration.md +++ b/docs/models/kimi-k2/deepep-migration.md @@ -1,13 +1,13 @@ # Kimi-K2 MoE EP: PPLX → DeepEP migration -> **TL;DR:** Implemented and 8×H200-verified — Kimi-K2's MoE EP backend is now DeepEP (elastic API, AOT-instantiated, no torch/NVRTC/NVSHMEM); PPLX is fully deleted from the kimi path (`moe_pplx.rs` gone, kimi crate no longer depends on `pegainfer-comm`). Decode = `do_expand=true` + `do_cpu_sync=false`: fixed worst-case buffers, zero host syncs/allocs per step → **CUDA graph capture enabled (#227): bs64 steady TPOT p50 26.03 ms vs 29.61 eager (−12%), replay only at full bucket occupancy**. Prefill = `do_cpu_sync=true` with host spin on pinned counters. Marlin consumes the DeepEP recv buffer **in place** (expert_alignment 8 == Marlin block size; identity routing + sentinels). Same-node A/B vs PPLX on hth200-29: **eager bs64 TPOT p50 29.61 vs 29.79 ms (parity), comm itself 7µs/layer faster**; golden gate equivalent to main (free-greedy near-tie reds on both backends, teacher-forced 0 violations both). The initial port was +14% TPOT slower until two capacity-proportional adapter kernels were fixed (see the lesson below). +> **TL;DR:** Implemented and 8×H200-verified — Kimi-K2's MoE EP backend is now DeepEP (elastic API, AOT-instantiated, no torch/NVRTC/NVSHMEM); PPLX is fully deleted from the kimi path (`moe_pplx.rs` gone, kimi crate no longer depends on `openinfer-comm`). Decode = `do_expand=true` + `do_cpu_sync=false`: fixed worst-case buffers, zero host syncs/allocs per step → **CUDA graph capture enabled (#227): bs64 steady TPOT p50 26.03 ms vs 29.61 eager (−12%), replay only at full bucket occupancy**. Prefill = `do_cpu_sync=true` with host spin on pinned counters. Marlin consumes the DeepEP recv buffer **in place** (expert_alignment 8 == Marlin block size; identity routing + sentinels). Same-node A/B vs PPLX on hth200-29: **eager bs64 TPOT p50 29.61 vs 29.79 ms (parity), comm itself 7µs/layer faster**; golden gate equivalent to main (free-greedy near-tie reds on both backends, teacher-forced 0 violations both). The initial port was +14% TPOT slower until two capacity-proportional adapter kernels were fixed (see the lesson below). > > **Last touched:** 2026-06 ## Architecture as built ``` -pegainfer-kernels/ +openinfer-kernels/ third_party/DeepEP # submodule d4f41e4 (2026-05-26) csrc/deepep/deepep_shim.cu # AOT template instantiation (Kimi config baked: # 384 experts / 48 local, topk 8, hidden 7168, 8 ranks) @@ -15,20 +15,20 @@ pegainfer-kernels/ src/ffi/deepep.rs # repr(C) DeepEpInfo + extern decls src/ops/deepep.rs # DeepEp wrapper: decode_dispatch/decode_combine (no sync), # prefill_dispatch_send/wait_counts/recv + prefill_combine -pegainfer-kimi-k2/ +openinfer-kimi-k2/ src/runner/moe_deepep.rs # the MoE layer: # forward_moe_layer_decode_deepep_normed (host-quiet) # forward_moe_layer_prefill_deepep (cpu-sync) ``` -Build needs `PEGAINFER_NCCL_ROOT` pointing at NCCL ≥ 2.30 (device API: `ncclDevComm`, +Build needs `OPENINFER_NCCL_ROOT` pointing at NCCL ≥ 2.30 (device API: `ncclDevComm`, windows, GIN). The binary links `libnccl.so.2` via `LD_LIBRARY_PATH` at runtime. -Local dev: `PEGAINFER_NCCL_ROOT=/data/opt/nccl-2.30.4`. +Local dev: `OPENINFER_NCCL_ROOT=/data/opt/nccl-2.30.4`. Backend selection: TP1/DP8 **requires** `--ep-backend=deepep` (default), TP8/DP1 requires `nccl` — both enforced with `ensure!` in `runner/bringup.rs`. There is no -PPLX fallback by design ("我们并不是很喜欢 pplx ep"). `pegainfer-comm`/PPLX survive -only for the deepseek crates, which use their own `pegainfer_comm::EpBackend` type. +PPLX fallback by design ("我们并不是很喜欢 pplx ep"). `openinfer-comm`/PPLX survive +only for the deepseek crates, which use their own `openinfer_comm::EpBackend` type. ## The contracts the integration stands on (verified in upstream source) @@ -161,7 +161,7 @@ Node env facts (also apply to other hth200 nodes until proven otherwise): plugin loads but deeper init fails without DOCA GPUNetIO; intranode traffic is NVLink windows, GIN is inter-node-only. - System NCCL is exactly 2.30.4 (`/usr/include` + `/usr/lib/x86_64-linux-gnu`); - `PEGAINFER_NCCL_ROOT` wants the `include/`+`lib/` layout — a symlink tree at + `OPENINFER_NCCL_ROOT` wants the `include/`+`lib/` layout — a symlink tree at `/data/opt/nccl-2.30.4` bridges it. - The bastion swallows ssh exit codes — poll remote jobs with output markers, never `$?`. `pkill -f ` self-matches the ssh wrapper command line — diff --git a/docs/models/kimi-k2/dp-design.md b/docs/models/kimi-k2/dp-design.md index 52e47d40..91331ec6 100644 --- a/docs/models/kimi-k2/dp-design.md +++ b/docs/models/kimi-k2/dp-design.md @@ -31,7 +31,7 @@ local_experts = 384 / ep_world (按 ep_world 切) ```rust /// 纯并行拓扑,跟模型无关。可复用于 DSV4、Qwen 等。 -/// 放 pegainfer-core。 +/// 放 openinfer-core。 pub struct ParallelConfig { pub tp_world: usize, pub dp_world: usize, @@ -39,7 +39,7 @@ pub struct ParallelConfig { } /// 一个 rank 在 TP×DP×EP 网格中的坐标。 -/// 放 pegainfer-core。 +/// 放 openinfer-core。 pub struct RankCoord { pub global_rank: usize, pub tp_rank: usize, // global_rank % tp_world @@ -48,7 +48,7 @@ pub struct RankCoord { } /// Kimi-K2 专属:从拓扑派生的模型维度。 -/// 现有 KimiK2ParallelShape 的延续,留在 pegainfer-kimi-k2。 +/// 现有 KimiK2ParallelShape 的延续,留在 openinfer-kimi-k2。 pub struct KimiK2ModelConfig { pub topo: ParallelConfig, pub heads_per_tp: usize, // = 64 / tp_world diff --git a/docs/models/kimi-k2/kv-cache-design.md b/docs/models/kimi-k2/kv-cache-design.md index cfdb4b0c..803e9e03 100644 --- a/docs/models/kimi-k2/kv-cache-design.md +++ b/docs/models/kimi-k2/kv-cache-design.md @@ -20,7 +20,7 @@ after burning up to 2047 tokens of compute. Nothing validated ### 2. Kimi KV is already paged — the kernels need nothing -`KimiMlaPagedKvLayout` (`pegainfer-kernels/src/ops/kimi_k2/mla.rs`), page +`KimiMlaPagedKvLayout` (`openinfer-kernels/src/ops/kimi_k2/mla.rs`), page table buffers (`page_indices_d/page_indptr_d/last_page_len_d`), a paged append kernel (`kimi_mla_paged_kv_append`) and a paged MLA decode kernel (`kimi_flashinfer_batch_decode_mla`) were all in production. The "fixed arena" @@ -39,12 +39,12 @@ threads explicit `positions_d`, so the fix shape is cleaner than qwen3's was. ## Design (as landed) -### Logical/physical split: `BlockPool` in `pegainfer-kv-cache` +### Logical/physical split: `BlockPool` in `openinfer-kv-cache` The qwen3 `KvCacheManager` was split so MLA models reuse the logical layer without inheriting the full-attention physical layout: -- **`BlockPool`** (`pegainfer-kv-cache/src/pool.rs`): kvbm `BlockManager` + +- **`BlockPool`** (`openinfer-kv-cache/src/pool.rs`): kvbm `BlockManager` + the reserved padding block + `RequestKv` (the `SchedulableSequence` wrapper: `schedule_prefill/apply_prefill/schedule_decode/apply_decode`, RAII release). Owns **no GPU memory** — it hands out block IDs. @@ -84,7 +84,7 @@ it within 16 decode steps). Handing that raw list to the worker trips the exact-match check above. Every page row given to a forward pass must come from `RequestKv::step_page_indices(new_tokens)`, which trims to `ceil((kv_position + new_tokens)/16)`; a regression test in -`pegainfer-kv-cache/src/pool.rs` sweeps prompt lengths × decode steps and +`openinfer-kv-cache/src/pool.rs` sweeps prompt lengths × decode steps and self-retires if kvbm stops over-allocating. Why it surfaced as a *hang*, not an error: on DP the owning rank's diff --git a/docs/models/kimi-k2/optimization.md b/docs/models/kimi-k2/optimization.md index bdbe58ea..13b1d2c6 100644 --- a/docs/models/kimi-k2/optimization.md +++ b/docs/models/kimi-k2/optimization.md @@ -6,7 +6,7 @@ ## Goal -PegaInfer Kimi-K2 端到端延迟和吞吐在同 H20 ×8 配置上达到或超过 vLLM 0.19.0 baseline,并保留 greedy token-id parity 作为 keep/revert 硬 gate。**当前重点是 decode 性能**,prefill 与 decode 主线并行改,但不优先。 +OpenInfer Kimi-K2 端到端延迟和吞吐在同 H20 ×8 配置上达到或超过 vLLM 0.19.0 baseline,并保留 greedy token-id parity 作为 keep/revert 硬 gate。**当前重点是 decode 性能**,prefill 与 decode 主线并行改,但不优先。 阶段路线(前两步已落地,TP1+DP8+EP8 是当前 active line): @@ -29,13 +29,13 @@ PegaInfer Kimi-K2 端到端延迟和吞吐在同 H20 ×8 配置上达到或超 ## E2E Dashboard(TP8+EP8 历史 bring-up 口径) -> 这一节是 TP8+EP8 NCCL graph 路径的历史 dashboard,concurrency 锁在 bs4。它记录的是 bring-up 阶段的 keep/revert gate,不是当前 serving cap。**当前 active line(TP1+DP8+EP8)decode batch cap 是 64**,bucketed `[1,2,4,8,16,32,64]`(`KIMI_RUNNER_MAX_BATCH = 64`,`pegainfer-kimi-k2/src/runner/scheduler.rs`),bs64 service 数据见 [tp1-dp8-ep8-performance.md](tp1-dp8-ep8-performance.md) / [roadmap.md](roadmap.md)。 +> 这一节是 TP8+EP8 NCCL graph 路径的历史 dashboard,concurrency 锁在 bs4。它记录的是 bring-up 阶段的 keep/revert gate,不是当前 serving cap。**当前 active line(TP1+DP8+EP8)decode batch cap 是 64**,bucketed `[1,2,4,8,16,32,64]`(`KIMI_RUNNER_MAX_BATCH = 64`,`openinfer-kimi-k2/src/runner/scheduler.rs`),bs64 service 数据见 [tp1-dp8-ep8-performance.md](tp1-dp8-ep8-performance.md) / [roadmap.md](roadmap.md)。 -GPU: 8× NVIDIA H20。Model: Kimi-K2.5 (Kimi-K2.6 同架构权重,K2.5 是当时 H20 验证路径)。vLLM: 0.19.0。**vLLM 是 TP1+DP8+EP8 形态**,跟当时 pegainfer 的 TP8+EP8 形态不同——这不是 apples-to-apples,是两条不同 sharding 路线在同硬件下的 baseline 对照(参见 [vllm-h20-baseline.md](vllm-h20-baseline.md))。 +GPU: 8× NVIDIA H20。Model: Kimi-K2.5 (Kimi-K2.6 同架构权重,K2.5 是当时 H20 验证路径)。vLLM: 0.19.0。**vLLM 是 TP1+DP8+EP8 形态**,跟当时 openinfer 的 TP8+EP8 形态不同——这不是 apples-to-apples,是两条不同 sharding 路线在同硬件下的 baseline 对照(参见 [vllm-h20-baseline.md](vllm-h20-baseline.md))。 -In-process bench(pegainfer 自带 `bench_serving request`): +In-process bench(openinfer 自带 `bench_serving request`): -| Profile | Metric | pegainfer | 备注 | +| Profile | Metric | openinfer | 备注 | | --- | --- | --- | --- | | short-prompt streaming (~30 tok in, free out) | TTFT | `1995.5ms` | HTTP `/v1/completions` 端到端 | | short-prompt streaming (~30 tok in, free out) | TPOT | `14.48ms` (≈30.8 tok/s) | HTTP | @@ -46,11 +46,11 @@ In-process bench(pegainfer 自带 `bench_serving request`): HTTP bench 同 client(`vllm bench serve`),decode-heavy profile(input=1, output=128, ignore-eos, bs=4): -| Metric | pegainfer TP8+EP8 | vLLM TP1+DP8+EP8 | delta | +| Metric | openinfer TP8+EP8 | vLLM TP1+DP8+EP8 | delta | | --- | ---: | ---: | --- | -| TPOT median | `19.13ms` | `24.97ms` | pegainfer −23% | -| TPOT p99 | `23.63ms` | `29.46ms` | pegainfer −20% | -| ITL median | `17.42ms` | `23.02ms` | pegainfer −24% | +| TPOT median | `19.13ms` | `24.97ms` | openinfer −23% | +| TPOT p99 | `23.63ms` | `29.46ms` | openinfer −20% | +| ITL median | `17.42ms` | `23.02ms` | openinfer −24% | | TTFT median | `313.10ms` | `69.60ms` | **vLLM 4.5× 更低** | | TTFT p99 | `4239.97ms` | `135.40ms` | **vLLM 31× 更低** | | Output tok/s | `159.99` | `157.94` | 同量级 | @@ -59,7 +59,7 @@ HTTP bench 同 client(`vllm bench serve`),decode-heavy profile(input=1, - in-process bench 来自 `target/release/bench_serving request --cuda-graph true ...`,已过四并发 vLLM fixture greedy gate,不会被 prompt prefill 吃掉。 - 短 prompt streaming TTFT 是 OpenAI-compatible `/v1/completions` 端到端窗口(含 first-collective stream drain、scheduler、frontend),不是纯 prefill GPU time;prefill 阶段拆分还没开始(见 Open 章节)。 -- HTTP bench 是用同一份 `vllm bench serve --backend openai --endpoint /v1/completions` 分别打 pegainfer 和 vLLM server,保证 client / metric 定义一致。vLLM TP1+DP8+EP8 完整 bs 1..256 扫描见 [vllm-h20-baseline.md](vllm-h20-baseline.md)。 +- HTTP bench 是用同一份 `vllm bench serve --backend openai --endpoint /v1/completions` 分别打 openinfer 和 vLLM server,保证 client / metric 定义一致。vLLM TP1+DP8+EP8 完整 bs 1..256 扫描见 [vllm-h20-baseline.md](vllm-h20-baseline.md)。 - **HTTP 19.13 vs in-process 14.39 差 4.74ms / token,~33% overhead** —— frontend / streaming 不该这么多,已记录到 Open 章节作为独立查询项。 ## Architecture @@ -273,7 +273,7 @@ Marlin 数字是 synthetic all-local route 假设,不是真实 EP8 全局 rout **Bottleneck:** H20 固定 4 并发 fixture `max_tokens=16` 时 row1 偶发输出 `[1008,2742,924,6454,...]`(应为 `[1008,2742,2531,414,...]`)。Per-phase row first-diff 把切点收缩到 layer1 routed expert path,最早是 `moe_w13_out`。 -**Root cause:** PegaInfer Marlin WNA16 wrapper 固定 `use_atomic_add=true` 且没传 `c_tmp`。当 split-K > 1 时,kernel 用 BF16 `atomicAdd` 直接累加进 output C;BF16 atomic 在 H20 上对累加顺序敏感,rank/token 之间的非确定性 ordering 把 row state 弄花。vLLM 自己的 `fused_marlin_moe.py` 对 W13 和 W2 都传 `use_atomic_add=False, use_fp32_reduce=True`,走 global F32 `c_tmp` 累加。 +**Root cause:** OpenInfer Marlin WNA16 wrapper 固定 `use_atomic_add=true` 且没传 `c_tmp`。当 split-K > 1 时,kernel 用 BF16 `atomicAdd` 直接累加进 output C;BF16 atomic 在 H20 上对累加顺序敏感,rank/token 之间的非确定性 ordering 把 row state 弄花。vLLM 自己的 `fused_marlin_moe.py` 对 W13 和 W2 都传 `use_atomic_add=False, use_fp32_reduce=True`,走 global F32 `c_tmp` 累加。 **Approach:** worker / decode arena 预分配 `c_tmp` F32 buffer,Marlin launch 切到 vLLM 的 global-reduce 路径(`use_atomic_add=false`),output / locks 在 step 边界 zero-fill。 diff --git a/docs/models/kimi-k2/pplx-ep-correctness.md b/docs/models/kimi-k2/pplx-ep-correctness.md index 97cba5f9..07c718c7 100644 --- a/docs/models/kimi-k2/pplx-ep-correctness.md +++ b/docs/models/kimi-k2/pplx-ep-correctness.md @@ -24,7 +24,7 @@ Target comparison: > CLI note: the parallel shape and EP backend are selected by the > `--tp-size/--dp-size/--ep-backend` flags. The old `kimi-k2-pplx-ep` cargo -> feature and `PEGAINFER_KIMI_PARALLEL` env (used in the original 2026-05-25 run) +> feature and `OPENINFER_KIMI_PARALLEL` env (used in the original 2026-05-25 run) > have been removed; the feature is now just `kimi-k2`. TP1/DP8 PPLX is intentionally not the baseline for this document. The current @@ -34,9 +34,9 @@ repair first makes TP8/DP1 PPLX match TP8/DP1 NCCL. | Date | Path | Output | Result | | --- | --- | --- | --- | -| 2026-05-25 | `cargo check --release -p pegainfer-server --features kimi-k2 --bin bench_serving` (PPLX selected via `--ep-backend pplx`) | clean build on 8×H200 node | Pass | -| 2026-05-25 | `cargo check --release -p pegainfer-server --features kimi-k2 --bin bench_serving` | clean build on 8×H200 node | Pass | -| 2026-05-25 | `cargo test --release -p pegainfer-comm --test pplx_roundtrip -- --nocapture` | 8 ranks dispatch+combine roundtrip, each rank received 512 tokens | Pass | +| 2026-05-25 | `cargo check --release -p openinfer-server --features kimi-k2 --bin bench_serving` (PPLX selected via `--ep-backend pplx`) | clean build on 8×H200 node | Pass | +| 2026-05-25 | `cargo check --release -p openinfer-server --features kimi-k2 --bin bench_serving` | clean build on 8×H200 node | Pass | +| 2026-05-25 | `cargo test --release -p openinfer-comm --test pplx_roundtrip -- --nocapture` | 8 ranks dispatch+combine roundtrip, each rank received 512 tokens | Pass | | 2026-05-25 | TP8 PPLX bs4, output 5, iters 3 | `$RESULT_ROOT/kimi_pplx_tp8_bs4_o5_final.json`: 12/12 traces hash `7c4c5d83355198fd` | Pass | | 2026-05-25 | TP8 NCCL bs64 active decode | `$RESULT_ROOT/kimi_nccl_tp8_active64_o5_final.json`: `Counter({'7c4c5d83355198fd': 32, '9eecc1ca6fb3409d': 32})`, steady TPOT p50 `97.53ms` | Reference | | 2026-05-25 | TP8 PPLX bs64 active decode | `$RESULT_ROOT/kimi_pplx_tp8_active64_o5_after_review.json`: `Counter({'7c4c5d83355198fd': 32, '9eecc1ca6fb3409d': 32})`, steady TPOT p50 `110.14ms` | Matches NCCL | @@ -51,18 +51,18 @@ per-index trace equality between PPLX and NCCL for the same active scheduling. Common environment: ```bash -cd $PEGAINFER_DIR +cd $OPENINFER_DIR export CUDA_HOME=/usr/local/cuda export NVCC=/usr/local/cuda/bin/nvcc -export LD_LIBRARY_PATH=$RESULT_ROOT/pegainfer-nccl-lib:/usr/local/cuda/lib64:${LD_LIBRARY_PATH:-} -export PEGAINFER_CUDA_SM=90a -export PEGAINFER_TRITON_PYTHON=$PEGAINFER_DIR/.triton-venv/bin/python +export LD_LIBRARY_PATH=$RESULT_ROOT/openinfer-nccl-lib:/usr/local/cuda/lib64:${LD_LIBRARY_PATH:-} +export OPENINFER_CUDA_SM=90a +export OPENINFER_TRITON_PYTHON=$OPENINFER_DIR/.triton-venv/bin/python ``` NCCL reference (TP8/DP1): ```bash -cargo run --quiet --release -p pegainfer-server --features kimi-k2 --bin bench_serving -- \ +cargo run --quiet --release -p openinfer-server --features kimi-k2 --bin bench_serving -- \ --model-path $MODEL_DIR \ --tp-size 8 --dp-size 1 --ep-backend nccl \ --cuda-graph false \ @@ -74,7 +74,7 @@ cargo run --quiet --release -p pegainfer-server --features kimi-k2 --bin bench_s PPLX path (TP8/DP1): ```bash -cargo run --quiet --release -p pegainfer-server --features kimi-k2 --bin bench_serving -- \ +cargo run --quiet --release -p openinfer-server --features kimi-k2 --bin bench_serving -- \ --model-path $MODEL_DIR \ --tp-size 8 --dp-size 1 --ep-backend pplx \ --cuda-graph false \ diff --git a/docs/models/kimi-k2/pplx-ep-decode.md b/docs/models/kimi-k2/pplx-ep-decode.md index 1ce70ac5..f28ef93c 100644 --- a/docs/models/kimi-k2/pplx-ep-decode.md +++ b/docs/models/kimi-k2/pplx-ep-decode.md @@ -122,7 +122,7 @@ block_size=64 意味着 `thread_m_blocks=4`(large-batch config),每个 m_b ### #0 PPLX EP baseline(2026-05-23) -从 TP8+EP8 NCCL 路径 fork,接入 `pegainfer-comm::EpBackend`。PPLX 4-step protocol 替换 NCCL RS bridge,router scale 在 combine_recv 后单独应用(`accumulate=false` + `kimi_scaled_add_f32_bf16_to_bf16`)。 +从 TP8+EP8 NCCL 路径 fork,接入 `openinfer-comm::EpBackend`。PPLX 4-step protocol 替换 NCCL RS bridge,router scale 在 combine_recv 后单独应用(`accumulate=false` + `kimi_scaled_add_f32_bf16_to_bf16`)。 初始 bench_serving 测得 PPLX TPOT ≈ 37ms,NCCL no-graph ≈ 19ms。 @@ -183,11 +183,11 @@ prefix sum 是串行的,但只有 48 iterations in shared memory——不值 | File | 改动 | | --- | --- | -| `pegainfer-kimi-k2/src/runner/moe_pplx.rs` | PPLX_EXPERT_PADDING 64→8, block_size 硬编码 8, forward 逻辑 | -| `pegainfer-kernels/csrc/kimi_k2/kimi_experts.cu` | routing kernel 并行化 <<<1,64>>>, shared memory prefix sum | -| `pegainfer-kernels/csrc/kimi_k2/kimi_marlin_wna16.cu` | `swiglu_w13_pplx_kernel` + C wrapper | -| `pegainfer-kernels/src/ops/kimi_k2/experts.rs` | `kimi_pplx_build_marlin_routing_on_stream`, tight_max 计算, PPLX GEMM wrappers | -| `pegainfer-kernels/src/ffi.rs` | FFI declarations | +| `openinfer-kimi-k2/src/runner/moe_pplx.rs` | PPLX_EXPERT_PADDING 64→8, block_size 硬编码 8, forward 逻辑 | +| `openinfer-kernels/csrc/kimi_k2/kimi_experts.cu` | routing kernel 并行化 <<<1,64>>>, shared memory prefix sum | +| `openinfer-kernels/csrc/kimi_k2/kimi_marlin_wna16.cu` | `swiglu_w13_pplx_kernel` + C wrapper | +| `openinfer-kernels/src/ops/kimi_k2/experts.rs` | `kimi_pplx_build_marlin_routing_on_stream`, tight_max 计算, PPLX GEMM wrappers | +| `openinfer-kernels/src/ffi.rs` | FFI declarations | ## Open diff --git a/docs/models/kimi-k2/roadmap.md b/docs/models/kimi-k2/roadmap.md index 531dd91f..9bd88f94 100644 --- a/docs/models/kimi-k2/roadmap.md +++ b/docs/models/kimi-k2/roadmap.md @@ -52,7 +52,7 @@ So routing diversity is worth **~7–15%** of decode TPOT, non-linear in K (flat | CUDA graph decode | ✓ DeepEP full-bucket capture, −12% TPOT | #227/#298 | | Bench-regression snapshot | ✓ `bench_snapshots/h200/kimi-k2.6.json` | #232 | | Lint gate (kernels + comm) | ✓ scoped `-D warnings` hook | #233 | -| LoRA | N/A — server rejects cleanly | `pegainfer-server/src/main.rs` | +| LoRA | N/A — server rejects cleanly | `openinfer-server/src/main.rs` | ## Claim boundaries diff --git a/docs/models/kimi-k2/sampling.md b/docs/models/kimi-k2/sampling.md index cc958b32..14877353 100644 --- a/docs/models/kimi-k2/sampling.md +++ b/docs/models/kimi-k2/sampling.md @@ -7,8 +7,8 @@ Last touched: 2026-06 ## Param surface (`/v1/completions`) What a client can send vs. what actually happens. "Frontend" = the vllm-server -OpenAI layer + `pegainfer-vllm-frontend` conversion -(`pegainfer-vllm-frontend/src/lib.rs` `convert_sampling`); "engine" = the kimi +OpenAI layer + `openinfer-vllm-frontend` conversion +(`openinfer-vllm-frontend/src/lib.rs` `convert_sampling`); "engine" = the kimi scheduler/worker. | Param | TP1/DP8 | TP8 | Where decided | @@ -32,7 +32,7 @@ Rejection UX pitfall: an engine-side rejection surfaces as a generic HTTP 500 vllm-server OpenAI layer swallows the text. Check the server log (`vllm_engine_core_client::client::stream "request failed"`) when a client reports a 500. Fixing the mapping is a vllm-rust-workspace change, not a -pegainfer one. +openinfer one. ## Design (TP1/DP8) diff --git a/docs/models/kimi-k2/source-layout.md b/docs/models/kimi-k2/source-layout.md index 4d4d8b2c..c5966e7c 100644 --- a/docs/models/kimi-k2/source-layout.md +++ b/docs/models/kimi-k2/source-layout.md @@ -1,6 +1,6 @@ # Kimi-K2 Source Layout -> **TL;DR:** Kimi-K2 source files over 1k lines were split by responsibility; the largest Rust file under `pegainfer-kimi-k2/src` is now `layers/attention.rs` at 950 lines. +> **TL;DR:** Kimi-K2 source files over 1k lines were split by responsibility; the largest Rust file under `openinfer-kimi-k2/src` is now `layers/attention.rs` at 950 lines. > > **Last touched:** 2026-05 @@ -9,13 +9,13 @@ - **Read**: - `docs/index.md` - routed the cleanup to the Kimi-K2 model docs. - `docs/models/kimi-k2/bringup-history.md` - confirmed `worker.rs` owns decode arena, forward, routing, and sampling paths. - - `pegainfer-kimi-k2/src/layers/attention.rs` - found tensor-view wrappers and validation helpers mixed into the attention header API. - - `pegainfer-kimi-k2/src/layers/experts.rs` - found tests embedded at the end of the expert header API. - - `pegainfer-kimi-k2/src/runner/worker.rs` - found rank worker ownership, state command handling, arena/cache logic, forward kernels, load helpers, runtime helpers, and tests in one file. + - `openinfer-kimi-k2/src/layers/attention.rs` - found tensor-view wrappers and validation helpers mixed into the attention header API. + - `openinfer-kimi-k2/src/layers/experts.rs` - found tests embedded at the end of the expert header API. + - `openinfer-kimi-k2/src/runner/worker.rs` - found rank worker ownership, state command handling, arena/cache logic, forward kernels, load helpers, runtime helpers, and tests in one file. - **Relevant history**: - `docs/models/kimi-k2/bringup-history.md` records CUDA Graph and decode arena constraints; splits must preserve pointer-stable decode behavior and not change allocation sites. - **Plan**: - 1. List Rust files under `pegainfer-kimi-k2/src` over 1k lines. + 1. List Rust files under `openinfer-kimi-k2/src` over 1k lines. 2. Split low-risk header/API files first: attention tensor wrappers/validation helpers and expert tests. 3. Split `runner/worker.rs` by runtime responsibility: state command handling, cache/arena ownership, forward kernels, load helpers, and runtime helpers. 4. Run formatting and Kimi feature compile checks. @@ -24,35 +24,35 @@ ### Step 1: List oversized files -- Ran `find pegainfer-kimi-k2/src -name '*.rs' -type f -print0 | xargs -0 wc -l`. +- Ran `find openinfer-kimi-k2/src -name '*.rs' -type f -print0 | xargs -0 wc -l`. - Files over 1k lines before splitting: - - `pegainfer-kimi-k2/src/runner/worker.rs` - 2799 lines. - - `pegainfer-kimi-k2/src/layers/attention.rs` - 1146 lines. - - `pegainfer-kimi-k2/src/layers/experts.rs` - 1008 lines. + - `openinfer-kimi-k2/src/runner/worker.rs` - 2799 lines. + - `openinfer-kimi-k2/src/layers/attention.rs` - 1146 lines. + - `openinfer-kimi-k2/src/layers/experts.rs` - 1008 lines. ### Step 2: Split header/API modules -- Moved attention tensor view wrappers to `pegainfer-kimi-k2/src/layers/attention/tensors.rs`. -- Moved attention validation helpers to `pegainfer-kimi-k2/src/layers/attention/validation.rs`. -- Moved expert tests to `pegainfer-kimi-k2/src/layers/experts/tests.rs`. +- Moved attention tensor view wrappers to `openinfer-kimi-k2/src/layers/attention/tensors.rs`. +- Moved attention validation helpers to `openinfer-kimi-k2/src/layers/attention/validation.rs`. +- Moved expert tests to `openinfer-kimi-k2/src/layers/experts/tests.rs`. ### Step 3: Split rank worker -- Moved `KimiRankThreadState` command handling to `pegainfer-kimi-k2/src/runner/worker/state.rs`. -- Moved decode cache/arena/scratch impls to `pegainfer-kimi-k2/src/runner/worker/cache.rs`. -- Moved forward kernel paths to `pegainfer-kimi-k2/src/runner/worker/forward.rs`. -- Moved weight-cache loading and shape checks to `pegainfer-kimi-k2/src/runner/worker/load.rs`. -- Moved collectives, RoPE helpers, sampling helpers, and decode scalar helpers to `pegainfer-kimi-k2/src/runner/worker/runtime.rs`. +- Moved `KimiRankThreadState` command handling to `openinfer-kimi-k2/src/runner/worker/state.rs`. +- Moved decode cache/arena/scratch impls to `openinfer-kimi-k2/src/runner/worker/cache.rs`. +- Moved forward kernel paths to `openinfer-kimi-k2/src/runner/worker/forward.rs`. +- Moved weight-cache loading and shape checks to `openinfer-kimi-k2/src/runner/worker/load.rs`. +- Moved collectives, RoPE helpers, sampling helpers, and decode scalar helpers to `openinfer-kimi-k2/src/runner/worker/runtime.rs`. ### Step 4: Verify - `cargo fmt --all --check` passed. -- `PEGAINFER_CUDA_SM=90a PEGAINFER_TRITON_PYTHON=$LOCAL_PEGAINFER_DIR/.venv/bin/python3 cargo check --release -p pegainfer-kimi-k2 --features kimi-k2 --tests` passed. -- `PEGAINFER_CUDA_SM=90a PEGAINFER_TRITON_PYTHON=$LOCAL_PEGAINFER_DIR/.venv/bin/python3 cargo check --release -p pegainfer-kimi-k2 --lib` passed after gating Kimi runtime/weights exports behind the crate `kimi-k2` feature. +- `OPENINFER_CUDA_SM=90a OPENINFER_TRITON_PYTHON=$LOCAL_OPENINFER_DIR/.venv/bin/python3 cargo check --release -p openinfer-kimi-k2 --features kimi-k2 --tests` passed. +- `OPENINFER_CUDA_SM=90a OPENINFER_TRITON_PYTHON=$LOCAL_OPENINFER_DIR/.venv/bin/python3 cargo check --release -p openinfer-kimi-k2 --lib` passed after gating Kimi runtime/weights exports behind the crate `kimi-k2` feature. ## Debrief -- **Outcome**: All Rust files under `pegainfer-kimi-k2/src` are now below 1k lines; the worker split preserved the Kimi feature compile gate and the default config/tokenizer build. +- **Outcome**: All Rust files under `openinfer-kimi-k2/src` are now below 1k lines; the worker split preserved the Kimi feature compile gate and the default config/tokenizer build. - **Pitfalls encountered**: - Rust module visibility needed explicit promotion for methods moved under `runner/worker/*`. - The default feature check exposed that Kimi runtime/weights exports were visible without the `kimi-k2` kernel feature. diff --git a/docs/models/kimi-k2/tp1-dp8-ep8-performance.md b/docs/models/kimi-k2/tp1-dp8-ep8-performance.md index 77f9b083..42803341 100644 --- a/docs/models/kimi-k2/tp1-dp8-ep8-performance.md +++ b/docs/models/kimi-k2/tp1-dp8-ep8-performance.md @@ -1,6 +1,6 @@ # Kimi-K2 TP1 DP8 EP8 performance -> TL;DR: This ledger tracks pegainfer TP1+DP8+EP8 on 8x H20 against the vLLM TP1+DP8+EP8 bs64 target. The vLLM sustained bs64 `~106ms` TPOT is now explained by a DPLB/CUDA-graph bucket cliff: an uneven DP distribution such as `9,8,8,8,8,8,8,7` pads every rank from graph bucket 8 to 16 and doubles TPOT. O2 landed five production decode-kernel picks (cuBLASLt fixed-shape shared_gate_up / o_proj / MLA strided-batch, split-vocab argmax, fused router selector); accuracy held at the bf16 ULP floor by a base-vs-opt prefill logits A/B, and the PPLX Marlin small-N tile was identified as the messy branch's real accuracy break (`-inf` logits + SIGSEGV at small per-rank N) and rejected. bs64 TPOT is unchanged within noise (p50 `40.58 -> 40.09ms`): the per-kernel wins do not resolve above the ±1ms band at this shape. Every pegainfer optimization must start from a profile, state the expected gain, show a microbench or isolated measurement, then pass correctness and service-level gates before commit. +> TL;DR: This ledger tracks openinfer TP1+DP8+EP8 on 8x H20 against the vLLM TP1+DP8+EP8 bs64 target. The vLLM sustained bs64 `~106ms` TPOT is now explained by a DPLB/CUDA-graph bucket cliff: an uneven DP distribution such as `9,8,8,8,8,8,8,7` pads every rank from graph bucket 8 to 16 and doubles TPOT. O2 landed five production decode-kernel picks (cuBLASLt fixed-shape shared_gate_up / o_proj / MLA strided-batch, split-vocab argmax, fused router selector); accuracy held at the bf16 ULP floor by a base-vs-opt prefill logits A/B, and the PPLX Marlin small-N tile was identified as the messy branch's real accuracy break (`-inf` logits + SIGSEGV at small per-rank N) and rejected. bs64 TPOT is unchanged within noise (p50 `40.58 -> 40.09ms`): the per-kernel wins do not resolve above the ±1ms band at this shape. Every openinfer optimization must start from a profile, state the expected gain, show a microbench or isolated measurement, then pass correctness and service-level gates before commit. > > Last touched: 2026-06-07 @@ -44,42 +44,42 @@ For TP1 DP8, correctness checks must include uneven per-rank active rows and emp Path placeholders: ```bash -export PEGAINFER_DIR=/path/to/pegainfer +export OPENINFER_DIR=/path/to/openinfer export VLLM_DIR=/path/to/vllm_test export MODEL_DIR=/path/to/Kimi-K2.5 export NCCL_LIB_DIR=/path/to/nccl-lib export EVAL_VENV=/path/to/eval-venv export RESULT_ROOT=/path/to/result-root -export TRITON_PYTHON=$PEGAINFER_DIR/.triton-venv/bin/python +export TRITON_PYTHON=$OPENINFER_DIR/.triton-venv/bin/python ``` Build on an : ```bash -cd "$PEGAINFER_DIR" +cd "$OPENINFER_DIR" CUDA_HOME=/usr/local/cuda \ NVCC=/usr/local/cuda/bin/nvcc \ LD_LIBRARY_PATH="$NCCL_LIB_DIR:/usr/local/cuda/lib64:${LD_LIBRARY_PATH:-}" \ -PEGAINFER_CUDA_SM=90a \ -PEGAINFER_TRITON_PYTHON="$TRITON_PYTHON" \ -cargo build --release -p pegainfer-server \ - --features kimi-k2 --bin pegainfer --bin bench_serving +OPENINFER_CUDA_SM=90a \ +OPENINFER_TRITON_PYTHON="$TRITON_PYTHON" \ +cargo build --release -p openinfer-server \ + --features kimi-k2 --bin openinfer --bin bench_serving ``` -(The old `kimi-k2-pplx-ep` feature and `PEGAINFER_KIMI_PARALLEL` env existed only on the +(The old `kimi-k2-pplx-ep` feature and `OPENINFER_KIMI_PARALLEL` env existed only on the pre-merge branch; on main the feature is `kimi-k2` and parallel shape is selected by the `--tp-size/--dp-size/--ep-backend` CLI flags below. nvcc must also be on `PATH` — the -`pegainfer-comm` cc-rs build looks it up there, not via `$NVCC`.) +`openinfer-comm` cc-rs build looks it up there, not via `$NVCC`.) In-process bs64: ```bash -cd "$PEGAINFER_DIR" +cd "$OPENINFER_DIR" CUDA_HOME=/usr/local/cuda \ NVCC=/usr/local/cuda/bin/nvcc \ LD_LIBRARY_PATH="$NCCL_LIB_DIR:/usr/local/cuda/lib64:${LD_LIBRARY_PATH:-}" \ -PEGAINFER_CUDA_SM=90a \ -PEGAINFER_TRITON_PYTHON="$TRITON_PYTHON" \ +OPENINFER_CUDA_SM=90a \ +OPENINFER_TRITON_PYTHON="$TRITON_PYTHON" \ target/release/bench_serving \ --model-path "$MODEL_DIR" \ --cuda-graph false \ @@ -92,13 +92,13 @@ target/release/bench_serving \ Service bs64, same client shape as vLLM: ```bash -cd "$PEGAINFER_DIR" +cd "$OPENINFER_DIR" CUDA_HOME=/usr/local/cuda \ NVCC=/usr/local/cuda/bin/nvcc \ LD_LIBRARY_PATH="$NCCL_LIB_DIR:/usr/local/cuda/lib64:${LD_LIBRARY_PATH:-}" \ -PEGAINFER_CUDA_SM=90a \ -PEGAINFER_TRITON_PYTHON="$TRITON_PYTHON" \ -target/release/pegainfer --model-path "$MODEL_DIR" --served-model-name kimi-k2.5 \ +OPENINFER_CUDA_SM=90a \ +OPENINFER_TRITON_PYTHON="$TRITON_PYTHON" \ +target/release/openinfer --model-path "$MODEL_DIR" --served-model-name kimi-k2.5 \ --port 8124 --cuda-graph false --tp-size 1 --dp-size 8 --ep-backend pplx ``` @@ -125,13 +125,13 @@ vllm bench serve \ --save-result \ --save-detailed \ --result-dir "$RESULT_ROOT/kimi-tp1dp8-service" \ - --result-filename pegainfer_tp1dp8_bs64_${COMMIT}.json + --result-filename openinfer_tp1dp8_bs64_${COMMIT}.json ``` GSM8K accuracy smoke, concurrent OpenAI `/v1/completions` path: ```bash -cd "$PEGAINFER_DIR" +cd "$OPENINFER_DIR" source "$EVAL_VENV/bin/activate" lm_eval run --model local-completions \ --model_args "model=kimi-k2.5,base_url=http://127.0.0.1:8125/v1/completions,tokenizer_backend=huggingface,tokenizer=$MODEL_DIR,tokenized_requests=False,trust_remote_code=True,max_length=4096,max_gen_toks=256,num_concurrent=16,timeout=300" \ @@ -190,13 +190,13 @@ vllm bench serve \ nsys profile: ```bash -cd "$PEGAINFER_DIR" +cd "$OPENINFER_DIR" mkdir -p "$RESULT_ROOT/kimi-profile" CUDA_HOME=/usr/local/cuda \ NVCC=/usr/local/cuda/bin/nvcc \ LD_LIBRARY_PATH="$NCCL_LIB_DIR:/usr/local/cuda/lib64:${LD_LIBRARY_PATH:-}" \ -PEGAINFER_CUDA_SM=90a \ -PEGAINFER_TRITON_PYTHON="$TRITON_PYTHON" \ +OPENINFER_CUDA_SM=90a \ +OPENINFER_TRITON_PYTHON="$TRITON_PYTHON" \ nsys profile --force-overwrite=true --trace=cuda,nvtx \ --cuda-graph-trace=node --export=sqlite \ -o "$RESULT_ROOT/kimi-profile/tp1dp8_bs64_o128_${COMMIT}" \ @@ -235,7 +235,7 @@ Motivation and expected gain: Change: -- `pegainfer-kimi-k2/src/runner/engine.rs` +- `openinfer-kimi-k2/src/runner/engine.rs` - `MAX_BATCH_PER_DP: 4 -> 8`. - Added prompt_len1 admission batching in `DpCoordinator`. - For prompt_len1 requests, send `StepCommand::Decode { positions: vec![0], slots, decode_batch_size: MAX_BATCH_PER_DP }` instead of `Prefill`. @@ -293,12 +293,12 @@ Correctness: CUDA_HOME=/usr/local/cuda \ NVCC=/usr/local/cuda/bin/nvcc \ LD_LIBRARY_PATH=/usr/local/cuda/lib64:${LD_LIBRARY_PATH:-} \ -cargo test -r -p pegainfer-kimi-k2 --features pplx-ep runner::engine::tests --no-fail-fast +cargo test -r -p openinfer-kimi-k2 --features pplx-ep runner::engine::tests --no-fail-fast ``` - Local result: `5 passed`. - H20 result at `0c23389`: `5 passed`. -- Mixed-arrival service test, `$RESULT_ROOT/kimi-tp1dp8-service/pegainfer_tp1dp8_mixed_arrival_prompt1_o64_0c23389.json`: +- Mixed-arrival service test, `$RESULT_ROOT/kimi-tp1dp8-service/openinfer_tp1dp8_mixed_arrival_prompt1_o64_0c23389.json`: `64/64` success with `--request-rate 16`, peak concurrent requests `54`, TTFT p50/p99 `58.10/110.88ms`, TPOT p50/p99 `35.91/37.63ms`. This covers prompt_len1 admissions landing while existing decode slots are active. @@ -309,7 +309,7 @@ Performance: `64/64` success, TTFT p50/p99 `74.62/77.19ms`, first decode p50/p99 `38.23/38.24ms`, steady TPOT p50/p95/p99 `40.10/43.32/43.72ms`. - Service, same `vllm bench serve` client as vLLM, - `$RESULT_ROOT/kimi-tp1dp8-service/pegainfer_tp1dp8_bs64_o128_0c23389_after_warmup.json`: + `$RESULT_ROOT/kimi-tp1dp8-service/openinfer_tp1dp8_bs64_o128_0c23389_after_warmup.json`: `256/256` success, output `1336.35 tok/s`, TTFT p50/p99 `105.31/127.81ms`, TPOT p50/p95/p99 `47.34/47.70/47.71ms`, ITL p50/p99 `47.84/50.69ms`. - vLLM warmup-after baseline, @@ -365,7 +365,7 @@ Decision: - Keep as the current H20 bs64 performance baseline. O1 moves prompt_len=1 onto the decode shape and clears the vLLM bs64 TPOT/output gate; full token-parity correctness remains a separate reference gate before using TP1 DP8 as an accuracy baseline. Follow-up profiles should - focus on lowering pegainfer service TPOT from `47ms` toward the H200-reported 30ms-class + focus on lowering openinfer service TPOT from `47ms` toward the H200-reported 30ms-class expectation if that target is confirmed on comparable hardware. ### O2 - decode kernel cherry-pick: cuBLASLt fixed-shape GEMMs, argmax split, router fusion @@ -410,7 +410,7 @@ Rejected: PPLX Marlin small-N tile (messy-branch `dd69876`) — the accuracy bre Accuracy gate: base-vs-opt prefill logits A/B. GSM8K-class evals are too coarse for ULP-level kernel drift, so the gate follows `subsystems/correctness/logits-golden-gate.md` -with base-pegainfer itself as the reference at the same TP1 DP8 PPLX config: a throwaway +with base-openinfer itself as the reference at the same TP1 DP8 PPLX config: a throwaway (uncommitted) hook after the prefill lm_head GEMM in `runner/worker/state.rs` dumps full-vocab bf16 logits at every prompt position for 12 fixed raw prompts (en/zh/code/math, 1..90 tokens) sent through `/v1/completions` at `max_tokens=1`, identical patch on base diff --git a/docs/models/kimi-k2/vllm-h20-baseline.md b/docs/models/kimi-k2/vllm-h20-baseline.md index 5890b772..0afc54b8 100644 --- a/docs/models/kimi-k2/vllm-h20-baseline.md +++ b/docs/models/kimi-k2/vllm-h20-baseline.md @@ -1,6 +1,6 @@ # Kimi-K2 vLLM H20 Baseline (decode-heavy) -> **TL;DR:** vLLM `0.19.0` + Kimi-K2.5 + 8× H20,TP1+DP8+EP8(NCCL allgather/reducescatter all2all)跑 `bench serve` decode-heavy profile(input=1, output=128, ignore-eos)。bs=1..256 扫描。这是 vLLM 侧的 baseline 数据快照,作为 pegainfer TP1+DP8+EP8 active line 的硬上限(pegainfer 当前数据见 [tp1-dp8-ep8-performance.md](tp1-dp8-ep8-performance.md))。下面的 pegainfer 列是 **历史 TP8+EP8 bs=4 bring-up 对照**(TPOT med `19.13ms` vs vLLM `24.97ms`,HTTP 比 in-process `14.39ms` 高 33%),保留作为 frontend/streaming overhead 的早期记录。 +> **TL;DR:** vLLM `0.19.0` + Kimi-K2.5 + 8× H20,TP1+DP8+EP8(NCCL allgather/reducescatter all2all)跑 `bench serve` decode-heavy profile(input=1, output=128, ignore-eos)。bs=1..256 扫描。这是 vLLM 侧的 baseline 数据快照,作为 openinfer TP1+DP8+EP8 active line 的硬上限(openinfer 当前数据见 [tp1-dp8-ep8-performance.md](tp1-dp8-ep8-performance.md))。下面的 openinfer 列是 **历史 TP8+EP8 bs=4 bring-up 对照**(TPOT med `19.13ms` vs vLLM `24.97ms`,HTTP 比 in-process `14.39ms` 高 33%),保留作为 frontend/streaming overhead 的早期记录。 > > **Last touched:** 2026-06 @@ -42,7 +42,7 @@ cross-hardware/backend check. | Model | Kimi-K2.5(local `$MODEL_DIR`,INT4 + BF16 scale Marlin WNA16) | | vLLM | `0.19.0`(venv `$VLLM_DIR/.venv`) | | Sharding | **vLLM**: TP=1, DP=8, EP=8,all2all backend `allgather_reducescatter`(NCCL,默认) | -| Sharding | **pegainfer**: TP=8, EP=8,NCCL F32 hidden bridge + RS routed bridge | +| Sharding | **openinfer**: TP=8, EP=8,NCCL F32 hidden bridge + RS routed bridge | | Profile | input_len=1, output_len=128, `--ignore-eos`, `--random-range-ratio 0` | | Bench | `vllm bench serve --backend openai --endpoint /v1/completions`(同一 client,两边对齐) | | 数据 | `$VLLM_DIR/kimi_dp8_baseline/result_*.json` | @@ -80,11 +80,11 @@ cross-hardware/backend check. **bs=8 ≈ 拐点**:从这一点开始多塞 batch 单请求体验快速恶化,aggregate throughput 增益逐渐被 8 倍 batch / 8 倍 latency 抵消。Decode 性能口径下 vLLM TP1+DP8+EP8 的 "sweet spot" 是 bs=8(8 路 DP 各自 bs=1)。 -## pegainfer bs=4 对照点 +## openinfer bs=4 对照点 -下表是历史 TP8+EP8 bring-up 对照(当时 `KIMI_RUNNER_MAX_BATCH=4`,没扫 bs;该 const 现在是 `64`,bucketed)。同 client / 同 profile(`vllm bench serve`,input=1, output=128, ignore-eos, max_concurrency=4)打 pegainfer OpenAI-compatible server: +下表是历史 TP8+EP8 bring-up 对照(当时 `KIMI_RUNNER_MAX_BATCH=4`,没扫 bs;该 const 现在是 `64`,bucketed)。同 client / 同 profile(`vllm bench serve`,input=1, output=128, ignore-eos, max_concurrency=4)打 openinfer OpenAI-compatible server: -| 指标 | pegainfer TP8+EP8(HTTP, vllm bench) | vLLM TP1+DP8+EP8 bs=4 | pegainfer in-process bench, bs4 | +| 指标 | openinfer TP8+EP8(HTTP, vllm bench) | vLLM TP1+DP8+EP8 bs=4 | openinfer in-process bench, bs4 | | --- | ---: | ---: | ---: | | TPOT median | `19.13ms` | `24.97ms` | `14.39ms` | | TPOT p99 | `23.63ms` | `29.46ms` | `14.83ms` | @@ -94,19 +94,19 @@ cross-hardware/backend check. | Output tok/s | `159.99` | `157.94` | `≈278` | 数据来源: -- pegainfer HTTP:`result_pegainfer_bs4.json`,server 是 `target/release/pegainfer --model-path $MODEL_DIR --port 8124 --cuda-graph true`,client 同 vLLM 那条 bench。 +- openinfer HTTP:`result_openinfer_bs4.json`,server 是 `target/release/openinfer --model-path $MODEL_DIR --port 8124 --cuda-graph true`,client 同 vLLM 那条 bench。 - vLLM bs=4:上面 sweep 表的 bs=4 行。 -- pegainfer in-process:`bench_serving request --cuda-graph true --concurrency 4`,见 optimization.md。 +- openinfer in-process:`bench_serving request --cuda-graph true --concurrency 4`,见 optimization.md。 ### 结论 -1. **同硬件、同 client、同 profile,pegainfer TPOT 比 vLLM 低 23%**(`19.13 vs 24.97`)。预期内:pegainfer 走 TP=8 把单 token MLA / dense / shared expert 的 GEMM 切到 8 rank,每发 token 跨 rank reduce 一次;vLLM TP=1 时单 rank 自己跑完整 GEMM,靠 DP=8 拿 throughput 但单请求慢。Decode latency 主线上 TP8 仍然赢。 +1. **同硬件、同 client、同 profile,openinfer TPOT 比 vLLM 低 23%**(`19.13 vs 24.97`)。预期内:openinfer 走 TP=8 把单 token MLA / dense / shared expert 的 GEMM 切到 8 rank,每发 token 跨 rank reduce 一次;vLLM TP=1 时单 rank 自己跑完整 GEMM,靠 DP=8 拿 throughput 但单请求慢。Decode latency 主线上 TP8 仍然赢。 -2. **TTFT 这边 vLLM 完胜**:median `69.60ms` vs pegainfer `313.10ms`。pegainfer p99 飙到 `4239.97ms`——基本是 first-request 冷启动(first NCCL collective stream drain + scheduler warmup)。decode 优先的方案在 prefill 路径上欠的债集中爆在 p99。 +2. **TTFT 这边 vLLM 完胜**:median `69.60ms` vs openinfer `313.10ms`。openinfer p99 飙到 `4239.97ms`——基本是 first-request 冷启动(first NCCL collective stream drain + scheduler warmup)。decode 优先的方案在 prefill 路径上欠的债集中爆在 p99。 -3. **HTTP overhead 异常高**:pegainfer 同 bs=4,HTTP 口径 TPOT med `19.13ms`,in-process bench 是 `14.39ms`——4.74ms / token,~33% overhead。streaming JSON + frontend bridge 不该这么多。**这条单独提出来作为后续要查的 finding**,优先级介于 decode kernel 和 prefill 之间。 +3. **HTTP overhead 异常高**:openinfer 同 bs=4,HTTP 口径 TPOT med `19.13ms`,in-process bench 是 `14.39ms`——4.74ms / token,~33% overhead。streaming JSON + frontend bridge 不该这么多。**这条单独提出来作为后续要查的 finding**,优先级介于 decode kernel 和 prefill 之间。 -4. **Aggregate throughput 不公平比较(历史)**:当时 pegainfer 卡在 `KIMI_RUNNER_MAX_BATCH=4`(现已是 `64`,bucketed)不能扫 bs,vLLM TP1+DP8 在 bs=256 拉到 `1131 tok/s`。这条数据当时给 TP1+DP8+EP8 milestone 提供了上限:H20 ×8、相同 client 口径,**vLLM TP1+DP8+EP8 baseline 单请求 TPOT `17.94ms`(bs=1)/ aggregate `1131 tok/s`(bs=256)**。pegainfer 的 TP1+DP8+EP8 已落地,bs64 service output `1336 tok/s` / TPOT p50 `47.3ms`(见 [tp1-dp8-ep8-performance.md](tp1-dp8-ep8-performance.md))。 +4. **Aggregate throughput 不公平比较(历史)**:当时 openinfer 卡在 `KIMI_RUNNER_MAX_BATCH=4`(现已是 `64`,bucketed)不能扫 bs,vLLM TP1+DP8 在 bs=256 拉到 `1131 tok/s`。这条数据当时给 TP1+DP8+EP8 milestone 提供了上限:H20 ×8、相同 client 口径,**vLLM TP1+DP8+EP8 baseline 单请求 TPOT `17.94ms`(bs=1)/ aggregate `1131 tok/s`(bs=256)**。openinfer 的 TP1+DP8+EP8 已落地,bs64 service output `1336 tok/s` / TPOT p50 `47.3ms`(见 [tp1-dp8-ep8-performance.md](tp1-dp8-ep8-performance.md))。 ## 复现命令 @@ -121,16 +121,16 @@ vllm serve $MODEL_DIR \ --port 8123 --max-num-seqs 256 --max-model-len 4096 ``` -pegainfer server(8×H20 node)。Build 用 `cargo build --release -p pegainfer-server --features kimi-k2 --bin pegainfer`,parallel shape 由 CLI flag 选(当前 active line 是 TP1+DP8+EP8 PPLX,下面的 flag 即对齐 vLLM 形态做 apples 对照): +openinfer server(8×H20 node)。Build 用 `cargo build --release -p openinfer-server --features kimi-k2 --bin openinfer`,parallel shape 由 CLI flag 选(当前 active line 是 TP1+DP8+EP8 PPLX,下面的 flag 即对齐 vLLM 形态做 apples 对照): ```bash -LD_LIBRARY_PATH=$RESULT_ROOT/pegainfer-nccl-lib:$LD_LIBRARY_PATH \ - $PEGAINFER_DIR/target/release/pegainfer \ +LD_LIBRARY_PATH=$RESULT_ROOT/openinfer-nccl-lib:$LD_LIBRARY_PATH \ + $OPENINFER_DIR/target/release/openinfer \ --model-path $MODEL_DIR --port 8124 --cuda-graph true \ --tp-size 1 --dp-size 8 --ep-backend pplx ``` -> 注:表里的 pegainfer 数据是历史 TP8+EP8 bs=4 口径(当时用旧的 `kimi-k2-pplx-ep` feature / `PEGAINFER_KIMI_PARALLEL` env,二者均已移除)。上面是当前 CLI 复现 active TP1+DP8+EP8 的命令,不是产生表里 TP8 数据的命令;8×H20 才能跑。 +> 注:表里的 openinfer 数据是历史 TP8+EP8 bs=4 口径(当时用旧的 `kimi-k2-pplx-ep` feature / `OPENINFER_KIMI_PARALLEL` env,二者均已移除)。上面是当前 CLI 复现 active TP1+DP8+EP8 的命令,不是产生表里 TP8 数据的命令;8×H20 才能跑。 bench(client 端,对哪个 server 改 `--base-url` 即可): diff --git a/docs/models/kimi-k2/vllm-path-comparison.md b/docs/models/kimi-k2/vllm-path-comparison.md index 4f5695bb..8b067313 100644 --- a/docs/models/kimi-k2/vllm-path-comparison.md +++ b/docs/models/kimi-k2/vllm-path-comparison.md @@ -1,6 +1,6 @@ # Kimi-K2 vLLM Path Comparison -> **TL;DR:** vLLM Kimi/DeepSeekV3 decode 和 PegaInfer decode 的最大结构差异已缩小到 MLA cache/metadata 与 collective bridge:PegaInfer 现在同样用 load-time `fused_qkv_a_proj` 合并 `q_a + kv_a`,decode 执行 `gemm_graphsafe(fused_qkv_a)` 后用 `kimi_mla_split_qkv_a` 一次拆出 `q_a/compressed_kv/k_rope`。MoE shared/main 与 routed compute/aux stream overlap、shared gate/up fused GEMM、dense layer0 gate/up fused GEMM、routed scale+residual add fused kernel、routed sum clear 与 Marlin locks clear 清理已通过 H20 correctness/perf gate;真实 fixture output16 steady TPOT p99 `14.26ms`,synthetic output64 steady TPOT avg `14.39ms` / p99 `14.83ms`。vLLM TP-only MoE final all-reduce cadence 已实测 BF16/F32 两版均慢于当前 RS bridge,因此保留 RS bridge。 +> **TL;DR:** vLLM Kimi/DeepSeekV3 decode 和 OpenInfer decode 的最大结构差异已缩小到 MLA cache/metadata 与 collective bridge:OpenInfer 现在同样用 load-time `fused_qkv_a_proj` 合并 `q_a + kv_a`,decode 执行 `gemm_graphsafe(fused_qkv_a)` 后用 `kimi_mla_split_qkv_a` 一次拆出 `q_a/compressed_kv/k_rope`。MoE shared/main 与 routed compute/aux stream overlap、shared gate/up fused GEMM、dense layer0 gate/up fused GEMM、routed scale+residual add fused kernel、routed sum clear 与 Marlin locks clear 清理已通过 H20 correctness/perf gate;真实 fixture output16 steady TPOT p99 `14.26ms`,synthetic output64 steady TPOT avg `14.39ms` / p99 `14.83ms`。vLLM TP-only MoE final all-reduce cadence 已实测 BF16/F32 两版均慢于当前 RS bridge,因此保留 RS bridge。 > > **Last touched:** 2026-05 @@ -18,12 +18,12 @@ - `$VLLM_DIR_ALT/vllm/model_executor/layers/fused_moe/layer.py` - `$VLLM_DIR_ALT/csrc/cache_kernels.cu` - `$VLLM_DIR_ALT/csrc/moe/*` -- PegaInfer files: - - `pegainfer-kimi-k2/src/direct/worker.rs` - - `pegainfer-kimi-k2/src/batch_decode_trace.rs` - - `pegainfer-kernels/src/ops/kimi_mla.rs` - - `pegainfer-kernels/src/ops/kimi_router.rs` - - `pegainfer-kernels/src/ops/kimi_experts.rs` +- OpenInfer files: + - `openinfer-kimi-k2/src/direct/worker.rs` + - `openinfer-kimi-k2/src/batch_decode_trace.rs` + - `openinfer-kernels/src/ops/kimi_mla.rs` + - `openinfer-kernels/src/ops/kimi_router.rs` + - `openinfer-kernels/src/ops/kimi_experts.rs` ## vLLM Decode Operator List @@ -52,20 +52,20 @@ This is the source-level list for Kimi/DeepSeekV3 decode, not an nsys trace. PyT | MoE scale and TP reduce | For BF16, routed output is multiplied by `routed_scaling_factor`, added with shared output, then `maybe_all_reduce_tensor_model_parallel`. | `deepseek_v2.py:187-208` | | Final logits | Final RMSNorm then LM head; sampling/logprobs live in vLLM sampling path rather than model file. | `deepseek_v2.py:724-725` | -## PegaInfer Current Decode Operator List +## OpenInfer Current Decode Operator List This list follows the current worker implementation. The static trace is now source-aligned for these high-level operators after the MLA trace fix below. -| Section | PegaInfer actual operator path | Source evidence | +| Section | OpenInfer actual operator path | Source evidence | | --- | --- | --- | | Embedding | `embedding_batch_vocab_shard` then TP all-reduce through BF16-via-F32 bridge. | `batch_decode_trace.rs:49-63` | | Attention input | `rms_norm_batch_into(hidden, input_norm)`. | `worker.rs:1777-1783` | | MLA q/kv down projection | `gemm_graphsafe(fused_qkv_a_proj)` then `kimi_mla_split_qkv_a` produces `q_a`, `compressed_kv`, and `k_rope`; q branch then runs `rms_norm_batch(q_a_norm)` and `gemm_graphsafe(q_b_proj)`, kv branch runs `rms_norm_batch(kv_a_norm)`. | `worker.rs:1784-1827` | | MLA RoPE split | `kimi_mla_rope_split_decode(q_proj, k_rope, cos, sin, positions)` produces `q_nope`, `q_pe`, and `append_kpe`. | `worker.rs:1839-1849` | -| MLA q absorb | `kimi_mla_absorb_q_nope(kv_b_proj, q_nope)` uses preloaded `kv_b_proj` weight; this is the PegaInfer equivalent of vLLM `q_nope @ W_UK_T`. | `worker.rs:1850-1855` | +| MLA q absorb | `kimi_mla_absorb_q_nope(kv_b_proj, q_nope)` uses preloaded `kv_b_proj` weight; this is the OpenInfer equivalent of vLLM `q_nope @ W_UK_T`. | `worker.rs:1850-1855` | | MLA cache append | `kimi_mla_paged_kv_append(compressed_normed, append_kpe, page tables, positions)` writes worker-owned paged MLA KV. | `worker.rs:1856-1868` | | MLA attention | `kimi_flashinfer_batch_decode_mla(q_abs_nope, q_pe, ckv_cache, kpe_cache, page tables, request_indices, kv metadata)`. | `worker.rs:1880-1895` | -| MLA v up | `kimi_mla_v_up(kv_b_proj, latent)`; this is the PegaInfer equivalent of vLLM `_v_up_proj`. | `worker.rs:1907-1912` | +| MLA v up | `kimi_mla_v_up(kv_b_proj, latent)`; this is the OpenInfer equivalent of vLLM `_v_up_proj`. | `worker.rs:1907-1912` | | MLA output projection | `gemm_graphsafe(o_proj)` then TP all-reduce through BF16-via-F32 bridge, then residual add. | `worker.rs:1913-1934`, `batch_decode_trace.rs:279-291` | | Dense layer 0 MLP | post-attn RMSNorm, separate gate/up GEMMs, `silu_mul_batch`, down GEMM, BF16-via-F32 TP all-reduce, residual add. | `batch_decode_trace.rs:294-327` | | MoE shared expert | post-attn RMSNorm; load-time fused shared gate/up GEMM, `silu_mul_fused_batch_into`, shared down GEMM, BF16-via-F32 TP all-reduce. | `worker.rs:2201-2238` | @@ -111,7 +111,7 @@ This count is source-aligned for the high-level worker operators. It still folds ## Trace Drift Fixed In This Session -`pegainfer-kimi-k2/src/batch_decode_trace.rs` differed from `worker.rs` in the first draft of this document: +`openinfer-kimi-k2/src/batch_decode_trace.rs` differed from `worker.rs` in the first draft of this document: | Trace item | Current trace | Actual worker path | Effect | | --- | --- | --- | --- | @@ -129,20 +129,20 @@ Validation: ```bash cargo fmt --all --check -PEGAINFER_CUDA_SM=90a cargo check --release -p pegainfer-kimi-k2 --features kernel-report --bins -PEGAINFER_CUDA_SM=90a cargo run --release -p pegainfer-kimi-k2 --features kernel-report --bin kimi_kernel_report -- \ +OPENINFER_CUDA_SM=90a cargo check --release -p openinfer-kimi-k2 --features kernel-report --bins +OPENINFER_CUDA_SM=90a cargo run --release -p openinfer-kimi-k2 --features kernel-report --bin kimi_kernel_report -- \ trace --source static --batch-size 4 --kv-len 1024 --out $RESULT_ROOT/kimi_decode_trace_fixed_bs4_kv1024.json ``` -H20 validation for the fused-qkv patch used the same `cargo check` and static trace command under `$PEGAINFER_DIR` with `PEGAINFER_TRITON_PYTHON=$PEGAINFER_DIR/.venv-kimi/bin/python`; output was `calls=1886`, `gemm_graphsafe=367`, and `kimi_mla_split_qkv_a=61`. +H20 validation for the fused-qkv patch used the same `cargo check` and static trace command under `$OPENINFER_DIR` with `OPENINFER_TRITON_PYTHON=$OPENINFER_DIR/.venv-kimi/bin/python`; output was `calls=1886`, `gemm_graphsafe=367`, and `kimi_mla_split_qkv_a=61`. Runtime model-report validation on H20: ```bash -LD_LIBRARY_PATH=$RESULT_ROOT/pegainfer-nccl-lib:${LD_LIBRARY_PATH:-} \ -PEGAINFER_CUDA_SM=90a \ -PEGAINFER_TRITON_PYTHON=$PEGAINFER_DIR/.venv-kimi/bin/python \ -cargo run --release -p pegainfer-kimi-k2 --features kernel-report --bin kimi_model_report -- \ +LD_LIBRARY_PATH=$RESULT_ROOT/openinfer-nccl-lib:${LD_LIBRARY_PATH:-} \ +OPENINFER_CUDA_SM=90a \ +OPENINFER_TRITON_PYTHON=$OPENINFER_DIR/.venv-kimi/bin/python \ +cargo run --release -p openinfer-kimi-k2 --features kernel-report --bin kimi_model_report -- \ decode --source runtime --batch-size 4 --kv-len 28 --iters 1 --format text \ --out $RESULT_ROOT/kimi_runtime_model_report_bs4_kv28_fixed_trace_v2.json ``` @@ -163,17 +163,17 @@ H20 graph serving gates after fused-qkv: ## Path Differences That Matter -| Difference | vLLM | PegaInfer | Why it matters | +| Difference | vLLM | OpenInfer | Why it matters | | --- | --- | --- | --- | | MLA first projection | One `MergedReplicatedLinear` for `[q_lora_rank, kv_lora_rank + rope_dim]`. | Now one load-time fused `DeviceMatrix` plus one graph-safe GEMM and one split kernel. | This structural delta is closed in code. The keep/revert gate is H20 correctness plus TPOT/model-report improvement. | | Dense gate/up | V1 can use fused `gate_up_proj`; V0 module-level path still exposes gate/up. | Dense layer still uses separate gate/up; MoE shared expert now uses load-time fused gate/up GEMM. | One dense layer only matters little; shared expert repeat cost is now closed at the high-level GEMM count. | | Router GEMM | V1 has small-batch `dsv3_router_gemm` / `router_gemm_bf16_fp32` path before grouped top-k. | `kimi_router_noaux_tc_launch` is a single custom router/top-k kernel path. | Need compare microbench, not assume; router was ~3.7ms/step in old strong-sync profile. | -| MLA cache append and metadata | vLLM uses `concat_and_cache_mla`; FlashMLA prepares tile scheduler metadata and graph buffers. | PegaInfer uses `kimi_mla_paged_kv_append` and precomputed decode arena arrays. | Need compare metadata/cache append cost before changing attention kernels; trace currently hides this. | -| MLA q absorb/v up | vLLM uses `torch.bmm` with preprocessed `W_UK_T/W_UV`. | PegaInfer custom kernels `kimi_mla_absorb_q_nope` and `kimi_mla_v_up` over `kv_b_proj`. | Semantically aligned, but microbench should decide whether custom kernels or cuBLAS batched GEMM wins for bs1..4. | -| MoE WNA16 | Both use Marlin WNA16 route align, W13, SiLU, W2, sum. | PegaInfer has persistent workspace and explicit local EP route metadata. | Main MoE kernel choice is already aligned; next work is route histogram/tail and combine, not replacing WNA16. | -| Routed combine | vLLM EP path maps local experts via `expert_map`; final tensor-parallel reduce happens through vLLM distributed path. | PegaInfer currently uses NCCL bridge: local sum -> repeat -> reduce-scatter -> fused scale+residual add. | This is not PPLX EP; it is graph-capturable but likely still extra data movement. | -| TP collectives | vLLM parallel layers hide TP reductions; BF16 path does not visibly use our BF16-via-F32 bridge. | PegaInfer uses BF16-via-F32 bridge for hidden all-reduces because BF16 collective changed greedy output. | This is correctness-driven overhead; replacing it needs external vLLM greedy/top-k gate. | -| Sampling/top1 | vLLM sampling/logprobs is integrated with its sampler path. | PegaInfer graph body ends at local top1; worker D2H reads local top1 and scheduler CPU-selects across ranks. | This graph-external boundary is real, but prior profile says it is not the largest item; fix after trace/accounting is accurate. | +| MLA cache append and metadata | vLLM uses `concat_and_cache_mla`; FlashMLA prepares tile scheduler metadata and graph buffers. | OpenInfer uses `kimi_mla_paged_kv_append` and precomputed decode arena arrays. | Need compare metadata/cache append cost before changing attention kernels; trace currently hides this. | +| MLA q absorb/v up | vLLM uses `torch.bmm` with preprocessed `W_UK_T/W_UV`. | OpenInfer custom kernels `kimi_mla_absorb_q_nope` and `kimi_mla_v_up` over `kv_b_proj`. | Semantically aligned, but microbench should decide whether custom kernels or cuBLAS batched GEMM wins for bs1..4. | +| MoE WNA16 | Both use Marlin WNA16 route align, W13, SiLU, W2, sum. | OpenInfer has persistent workspace and explicit local EP route metadata. | Main MoE kernel choice is already aligned; next work is route histogram/tail and combine, not replacing WNA16. | +| Routed combine | vLLM EP path maps local experts via `expert_map`; final tensor-parallel reduce happens through vLLM distributed path. | OpenInfer currently uses NCCL bridge: local sum -> repeat -> reduce-scatter -> fused scale+residual add. | This is not PPLX EP; it is graph-capturable but likely still extra data movement. | +| TP collectives | vLLM parallel layers hide TP reductions; BF16 path does not visibly use our BF16-via-F32 bridge. | OpenInfer uses BF16-via-F32 bridge for hidden all-reduces because BF16 collective changed greedy output. | This is correctness-driven overhead; replacing it needs external vLLM greedy/top-k gate. | +| Sampling/top1 | vLLM sampling/logprobs is integrated with its sampler path. | OpenInfer graph body ends at local top1; worker D2H reads local top1 and scheduler CPU-selects across ranks. | This graph-external boundary is real, but prior profile says it is not the largest item; fix after trace/accounting is accurate. | ## Routed Bridge Probe @@ -181,9 +181,9 @@ Historical `kimi_graph_probe --probe routed-bridge-compare` (since retired, see ## TP-Only MoE Cadence Probe -Hypatia 对 `$LOCAL_VLLM_DIR` 的 Kimi/DeepSeekV3 TP-only path 做了源码对照:vLLM decode 是 embedding `1` 次、attention `61` 次、dense layer0 `1` 次、MoE final `60` 次 BF16 all-reduce,总计 `123` 次 BF16 all-reduce,MoE TP-only path 不使用 reduce-scatter。PegaInfer 当前是同样 `123` 次 logical hidden all-reduce,再额外加 `60` 次 routed `repeat+RS` bridge。 +Hypatia 对 `$LOCAL_VLLM_DIR` 的 Kimi/DeepSeekV3 TP-only path 做了源码对照:vLLM decode 是 embedding `1` 次、attention `61` 次、dense layer0 `1` 次、MoE final `60` 次 BF16 all-reduce,总计 `123` 次 BF16 all-reduce,MoE TP-only path 不使用 reduce-scatter。OpenInfer 当前是同样 `123` 次 logical hidden all-reduce,再额外加 `60` 次 routed `repeat+RS` bridge。 -把 PegaInfer decode MoE 临时改成 vLLM TP-only final all-reduce 后,H20 correctness 通过但性能回退: +把 OpenInfer decode MoE 临时改成 vLLM TP-only final all-reduce 后,H20 correctness 通过但性能回退: | Variant | output16 steady | output64 steady | Decision | | --- | --- | --- | --- | @@ -198,4 +198,4 @@ Conclusion: source-level cadence parity alone is not a keep criterion. The next 1. Profile any remaining p99/max tail under dense/shared gate-up fusion plus routed scaled-add fusion and Marlin locks clear removal: output64 avg/p50/p95/p99 are now around `14.4/14.5/14.9/14.8ms`, with p99 under `15ms` in the latest kept gate. 2. Revisit full shared/EP communication overlap only with a production-shaped NCCL probe; isolated two-comm graph replay wins, but worker two-comm init/capture is not stable enough to ship. 3. Next graph-safe local wins: keep Marlin output clears unless route metadata proves every consumed row is written, add `kimi_mla_paged_kv_append` provider coverage, and design a real AG/RS or PPLX EP combine path that removes the repeat-for-RS bridge. -4. Keep MoE WNA16 kernel path unchanged until the corrected report shows a measured win candidate; current vLLM/PegaInfer MoE compute path is already structurally close. +4. Keep MoE WNA16 kernel path unchanged until the corrected report shows a measured win candidate; current vLLM/OpenInfer MoE compute path is already structurally close. diff --git a/docs/models/qwen3/accuracy-gate.md b/docs/models/qwen3/accuracy-gate.md index 63afb05f..18b0e332 100644 --- a/docs/models/qwen3/accuracy-gate.md +++ b/docs/models/qwen3/accuracy-gate.md @@ -1,6 +1,6 @@ # Qwen3-4B accuracy gate -**TL;DR**: Qwen3-4B's logits are guarded by `tests/hf_golden_gate.rs` — a tolerance check against a stored HuggingFace bf16 golden, *not* an exact-text or hash baseline. It teacher-forces 48 fixed sequences and asserts pegainfer's logprobs stay at the bf16 noise floor of HF across bs=1 / batched eager / CUDA-graph. Strict guards: a structural **regret** check on the argmax + **mean** delta ≤ 0.06 nat + **p99** delta ≤ 0.20 nat; the absolute max is printed but not asserted (it is coverage-unstable). This is the reference implementation of the pattern in `subsystems/correctness/logits-golden-gate.md` — read that for the *why*; this doc is the Qwen3-4B *specifics*. +**TL;DR**: Qwen3-4B's logits are guarded by `tests/hf_golden_gate.rs` — a tolerance check against a stored HuggingFace bf16 golden, *not* an exact-text or hash baseline. It teacher-forces 48 fixed sequences and asserts openinfer's logprobs stay at the bf16 noise floor of HF across bs=1 / batched eager / CUDA-graph. Strict guards: a structural **regret** check on the argmax + **mean** delta ≤ 0.06 nat + **p99** delta ≤ 0.20 nat; the absolute max is printed but not asserted (it is coverage-unstable). This is the reference implementation of the pattern in `subsystems/correctness/logits-golden-gate.md` — read that for the *why*; this doc is the Qwen3-4B *specifics*. Last touched: 2026-05 @@ -16,7 +16,7 @@ The methodology (why HF, why a tolerance not a hash, why teacher-forcing, why re | Reference top-K | HF bf16 top-64 logprobs per position | dumper | | Regret tolerance | `MARGIN_TOL` = 0.20 nat | gate | | Mean / p99 bounds | `MEAN_TOL` = 0.06, `P99_TOL` = 0.20 | gate | -| Head tokens compared | top `HEAD_K` = 8 of pegainfer's own picks | gate | +| Head tokens compared | top `HEAD_K` = 8 of openinfer's own picks | gate | | Graph-bucket straddles | `BUCKET_STRADDLES = [9, 5]` (9→bucket 16 = 7 pad; 5→bucket 8 = 3 pad) | gate, from `batch_decode.rs` buckets | Prompt lengths reach 256 tokens (up to 16 KV blocks at block_size 16) on purpose: the gate then exercises long-attention / KV-block indexing / high RoPE positions, not just short prompts. @@ -46,7 +46,7 @@ Verified run, all four passes green in 26s: | graph (9 padded) | 153 | 0.0337 | 0.0260 | 0.1297 | 0.4374 | | graph (5 padded) | 85 | 0.0316 | 0.0253 | 0.1080 | 0.1410 | -**mean (~0.032) and p99 (~0.12) are dead stable; only the absolute max moves** — which is why max is printed, not asserted. The single worst token (seq 7 / pos 5 / token 68172) is the *same* across bs=1 / eager-9 / graph-9: a deep-tail token at logprob ≈−10, far below the argmax. HF is fixed at −10.2508; pegainfer reads −9.8759 at bs=1 and −9.8134 in the 9-seq batch — the delta swings 0.3749→0.4374 purely from batch-dependent reduction order, with zero effect on the argmax. eager-9 and graph-9 are bit-identical, so the CUDA-graph path matches eager exactly at the same composition; only batch composition moves the number. As coverage grew (108→816 positions over the redesign) the max climbed 0.26→0.44 while mean/p99 held — the absolute max is a coverage treadmill, not a drift signal. +**mean (~0.032) and p99 (~0.12) are dead stable; only the absolute max moves** — which is why max is printed, not asserted. The single worst token (seq 7 / pos 5 / token 68172) is the *same* across bs=1 / eager-9 / graph-9: a deep-tail token at logprob ≈−10, far below the argmax. HF is fixed at −10.2508; openinfer reads −9.8759 at bs=1 and −9.8134 in the 9-seq batch — the delta swings 0.3749→0.4374 purely from batch-dependent reduction order, with zero effect on the argmax. eager-9 and graph-9 are bit-identical, so the CUDA-graph path matches eager exactly at the same composition; only batch composition moves the number. As coverage grew (108→816 positions over the redesign) the max climbed 0.26→0.44 while mean/p99 held — the absolute max is a coverage treadmill, not a drift signal. Tolerances were calibrated from this floor, strictly: `MEAN_TOL` 0.06 ≈ 2× the measured mean; `P99_TOL` 0.20 ≈ 1.6× the measured p99. Not comfortable round numbers — a loose gate would silently miss real drift smaller than its headroom. @@ -58,8 +58,8 @@ After a change that legitimately alters numerical output, recompute the golden o uv run --no-project python tools/accuracy/dump_qwen3_4b_hf_golden.py \ --model-path /data/models/Qwen3-4B --out test_data/qwen3-4b-hf-golden.safetensors -PEGAINFER_TEST_MODEL_PATH=/data/models/Qwen3-4B \ - cargo test --release -p pegainfer-qwen3-4b --test hf_golden_gate -- --nocapture +OPENINFER_TEST_MODEL_PATH=/data/models/Qwen3-4B \ + cargo test --release -p openinfer-qwen3-4b --test hf_golden_gate -- --nocapture ``` ## Diagnosing a red gate @@ -67,8 +67,8 @@ PEGAINFER_TEST_MODEL_PATH=/data/models/Qwen3-4B \ The gate prints the full delta distribution and the worst position (`seq`, `pos`, `token`, both logprobs) before it fails. Read that first: - **`mean` over `MEAN_TOL` (or `p99` over `P99_TOL`), max near the floor** → a *systematic* drift: something shifted every logit a little (a kernel change, a dtype/rounding change, a norm/RoPE regression). Real bug — bisect the change. -- **`mean`/`p99` at the floor, one lone `max` outlier** → a localised token error, or just a new bf16 tail outlier on different hardware. Adjudicate with fp32: regenerate the golden with `--dtype float32` and compare. If pegainfer tracks fp32 truth as well as HF-bf16 does, it is bf16 noise — the gate does not assert max precisely so this should not have failed; if you must widen `MEAN_TOL`/`P99_TOL`, record the measurement and multiple here. -- **regret / argmax violation** → HF had a clear winner (regret > 0.20 nat) and pegainfer disagreed, or pegainfer's pick is absent from HF's top-64 entirely. Almost always a real wrong-token bug; 0.20 nat is far above a tie. +- **`mean`/`p99` at the floor, one lone `max` outlier** → a localised token error, or just a new bf16 tail outlier on different hardware. Adjudicate with fp32: regenerate the golden with `--dtype float32` and compare. If openinfer tracks fp32 truth as well as HF-bf16 does, it is bf16 noise — the gate does not assert max precisely so this should not have failed; if you must widen `MEAN_TOL`/`P99_TOL`, record the measurement and multiple here. +- **regret / argmax violation** → HF had a clear winner (regret > 0.20 nat) and openinfer disagreed, or openinfer's pick is absent from HF's top-64 entirely. Almost always a real wrong-token bug; 0.20 nat is far above a tie. ## Next step diff --git a/docs/models/qwen3/kernels-crate.md b/docs/models/qwen3/kernels-crate.md index f6af61fb..734bb09f 100644 --- a/docs/models/qwen3/kernels-crate.md +++ b/docs/models/qwen3/kernels-crate.md @@ -2,78 +2,78 @@ **Created**: 2026-05-03 **Status**: complete -**TL;DR**: Phase 1 now extracts the Qwen3-4B dense full-attention kernel surface into `crates/pegainfer-kernels`, with a compact kernel index so future LLM sessions can jump from model DAG nodes to Rust wrappers, FFI symbols, CUDA/Triton sources, and shape constraints. `KvPool`, `PagePool`, and `SamplingParams` stay in the root runtime. Local metadata/format checks pass; GPU release build, release test-target compilation, release clippy, Qwen3-4B e2e, and `bench_serving snapshot` pass. +**TL;DR**: Phase 1 now extracts the Qwen3-4B dense full-attention kernel surface into `crates/openinfer-kernels`, with a compact kernel index so future LLM sessions can jump from model DAG nodes to Rust wrappers, FFI symbols, CUDA/Triton sources, and shape constraints. `KvPool`, `PagePool`, and `SamplingParams` stay in the root runtime. Local metadata/format checks pass; GPU release build, release test-target compilation, release clippy, Qwen3-4B e2e, and `bench_serving snapshot` pass. ## Preparation - **Read**: - `docs/index.md` - confirmed the relevant architecture, kernel, TP, benchmarking, and Qwen3 history docs. - - `docs/subsystems/kernels/pegainfer-kernels-boundary.md` - recorded the per-model engine direction, but its near-term ordering needs to be corrected from ledger-first to crate-first. + - `docs/subsystems/kernels/openinfer-kernels-boundary.md` - recorded the per-model engine direction, but its near-term ordering needs to be corrected from ledger-first to crate-first. - `docs/models/qwen3/tp-design.md` - confirmed Qwen3-4B TP constraints and runtime hazards around per-thread CUDA/cuBLAS state. - `src/model/qwen3/*`, `src/ops/*`, `src/ffi.rs`, `src/tensor.rs`, `src/kv_pool.rs`, `src/page_pool.rs`, and `build.rs` - mapped the current Qwen3-4B kernel calls, tensor/runtime dependencies, paged KV metadata, and CUDA/Triton build pipeline. - **Relevant history**: - `docs/models/qwen3/tp-design.md` shows that Qwen3 execution is already rank-local and step-oriented, so the kernel crate must not hide device binding or TP collective points. - **Plan**: - 1. Convert the repository into a Cargo workspace while keeping the root `pegainfer` package as the server/control-plane crate. - 2. Create `crates/pegainfer-kernels` with the Qwen3-4B kernel surface: kernel ABI tensor helpers, Qwen3-used `ops`, FFI declarations, CUDA/Triton build support, and Qwen3 paged-attention layout metadata helpers. - 3. Move Qwen3 call sites to import `pegainfer_kernels::{ops, tensor}` and remove direct Qwen3 dependence on root-local `ops`, `ffi`, and `tensor` modules. + 1. Convert the repository into a Cargo workspace while keeping the root `openinfer` package as the server/control-plane crate. + 2. Create `crates/openinfer-kernels` with the Qwen3-4B kernel surface: kernel ABI tensor helpers, Qwen3-used `ops`, FFI declarations, CUDA/Triton build support, and Qwen3 paged-attention layout metadata helpers. + 3. Move Qwen3 call sites to import `openinfer_kernels::{ops, tensor}` and remove direct Qwen3 dependence on root-local `ops`, `ffi`, and `tensor` modules. 4. Preserve repository build health. If Qwen3.5 still requires symbols from the old combined CUDA library, either keep those symbols as compatibility exports in the kernels crate or explicitly document and gate any temporary Qwen3-only limitation before making code changes. 5. Add a kernel index for LLM navigation under the new crate: - `KERNELS.md`: short human/LLM routing table from `qwen3_4b::::` to Rust wrapper, FFI symbol, source file, backend, shape/layout constraints, and status. - Machine-readable model DAG metadata should wait for the Qwen3-4B model crate, where it can be generated or validated from model code instead of hand-maintained in the generic kernels crate. - 6. Update `docs/subsystems/kernels/pegainfer-kernels-boundary.md` and `docs/index.md` so the recorded next step is crate-first, with ledger/trace/simulator as metadata products of the crate boundary. + 6. Update `docs/subsystems/kernels/openinfer-kernels-boundary.md` and `docs/index.md` so the recorded next step is crate-first, with ledger/trace/simulator as metadata products of the crate boundary. 7. Verify with `cargo test --release` or, if the local environment blocks full release tests, at least `cargo check --release` and report the exact blocker. - **Risks / open questions**: - - A strict Qwen3-only CUDA extraction can conflict with the current default binary because Qwen3.5 still compiles in the same root crate and references some shared FFI symbols. The safest implementation may need to move the link/build owner to `pegainfer-kernels` while only stabilizing and indexing the Qwen3 API first. + - A strict Qwen3-only CUDA extraction can conflict with the current default binary because Qwen3.5 still compiles in the same root crate and references some shared FFI symbols. The safest implementation may need to move the link/build owner to `openinfer-kernels` while only stabilizing and indexing the Qwen3 API first. - `kv_pool` and `page_pool` sit between model state and kernel metadata. For Phase 1, only the kernel-facing layout/descriptor pieces should move if needed; scheduler-owned allocation policy should remain in the root crate unless compilation forces a narrower split. - - Build-script path handling is fragile when moving kernel source into `crates/pegainfer-kernels/`. The plan should prefer one build owner and avoid compiling the same C symbols in both root and dependency crates. + - Build-script path handling is fragile when moving kernel source into `crates/openinfer-kernels/`. The plan should prefer one build owner and avoid compiling the same C symbols in both root and dependency crates. ## Execution Log ### Step 1: Create kernels crate and move build ownership -- Converted the repository into a Cargo workspace with `crates/pegainfer-kernels`. -- Added `pegainfer-kernels` as a root dependency. -- Moved CUDA source from root `csrc/` to `crates/pegainfer-kernels/csrc/`. -- Moved Triton AOT files from root `tools/triton/` to `crates/pegainfer-kernels/tools/triton/`. -- Moved the FlashInfer submodule path from `third_party/flashinfer` to `crates/pegainfer-kernels/third_party/flashinfer`. -- Replaced the root `build.rs` with an intentionally empty build script; `crates/pegainfer-kernels/build.rs` now owns CUDA/Triton compilation. +- Converted the repository into a Cargo workspace with `crates/openinfer-kernels`. +- Added `openinfer-kernels` as a root dependency. +- Moved CUDA source from root `csrc/` to `crates/openinfer-kernels/csrc/`. +- Moved Triton AOT files from root `tools/triton/` to `crates/openinfer-kernels/tools/triton/`. +- Moved the FlashInfer submodule path from `third_party/flashinfer` to `crates/openinfer-kernels/third_party/flashinfer`. +- Replaced the root `build.rs` with an intentionally empty build script; `crates/openinfer-kernels/build.rs` now owns CUDA/Triton compilation. -- Moved kernel-owned ABI and operator code into `crates/pegainfer-kernels/src/`: `ffi`, tensor helpers, paged-KV geometry metadata, and the Qwen3-used `ops` modules. +- Moved kernel-owned ABI and operator code into `crates/openinfer-kernels/src/`: `ffi`, tensor helpers, paged-KV geometry metadata, and the Qwen3-used `ops` modules. - Kept `KvPool`, `PagePool`, and `SamplingParams` in the root crate because they are runtime allocation/policy state, not kernels. - Replaced root `src/ffi.rs` and `src/tensor.rs` with compatibility re-exports. -- Replaced root `src/ops.rs` with re-exports from `pegainfer-kernels` plus thin root adapters for sampling, paged prefill planning, paged attention layout conversion, and the remaining Qwen3.5 recurrent wrapper. +- Replaced root `src/ops.rs` with re-exports from `openinfer-kernels` plus thin root adapters for sampling, paged prefill planning, paged attention layout conversion, and the remaining Qwen3.5 recurrent wrapper. - Removed duplicate root `src/ops/{attention,elementwise,embedding,linear,norm,sampling}.rs`. - Kept `src/ops/recurrent.rs` in root for now because it depends on Qwen3.5's model-local `GdrChunkwiseScratch35`; moving that would expand Phase 1 beyond Qwen3-4B. ### Step 3: Add kernel index for LLM navigation -- Added `crates/pegainfer-kernels/KERNELS.md`. +- Added `crates/openinfer-kernels/KERNELS.md`. - The index maps each Qwen3-4B op ID to phase, Rust wrapper, FFI symbol, source file, backend, and shape/layout notes. - Removed the initial `kernel_manifest/qwen3_4b.toml` idea from the kernels crate. A hand-maintained machine-readable manifest in the generic kernel crate would drift; the right place is the future Qwen3-4B model crate, where the manifest can describe the model DAG and be generated or checked against code. ### Step 4: Documentation updates -- Updated `CLAUDE.md`, `README.md`, and `docs/playbooks/developer-onboarding.md` to point CUDA/Triton paths at `crates/pegainfer-kernels/`. -- Updated `docs/subsystems/kernels/pegainfer-kernels-boundary.md` to record crate-first ordering before ledger/simulator work. +- Updated `CLAUDE.md`, `README.md`, and `docs/playbooks/developer-onboarding.md` to point CUDA/Triton paths at `crates/openinfer-kernels/`. +- Updated `docs/subsystems/kernels/openinfer-kernels-boundary.md` to record crate-first ordering before ledger/simulator work. ### Step 5: Verification -- `cargo metadata --no-deps --format-version 1` succeeded and showed both workspace packages: root `pegainfer` and `pegainfer-kernels`. +- `cargo metadata --no-deps --format-version 1` succeeded and showed both workspace packages: root `openinfer` and `openinfer-kernels`. - `cargo fmt --all` applied formatting, then `cargo fmt --all --check` passed. -- `PEGAINFER_CUDA_SM=120 cargo check --release` reached the `pegainfer-kernels` build script and failed at `nvcc` execution because this machine has no `nvcc`. +- `OPENINFER_CUDA_SM=120 cargo check --release` reached the `openinfer-kernels` build script and failed at `nvcc` execution because this machine has no `nvcc`. ### Step 6: GPU release compile - Avoided overwriting `` because that validation checkout has unrelated uncommitted work. - Synced the local working tree to `` with `rsync`, excluding `.git/`, `target/`, `.venv/`, and `models/`. -- Copied the existing validation FlashInfer submodule contents from `/third_party/flashinfer` into `crates/pegainfer-kernels/third_party/flashinfer` inside the build directory. -- `PEGAINFER_CUDA_SM=120 cargo build --release` passed on the CUDA validation host. First pass exposed two Rust warnings from this split (`SamplingParams::is_greedy` unused and root `PrefillPagedPlan` visibility too wide); both were cleaned up. -- Re-synced and reran `PEGAINFER_CUDA_SM=120 cargo build --release`; it passed in 14.16s with only build-script informational warnings. -- `PEGAINFER_CUDA_SM=120 cargo test --release --no-run` passed in 12.28s and compiled all unit, binary, e2e, paged-attention, and regen test targets. +- Copied the existing validation FlashInfer submodule contents from `/third_party/flashinfer` into `crates/openinfer-kernels/third_party/flashinfer` inside the build directory. +- `OPENINFER_CUDA_SM=120 cargo build --release` passed on the CUDA validation host. First pass exposed two Rust warnings from this split (`SamplingParams::is_greedy` unused and root `PrefillPagedPlan` visibility too wide); both were cleaned up. +- Re-synced and reran `OPENINFER_CUDA_SM=120 cargo build --release`; it passed in 14.16s with only build-script informational warnings. +- `OPENINFER_CUDA_SM=120 cargo test --release --no-run` passed in 12.28s and compiled all unit, binary, e2e, paged-attention, and regen test targets. ### Step 7: GPU e2e and serving benchmark - Ran Qwen3-4B e2e on the same validation build directory: - - `PEGAINFER_CUDA_SM=120 PEGAINFER_TEST_MODEL_PATH= cargo test --release --test e2e -- --nocapture` + - `OPENINFER_CUDA_SM=120 OPENINFER_TEST_MODEL_PATH= cargo test --release --test e2e -- --nocapture` - Result: pass, 1 test passed in 9.36s. - Covered greedy golden outputs, multi-request generation, and consumer-drop scheduler survival. - Ran the standard in-process serving snapshot: - - `RUST_LOG=warn PEGAINFER_CUDA_SM=120 cargo run --release --bin bench_serving -- --model-path snapshot` + - `RUST_LOG=warn OPENINFER_CUDA_SM=120 cargo run --release --bin bench_serving -- --model-path snapshot` - Result: pass. - RTX 5090 Qwen3-4B snapshot: - `prefill_heavy (10000,1)`: TTFT p50 `501.93ms`, p99 `503.75ms`. @@ -87,17 +87,17 @@ - Ran local `cargo fmt --all --check`: pass. - Ran local `cargo metadata --no-deps --format-version 1`: pass. - Synced the current working tree to ``. -- Ran `PEGAINFER_CUDA_SM=120 cargo clippy --release --all-targets -- -D warnings` on the CUDA validation host: pass in 1m42s. +- Ran `OPENINFER_CUDA_SM=120 cargo clippy --release --all-targets -- -D warnings` on the CUDA validation host: pass in 1m42s. ### Unexpected -- Local `cargo check --release` reached `pegainfer-kernels` build script but failed because this machine does not have `nvcc`; the user will provide a GPU build machine for compilation. -- A second `cargo check --release -p pegainfer-kernels --lib` without `PEGAINFER_CUDA_SM` failed earlier at GPU SM detection, which is expected on this local machine without `nvidia-smi`. +- Local `cargo check --release` reached `openinfer-kernels` build script but failed because this machine does not have `nvcc`; the user will provide a GPU build machine for compilation. +- A second `cargo check --release -p openinfer-kernels --lib` without `OPENINFER_CUDA_SM` failed earlier at GPU SM detection, which is expected on this local machine without `nvidia-smi`. - The validation checkout was dirty, so verification used a separate validation build directory instead of modifying that checkout. - The validation build directory does not include `.git/`, so `bench_serving snapshot` reports `commit: unknown`. ## Debrief -- **Outcome**: Implemented and validated the crate-first Phase 1 split. Kernel source, Triton source, FlashInfer submodule ownership, CUDA/Triton build script, FFI, kernel ABI tensor helpers, paged-KV layout metadata, and Qwen3-used Rust ops now live under `crates/pegainfer-kernels`. Root `pegainfer` keeps server/model code, `KvPool`, `PagePool`, `SamplingParams`, and thin compatibility adapters. The split passes local format/metadata checks, GPU release build/test-target compilation, release clippy, Qwen3-4B e2e, and the standard Qwen3-4B `bench_serving snapshot`. +- **Outcome**: Implemented and validated the crate-first Phase 1 split. Kernel source, Triton source, FlashInfer submodule ownership, CUDA/Triton build script, FFI, kernel ABI tensor helpers, paged-KV layout metadata, and Qwen3-used Rust ops now live under `crates/openinfer-kernels`. Root `openinfer` keeps server/model code, `KvPool`, `PagePool`, `SamplingParams`, and thin compatibility adapters. The split passes local format/metadata checks, GPU release build/test-target compilation, release clippy, Qwen3-4B e2e, and the standard Qwen3-4B `bench_serving snapshot`. - **Pitfalls encountered**: - Root `src/ops/recurrent.rs` cannot be moved cleanly in this pass because it takes Qwen3.5's `GdrChunkwiseScratch35` type. Moving it would pull hybrid-model scratch ownership into the kernels crate, which is outside the Qwen3-4B Phase 1 scope. - Initially moved `KvPool`, `PagePool`, and `SamplingParams` into the kernels crate. That was too broad; those belong to runtime policy and have been moved back to root. @@ -106,6 +106,6 @@ - The kernel crate should own source and build artifacts physically, not only re-export copied Rust wrappers. Keeping `csrc/`, `tools/triton/`, and `third_party/flashinfer` in root creates exactly the duplicate context we are trying to remove. - The human/LLM routing index belongs beside the kernels crate because it helps edit reusable kernels. Machine-readable model DAG manifests should not live there unless they are generated or validated; they belong with the model crate that owns the DAG. - **Follow-ups**: - - Phase 2 can extract the Qwen3 model crate on top of `pegainfer-kernels`. + - Phase 2 can extract the Qwen3 model crate on top of `openinfer-kernels`. - In the Qwen3 model crate, define the model-owned kernel DAG and decide whether any TOML/JSON manifest is generated from Rust code, validated against wrappers, or avoided entirely in favor of trace IDs emitted directly from the executor. - Run Qwen3.5 e2e separately on a box with `` if later changes touch the compatibility kernels or recurrent wrappers. diff --git a/docs/models/qwen3/kv-pressure-hang.md b/docs/models/qwen3/kv-pressure-hang.md index 3c3c1a76..40b5bc89 100644 --- a/docs/models/qwen3/kv-pressure-hang.md +++ b/docs/models/qwen3/kv-pressure-hang.md @@ -15,11 +15,11 @@ - `.codex/harness/README.md` - confirms the verification ladder and safety boundaries. - `.codex/harness/commands.md` - provides Qwen3 e2e, server, and benchmark commands. - `.codex/harness/verification.md` - classifies this as serving/scheduler behavior needing a narrow repro plus HTTP/benchmark evidence. - - `pegainfer-qwen3-4b/src/scheduler.rs` - admission control currently defers requests under KV pressure. - - `pegainfer-qwen3-4b/src/scheduler/plan.rs` - execution plans currently consume pending requests before failures are handled. - - `pegainfer-qwen3-4b/src/scheduler/effects.rs` - successful finishes drop request state; scheduler execution errors do not. - - `pegainfer-qwen3-4b/src/executor.rs` - `drop_request` is the existing owner API for releasing per-request KV state. - - `pegainfer-core/src/kv_pool.rs` and `pegainfer-core/src/page_pool.rs` - KV pages are RAII-returned only when request state is dropped. + - `openinfer-qwen3-4b/src/scheduler.rs` - admission control currently defers requests under KV pressure. + - `openinfer-qwen3-4b/src/scheduler/plan.rs` - execution plans currently consume pending requests before failures are handled. + - `openinfer-qwen3-4b/src/scheduler/effects.rs` - successful finishes drop request state; scheduler execution errors do not. + - `openinfer-qwen3-4b/src/executor.rs` - `drop_request` is the existing owner API for releasing per-request KV state. + - `openinfer-core/src/kv_pool.rs` and `openinfer-core/src/page_pool.rs` - KV pages are RAII-returned only when request state is dropped. - GitHub issue #85 - observed server stays alive but completions hang after QPS=2 KV pressure. - **Relevant history**: - `docs/subsystems/scheduler/scheduler.md` - QPS=2 varied workload is near capacity and already had some failed requests; the fix must handle pressure explicitly rather than claim higher throughput. @@ -45,7 +45,7 @@ - decode errors surfacing as `TokenEvent::Error`, dropping request state, and allowing recovery; - client/receiver drop releasing request state. - Changed `DecodeEffect::EmitAndContinue` send-failure handling to call `drop_request` before retiring the active request. -- Result: remote RTX 5090 `cargo test --release -p pegainfer-qwen3-4b --lib scheduler -- --nocapture` passed, `4 passed`. +- Result: remote RTX 5090 `cargo test --release -p openinfer-qwen3-4b --lib scheduler -- --nocapture` passed, `4 passed`. ### Step 2: Maintainer feedback refinement - The maintainer clarified that the basic fix should keep requests that cannot get KV allocation in the waiting queue; preemption can be deferred. @@ -59,21 +59,21 @@ ### Step 3: Build and static gates - Remote environment: - GPU: NVIDIA GeForce RTX 5090, driver `580.76.05`, 32607 MiB. - - CUDA: `nvcc` `13.0.88`, `PEGAINFER_CUDA_SM=120`. + - CUDA: `nvcc` `13.0.88`, `OPENINFER_CUDA_SM=120`. - Rust: `rustc 1.97.0-nightly (7c3c88f42 2026-05-14)`. - Model: `models/Qwen3-4B`, HF revision metadata `1cfa9a7208912126459214e8b04321603b3df60c`. - Commands: - `cargo fmt --check` — passed. - - `cargo test --release -p pegainfer-qwen3-4b --lib scheduler -- --nocapture` — passed, `4 passed`. - - `cargo clippy --release -p pegainfer-qwen3-4b --lib -- -D warnings` — passed. - - `cargo build --release -p pegainfer-server` — passed. + - `cargo test --release -p openinfer-qwen3-4b --lib scheduler -- --nocapture` — passed, `4 passed`. + - `cargo clippy --release -p openinfer-qwen3-4b --lib -- -D warnings` — passed. + - `cargo build --release -p openinfer-server` — passed. - Local command: - `~/.cargo/bin/cargo fmt --check` — passed. ### Step 4: E2E and serving pressure validation - Installed `vllm 0.21.0` in the validation venv to run the issue's real `vllm bench serve` client. - Ran a host-local exact e2e check against the validation model snapshot: - - `PEGAINFER_TEST_MODEL_PATH=models/Qwen3-4B cargo test --release -p pegainfer-qwen3-4b --test e2e -- --nocapture` + - `OPENINFER_TEST_MODEL_PATH=models/Qwen3-4B cargo test --release -p openinfer-qwen3-4b --test e2e -- --nocapture` - Result after local fixture regeneration for that model snapshot: passed, `1 passed`. - PR review later found the regenerated fixture was not portable to the standard local model snapshot, so the repository `test_data/Qwen3-4B.json` change was reverted and this e2e result is not used as a merge gate. - Ran a small issue-shaped benchmark first: @@ -90,19 +90,19 @@ ### Step 5: Compatibility fix encountered during validation - Remote CUDA 13.0 initially failed with the existing `cudarc` `cuda-13010` feature because the driver/runtime lacked `cuDevSmResourceSplit`. - Kept the workspace on `cuda-13010`; changing the shared `cudarc` feature would widen the PR's collaboration surface beyond issue #85. -- Fixed `qwen3_decode_context` test-target compilation by linking `cudaProfilerStart/Stop` directly from `cudart`; the symbols were not exposed through `pegainfer_core::ffi`. +- Fixed `qwen3_decode_context` test-target compilation by linking `cudaProfilerStart/Stop` directly from `cudart`; the symbols were not exposed through `openinfer_core::ffi`. ### Step 6: Final diff hygiene - `git diff --check` — passed. -- Confirmed the remote pegainfer server process was stopped after validation. +- Confirmed the remote openinfer server process was stopped after validation. ### Step 7: Maintainer-style review follow-up - Re-reviewed the changed scheduler and bridge paths after the main fix. - Found one API-contract issue: `TokenEvent::Rejected` was being translated to vLLM `EngineCoreFinishReason::Stop`, which would make an impossible KV request look like an empty successful response. -- Changed `pegainfer-server/src/vllm_frontend.rs` so `Rejected` maps to `EngineCoreFinishReason::Error` with the rejection message as `stop_reason`. +- Changed `openinfer-server/src/vllm_frontend.rs` so `Rejected` maps to `EngineCoreFinishReason::Error` with the rejection message as `stop_reason`. - Added `vllm_frontend::tests::rejected_request_is_reported_as_error`. - Remote RTX 5090 command: - - `cargo test --release -p pegainfer-server rejected_request_is_reported_as_error --lib` — passed, `1 passed`. + - `cargo test --release -p openinfer-server rejected_request_is_reported_as_error --lib` — passed, `1 passed`. ### Step 8: PR review comment follow-up - Read PR #131 review comments from `gemini-code-assist`. The comments claimed the KV budget formulas should use `prompt_len + max_tokens` and `prompt_len + generated_count`. diff --git a/docs/models/qwen3/model-crate.md b/docs/models/qwen3/model-crate.md index 39636723..35403417 100644 --- a/docs/models/qwen3/model-crate.md +++ b/docs/models/qwen3/model-crate.md @@ -2,23 +2,23 @@ **Created**: 2026-05-03 **Status**: ready for diff review -**TL;DR**: `crates/pegainfer-qwen3-4b` now owns Qwen3 config, weights, execution, scheduler, tests, benches, and kernel plan. Root `pegainfer` loads Qwen3 through a generic `EngineHandle` and no longer contains `Qwen3Model`, `Qwen3Executor`, `ModelRuntimeConfig`, root Qwen3 tests, or `src/model/qwen3/*`. The old `ModelForward` path has been removed; decode length-limit now emits the final token before `Finished`. Long-context `bs=1` TPOT was traced to non-partition FlashInfer paged decode under-filling the GPU; Qwen3 runtime gates FlashInfer split-K decode for `padded_bs<=2 && seq_len>=1024` and was retuned to `chunk_tokens=256,max_chunks=64`, cutting 4k/64 serving steady TPOT from about `11.7ms` to `6.46ms` on RTX 5090. Qwen3 now keeps a single model-crate bench entry: `qwen3_kernel_snapshot`, a JSON snapshot runner with warm/cold-L2 latency, default-on CUPTI counters, and compare. Correctness/truth is intentionally out of this snapshot for now. +**TL;DR**: `crates/openinfer-qwen3-4b` now owns Qwen3 config, weights, execution, scheduler, tests, benches, and kernel plan. Root `openinfer` loads Qwen3 through a generic `EngineHandle` and no longer contains `Qwen3Model`, `Qwen3Executor`, `ModelRuntimeConfig`, root Qwen3 tests, or `src/model/qwen3/*`. The old `ModelForward` path has been removed; decode length-limit now emits the final token before `Finished`. Long-context `bs=1` TPOT was traced to non-partition FlashInfer paged decode under-filling the GPU; Qwen3 runtime gates FlashInfer split-K decode for `padded_bs<=2 && seq_len>=1024` and was retuned to `chunk_tokens=256,max_chunks=64`, cutting 4k/64 serving steady TPOT from about `11.7ms` to `6.46ms` on RTX 5090. Qwen3 now keeps a single model-crate bench entry: `qwen3_kernel_snapshot`, a JSON snapshot runner with warm/cold-L2 latency, default-on CUPTI counters, and compare. Correctness/truth is intentionally out of this snapshot for now. ## Preparation - **Read**: - `docs/index.md` - identified the kernels/core crate split and per-model boundary docs. - - `docs/models/qwen3/kernels-crate.md` - Qwen3 kernel source/build ownership and human kernel index already live in `pegainfer-kernels`; model-owned DAG metadata should live with the model crate. - - `docs/subsystems/kernels/pegainfer-kernels-boundary.md` - records the per-model engine direction and says root should be reusable frontend/control-plane infrastructure, not a universal model abstraction. + - `docs/models/qwen3/kernels-crate.md` - Qwen3 kernel source/build ownership and human kernel index already live in `openinfer-kernels`; model-owned DAG metadata should live with the model crate. + - `docs/subsystems/kernels/openinfer-kernels-boundary.md` - records the per-model engine direction and says root should be reusable frontend/control-plane infrastructure, not a universal model abstraction. - `src/main.rs`, `src/lib.rs`, `src/server_engine.rs`, `src/scheduler.rs`, `src/model_executor.rs`, `src/model/qwen3/*`, `src/bin/bench_serving.rs`, and Qwen3 tests - mapped what root currently knows about Qwen3. - **Relevant history**: - The earlier shared-runtime work (now consolidated into `docs/subsystems/runtime/runtime.md`) was a useful simplification, but the next boundary should not make `ModelForward` the long-term universal engine API. - **Plan**: 1. Define the model crate/root interface before moving code. - 2. Move the generic text-generation handle/request/event types into `pegainfer-core` so root and model crates can communicate without model crates depending on root. - 3. Create `crates/pegainfer-qwen3-4b` and move Qwen3 config, weights, forward paths, decode buffers, `Qwen3Executor`, Qwen3 scheduler internals, Qwen3 correctness tests, and Qwen3-specific benches into it. - 4. Keep root `pegainfer` as frontend plus model registry. The registry can know crate names, but `main`, `vllm_frontend`, and generic benchmark code should only see `EngineHandle`, `ModelInfo`, and tokenizer path. - 5. Add a model-owned `kernel_plan.rs` in the Qwen3 crate as the LLM/human index from model DAG phases to reusable kernels. Do not add a hand-maintained public TOML in `pegainfer-kernels`. + 2. Move the generic text-generation handle/request/event types into `openinfer-core` so root and model crates can communicate without model crates depending on root. + 3. Create `crates/openinfer-qwen3-4b` and move Qwen3 config, weights, forward paths, decode buffers, `Qwen3Executor`, Qwen3 scheduler internals, Qwen3 correctness tests, and Qwen3-specific benches into it. + 4. Keep root `openinfer` as frontend plus model registry. The registry can know crate names, but `main`, `vllm_frontend`, and generic benchmark code should only see `EngineHandle`, `ModelInfo`, and tokenizer path. + 5. Add a model-owned `kernel_plan.rs` in the Qwen3 crate as the LLM/human index from model DAG phases to reusable kernels. Do not add a hand-maintained public TOML in `openinfer-kernels`. 6. Verify locally with format/metadata, then on the CUDA validation host with release build, clippy, Qwen3 crate e2e, and root `bench_serving snapshot`. Keep microbench timing in Criterion benches instead of duplicating it as a test. - **Risks / open questions**: - If the scheduler stays in root, root still knows Qwen3's execution shape. To meet the stated goal, the Qwen3 scheduler should move into the Qwen3 crate and expose only a generic handle. @@ -30,7 +30,7 @@ The root-visible interface should be request/response oriented, not prefill/decode oriented. ```rust -// pegainfer-core +// openinfer-core pub struct EngineLoadOptions { pub enable_cuda_graph: bool, pub device_ordinals: Vec, @@ -65,7 +65,7 @@ pub struct EngineHandle { ``` ```rust -// pegainfer-qwen3-4b +// openinfer-qwen3-4b pub fn probe_model(model_path: &std::path::Path) -> anyhow::Result>; pub fn start_engine( model_path: &std::path::Path, @@ -74,12 +74,12 @@ pub fn start_engine( pub fn kernel_plan() -> &'static KernelPlan; ``` -`Qwen3Model`, `BatchDecodeBuffers`, and `KvState` should not be root-facing APIs. The deliberate low-level escape hatch is `pegainfer_qwen3_4b::runtime`, which exposes `Qwen3Executor` plus prefill/decode/unified plan types. That is the production phase boundary used by the scheduler and by model-local benches; root should still use `start_engine`. +`Qwen3Model`, `BatchDecodeBuffers`, and `KvState` should not be root-facing APIs. The deliberate low-level escape hatch is `openinfer_qwen3_4b::runtime`, which exposes `Qwen3Executor` plus prefill/decode/unified plan types. That is the production phase boundary used by the scheduler and by model-local benches; root should still use `start_engine`. ## Execution Log ### Step 1: Add generic engine API to core -- Added `pegainfer_core::engine` with: +- Added `openinfer_core::engine` with: - `EngineLoadOptions` - `ModelInfo` - `TokenLogprob` @@ -91,7 +91,7 @@ pub fn kernel_plan() -> &'static KernelPlan; - Root `scheduler.rs` is reduced to compatibility re-exports for `SchedulerHandle`, `SchedulerRequest`, and `TokenEvent`. ### Step 2: Extract Qwen3 crate -- Added `crates/pegainfer-qwen3-4b`. +- Added `crates/openinfer-qwen3-4b`. - Moved Qwen3-owned code into the crate: - config/weights/forward/prefill/decode/unified forward - batch decode buffers @@ -108,14 +108,14 @@ pub fn kernel_plan() -> &'static KernelPlan; - `src/model/qwen3/*` - `src/model_executor.rs` - Qwen3 root tests: `tests/e2e.rs`, `tests/paged_attention.rs`, `tests/bench_prefill.rs` -- Root `main.rs` starts Qwen3 through `pegainfer_qwen3_4b::start_engine(...)`. +- Root `main.rs` starts Qwen3 through `openinfer_qwen3_4b::start_engine(...)`. - Root `vllm_frontend.rs` accepts a generic `EngineHandle`. - Root `bench_serving` uses the same generic scheduler bench path for Qwen3 instead of constructing `Qwen3Model` directly. - Checked root with `rg` and confirmed no hits for `Qwen3Model`, `Qwen3Executor`, `ModelRuntimeConfig`, `model_executor`, `src/model/qwen3`, or stale "Qwen3 continuous" comments under root source/tests/benches/README. ### Step 4: Link and validation fixes -- Added explicit `stdc++` link output in `pegainfer-kernels` build script. Once Qwen3 became an independent crate with its own tests, the FlashInfer C++ CUDA objects needed the C++ runtime linked for test binaries as well as root binaries. -- Fixed the Qwen3 crate prefill test to respect `PEGAINFER_TEST_MODEL_PATH`. +- Added explicit `stdc++` link output in `openinfer-kernels` build script. Once Qwen3 became an independent crate with its own tests, the FlashInfer C++ CUDA objects needed the C++ runtime linked for test binaries as well as root binaries. +- Fixed the Qwen3 crate prefill test to respect `OPENINFER_TEST_MODEL_PATH`. - The validation build directory still has no `.git`, so `bench_serving snapshot` writes `commit: unknown`; after pulling it back with `rsync -e 'ssh -S none'`, the local snapshot commit field was set to current local `HEAD` short hash `0f54a1d`. ### Step 5: Verification @@ -123,11 +123,11 @@ pub fn kernel_plan() -> &'static KernelPlan; - `cargo fmt --all --check` passes. - `cargo metadata --no-deps --format-version 1` passes. - CUDA validation host (RTX 5090): - - `PEGAINFER_CUDA_SM=120 cargo clippy --release --all-targets -- -D warnings` passes. - - `PEGAINFER_CUDA_SM=120 cargo build --release` passes. - - `PEGAINFER_CUDA_SM=120 cargo test --release --workspace --no-run` passes. - - `PEGAINFER_CUDA_SM=120 PEGAINFER_TEST_MODEL_PATH= cargo test --release -p pegainfer-qwen3-4b --test e2e -- --nocapture` passes. - - `RUST_LOG=warn PEGAINFER_CUDA_SM=120 cargo run --release --bin bench_serving -- --model-path snapshot` passes: + - `OPENINFER_CUDA_SM=120 cargo clippy --release --all-targets -- -D warnings` passes. + - `OPENINFER_CUDA_SM=120 cargo build --release` passes. + - `OPENINFER_CUDA_SM=120 cargo test --release --workspace --no-run` passes. + - `OPENINFER_CUDA_SM=120 OPENINFER_TEST_MODEL_PATH= cargo test --release -p openinfer-qwen3-4b --test e2e -- --nocapture` passes. + - `RUST_LOG=warn OPENINFER_CUDA_SM=120 cargo run --release --bin bench_serving -- --model-path snapshot` passes: - `prefill_heavy (10000,1)`: TTFT p50 `500.90ms`, p99 `503.30ms` - `decode_heavy (1024,256)`: TPOT p50 `7.57ms`, p99 `7.74ms` - This run exposed a scheduler length-limit bug: `max_tokens=256` emitted only `255` token events because the limit path finished without emitting the final decoded token. It was fixed in Step 7. @@ -138,20 +138,20 @@ pub fn kernel_plan() -> &'static KernelPlan; - Rejected a bench-only support API and also rejected using `ModelForward` as the benchmark entry. - Added an explicit `runtime` module that re-exports the scheduler's real `Qwen3Executor` phase API: `PrefillPlan`, `DecodePlan`, `UnifiedPlan`, request items, and result types. - Removed top-level public `Qwen3Model`, `ModelRuntimeConfig`, and `Qwen3State` re-exports. External low-level tools must opt into `runtime`; root continues to use `start_engine`. -- Replaced `crates/pegainfer-qwen3-4b/benches/qwen3_prefill.rs` with `benches/qwen3_runtime.rs`. It measures executor prefill TTFT over `128`, `512`, `1024`, `2048`, `4096`, and `10000` token prompts, plus executor decode TPOT for batch sizes `1`, `2`, `4`, `8`, `16`, and `32` at a `1024` token context. +- Replaced `crates/openinfer-qwen3-4b/benches/qwen3_prefill.rs` with `benches/qwen3_runtime.rs`. It measures executor prefill TTFT over `128`, `512`, `1024`, `2048`, `4096`, and `10000` token prompts, plus executor decode TPOT for batch sizes `1`, `2`, `4`, `8`, `16`, and `32` at a `1024` token context. - Updated `tests/paged_attention.rs` to use the same executor phase API: prefill once to create KV state, then decode through `execute_decode`. - Verification after the cleanup: - Local `cargo fmt --all --check` and `cargo metadata --no-deps --format-version 1` pass. - - Local `cargo check --release -p pegainfer-qwen3-4b --benches --tests` cannot run on the Mac without CUDA/nvcc; with `PEGAINFER_CUDA_SM=120` it still fails at local `nvcc`. - - CUDA host `PEGAINFER_CUDA_SM=120 cargo check --release -p pegainfer-qwen3-4b --benches --tests` passes. - - CUDA host `PEGAINFER_CUDA_SM=120 cargo clippy --release --all-targets -- -D warnings` passes. - - CUDA host `PEGAINFER_CUDA_SM=120 PEGAINFER_TEST_MODEL_PATH= cargo test --release -p pegainfer-qwen3-4b --test paged_attention -- --nocapture` passes. - - CUDA host full Criterion bench passes with `PEGAINFER_CUDA_SM=120 PEGAINFER_TEST_MODEL_PATH= cargo bench -p pegainfer-qwen3-4b --bench qwen3_runtime`: + - Local `cargo check --release -p openinfer-qwen3-4b --benches --tests` cannot run on the Mac without CUDA/nvcc; with `OPENINFER_CUDA_SM=120` it still fails at local `nvcc`. + - CUDA host `OPENINFER_CUDA_SM=120 cargo check --release -p openinfer-qwen3-4b --benches --tests` passes. + - CUDA host `OPENINFER_CUDA_SM=120 cargo clippy --release --all-targets -- -D warnings` passes. + - CUDA host `OPENINFER_CUDA_SM=120 OPENINFER_TEST_MODEL_PATH= cargo test --release -p openinfer-qwen3-4b --test paged_attention -- --nocapture` passes. + - CUDA host full Criterion bench passes with `OPENINFER_CUDA_SM=120 OPENINFER_TEST_MODEL_PATH= cargo bench -p openinfer-qwen3-4b --bench qwen3_runtime`: - Prefill TTFT: `128 -> 11.804ms`, `512 -> 23.200ms`, `1024 -> 44.114ms`, `2048 -> 87.327ms`, `4096 -> 179.60ms`, `10000 -> 505.55ms`. - Decode one-step batch time at 1024-token context: `bs1 -> 9.3095ms`, `bs2 -> 9.3207ms`, `bs4 -> 9.4059ms`, `bs8 -> 10.960ms`, `bs16 -> 11.718ms`, `bs32 -> 13.196ms`. ### Step 7: Retire ModelForward and Fix Length Limit -- Deleted `pegainfer_core::model::{ModelForward, GenerationState}` and removed the root `src/model.rs` re-export. +- Deleted `openinfer_core::model::{ModelForward, GenerationState}` and removed the root `src/model.rs` re-export. - Deleted the Qwen3 `forward.rs` compatibility path. Qwen3 tests that used it now build their baselines from `batch_prefill(bs=1)` plus `batch_decode(bs=1)`, so they exercise the same phase APIs as production. - Fixed Qwen3 decode length-limit handling by adding `DecodeEffect::EmitAndFinish`. EOS behavior is unchanged: EOS finishes without emitting the stop token. Length limit now emits the sampled final token, then sends `Finished { finish_reason: Length }`. - Regenerated `test_data/Qwen3-4B.json` because every length-limited golden output now includes the final requested token. @@ -161,13 +161,13 @@ pub fn kernel_plan() -> &'static KernelPlan; - `decode_heavy (1024,256)`: TPOT p50 `7.56ms`, p99 `7.73ms`. - Final verification after this step: - Local `cargo fmt --all --check`, `cargo metadata --no-deps --format-version 1`, and `git diff --check` pass. - - CUDA host `PEGAINFER_CUDA_SM=120 cargo clippy --release --all-targets -- -D warnings` passes. - - CUDA host `PEGAINFER_CUDA_SM=120 PEGAINFER_TEST_MODEL_PATH= cargo test --release -p pegainfer-qwen3-4b --test e2e -- --nocapture` passes. + - CUDA host `OPENINFER_CUDA_SM=120 cargo clippy --release --all-targets -- -D warnings` passes. + - CUDA host `OPENINFER_CUDA_SM=120 OPENINFER_TEST_MODEL_PATH= cargo test --release -p openinfer-qwen3-4b --test e2e -- --nocapture` passes. ### Step 8: Decode Context-Length Sweep and Compile Audit -- Added `crates/pegainfer-qwen3-4b/src/bin/qwen3_decode_context.rs` as a production-path fixed-context decode probe. It prefills a fresh request to a selected context length, then measures or profiles real `Qwen3Executor::execute_decode`; the optional `cudaProfilerStart/Stop` range only exists for profiler capture and does not run in normal serving. +- Added `crates/openinfer-qwen3-4b/src/bin/qwen3_decode_context.rs` as a production-path fixed-context decode probe. It prefills a fresh request to a selected context length, then measures or profiles real `Qwen3Executor::execute_decode`; the optional `cudaProfilerStart/Stop` range only exists for profiler capture and does not run in normal serving. - GPU fixed-context command: - - `PEGAINFER_CUDA_SM=120 target/release/qwen3_decode_context --model-path --iters 10 --contexts 128,512,1024,2048,4096,8192,10000` + - `OPENINFER_CUDA_SM=120 target/release/qwen3_decode_context --model-path --iters 10 --contexts 128,512,1024,2048,4096,8192,10000` - Result on RTX 5090: | Context | Decode p50 | @@ -191,13 +191,13 @@ pub fn kernel_plan() -> &'static KernelPlan; - H2D traffic in the profiled decode range was only about `20-23us/step`, so metadata dirty caching is good runtime hygiene but cannot explain a multi-ms TPOT gap. - Compile audit on the same validation worktree: - GPU reports compute capability `12.0`; default toolkit is CUDA `12.9` (`nvcc V12.9.86`), driver `575.57.08`. - - `crates/pegainfer-kernels/build.rs` emits `-O3 -gencode arch=compute_120,code=sm_120 -gencode arch=compute_120,code=compute_120 --compiler-options -fPIC`; FlashInfer translation units add `--std=c++17` and the FlashInfer include path. - - `cuobjdump -lelf` confirms both `libkernels_cuda.a` and `target/release/pegainfer` contain `sm_120.cubin`. `compute_120` PTX fallback is also embedded, but the matching SASS is present, so this is not PTX-JIT-only execution. + - `crates/openinfer-kernels/build.rs` emits `-O3 -gencode arch=compute_120,code=sm_120 -gencode arch=compute_120,code=compute_120 --compiler-options -fPIC`; FlashInfer translation units add `--std=c++17` and the FlashInfer include path. + - `cuobjdump -lelf` confirms both `libkernels_cuda.a` and `target/release/openinfer` contain `sm_120.cubin`. `compute_120` PTX fallback is also embedded, but the matching SASS is present, so this is not PTX-JIT-only execution. - CUDA `13.1` is installed and can build the same code into `sm_120` cubins, but the current driver/runtime combination cannot run it (`cudaError=35` after linking `libcudart.so.13`). Until the driver is upgraded, CUDA `12.9` is the latest runnable toolkit on this box. - Interpretation: the compile target is correct. The `bs=1` long-context slope is the known non-partition FlashInfer paged decode issue: grid shape is effectively `(batch_size, num_kv_heads) = (1, 8)`, so only 8 CTAs scan the whole KV context. At `ctx=4096`, Qwen3-4B attention reads about `604MB` (`576MiB`) of K/V per token; the measured attention time is about `5.7ms`, or roughly `105GB/s` effective aggregate bandwidth, far below the RTX 5090 memory system because the kernel under-fills the GPU. The next real fix is partition-KV/split-K decode for `bs=1` or low-batch, not build-flag tuning. ### Step 9: Pure Paged Decode Attention Bench -- Added `crates/pegainfer-qwen3-4b/benches/qwen3_attention.rs`. +- Added `crates/openinfer-qwen3-4b/benches/qwen3_attention.rs`. - The bench does not load Qwen3 weights. It constructs synthetic non-zero Q and paged KV buffers using Qwen3-4B attention shape: `num_qo_heads=32`, `num_kv_heads=8`, `head_dim=128`, `page_size=16`, one layer. - The bench calls the FlashInfer paged decode FFI directly and uses CUDA events around the kernel launches. It measures decode attention only; it excludes QKV projection, KV append, O projection, MLP, scheduler, tokenizer, and host-side serving overhead. - Added `paged_attention_decode_split_kv_cuda` as a reusable kernel entry for FlashInfer partition-KV/split-K decode. Runtime dispatch still uses the existing non-partition path; this step only exposes and benchmarks the candidate operator. @@ -209,10 +209,10 @@ pub fn kernel_plan() -> &'static KernelPlan; - `cargo metadata --no-deps --format-version 1` passes. - `git diff --check` passes. - GPU compile: - - `PEGAINFER_CUDA_SM=120 cargo check --release -p pegainfer-qwen3-4b --bench qwen3_attention` passes. - - `PEGAINFER_CUDA_SM=120 cargo clippy --release -p pegainfer-qwen3-4b --all-targets -- -D warnings` passes. + - `OPENINFER_CUDA_SM=120 cargo check --release -p openinfer-qwen3-4b --bench qwen3_attention` passes. + - `OPENINFER_CUDA_SM=120 cargo clippy --release -p openinfer-qwen3-4b --all-targets -- -D warnings` passes. - GPU run: - - `PEGAINFER_CUDA_SM=120 cargo bench -p pegainfer-qwen3-4b --bench qwen3_attention -- --noplot` passes. + - `OPENINFER_CUDA_SM=120 cargo bench -p openinfer-qwen3-4b --bench qwen3_attention -- --noplot` passes. Single-layer `bs=1` context sweep on RTX 5090: @@ -260,9 +260,9 @@ GPU validation: | Check | Result | | --- | --- | -| `PEGAINFER_CUDA_SM=120 cargo check --release -p pegainfer-qwen3-4b --all-targets` | pass | -| `PEGAINFER_CUDA_SM=120 PEGAINFER_TEST_MODEL_PATH= cargo test --release -p pegainfer-qwen3-4b --test e2e -- --nocapture` | pass | -| `PEGAINFER_CUDA_SM=120 cargo clippy --release --all-targets -- -D warnings` | pass | +| `OPENINFER_CUDA_SM=120 cargo check --release -p openinfer-qwen3-4b --all-targets` | pass | +| `OPENINFER_CUDA_SM=120 OPENINFER_TEST_MODEL_PATH= cargo test --release -p openinfer-qwen3-4b --test e2e -- --nocapture` | pass | +| `OPENINFER_CUDA_SM=120 cargo clippy --release --all-targets -- -D warnings` | pass | Fixed-context decode probe after runtime integration: @@ -275,7 +275,7 @@ Fixed-context decode probe after runtime integration: Command: ```bash -PEGAINFER_CUDA_SM=120 target/release/qwen3_decode_context \ +OPENINFER_CUDA_SM=120 target/release/qwen3_decode_context \ --model-path \ --iters 10 \ --contexts 1024,4096,10000 @@ -284,7 +284,7 @@ PEGAINFER_CUDA_SM=120 target/release/qwen3_decode_context \ Cross-threshold smoke: ```bash -PEGAINFER_CUDA_SM=120 target/release/qwen3_decode_context \ +OPENINFER_CUDA_SM=120 target/release/qwen3_decode_context \ --model-path \ --iters 600 \ --contexts 512 @@ -295,7 +295,7 @@ Result: pass, `p50=6.7156ms`. This exercises a single request growing from non-p Serving request check after rebuilding `bench_serving`: ```bash -RUST_LOG=warn PEGAINFER_CUDA_SM=120 target/release/bench_serving \ +RUST_LOG=warn OPENINFER_CUDA_SM=120 target/release/bench_serving \ --model-path \ request --prompt-len 4096 --output-len 64 ``` @@ -311,7 +311,7 @@ Result: Interpretation: split-K removes the long-context attention slope for the low-batch case. The remaining `~6.8-7.1ms` TPOT is now dominated by the non-attention decode body: GEMMs/GEMVs, MLP, norms, logits, sampling, and graph replay overhead. Next optimization work should not keep pushing paged attention first; it should re-profile the post-split decode step and pick the new largest kernel family. ### Step 11: Attention Theoretical Bandwidth Estimate -- Updated `crates/pegainfer-qwen3-4b/benches/qwen3_attention.rs` to print a one-time theoretical bandwidth report before Criterion runs. +- Updated `crates/openinfer-qwen3-4b/benches/qwen3_attention.rs` to print a one-time theoretical bandwidth report before Criterion runs. - The report queries CUDA Driver attributes: - `CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE` - `CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH` @@ -322,7 +322,7 @@ Interpretation: split-K removes the long-context attention slope for the low-bat - Verification command: ```bash -PEGAINFER_CUDA_SM=120 cargo bench -p pegainfer-qwen3-4b \ +OPENINFER_CUDA_SM=120 cargo bench -p openinfer-qwen3-4b \ --bench qwen3_attention -- --noplot ``` @@ -346,18 +346,18 @@ Batch sweep sanity rows at `kv_len=1024`: Interpretation: the estimate is good enough to prove the original `bs=1` non-partition path was badly under-filling memory bandwidth. It is not good enough to make final hardware-utilization claims because single-layer KV working sets fit in the RTX 5090's `96MiB` L2; the `bs16` non-partition row exceeding `100%` of DRAM peak is the warning sign. The next measurement step should use CUPTI Profiler or NCU counters for `dram__bytes_*`, `lts__t_bytes.*`, and `*_pct_of_peak_sustained_elapsed`. ### Step 12: CUPTI Counters and Split-K Retune -- Added `crates/pegainfer-cupti`, a small CUPTI Range Profiler wrapper used by the attention bench. It profiles only the attention launch range and lets the bench clear L2 before `cuptiRangeProfilerStart`, so cache-clear traffic is excluded from the measured range. -- Extended `crates/pegainfer-qwen3-4b/benches/qwen3_attention.rs`: - - `PEGAINFER_QWEN3_ATTENTION_CUPTI=1` prints cold-L2 CUPTI rows for `gpu__time_duration.sum`, `dram__bytes.sum`, `dram__bytes_op_read.sum`, `dram__bytes_op_write.sum`, and `lts__t_bytes.sum`. - - `PEGAINFER_QWEN3_ATTENTION_SPLITK_SWEEP=1` sweeps split-K chunk sizes and max chunk slots. - - `PEGAINFER_QWEN3_ATTENTION_REPORT_ONLY=1` prints reports without running Criterion samples. +- Added `crates/openinfer-cupti`, a small CUPTI Range Profiler wrapper used by the attention bench. It profiles only the attention launch range and lets the bench clear L2 before `cuptiRangeProfilerStart`, so cache-clear traffic is excluded from the measured range. +- Extended `crates/openinfer-qwen3-4b/benches/qwen3_attention.rs`: + - `OPENINFER_QWEN3_ATTENTION_CUPTI=1` prints cold-L2 CUPTI rows for `gpu__time_duration.sum`, `dram__bytes.sum`, `dram__bytes_op_read.sum`, `dram__bytes_op_write.sum`, and `lts__t_bytes.sum`. + - `OPENINFER_QWEN3_ATTENTION_SPLITK_SWEEP=1` sweeps split-K chunk sizes and max chunk slots. + - `OPENINFER_QWEN3_ATTENTION_REPORT_ONLY=1` prints reports without running Criterion samples. - GPU CUPTI command: ```bash -PEGAINFER_CUDA_SM=120 \ -PEGAINFER_QWEN3_ATTENTION_REPORT_ONLY=1 \ -PEGAINFER_QWEN3_ATTENTION_CUPTI=1 \ -cargo bench -p pegainfer-qwen3-4b --bench qwen3_attention -- --noplot +OPENINFER_CUDA_SM=120 \ +OPENINFER_QWEN3_ATTENTION_REPORT_ONLY=1 \ +OPENINFER_QWEN3_ATTENTION_CUPTI=1 \ +cargo bench -p openinfer-qwen3-4b --bench qwen3_attention -- --noplot ``` Key cold-L2 CUPTI rows at `bs=1,ctx=10000`: @@ -373,10 +373,10 @@ Interpretation: FlashInfer is not rereading KV many times from DRAM. The non-par Split-K sweep command: ```bash -PEGAINFER_CUDA_SM=120 \ -PEGAINFER_QWEN3_ATTENTION_REPORT_ONLY=1 \ -PEGAINFER_QWEN3_ATTENTION_SPLITK_SWEEP=1 \ -cargo bench -p pegainfer-qwen3-4b --bench qwen3_attention -- --noplot +OPENINFER_CUDA_SM=120 \ +OPENINFER_QWEN3_ATTENTION_REPORT_ONLY=1 \ +OPENINFER_QWEN3_ATTENTION_SPLITK_SWEEP=1 \ +cargo bench -p openinfer-qwen3-4b --bench qwen3_attention -- --noplot ``` Representative cold-L2 sweep rows: @@ -395,9 +395,9 @@ Runtime change: `BatchDecodeBuffers` now uses `SPLIT_KV_CHUNK_TOKENS=256` with ` Production decode probe after retune: ```bash -PEGAINFER_CUDA_SM=120 cargo build --release \ - -p pegainfer-qwen3-4b --bin qwen3_decode_context -PEGAINFER_CUDA_SM=120 target/release/qwen3_decode_context \ +OPENINFER_CUDA_SM=120 cargo build --release \ + -p openinfer-qwen3-4b --bin qwen3_decode_context +OPENINFER_CUDA_SM=120 target/release/qwen3_decode_context \ --model-path \ --iters 10 \ --contexts 1024,4096,10000 @@ -412,7 +412,7 @@ PEGAINFER_CUDA_SM=120 target/release/qwen3_decode_context \ Serving check after syncing the root `src/` worktree on the CUDA validation host: ```bash -RUST_LOG=warn PEGAINFER_CUDA_SM=120 cargo run --release \ +RUST_LOG=warn OPENINFER_CUDA_SM=120 cargo run --release \ --bin bench_serving -- \ --model-path \ request --prompt-len 4096 --output-len 64 --warmup 5 --iters 20 @@ -429,9 +429,9 @@ Verification: | Check | Result | | --- | --- | -| `PEGAINFER_CUDA_SM=120 cargo clippy --release -p pegainfer-cupti -p pegainfer-qwen3-4b --bench qwen3_attention -- -D warnings` | pass | -| `PEGAINFER_CUDA_SM=120 PEGAINFER_TEST_MODEL_PATH= cargo test --release -p pegainfer-qwen3-4b --test e2e -- --nocapture` | pass | -| `PEGAINFER_CUDA_SM=120 PEGAINFER_QWEN3_ATTENTION_REPORT_ONLY=1 cargo bench -p pegainfer-qwen3-4b --bench qwen3_attention -- --noplot` | pass | +| `OPENINFER_CUDA_SM=120 cargo clippy --release -p openinfer-cupti -p openinfer-qwen3-4b --bench qwen3_attention -- -D warnings` | pass | +| `OPENINFER_CUDA_SM=120 OPENINFER_TEST_MODEL_PATH= cargo test --release -p openinfer-qwen3-4b --test e2e -- --nocapture` | pass | +| `OPENINFER_CUDA_SM=120 OPENINFER_QWEN3_ATTENTION_REPORT_ONLY=1 cargo bench -p openinfer-qwen3-4b --bench qwen3_attention -- --noplot` | pass | | `cargo fmt --all --check` | pass | | `cargo metadata --no-deps --format-version 1` | pass | | `git diff --check` | pass | @@ -439,15 +439,15 @@ Verification: Note: an initial remote e2e run failed because the remote `test_data/Qwen3-4B.json` was stale and expected the pre length-limit baseline. Syncing the tracked baseline fixed it; this was not a split-K numerical drift. ### Step 13: Kernel Snapshot MVP -- Extracted the Qwen3 paged decode attention case construction into `crates/pegainfer-qwen3-4b/src/kernel_bench.rs`. -- Added `crates/pegainfer-qwen3-4b/benches/qwen3_kernel_snapshot.rs` as a deterministic `harness=false` runner. +- Extracted the Qwen3 paged decode attention case construction into `crates/openinfer-qwen3-4b/src/kernel_bench.rs`. +- Added `crates/openinfer-qwen3-4b/benches/qwen3_kernel_snapshot.rs` as a deterministic `harness=false` runner. - Removed the temporary correctness envelope from the snapshot runner. We do not have a settled truth source for this layer yet, so correctness belongs in a separate design rather than a misleading "non-partition equals truth" field. - CUPTI is default-on in the snapshot runner. `--no-cupti` is available only for latency-only smoke runs. Snapshot command: ```bash -PEGAINFER_CUDA_SM=120 cargo bench -p pegainfer-qwen3-4b \ +OPENINFER_CUDA_SM=120 cargo bench -p openinfer-qwen3-4b \ --bench qwen3_kernel_snapshot -- \ run \ --contexts 1024 \ @@ -460,7 +460,7 @@ PEGAINFER_CUDA_SM=120 cargo bench -p pegainfer-qwen3-4b \ Compare command: ```bash -cargo bench -p pegainfer-qwen3-4b \ +cargo bench -p openinfer-qwen3-4b \ --bench qwen3_kernel_snapshot -- \ compare \ --base $RESULT_ROOT/qwen3_kernel_snapshot_smoke.json \ @@ -494,7 +494,7 @@ kernel snapshot compare complete: warnings=0 failures=0 CUPTI note: the standalone snapshot runner originally crashed inside `libnvperf_host.so` at `NVPW_CUDA_Profiler_DecodeCounters`. The root cause was the verbose user range name, not the attention case or Rust callback trampoline. The fix is to use compact range names such as `qk/non_partition/b1/k1024` and keep full metadata in JSON fields. The first profiled launch also needs an unprofiled warmup launch; otherwise CUDA lazy initialization pollutes the first CUPTI GPU time. ```bash -PEGAINFER_CUDA_SM=120 cargo bench -p pegainfer-qwen3-4b \ +OPENINFER_CUDA_SM=120 cargo bench -p openinfer-qwen3-4b \ --bench qwen3_kernel_snapshot -- \ run \ --contexts 1024 \ @@ -508,16 +508,16 @@ Verification: | Check | Result | | --- | --- | -| `PEGAINFER_CUDA_SM=120 cargo clippy --release -p pegainfer-cupti -p pegainfer-qwen3-4b --bench qwen3_kernel_snapshot -- -D warnings` | pass | -| `PEGAINFER_CUDA_SM=120 PEGAINFER_TEST_MODEL_PATH= cargo test --release -p pegainfer-qwen3-4b --test e2e -- --nocapture` | pass | -| `PEGAINFER_CUDA_SM=120 cargo bench -p pegainfer-qwen3-4b --bench qwen3_kernel_snapshot -- run --contexts 1024 --batch-sizes 1 --variants non_partition,split_kv_256x64 --iters 4 --out $RESULT_ROOT/qwen3_kernel_snapshot_cupti_smoke.json` | pass | +| `OPENINFER_CUDA_SM=120 cargo clippy --release -p openinfer-cupti -p openinfer-qwen3-4b --bench qwen3_kernel_snapshot -- -D warnings` | pass | +| `OPENINFER_CUDA_SM=120 OPENINFER_TEST_MODEL_PATH= cargo test --release -p openinfer-qwen3-4b --test e2e -- --nocapture` | pass | +| `OPENINFER_CUDA_SM=120 cargo bench -p openinfer-qwen3-4b --bench qwen3_kernel_snapshot -- run --contexts 1024 --batch-sizes 1 --variants non_partition,split_kv_256x64 --iters 4 --out $RESULT_ROOT/qwen3_kernel_snapshot_cupti_smoke.json` | pass | The SM counters are intentionally minimal. `sm__throughput.avg.pct_of_peak_sustained_elapsed` shows whether SMs are busy over elapsed time; `smsp__warps_active.avg.pct_of_peak_sustained_active` shows active-warp residency while SM partitions are active. At `bs=1,ctx=10000`, non-partition measured `1.19%` SM throughput and `6.59%` DRAM peak, while split-K measured `8.74%` SM throughput and `41.06%` DRAM peak for nearly identical DRAM read bytes. That is the kernel snapshot evidence for low-batch underfill. ### Step 14: Consolidate Bench Entry Points - Deleted the retired Criterion benches: - - `crates/pegainfer-qwen3-4b/benches/qwen3_runtime.rs` - - `crates/pegainfer-qwen3-4b/benches/qwen3_attention.rs` + - `crates/openinfer-qwen3-4b/benches/qwen3_runtime.rs` + - `crates/openinfer-qwen3-4b/benches/qwen3_attention.rs` - Removed their `[[bench]]` entries and the Qwen3 crate-local `criterion` dev dependency. - Qwen3 now has exactly one model-crate bench entry: `qwen3_kernel_snapshot`. - Rationale: the human CSV report, split-K tuning sweep, and machine-readable JSON runner were duplicating case construction, metric selection, and interpretation. Kernel maintenance should have one durable artifact first; optional human views should be generated from snapshot data rather than maintained as separate benches. @@ -529,13 +529,13 @@ Verification after consolidation: | `cargo fmt --all --check` | pass | | `cargo metadata --no-deps --format-version 1` | pass | | `git diff --check` | pass | -| `PEGAINFER_CUDA_SM=120 cargo check --release -p pegainfer-qwen3-4b --bench qwen3_kernel_snapshot` on the CUDA validation host | pass | -| `PEGAINFER_CUDA_SM=120 cargo clippy --release -p pegainfer-cupti -p pegainfer-qwen3-4b --bench qwen3_kernel_snapshot -- -D warnings` on the CUDA validation host | pass | -| `PEGAINFER_CUDA_SM=120 cargo bench -p pegainfer-qwen3-4b --bench qwen3_kernel_snapshot -- run --contexts 1024 --batch-sizes 1 --variants non_partition,split_kv_256x64 --iters 4 --out $RESULT_ROOT/qwen3_kernel_snapshot_single_bench_smoke.json` on the CUDA validation host | pass | -| `PEGAINFER_CUDA_SM=120 cargo bench -p pegainfer-qwen3-4b --bench qwen3_kernel_snapshot -- compare --base $RESULT_ROOT/qwen3_kernel_snapshot_single_bench_smoke.json --new $RESULT_ROOT/qwen3_kernel_snapshot_single_bench_smoke.json` on the CUDA validation host | pass | +| `OPENINFER_CUDA_SM=120 cargo check --release -p openinfer-qwen3-4b --bench qwen3_kernel_snapshot` on the CUDA validation host | pass | +| `OPENINFER_CUDA_SM=120 cargo clippy --release -p openinfer-cupti -p openinfer-qwen3-4b --bench qwen3_kernel_snapshot -- -D warnings` on the CUDA validation host | pass | +| `OPENINFER_CUDA_SM=120 cargo bench -p openinfer-qwen3-4b --bench qwen3_kernel_snapshot -- run --contexts 1024 --batch-sizes 1 --variants non_partition,split_kv_256x64 --iters 4 --out $RESULT_ROOT/qwen3_kernel_snapshot_single_bench_smoke.json` on the CUDA validation host | pass | +| `OPENINFER_CUDA_SM=120 cargo bench -p openinfer-qwen3-4b --bench qwen3_kernel_snapshot -- compare --base $RESULT_ROOT/qwen3_kernel_snapshot_single_bench_smoke.json --new $RESULT_ROOT/qwen3_kernel_snapshot_single_bench_smoke.json` on the CUDA validation host | pass | ## Debrief -The Qwen3 split now enforces the intended dependency direction: model execution code depends on `pegainfer-core` and `pegainfer-kernels`; root depends on the model crate only at registry/startup glue points. Root still has a `ModelType::Qwen3` enum and default Qwen3 model path because the product needs a loader choice, but it no longer sees Qwen3 layers, KV state, TP rank workers, or prefill/decode/unified plans. +The Qwen3 split now enforces the intended dependency direction: model execution code depends on `openinfer-core` and `openinfer-kernels`; root depends on the model crate only at registry/startup glue points. Root still has a `ModelType::Qwen3` enum and default Qwen3 model path because the product needs a loader choice, but it no longer sees Qwen3 layers, KV state, TP rank workers, or prefill/decode/unified plans. Next cleanup should be a generic model registry module so `main.rs` and `bench_serving.rs` stop matching model crate names directly. Performance-wise, the next target is the post-split decode body: GEMM/GEMV, MLP, norms, logits, sampling, and graph replay overhead now dominate the remaining `~6.5-7.0ms` TPOT. Kernel DevOps-wise, the next target is defining a real correctness/truth source for kernel snapshots instead of treating one implementation path as the oracle. diff --git a/docs/models/qwen3/prefix-cache.md b/docs/models/qwen3/prefix-cache.md index 75c96f64..c5ee79e6 100644 --- a/docs/models/qwen3/prefix-cache.md +++ b/docs/models/qwen3/prefix-cache.md @@ -53,12 +53,12 @@ Negative result: registering 40 extra unique ~1900-token prompts did not move wa ## Tests -- `pegainfer-qwen3-4b/tests/prefix_cache.rs` — behavioral contract: exact cached-token counts (3-block hit + tail recompute, extension match, full-block cap edge), mixed cold+warm batch in one plan, unified prefill+decode path, warm-vs-cold logit bounds (regret + mean, golden-gate methodology). -- `pegainfer-qwen3-4b/tests/hf_golden_gate.rs` — cached-replay surfaces (sequential bs=1 eager, batched eager, batched cuda-graph) vs the HF golden: warm mean 0.0316 / p99 0.1215 vs cold floor 0.0317 / 0.1196. +- `openinfer-qwen3-4b/tests/prefix_cache.rs` — behavioral contract: exact cached-token counts (3-block hit + tail recompute, extension match, full-block cap edge), mixed cold+warm batch in one plan, unified prefill+decode path, warm-vs-cold logit bounds (regret + mean, golden-gate methodology). +- `openinfer-qwen3-4b/tests/hf_golden_gate.rs` — cached-replay surfaces (sequential bs=1 eager, batched eager, batched cuda-graph) vs the HF golden: warm mean 0.0316 / p99 0.1215 vs cold floor 0.0317 / 0.1196. ```bash -PEGAINFER_TEST_MODEL_PATH=/data/models/Qwen3-4B cargo test --release -p pegainfer-qwen3-4b --test prefix_cache -PEGAINFER_TEST_MODEL_PATH=/data/models/Qwen3-4B cargo test --release -p pegainfer-qwen3-4b --test hf_golden_gate +OPENINFER_TEST_MODEL_PATH=/data/models/Qwen3-4B cargo test --release -p openinfer-qwen3-4b --test prefix_cache +OPENINFER_TEST_MODEL_PATH=/data/models/Qwen3-4B cargo test --release -p openinfer-qwen3-4b --test hf_golden_gate ``` ## Next diff --git a/docs/models/qwen3/roadmap.md b/docs/models/qwen3/roadmap.md index cd0da10c..d079db0a 100644 --- a/docs/models/qwen3/roadmap.md +++ b/docs/models/qwen3/roadmap.md @@ -26,7 +26,7 @@ Tracking issue: see the `[Model] Qwen3-4B roadmap` GitHub issue. Cross-model ite ### Now 1. **YaRN for rope-scaled checkpoints (#8).** The #220 RoPE OOB fix landed scope (a): the cos/sin cache is sized from `config.max_position_embeddings`, admission crash-early rejects past the window (distinct context-length vs KV-budget reasons), the kernel `__trap`s an out-of-range position as a last-resort backstop, and the gate now covers both an oversized reject and an in-window >4096 case (`tests/context_window.rs`, `tests/context_window_in_window.rs`). That precompute is correct *only because this checkpoint has `rope_scaling: null`*. Scope (b) remains open: #8 YaRN is the prerequisite for any rope-scaled checkpoint — the precompute length must come from the scaled schedule, coordinated with the qwen3.5 sibling fix so both crates share the pattern. -2. **Batched greedy decode sampling.** Phase 1: route all-greedy batches through `argmax_batch_bf16_into` — one launch + one D2H per step; this primitive is production-proven in deepseek-v2-lite (`runtime.rs:1379`). `flashinfer_top1_batch_into` has *no* production caller and needs its own validation before use. Phase 2: batched random path with per-row params; source the 1MB FlashInfer row-state scratch from the kernel instead of the literal. Shared `pegainfer-core/kernels` work — covers qwen35 too. Gated by the existing golden gate. +2. **Batched greedy decode sampling.** Phase 1: route all-greedy batches through `argmax_batch_bf16_into` — one launch + one D2H per step; this primitive is production-proven in deepseek-v2-lite (`runtime.rs:1379`). `flashinfer_top1_batch_into` has *no* production caller and needs its own validation before use. Phase 2: batched random path with per-row params; source the 1MB FlashInfer row-state scratch from the kernel instead of the literal. Shared `openinfer-core/kernels` work — covers qwen35 too. Gated by the existing golden gate. 3. **Sampling correctness coverage.** Every test in both qwen crates is greedy. Add seed-determinism + temperature/top_k/top_p behavioral tests, and audit the frontend for silently-dropped params (penalties, min_p are absent from `SamplingParams` entirely) — the kimi-k2 silent-greedy bug (#237) shows this class is real and currently nothing would catch it here. 4. **Prefix-cache observability.** `cached_tokens` is computed (`executor.rs:751`) and dies at the scheduler boundary; the frontend hardcodes `num_cached_tokens: 0`. Thread it through `TokenEvent::Scheduled` into usage; log hit rate. Adjacent: #78 (streaming usage discards completion_tokens) — same usage-accounting surface. diff --git a/docs/models/qwen3/tp-design.md b/docs/models/qwen3/tp-design.md index fd74c09a..717e6fd9 100644 --- a/docs/models/qwen3/tp-design.md +++ b/docs/models/qwen3/tp-design.md @@ -13,11 +13,11 @@ Add tensor parallelism for `Qwen3-4B` with a narrow and explicit first target: - keep the milestone focused on model-parallel correctness - establish the right abstractions for later large-model and MoE work -This milestone is about making pegainfer capable of serving a single model replica across two GPUs. It is not about multi-replica throughput scaling. +This milestone is about making openinfer capable of serving a single model replica across two GPUs. It is not about multi-replica throughput scaling. ## Why This Matters -Large-model serving and MoE serving both require model-parallel building blocks. For pegainfer, tensor parallelism is the first such building block. +Large-model serving and MoE serving both require model-parallel building blocks. For openinfer, tensor parallelism is the first such building block. The immediate value is: @@ -34,7 +34,7 @@ This first pass is intentionally narrow: - focus: correctness and architecture - deployment target: a single machine -The first milestone does not need to solve every parallelism problem. It needs to prove that pegainfer can run one dense model replica across two GPUs without breaking correctness or making the architecture harder to evolve. +The first milestone does not need to solve every parallelism problem. It needs to prove that openinfer can run one dense model replica across two GPUs without breaking correctness or making the architecture harder to evolve. ## Design Constraints @@ -689,7 +689,7 @@ This abstraction is the right carrier for model-internal parallelism such as TP It is not the right abstraction for service-layer data parallelism across multiple model replicas. -If pegainfer later needs multiple model replicas, that should live above the executor layer. A `ModelExecutor` still represents one logical model replica, even if that replica internally spans multiple GPUs. +If openinfer later needs multiple model replicas, that should live above the executor layer. A `ModelExecutor` still represents one logical model replica, even if that replica internally spans multiple GPUs. ## What Success Looks Like @@ -762,4 +762,4 @@ The following questions are intentionally deferred until after the first TP mile This milestone should stay disciplined. -The job is not to build a full distributed inference platform in one step. The job is to make pegainfer capable of correct `TP=2` execution for `Qwen3-4B`, while establishing the architectural boundary that future large dense and MoE work can build on. +The job is not to build a full distributed inference platform in one step. The job is to make openinfer capable of correct `TP=2` execution for `Qwen3-4B`, while establishing the architectural boundary that future large dense and MoE work can build on. diff --git a/docs/models/qwen35/accuracy.md b/docs/models/qwen35/accuracy.md index 3fcdc51a..3ecf46dc 100644 --- a/docs/models/qwen35/accuracy.md +++ b/docs/models/qwen35/accuracy.md @@ -1,8 +1,8 @@ # Qwen3.5-4B Accuracy -> **TL;DR:** Qwen3.5 accuracy now has short and long HF-backed logits goldens (`tests/hf_golden_gate.rs`, `test_data/qwen35-4b-hf-golden.safetensors`, and `test_data/qwen35-4b-hf-long-golden.safetensors`). The HF fixtures use `AutoModelForCausalLM` with `use_cache=True` / `past_key_values`, so they match pegainfer's prefill + decode shape. The long fixture crosses the old 4096-position RoPE cache boundary with 4097- and 8192-token prompts, and the #250 fix recovers full GSM8K 8-shot at `batch_size=1` to `strict-match` 79.38% / `flexible-extract` 79.30% vs the HF 79.45% baseline. The older exact-text `test_data/Qwen3.5-4B.json` and its regeneration test are retired; `e2e_scheduler` remains only a scheduler liveness/integration check. A broader PegaInfer-owned rand/hash corpus is deferred until the project decides how to handle cross-architecture exact-token drift. +> **TL;DR:** Qwen3.5 accuracy now has short and long HF-backed logits goldens (`tests/hf_golden_gate.rs`, `test_data/qwen35-4b-hf-golden.safetensors`, and `test_data/qwen35-4b-hf-long-golden.safetensors`). The HF fixtures use `AutoModelForCausalLM` with `use_cache=True` / `past_key_values`, so they match openinfer's prefill + decode shape. The long fixture crosses the old 4096-position RoPE cache boundary with 4097- and 8192-token prompts, and the #250 fix recovers full GSM8K 8-shot at `batch_size=1` to `strict-match` 79.38% / `flexible-extract` 79.30% vs the HF 79.45% baseline. The older exact-text `test_data/Qwen3.5-4B.json` and its regeneration test are retired; `e2e_scheduler` remains only a scheduler liveness/integration check. A broader OpenInfer-owned rand/hash corpus is deferred until the project decides how to handle cross-architecture exact-token drift. > -> **Last touched:** 2026-06. The HF logits gate passes on RTX 5090 `sm_120` and covers the qwen35-owned replay surfaces: sequential graph decode, bucket-straddling batched graph decode, slot-compaction replay after a mid-batch request drop, and a long-prompt sequential replay at 4097/8192 tokens. A full GSM8K 8-shot `lm_eval` run against `/v1/completions` also passes at HF-baseline accuracy. Current accuracy command is crate-local and needs an absolute `PEGAINFER_TEST_MODEL_PATH`: `cargo test --release -p pegainfer-qwen35-4b --test hf_golden_gate -- --nocapture`. Run `e2e_scheduler` only when scheduler request-flow behavior changes. +> **Last touched:** 2026-06. The HF logits gate passes on RTX 5090 `sm_120` and covers the qwen35-owned replay surfaces: sequential graph decode, bucket-straddling batched graph decode, slot-compaction replay after a mid-batch request drop, and a long-prompt sequential replay at 4097/8192 tokens. A full GSM8K 8-shot `lm_eval` run against `/v1/completions` also passes at HF-baseline accuracy. Current accuracy command is crate-local and needs an absolute `OPENINFER_TEST_MODEL_PATH`: `cargo test --release -p openinfer-qwen35-4b --test hf_golden_gate -- --nocapture`. Run `e2e_scheduler` only when scheduler request-flow behavior changes. ## Goal @@ -13,14 +13,14 @@ ## Current State - Reusable debugging method now lives in [../../playbooks/accuracy-parity-playbook.md](../../playbooks/accuracy-parity-playbook.md). -- `pegainfer-qwen35-4b/tests/hf_golden_gate.rs` checks pegainfer logits against two pinned HF bf16 `past_key_values` oracles: +- `openinfer-qwen35-4b/tests/hf_golden_gate.rs` checks openinfer logits against two pinned HF bf16 `past_key_values` oracles: - `test_data/qwen35-4b-hf-golden.safetensors` for the short mixed-shape replay surfaces. - `test_data/qwen35-4b-hf-long-golden.safetensors` for the long 4097/8192-token replay surface. -- `pegainfer-qwen35-4b/tests/e2e.rs`, `pegainfer-qwen35-4b/tests/regen_test_data.rs`, and `test_data/Qwen3.5-4B.json` are retired. They were exact-text PegaInfer self-baselines, not HF accuracy gates. -- `pegainfer-qwen35-4b/tests/e2e_scheduler.rs` still loads the model and exercises sequential, repeated, concurrent, and consumer-drop scheduler paths, but it no longer reads an exact-text JSON fixture. -- A broader PegaInfer-owned rand/hash corpus was considered for issue #186, but checked-in exact token/hash data may drift across GPU architectures (`sm_80`, `sm_90`, `sm_120`). Keep that as follow-up design work until the cross-architecture stability policy is explicit. +- `openinfer-qwen35-4b/tests/e2e.rs`, `openinfer-qwen35-4b/tests/regen_test_data.rs`, and `test_data/Qwen3.5-4B.json` are retired. They were exact-text OpenInfer self-baselines, not HF accuracy gates. +- `openinfer-qwen35-4b/tests/e2e_scheduler.rs` still loads the model and exercises sequential, repeated, concurrent, and consumer-drop scheduler paths, but it no longer reads an exact-text JSON fixture. +- A broader OpenInfer-owned rand/hash corpus was considered for issue #186, but checked-in exact token/hash data may drift across GPU architectures (`sm_80`, `sm_90`, `sm_120`). Keep that as follow-up design work until the cross-architecture stability policy is explicit. - `docs/models/qwen35/optimization.md` records historical exact-text baseline churn. New accuracy work should use the HF logits gate before interpreting prompt-level text drift. -- The #250 GSM8K 8-shot recovery run now closes the task-score side of the old long-prompt divergence: pegainfer scored `strict-match` 79.38% and `flexible-extract` 79.30% vs the HF 79.45% baseline. +- The #250 GSM8K 8-shot recovery run now closes the task-score side of the old long-prompt divergence: openinfer scored `strict-match` 79.38% and `flexible-extract` 79.30% vs the HF 79.45% baseline. - Existing low-level tests already narrow the search space: - `src/ops/tests.rs`: `test_flash_attention_prefill_hd256_matches_cpu_reference` - `src/ops/tests.rs`: `test_prefill_attention_hd256_batch_matches_cpu_reference` @@ -30,7 +30,7 @@ - `src/ops/tests.rs`: `test_argmax_tie_prefers_smallest_index` - `src/ops/tests.rs`: `test_argmax_tie_prefers_smallest_index_across_thread_strides` - Historical accuracy tooling was recorded for layer `0` prefill, but these paths are not present in the current tree after the model-crate split: - - `src/bin/qwen35_dump_layer0.rs` dumps pegainfer layer-0 checkpoints to JSON + - `src/bin/qwen35_dump_layer0.rs` dumps openinfer layer-0 checkpoints to JSON - `tools/accuracy/hf_dump_qwen35_layer0.py` dumps matching HF checkpoints on GPU - `tools/accuracy/compare_qwen35_dump.py` reports `max_abs` / `mean_abs` per checkpoint - `src/bin/qwen35_dump_decode_layer_ids.rs` dumps the real production-path incremental step for an explicit token-id prefix @@ -55,11 +55,11 @@ Verified on RTX 5090 `sm_120` with Triton 3.4.0 for build-time AOT: export MODEL_PATH=/path/to/Qwen3.5-4B export TRITON_PYTHON=/path/to/triton34-venv/bin/python -PEGAINFER_CUDA_SM=120 \ -PEGAINFER_TRITON_PYTHON=$TRITON_PYTHON \ -PEGAINFER_TEST_MODEL_PATH=$MODEL_PATH \ -PEGAINFER_TEST_MODEL_REVISION=851bf6e806efd8d0a36b00ddf55e13ccb7b8cd0a \ -cargo test --release -p pegainfer-qwen35-4b --test hf_golden_gate -- --nocapture +OPENINFER_CUDA_SM=120 \ +OPENINFER_TRITON_PYTHON=$TRITON_PYTHON \ +OPENINFER_TEST_MODEL_PATH=$MODEL_PATH \ +OPENINFER_TEST_MODEL_REVISION=851bf6e806efd8d0a36b00ddf55e13ccb7b8cd0a \ +cargo test --release -p openinfer-qwen35-4b --test hf_golden_gate -- --nocapture ``` Observed floor from that run: @@ -92,11 +92,11 @@ Verified on RTX 5090 `sm_120` with CUDA 12.8 and Triton 3.4.0 for build-time AOT export MODEL_PATH=/path/to/Qwen3.5-4B export TRITON_PYTHON=/path/to/triton34-venv/bin/python -PEGAINFER_CUDA_SM=120 \ -PEGAINFER_TRITON_PYTHON=$TRITON_PYTHON \ -PEGAINFER_TEST_MODEL_PATH=$MODEL_PATH \ -PEGAINFER_TEST_MODEL_REVISION=851bf6e806efd8d0a36b00ddf55e13ccb7b8cd0a \ -cargo test --release -p pegainfer-qwen35-4b --test hf_golden_gate -- --nocapture +OPENINFER_CUDA_SM=120 \ +OPENINFER_TRITON_PYTHON=$TRITON_PYTHON \ +OPENINFER_TEST_MODEL_PATH=$MODEL_PATH \ +OPENINFER_TEST_MODEL_REVISION=851bf6e806efd8d0a36b00ddf55e13ccb7b8cd0a \ +cargo test --release -p openinfer-qwen35-4b --test hf_golden_gate -- --nocapture ``` Observed long-prompt floor from that run: @@ -116,12 +116,12 @@ cached `openai/gsm8k` dataset snapshot. export MODEL_PATH=/path/to/Qwen3.5-4B export TRITON_PYTHON=/path/to/triton34-venv/bin/python export LM_EVAL_BIN=/path/to/lm_eval -export RESULT_ROOT=results/qwen35-gsm8k-8shot-pegainfer-issue250 +export RESULT_ROOT=results/qwen35-gsm8k-8shot-openinfer-issue250 # Terminal 1: start the server. -PEGAINFER_CUDA_SM=120 \ -PEGAINFER_TRITON_PYTHON=$TRITON_PYTHON \ -cargo +nightly run --release -p pegainfer-server --bin pegainfer -- \ +OPENINFER_CUDA_SM=120 \ +OPENINFER_TRITON_PYTHON=$TRITON_PYTHON \ +cargo +nightly run --release -p openinfer-server --bin openinfer -- \ --model-path "$MODEL_PATH" \ --served-model-name qwen35-eval \ --port 18082 @@ -137,7 +137,7 @@ $LM_EVAL_BIN run \ ``` Result file: -`results/qwen35-gsm8k-8shot-pegainfer-issue250/qwen35-eval/results_*.json` +`results/qwen35-gsm8k-8shot-openinfer-issue250/qwen35-eval/results_*.json` | Filter | exact_match | stderr | Delta vs HF 79.45% | | --- | ---: | ---: | ---: | @@ -150,7 +150,7 @@ admission, non-greedy sampling, or `batch_size > 1` task-score evals. ### Deferred rand/hash corpus -Issue #186 also discussed a larger PegaInfer-owned rand/hash regression corpus after the HF gate is trusted. That idea is still useful, but checked-in exact token/hash data may depend on GPU architecture and CUDA stack. Do not land it as a normal regression gate until the corpus policy says whether it is per-arch, tolerance-adjudicated through HF, or generated only as a local diagnostic. +Issue #186 also discussed a larger OpenInfer-owned rand/hash regression corpus after the HF gate is trusted. That idea is still useful, but checked-in exact token/hash data may depend on GPU architecture and CUDA stack. Do not land it as a normal regression gate until the corpus policy says whether it is per-arch, tolerance-adjudicated through HF, or generated only as a local diagnostic. ## Progress Log @@ -263,12 +263,12 @@ The real break happens on the first decode step after prefill: - `prefill_next_token` matches HF: `23066` - `decode_next_token` does **not** match HF: - HF: `23066` - - pegainfer: `213603` + - openinfer: `213603` - `decode_logits` vs HF: - `max_abs=23.75` - `mean_abs≈4.10` -Most importantly, this is not just an HF mismatch. pegainfer decode is also inconsistent with pegainfer prefill: +Most importantly, this is not just an HF mismatch. openinfer decode is also inconsistent with openinfer prefill: - compare `decode_logits` after `65`-token prefill + one decode step - against longer-prefill logits for the equivalent `66`-token prompt @@ -285,7 +285,7 @@ Interpretation: ### 2026-03-27 — fixed `conv1d_prefill` state handoff, decode consistency restored -After replacing the HD256 decode attention kernel with the validated prefill path, first-decode HF mismatch improved but did not disappear. The next step was to compare incremental decode against fresh full-prefill inside pegainfer itself. +After replacing the HD256 decode attention kernel with the validated prefill path, first-decode HF mismatch improved but did not disappear. The next step was to compare incremental decode against fresh full-prefill inside openinfer itself. On prompt: @@ -372,14 +372,14 @@ Current result: - fails immediately on `Hello` - because `test_data/Qwen3.5-4B.json` was the old self-generated baseline - a fresh candidate baseline was generated to: - - [target/accuracy/Qwen3.5-4B.current.json]($LOCAL_PEGAINFER_DIR/target/accuracy/Qwen3.5-4B.current.json) + - [target/accuracy/Qwen3.5-4B.current.json]($LOCAL_OPENINFER_DIR/target/accuracy/Qwen3.5-4B.current.json) Important new finding while checking remaining HF mismatches: - for `python_prime`, after the common HF prefix of one generated token, the residual difference is already small enough that top logits are tied - HF exact-token-id prefill on that prefix has max-logit tokens: - `[32, 1206]` -- pegainfer exact-token-id prefill on that prefix has max-logit tokens: +- openinfer exact-token-id prefill on that prefix has max-logit tokens: - `[727, 1206]` - all of these tied tokens are at logit `20.0` in bf16/f32 dump @@ -445,7 +445,7 @@ New tooling: Important correction: -- pegainfer's production decode path currently does **not** run the old per-layer decode kernels +- openinfer's production decode path currently does **not** run the old per-layer decode kernels - `Qwen35Model::decode_one_token()` now reuses `prefill_forward(&[token])` - an earlier manual incremental dump implementation that walked `decode_full_attention_layer()` / `decode_linear_attention_layer()` directly produced large false mismatches and was corrected to mirror the real runtime path @@ -472,13 +472,13 @@ But later in the stack, the same step shows the familiar cumulative drift again: - layer `31` `attn_out`: `max_abs=0.046875`, `mean_abs≈2.93e-03` - layer `31` `layer_out`: `max_abs=0.203125`, `mean_abs≈1.29e-02` -Crucially, this later-layer gap is not unique to pegainfer. On the same exact prefix, HF's own incremental decode also separates from HF full-prefill by a similar amount at layer `31`: +Crucially, this later-layer gap is not unique to openinfer. On the same exact prefix, HF's own incremental decode also separates from HF full-prefill by a similar amount at layer `31`: - HF incremental vs HF full-prefill, layer `31` `layer_input`: `max_abs=0.125`, `mean_abs≈8.28e-03` - HF incremental vs HF full-prefill, layer `31` `attn_out`: `max_abs=0.015625`, `mean_abs≈2.70e-03` - HF incremental vs HF full-prefill, layer `31` `layer_out`: `max_abs=0.125`, `mean_abs≈1.07e-02` -pegainfer shows the same qualitative pattern on the same step: +openinfer shows the same qualitative pattern on the same step: - peg incremental vs peg full-prefill, layer `31` `layer_input`: `max_abs=0.125`, `mean_abs≈8.52e-03` - peg incremental vs peg full-prefill, layer `31` `attn_out`: `max_abs=0.015625`, `mean_abs≈2.82e-03` diff --git a/docs/models/qwen35/kernel-plan.md b/docs/models/qwen35/kernel-plan.md index c5a16224..ad431f95 100644 --- a/docs/models/qwen35/kernel-plan.md +++ b/docs/models/qwen35/kernel-plan.md @@ -1,6 +1,6 @@ # Qwen3.5 Kernel Plan -> **TL;DR:** Qwen3.5-4B has a `pegainfer_qwen35_4b::kernel_plan()` static descriptor mirroring the qwen3-4b module — enumerates every prefill / decode / unified op with its Rust call site, backend, and notes, so you can dump the active kernel mix without reading call sites. +> **TL;DR:** Qwen3.5-4B has a `openinfer_qwen35_4b::kernel_plan()` static descriptor mirroring the qwen3-4b module — enumerates every prefill / decode / unified op with its Rust call site, backend, and notes, so you can dump the active kernel mix without reading call sites. > > **Last touched:** 2026-06 @@ -10,7 +10,7 @@ Qwen3-4B centralized runtime kernel selection in a `kernel_plan` module — one Qwen3.5-4B historically hardwired kernel choices at call sites. There was no way to see which variants a given run used without reading code. -The refactor (issue #256) adds a qwen35 counterpart: a `src/kernel_plan.rs` module that mirrors the qwen3 structure and is exposed via `pegainfer_qwen35_4b::kernel_plan()`. +The refactor (issue #256) adds a qwen35 counterpart: a `src/kernel_plan.rs` module that mirrors the qwen3 structure and is exposed via `openinfer_qwen35_4b::kernel_plan()`. ## What the plan covers @@ -25,7 +25,7 @@ Each `KernelOp` records the Rust call site (path through the crate), the runtime ## How to use it ```rust -use pegainfer_qwen35_4b::kernel_plan; +use openinfer_qwen35_4b::kernel_plan; let plan = kernel_plan(); println!("model: {}", plan.model); @@ -41,7 +41,7 @@ Or, for JSON, walk the structure and serialize. (No built-in JSON helper; the da ## What's NOT in scope (yet) -- **No `qwen35_kernel_report.rs` bin.** The qwen3 counterpart (`pegainfer-qwen3-4b/src/bin/qwen3_kernel_report.rs`) is a CUPTI-driven per-op microbench with manifest-driven variant sweeps. That's a much larger piece of work — out of scope for the "pure refactor, no kernel behavior change" boundary in #256. +- **No `qwen35_kernel_report.rs` bin.** The qwen3 counterpart (`openinfer-qwen3-4b/src/bin/qwen3_kernel_report.rs`) is a CUPTI-driven per-op microbench with manifest-driven variant sweeps. That's a much larger piece of work — out of scope for the "pure refactor, no kernel behavior change" boundary in #256. - **No `kernel_manifests/qwen35-4b.toml` either**, since no kernel_report bin consumes it. - **No actual selection logic.** Like qwen3, the plan is descriptive only — it documents the call sites, it doesn't dispatch between them. If/when a kernel variant choice depends on shape (e.g., CTA size for prefill attention), that decision still happens at the call site. The plan is the **observability** layer, not a policy engine. @@ -53,7 +53,7 @@ Or, for JSON, walk the structure and serialize. (No built-in JSON helper; the da ## See also -- `pegainfer-qwen3-4b/src/kernel_plan.rs` — the reference implementation this is modeled on. -- `pegainfer-qwen3-4b/src/bin/qwen3_kernel_report.rs` — full CUPTI kernel report runner (future work, not in this refactor). -- `pegainfer-qwen3-4b/kernel_manifests/qwen3-4b.toml` — manifest consumed by the qwen3 report runner. +- `openinfer-qwen3-4b/src/kernel_plan.rs` — the reference implementation this is modeled on. +- `openinfer-qwen3-4b/src/bin/qwen3_kernel_report.rs` — full CUPTI kernel report runner (future work, not in this refactor). +- `openinfer-qwen3-4b/kernel_manifests/qwen3-4b.toml` — manifest consumed by the qwen3 report runner. - Issue #256 — "qwen35: no kernel_plan — decode kernel picks are hardwired". diff --git a/docs/models/qwen35/kv-admission.md b/docs/models/qwen35/kv-admission.md index 1893bd54..c60df8f1 100644 --- a/docs/models/qwen35/kv-admission.md +++ b/docs/models/qwen35/kv-admission.md @@ -14,10 +14,10 @@ - `docs/subsystems/scheduler/scheduler.md` - explains paged KV, scheduler ownership, and why pressure evidence needs a real serving run plus post-pressure completion. - `docs/models/qwen35/model-crate.md` - confirms Qwen3.5 scheduler/runtime ownership and test paths. - GitHub issue #254 - desired outcome is full-lifetime admission, clean rejection for impossible requests, and no batch-wide abort from KV exhaustion. - - `pegainfer-qwen35-4b/src/scheduler.rs` - production scheduler currently calls prompt-only admission and reports execution errors as normal finishes in several paths. - - `pegainfer-qwen35-4b/src/scheduler/plan.rs` - CPU-testable admission seam currently reserves prompt pages only. - - `pegainfer-qwen3-4b/src/scheduler.rs` - reference implementation for `prompt_len + max_tokens - 1` KV accounting and impossible-request rejection. - - `pegainfer-core/src/kv_pool.rs` - confirms `KvState::ensure_capacity` grows physical pages lazily and pool capacity includes the reserved padding page. + - `openinfer-qwen35-4b/src/scheduler.rs` - production scheduler currently calls prompt-only admission and reports execution errors as normal finishes in several paths. + - `openinfer-qwen35-4b/src/scheduler/plan.rs` - CPU-testable admission seam currently reserves prompt pages only. + - `openinfer-qwen3-4b/src/scheduler.rs` - reference implementation for `prompt_len + max_tokens - 1` KV accounting and impossible-request rejection. + - `openinfer-core/src/kv_pool.rs` - confirms `KvState::ensure_capacity` grows physical pages lazily and pool capacity includes the reserved padding page. - **Relevant history**: - `docs/models/qwen3/kv-pressure-hang.md` - the original failure mode kept the server alive while completions hung, so validation must include both pressure result and a post-pressure completion. - `docs/models/qwen35/roadmap.md` - the current #255 scheduler seam should host policy changes so they remain CPU-testable. @@ -34,13 +34,13 @@ ## Execution Log ### Step 1: Qwen3.5 admission policy -- Updated `pegainfer-qwen35-4b/src/scheduler/plan.rs` so pending requests are sized by full lifetime KV demand: `prompt_len + max_tokens - 1`. +- Updated `openinfer-qwen35-4b/src/scheduler/plan.rs` so pending requests are sized by full lifetime KV demand: `prompt_len + max_tokens - 1`. - Added active-request budgeting with `ActiveKvBudget`. Active requests subtract only their remaining future page growth from `available_pages`, because `KvState` already holds their current pages through the shared `PagePool`. - Preserved Qwen3.5's existing FCFS deferral policy after the first temporary budget miss. Requests larger than `max_request_pages` are rejected and do not block later fitting requests. - Added a release assertion for the invariant that an active request's current KV pages cannot exceed its admitted lifetime pages. ### Step 2: Scheduler event semantics -- Updated `pegainfer-qwen35-4b/src/scheduler.rs` to build active budgets, pass the usable single-request cap (`capacity_pages - 1`, excluding the CUDA Graph padding page), and emit `TokenEvent::Rejected` for impossible requests. +- Updated `openinfer-qwen35-4b/src/scheduler.rs` to build active budgets, pass the usable single-request cap (`capacity_pages - 1`, excluding the CUDA Graph padding page), and emit `TokenEvent::Rejected` for impossible requests. - Converted Qwen3.5 execution/sampling failure paths from fake `Finished(Stop)` to `TokenEvent::Error`, so request failures surface as errors instead of clean stops. - Rejection message includes the prompt length and full lifetime request demand: - `request requires more KV pages than this model instance can provide: prompt_tokens=..., max_request_tokens=...` @@ -56,7 +56,7 @@ - direct `send_rejection` event shape. - After PR review, removed the fake scheduler-loop seam and fake loop tests. The loop remains concrete; pure admission policy stays in `scheduler/plan.rs`, and runtime shell behavior is covered by e2e plus the direct bench rejection gate. - Kept the frontend bridge test proving `TokenEvent::Rejected` maps to an error finish: - - `cargo test --offline --release -p pegainfer-vllm-frontend rejected_request_is_reported_as_error --lib -- --nocapture` passed, `1 passed`. + - `cargo test --offline --release -p openinfer-vllm-frontend rejected_request_is_reported_as_error --lib -- --nocapture` passed, `1 passed`. - Ran read-only DeepSeek diff reviews. Useful findings were handled by adding the active future-page direct test, a release assertion, the padding-page comment, and direct rejection-event coverage. Two findings were rejected after source checks: - `TokenEvent::Error` is an existing engine contract and the frontend consumes it. - `active.drain(..)` drops `ActiveRequest35`, and its owned `KvState` returns pages by RAII; no KV page leak was found there. @@ -64,7 +64,7 @@ ### Step 4: Remote setup and narrow gates - Remote validation host: - GPU: NVIDIA GeForce RTX 5090, driver `580.105.08`, 32607 MiB. - - CUDA toolkit: `/usr/local/cuda-12.8`, `PEGAINFER_CUDA_SM=120`. + - CUDA toolkit: `/usr/local/cuda-12.8`, `OPENINFER_CUDA_SM=120`. - Triton AOT Python: validation venv with Triton 3.6 for `sm_120`. - Model: `models/Qwen3.5-4B`, HF revision `851bf6e806efd8d0a36b00ddf55e13ccb7b8cd0a`. - Remote dependency fixes needed before validation: @@ -73,21 +73,21 @@ - restored FlashInfer CCCL headers from an existing CUDA 13 Python environment and used Triton 3.6 for `sm_120`. - Commands passed: - `cargo fmt --check` - - `cargo test --offline --release -p pegainfer-qwen35-4b --lib scheduler::plan -- --nocapture` - `14 passed`. - - `cargo test --offline --release -p pegainfer-qwen35-4b --lib -- --nocapture` - `22 passed` before the fake-loop test deletion. - - `cargo test --offline --release -p pegainfer-qwen35-4b send_rejection_reports_kv_lifetime_context --lib -- --nocapture` - `1 passed` before the rejection label rename and test rename. - - `cargo test --offline --release -p pegainfer-vllm-frontend rejected_request_is_reported_as_error --lib -- --nocapture` - `1 passed`. - - `cargo build --offline --release -p pegainfer-server` - passed with existing unused-import warnings in `pegainfer-server`. + - `cargo test --offline --release -p openinfer-qwen35-4b --lib scheduler::plan -- --nocapture` - `14 passed`. + - `cargo test --offline --release -p openinfer-qwen35-4b --lib -- --nocapture` - `22 passed` before the fake-loop test deletion. + - `cargo test --offline --release -p openinfer-qwen35-4b send_rejection_reports_kv_lifetime_context --lib -- --nocapture` - `1 passed` before the rejection label rename and test rename. + - `cargo test --offline --release -p openinfer-vllm-frontend rejected_request_is_reported_as_error --lib -- --nocapture` - `1 passed`. + - `cargo build --offline --release -p openinfer-server` - passed with existing unused-import warnings in `openinfer-server`. - Review-fix validation on the H20 host after deleting the fake seam and renaming the rejection label: - `cargo fmt --check` passed with nightly Rust. - - `cargo test --offline --release -p pegainfer-qwen35-4b send_rejection_reports_kv_lifetime_request_tokens --lib -- --nocapture` passed, `1 passed`. - - `cargo test --offline --release -p pegainfer-qwen35-4b --lib -- --nocapture` passed, `20 passed`. + - `cargo test --offline --release -p openinfer-qwen35-4b send_rejection_reports_kv_lifetime_request_tokens --lib -- --nocapture` passed, `1 passed`. + - `cargo test --offline --release -p openinfer-qwen35-4b --lib -- --nocapture` passed, `20 passed`. - H20 real-model e2e was not rerun because the host had no `models/Qwen3.5-4B/config.json`, and downloading `Qwen/Qwen3.5-4B` revision `851bf6e806efd8d0a36b00ddf55e13ccb7b8cd0a` failed with `Network is unreachable`. - - `cargo test --offline --release -p pegainfer-vllm-frontend rejected_request_is_reported_as_error --lib -- --nocapture` was blocked by the local vendored vLLM `proto/vllm_grpc.proto` path, so the prior frontend bridge pass and GitHub CPU check remain the evidence for that surface. + - `cargo test --offline --release -p openinfer-vllm-frontend rejected_request_is_reported_as_error --lib -- --nocapture` was blocked by the local vendored vLLM `proto/vllm_grpc.proto` path, so the prior frontend bridge pass and GitHub CPU check remain the evidence for that surface. ### Step 5: Real Qwen3.5 e2e - Command: - - `PEGAINFER_TEST_MODEL_PATH=models/Qwen3.5-4B cargo test --offline --release -p pegainfer-qwen35-4b --test e2e_scheduler -- --nocapture` + - `OPENINFER_TEST_MODEL_PATH=models/Qwen3.5-4B cargo test --offline --release -p openinfer-qwen35-4b --test e2e_scheduler -- --nocapture` - Result before the extra release-assert hardening: - `test_e2e_qwen35_scheduler ... ok` - `1 passed`, finished in `12.95s`. @@ -96,7 +96,7 @@ - `1 passed`, finished in `11.04s`. ### Step 6: HTTP pressure validation - Started the real OpenAI-compatible server: - - `./target/release/pegainfer --model-path models/Qwen3.5-4B --served-model-name issue254-qwen35 --port 18082` + - `./target/release/openinfer --model-path models/Qwen3.5-4B --served-model-name issue254-qwen35 --port 18082` - Server startup facts: - `Qwen3.5 KV cache: 33289 pages (16644 MB), prefill scratch reserve: 3873 MB` - scheduler `max_batch=64` diff --git a/docs/models/qwen35/model-crate.md b/docs/models/qwen35/model-crate.md index 8153c58f..a65740d6 100644 --- a/docs/models/qwen35/model-crate.md +++ b/docs/models/qwen35/model-crate.md @@ -1,7 +1,7 @@ # Qwen3.5-4B Model Crate **Created**: 2026-05-05 -**TL;DR**: `pegainfer-qwen35-4b` now owns Qwen3.5 config, weights, prefill/decode/unified forward, recurrent state, scheduler, recurrent op wrappers, scheduler integration tests, and Qwen3.5 op benches. Root `pegainfer` loads Qwen3.5 through `pegainfer_qwen35_4b::start_engine(...)` / generic `EngineHandle`; root no longer exposes `pegainfer::model::Qwen35Model` or `pegainfer::scheduler_qwen35`. The original exact-text e2e/regen tests described in this migration record were later retired by the HF logits gate in `docs/models/qwen35/accuracy.md`. +**TL;DR**: `openinfer-qwen35-4b` now owns Qwen3.5 config, weights, prefill/decode/unified forward, recurrent state, scheduler, recurrent op wrappers, scheduler integration tests, and Qwen3.5 op benches. Root `openinfer` loads Qwen3.5 through `openinfer_qwen35_4b::start_engine(...)` / generic `EngineHandle`; root no longer exposes `openinfer::model::Qwen35Model` or `openinfer::scheduler_qwen35`. The original exact-text e2e/regen tests described in this migration record were later retired by the HF logits gate in `docs/models/qwen35/accuracy.md`. **Last touched**: 2026-06 ## Preparation @@ -11,16 +11,16 @@ - `docs/models/qwen3/model-crate.md` - Qwen3 already owns its scheduler, executor/runtime API, tests, benches, and root-facing `EngineHandle` entry. - `docs/models/qwen35/accuracy.md` - at the time of this migration, Qwen3.5 e2e tests were regression guards against `test_data/Qwen3.5-4B.json`; current accuracy coverage is the HF logits gate recorded there. - `docs/models/qwen35/optimization.md` - Qwen3.5 should keep its hybrid linear/full-attention scheduler/state architecture. - - GitHub issue #79 - acceptance criteria require `pegainfer-qwen35-4b`, removal of root `pegainfer::model::Qwen35Model` and `pegainfer::scheduler_qwen35`, generic root `bench_serving`, and CUDA validation. - - `Cargo.toml`, `src/lib.rs`, `src/main.rs`, `src/ops.rs`, `src/scheduler.rs`, `src/model/qwen35.rs`, and `pegainfer-qwen3-4b/src/lib.rs` - mapped the current root Qwen3.5 surface and the Qwen3 crate interface to copy. + - GitHub issue #79 - acceptance criteria require `openinfer-qwen35-4b`, removal of root `openinfer::model::Qwen35Model` and `openinfer::scheduler_qwen35`, generic root `bench_serving`, and CUDA validation. + - `Cargo.toml`, `src/lib.rs`, `src/main.rs`, `src/ops.rs`, `src/scheduler.rs`, `src/model/qwen35.rs`, and `openinfer-qwen3-4b/src/lib.rs` - mapped the current root Qwen3.5 surface and the Qwen3 crate interface to copy. - **Relevant history**: - `docs/models/qwen3/model-crate.md` - root should load model crates through `EngineHandle`; model-owned execution details should move behind crate-local modules. - **Plan**: - 1. Add `pegainfer-qwen35-4b` to the workspace with dependencies mirroring the Qwen3 crate plus the root dependencies Qwen3.5 currently uses. - 2. Move `src/model/qwen35.rs`, `src/model/qwen35/*`, `src/scheduler_qwen35.rs`, and Qwen3.5 recurrent op wrappers into the new crate, keeping CUDA/Triton kernel sources and FFI in `pegainfer-kernels`. - 3. Rewrite imports so the new crate depends on `pegainfer-core` and `pegainfer-kernels`, not on root `pegainfer`. - 4. Expose `start_engine`, `start_engine_with_capacity`, and a deliberate `runtime` module from `pegainfer-qwen35-4b`. - 5. Update root `main.rs` and `src/bin/bench_serving.rs` to call `pegainfer_qwen35_4b::start_engine`. + 1. Add `openinfer-qwen35-4b` to the workspace with dependencies mirroring the Qwen3 crate plus the root dependencies Qwen3.5 currently uses. + 2. Move `src/model/qwen35.rs`, `src/model/qwen35/*`, `src/scheduler_qwen35.rs`, and Qwen3.5 recurrent op wrappers into the new crate, keeping CUDA/Triton kernel sources and FFI in `openinfer-kernels`. + 3. Rewrite imports so the new crate depends on `openinfer-core` and `openinfer-kernels`, not on root `openinfer`. + 4. Expose `start_engine`, `start_engine_with_capacity`, and a deliberate `runtime` module from `openinfer-qwen35-4b`. + 5. Update root `main.rs` and `src/bin/bench_serving.rs` to call `openinfer_qwen35_4b::start_engine`. 6. Move Qwen3.5 e2e tests and regen test into the model crate; adjust model/test-data paths after the move. 7. Remove root Qwen3.5 modules and compatibility exports, then audit root with `rg`. 8. Verify with `cargo fmt --all --check`, `cargo metadata --no-deps --format-version 1`, and the CUDA-capable build/test commands available on this machine. @@ -31,7 +31,7 @@ ## Execution Log ### Step 1: Add model crate and move Qwen3.5 runtime -- Added `pegainfer-qwen35-4b` to the workspace and root dependencies. +- Added `openinfer-qwen35-4b` to the workspace and root dependencies. - Moved Qwen3.5-owned runtime files out of root: - `src/model/qwen35.rs` - `src/model/qwen35/*` @@ -45,12 +45,12 @@ ### Step 2: Move tests and benches - Moved root Qwen3.5 tests to the model crate at the time: - - `pegainfer-qwen35-4b/tests/e2e.rs` - - `pegainfer-qwen35-4b/tests/e2e_scheduler.rs` - - `pegainfer-qwen35-4b/tests/regen_test_data.rs` + - `openinfer-qwen35-4b/tests/e2e.rs` + - `openinfer-qwen35-4b/tests/e2e_scheduler.rs` + - `openinfer-qwen35-4b/tests/regen_test_data.rs` - The exact-text `e2e.rs` and `regen_test_data.rs` were later removed by the Qwen3.5 HF logits gate work; `e2e_scheduler.rs` remains as request-flow coverage. -- Moved Qwen3.5-specific op benches to `pegainfer-qwen35-4b/benches/qwen35_ops.rs`. -- Moved the `conv1d_prefill_handoff_matches_single_prefill` operator test into `pegainfer-qwen35-4b/src/recurrent.rs`, next to the wrapper it validates. +- Moved Qwen3.5-specific op benches to `openinfer-qwen35-4b/benches/qwen35_ops.rs`. +- Moved the `conv1d_prefill_handoff_matches_single_prefill` operator test into `openinfer-qwen35-4b/src/recurrent.rs`, next to the wrapper it validates. - Removed Qwen3.5-specific GEMV shapes from the root generic `ops_bench`; the model-specific benches now live with Qwen3.5. ### Step 3: Remove root Qwen3.5 compatibility surface @@ -60,26 +60,26 @@ - `src/model.rs` - `src/ffi.rs` - `src/kv_pool.rs` -- Root `main.rs` now calls `pegainfer_qwen35_4b::start_engine(...)` for Qwen3.5. -- Root `bench_serving` now calls `pegainfer_qwen35_4b::start_engine_with_capacity(...)` and still benchmarks via generic `EngineHandle`. +- Root `main.rs` now calls `openinfer_qwen35_4b::start_engine(...)` for Qwen3.5. +- Root `bench_serving` now calls `openinfer_qwen35_4b::start_engine_with_capacity(...)` and still benchmarks via generic `EngineHandle`. - The Qwen3.5 engine entry honors a single `EngineLoadOptions.device_ordinals` value and rejects multi-device input, matching the current single-GPU implementation instead of silently ignoring the option. -- `rg` confirms there are no root references to `pegainfer::model::Qwen35Model`, `pegainfer::scheduler_qwen35`, or `src/model/qwen35`. +- `rg` confirms there are no root references to `openinfer::model::Qwen35Model`, `openinfer::scheduler_qwen35`, or `src/model/qwen35`. ### Step 4: Validation - Passed: - `cargo metadata --no-deps --format-version 1` - `cargo fmt --all --check` - - `PEGAINFER_CUDA_SM=120 cargo check --release --workspace --all-targets` - - `PEGAINFER_CUDA_SM=120 cargo clippy --release --workspace --all-targets -- -D warnings` - - `PEGAINFER_CUDA_SM=120 cargo build --release` - - `PEGAINFER_CUDA_SM=120 cargo test --release -p pegainfer-qwen35-4b recurrent::tests::conv1d_prefill_handoff_matches_single_prefill -- --nocapture` - - `PEGAINFER_CUDA_SM=120 cargo run --release --bin bench_serving -- --model-path $LOCAL_PEGAINFER_DIR/models/Qwen3.5-4B request --prompt-len 1 --output-len 1 --warmup 0 --iters 1` + - `OPENINFER_CUDA_SM=120 cargo check --release --workspace --all-targets` + - `OPENINFER_CUDA_SM=120 cargo clippy --release --workspace --all-targets -- -D warnings` + - `OPENINFER_CUDA_SM=120 cargo build --release` + - `OPENINFER_CUDA_SM=120 cargo test --release -p openinfer-qwen35-4b recurrent::tests::conv1d_prefill_handoff_matches_single_prefill -- --nocapture` + - `OPENINFER_CUDA_SM=120 cargo run --release --bin bench_serving -- --model-path $LOCAL_OPENINFER_DIR/models/Qwen3.5-4B request --prompt-len 1 --output-len 1 --warmup 0 --iters 1` - Initial Qwen3.5 e2e failure: - - `PEGAINFER_CUDA_SM=120 PEGAINFER_TEST_MODEL_PATH=$LOCAL_PEGAINFER_DIR/models/Qwen3.5-4B cargo test --release -p pegainfer-qwen35-4b --test e2e -- --nocapture` - - `PEGAINFER_CUDA_SM=120 PEGAINFER_TEST_MODEL_PATH=$LOCAL_PEGAINFER_DIR/models/Qwen3.5-4B cargo test --release -p pegainfer-qwen35-4b --test e2e_scheduler -- --nocapture` + - `OPENINFER_CUDA_SM=120 OPENINFER_TEST_MODEL_PATH=$LOCAL_OPENINFER_DIR/models/Qwen3.5-4B cargo test --release -p openinfer-qwen35-4b --test e2e -- --nocapture` + - `OPENINFER_CUDA_SM=120 OPENINFER_TEST_MODEL_PATH=$LOCAL_OPENINFER_DIR/models/Qwen3.5-4B cargo test --release -p openinfer-qwen35-4b --test e2e_scheduler -- --nocapture` - Both initially produced all-case gibberish-output mismatches. - Control run: - - A temporary old-HEAD worktree at `$RESULT_ROOT/pegainfer-head` ran `PEGAINFER_CUDA_SM=120 PEGAINFER_TRITON_PYTHON=$LOCAL_PEGAINFER_DIR/.venv/bin/python PEGAINFER_TEST_MODEL_PATH=$LOCAL_PEGAINFER_DIR/models/Qwen3.5-4B CARGO_TARGET_DIR=$RESULT_ROOT/pegainfer-head-target cargo test --release --test e2e_qwen35 -- --nocapture`. + - A temporary old-HEAD worktree at `$RESULT_ROOT/openinfer-head` ran `OPENINFER_CUDA_SM=120 OPENINFER_TRITON_PYTHON=$LOCAL_OPENINFER_DIR/.venv/bin/python OPENINFER_TEST_MODEL_PATH=$LOCAL_OPENINFER_DIR/models/Qwen3.5-4B CARGO_TARGET_DIR=$RESULT_ROOT/openinfer-head-target cargo test --release --test e2e_qwen35 -- --nocapture`. - Old HEAD failed the same way on all 10 Qwen3.5 cases, so the e2e mismatch predated this crate split. - Follow-up fix: - `docs/lessons/exact-match-gate-thread-cublas.md` identified the first gibberish commit as `6a5b826`, fixed Qwen3.5 scheduler thread CUDA/cuBLAS binding, kept greedy sampling on FlashInfer top1, and refreshed the exact Qwen3.5 golden for the default engine shape. @@ -89,9 +89,9 @@ - **Outcome**: Qwen3.5 is now an independent model crate with the same root-facing engine style as Qwen3-4B. Root retains model detection/frontend/bench orchestration, but not Qwen3.5 model internals. The follow-up e2e corruption fix restored the then-current exact-text e2e and scheduler e2e; the exact-text gate was later retired in favor of the HF logits gate. - **Pitfalls encountered**: - - The first e2e run used a relative `PEGAINFER_TEST_MODEL_PATH`; package tests execute with a crate-oriented working directory, so absolute model paths are safer for crate-local tests. + - The first e2e run used a relative `OPENINFER_TEST_MODEL_PATH`; package tests execute with a crate-oriented working directory, so absolute model paths are safer for crate-local tests. - Qwen3.5 e2e initially looked like a crate-split regression, but git history showed the corruption started earlier when cuBLAS handles became thread-local without equivalent Qwen3.5 scheduler thread binding. - Moving recurrent wrappers out of root exposed stale root compatibility re-exports (`src/ffi.rs`, `src/kv_pool.rs`, and root Qwen3.5 ops bench shapes), which were removed. - **Lessons learned**: - Model-local benches need a deliberate public surface. `runtime_ops` is intentionally narrow and only exposes the Qwen3.5 operator wrappers needed by Qwen3.5 benches. - - Qwen3.5 test docs should use absolute `PEGAINFER_TEST_MODEL_PATH` examples when run from the workspace, because package test working directories can make relative paths misleading. + - Qwen3.5 test docs should use absolute `OPENINFER_TEST_MODEL_PATH` examples when run from the workspace, because package test working directories can make relative paths misleading. diff --git a/docs/models/qwen35/optimization.md b/docs/models/qwen35/optimization.md index c89a57b4..b5af47dc 100644 --- a/docs/models/qwen35/optimization.md +++ b/docs/models/qwen35/optimization.md @@ -2,13 +2,13 @@ > **TL;DR:** Hybrid 24 linear + 8 full attn. At parity with vLLM: TTFT `234ms` (+2%), TPOT `11.77ms` (+1%) — see the [E2E Dashboard](#e2e-dashboard) for the authoritative ledger. After the accuracy-parity refactor (#40) regressed decode by +4%, restoring the dedicated GDR decode kernel (#9) recovered it fully. > -> **Last touched:** 2026-06. Qwen3.5 runtime code lives in top-level `pegainfer-qwen35-4b`; root `bench_serving` loads it through the generic `EngineHandle`. Current accuracy coverage is `PEGAINFER_CUDA_SM=120 PEGAINFER_TEST_MODEL_PATH= cargo test --release -p pegainfer-qwen35-4b --test hf_golden_gate -- --nocapture`; run `e2e_scheduler` when scheduler request-flow behavior changes. The old exact-text e2e/regen baseline was retired by the HF logits gate in `docs/models/qwen35/accuracy.md`. +> **Last touched:** 2026-06. Qwen3.5 runtime code lives in top-level `openinfer-qwen35-4b`; root `bench_serving` loads it through the generic `EngineHandle`. Current accuracy coverage is `OPENINFER_CUDA_SM=120 OPENINFER_TEST_MODEL_PATH= cargo test --release -p openinfer-qwen35-4b --test hf_golden_gate -- --nocapture`; run `e2e_scheduler` when scheduler request-flow behavior changes. The old exact-text e2e/regen baseline was retired by the HF logits gate in `docs/models/qwen35/accuracy.md`. -Historical command logs below keep the command paths that were actually run at the time. For new Qwen3.5 accuracy tests, use `-p pegainfer-qwen35-4b --test hf_golden_gate`; for serving benchmarks, continue using root `cargo run --release --bin bench_serving -- --model-path models/Qwen3.5-4B ...`. +Historical command logs below keep the command paths that were actually run at the time. For new Qwen3.5 accuracy tests, use `-p openinfer-qwen35-4b --test hf_golden_gate`; for serving benchmarks, continue using root `cargo run --release --bin bench_serving -- --model-path models/Qwen3.5-4B ...`. ## Goal -pegainfer single-request latency >= vLLM on Qwen3.5-4B, same GPU/workload. The original prefill-heavy gap is now mostly closed: chunk-wise GDR prefill gets `(2048,1)` TTFT to parity level on this GPU, so the remaining work is normal tuning and cleanup rather than a structural latency crisis. +openinfer single-request latency >= vLLM on Qwen3.5-4B, same GPU/workload. The original prefill-heavy gap is now mostly closed: chunk-wise GDR prefill gets `(2048,1)` TTFT to parity level on this GPU, so the remaining work is normal tuning and cleanup rather than a structural latency crisis. ## Known Caveat @@ -41,7 +41,7 @@ Historical exact-text baseline note: GPU: RTX 5070 Ti, Model: Qwen3.5-4B, vLLM 0.18.0, single concurrency. Both measured via `vllm bench serve` HTTP client (apples-to-apples). vLLM: torch.compile + CUDA Graph (`--max-num-seqs 1` to fit in 16 GB alongside desktop). -| Profile | Metric | pegainfer | vLLM | delta | +| Profile | Metric | openinfer | vLLM | delta | |---------|--------|-----------|------|-------| | prefill-heavy (2048,1) | TTFT median | 234.21ms | 229.25ms | +2% | | prefill-heavy (2048,1) | TTFT p99 | 375.65ms | 8822ms¹ | — | @@ -186,7 +186,7 @@ GEMV + MLP (cuBLAS + LM head + silu_mul) dominate at 91.6%. GDR is 2.8% after th Command used: ```bash -PEGAINFER_TRITON_PYTHON=./.venv/bin/python \ +OPENINFER_TRITON_PYTHON=./.venv/bin/python \ nsys profile --force-overwrite=true --trace=cuda,nvtx --cuda-graph-trace=node \ --export=sqlite -o target/profiling/qwen35_decode_1x128_20260327 \ cargo run --release --bin bench_serving -- --model-path models/Qwen3.5-4B \ @@ -414,7 +414,7 @@ Trace note: this capture includes one warmup request plus one measured request, Stable request bench: ```bash -PEGAINFER_TRITON_PYTHON=./.venv/bin/python \ +OPENINFER_TRITON_PYTHON=./.venv/bin/python \ cargo run --release --bin bench_serving -- --model-path models/Qwen3.5-4B \ request --prompt-len 2048 --output-len 1 --warmup 1 --iters 3 ``` @@ -428,7 +428,7 @@ Result: `nsys` command used: ```bash -PEGAINFER_TRITON_PYTHON=./.venv/bin/python \ +OPENINFER_TRITON_PYTHON=./.venv/bin/python \ nsys profile --force-overwrite=true --trace=cuda,nvtx --cuda-graph-trace=node \ --export=sqlite -o target/profiling/qwen35_prefill_2048_gdr \ cargo run --release --bin bench_serving -- --model-path models/Qwen3.5-4B \ @@ -469,7 +469,7 @@ Note: `cuMemcpyHtoDAsync_v2` dominates total API time in this whole-process trac Stable request bench: ```bash -PEGAINFER_TRITON_PYTHON=./.venv/bin/python \ +OPENINFER_TRITON_PYTHON=./.venv/bin/python \ cargo run --release --bin bench_serving -- --model-path models/Qwen3.5-4B \ request --prompt-len 2048 --output-len 1 --warmup 1 --iters 3 ``` @@ -483,8 +483,8 @@ Result: Correctness refresh: ```bash -PEGAINFER_TRITON_PYTHON=./.venv/bin/python cargo test --release --test e2e_qwen35 -- --nocapture -PEGAINFER_TRITON_PYTHON=./.venv/bin/python cargo test --release --test gen_test_data_35 -- --nocapture +OPENINFER_TRITON_PYTHON=./.venv/bin/python cargo test --release --test e2e_qwen35 -- --nocapture +OPENINFER_TRITON_PYTHON=./.venv/bin/python cargo test --release --test gen_test_data_35 -- --nocapture ``` Result: @@ -574,10 +574,10 @@ Initial attempt — state layout transpose for coalescing — had no measurable 3. **Pass fusion:** Merged 4 separate state passes into 2: decay+kv_mem (pass 1), rank-1 update+output (pass 2). Eliminated the shared-memory `smem_delta` round-trip. **Validated commands:** -- `PEGAINFER_TRITON_PYTHON=./.venv/bin/python cargo bench --bench ops_bench -- gated_delta_rule_decode` -- `PEGAINFER_TRITON_PYTHON=./.venv/bin/python cargo test --release --test e2e_qwen35 -- --nocapture` -- `PEGAINFER_TRITON_PYTHON=./.venv/bin/python cargo run --release --bin bench_serving -- --model-path models/Qwen3.5-4B request --prompt-len 1 --output-len 128 --warmup 3 --iters 5` -- `PEGAINFER_TRITON_PYTHON=./.venv/bin/python cargo run --release --bin bench_serving -- --model-path models/Qwen3.5-4B request --prompt-len 2048 --output-len 1 --warmup 1 --iters 3` +- `OPENINFER_TRITON_PYTHON=./.venv/bin/python cargo bench --bench ops_bench -- gated_delta_rule_decode` +- `OPENINFER_TRITON_PYTHON=./.venv/bin/python cargo test --release --test e2e_qwen35 -- --nocapture` +- `OPENINFER_TRITON_PYTHON=./.venv/bin/python cargo run --release --bin bench_serving -- --model-path models/Qwen3.5-4B request --prompt-len 1 --output-len 128 --warmup 3 --iters 5` +- `OPENINFER_TRITON_PYTHON=./.venv/bin/python cargo run --release --bin bench_serving -- --model-path models/Qwen3.5-4B request --prompt-len 2048 --output-len 1 --warmup 1 --iters 3` - `cargo test --release --test e2e -- --nocapture` (Qwen3 unaffected) **Results:** @@ -596,10 +596,10 @@ Initial attempt — state layout transpose for coalescing — had no measurable **Changes:** Added explicit chunk-wise scratch buffers, added Triton AOT stages for `gdr_prepare_qkv_gbeta`, `chunk_local_cumsum`, `chunk_scaled_dot_kkt`, `solve_tril_64`, `recompute_w_u`, `chunk_state`, and `chunk_o`, rewired the real Qwen3.5 prefill path to launch this pipeline per linear-attention layer, and fixed the main correctness bug in `gdr_chunk_state_qwen35_kernel`: `v_new` must be written back ungated and only the recurrent update should use the gated form. The chunk-wise solve/recompute/state/output kernels are adapted from FLA and now carry explicit source attribution in `tools/triton/gated_delta_rule_chunkwise_kernels.py`. **Validated commands:** -- `PEGAINFER_TRITON_PYTHON=./.venv/bin/python cargo check --release` -- `PEGAINFER_TRITON_PYTHON=./.venv/bin/python cargo run --release --bin bench_serving -- --model-path models/Qwen3.5-4B request --prompt-len 2048 --output-len 1 --warmup 1 --iters 3` -- `PEGAINFER_TRITON_PYTHON=./.venv/bin/python cargo test --release --test e2e_qwen35 -- --nocapture` -- `PEGAINFER_TRITON_PYTHON=./.venv/bin/python cargo test --release --test gen_test_data_35 -- --nocapture` +- `OPENINFER_TRITON_PYTHON=./.venv/bin/python cargo check --release` +- `OPENINFER_TRITON_PYTHON=./.venv/bin/python cargo run --release --bin bench_serving -- --model-path models/Qwen3.5-4B request --prompt-len 2048 --output-len 1 --warmup 1 --iters 3` +- `OPENINFER_TRITON_PYTHON=./.venv/bin/python cargo test --release --test e2e_qwen35 -- --nocapture` +- `OPENINFER_TRITON_PYTHON=./.venv/bin/python cargo test --release --test gen_test_data_35 -- --nocapture` **Results:** - Prefill-heavy `(2048,1)`: TTFT avg `222.45ms`, p50 `222.55ms`, p99 `222.85ms` @@ -619,10 +619,10 @@ Initial attempt — state layout transpose for coalescing — had no measurable **Changes:** Added a temporary Triton fused-recurrent GDR prefill kernel and its AOT build wiring, exposed it through FFI, added batched `conv1d -> GDR -> gated_norm` operator plumbing in `ops.rs`, rewired `Qwen35Model` linear prefill to use that path, and added a standalone prefill microbench entry. This path was later superseded by the chunk-wise implementation in `#7`. **Validated commands:** -- `PEGAINFER_TRITON_PYTHON=./.venv/bin/python cargo run --release --bin bench_serving -- --model-path models/Qwen3.5-4B request --prompt-len 2048 --output-len 1 --warmup 1 --iters 3` -- `PEGAINFER_TRITON_PYTHON=./.venv/bin/python cargo run --release --bin bench_serving -- --model-path models/Qwen3.5-4B request --prompt-len 1 --output-len 128 --warmup 1 --iters 3` -- `PEGAINFER_TRITON_PYTHON=./.venv/bin/python cargo test --release --test gen_test_data_35 -- --nocapture` -- `PEGAINFER_TRITON_PYTHON=./.venv/bin/python nsys profile --force-overwrite=true --trace=cuda,nvtx --cuda-graph-trace=node --export=sqlite -o target/profiling/qwen35_prefill_2048_gdr cargo run --release --bin bench_serving -- --model-path models/Qwen3.5-4B request --prompt-len 2048 --output-len 1 --warmup 1 --iters 1` +- `OPENINFER_TRITON_PYTHON=./.venv/bin/python cargo run --release --bin bench_serving -- --model-path models/Qwen3.5-4B request --prompt-len 2048 --output-len 1 --warmup 1 --iters 3` +- `OPENINFER_TRITON_PYTHON=./.venv/bin/python cargo run --release --bin bench_serving -- --model-path models/Qwen3.5-4B request --prompt-len 1 --output-len 128 --warmup 1 --iters 3` +- `OPENINFER_TRITON_PYTHON=./.venv/bin/python cargo test --release --test gen_test_data_35 -- --nocapture` +- `OPENINFER_TRITON_PYTHON=./.venv/bin/python nsys profile --force-overwrite=true --trace=cuda,nvtx --cuda-graph-trace=node --export=sqlite -o target/profiling/qwen35_prefill_2048_gdr cargo run --release --bin bench_serving -- --model-path models/Qwen3.5-4B request --prompt-len 2048 --output-len 1 --warmup 1 --iters 1` - `nsys stats --report cuda_gpu_kern_sum target/profiling/qwen35_prefill_2048_gdr.sqlite` - `nsys stats --report cuda_api_sum target/profiling/qwen35_prefill_2048_gdr.sqlite` @@ -728,8 +728,8 @@ Initial attempt — state layout transpose for coalescing — had no measurable ### #0 Baseline (2026-03-14) **E2E numbers:** -- Prefill-heavy (2048,1): pegainfer 16,846ms vs vLLM 222ms (**76× slower**) -- Decode-heavy (1,128): pegainfer 12.55ms vs vLLM 11.64ms (+8%) +- Prefill-heavy (2048,1): openinfer 16,846ms vs vLLM 222ms (**76× slower**) +- Decode-heavy (1,128): openinfer 12.55ms vs vLLM 11.64ms (+8%) - Supplementary: prefill (128,1) TTFT 669ms → extrapolation confirms superlinear scaling due to O(n) attention per token **Decode verdict:** Close to parity. 12.55ms TPOT vs vLLM 11.64ms (+8%), fully GPU-bound, CUDA Graph'd. Slower than Qwen3-4B (10.6ms) due to 153 GEMV/step (vs ~109 for Qwen3-4B) — linear attention's extra projections (Z, B, A). GDR kernel adds 1.09ms (8.7%). Not worth optimizing until prefill is fixed. diff --git a/docs/playbooks/accuracy-parity-playbook.md b/docs/playbooks/accuracy-parity-playbook.md index 4334aa38..e4acd088 100644 --- a/docs/playbooks/accuracy-parity-playbook.md +++ b/docs/playbooks/accuracy-parity-playbook.md @@ -6,7 +6,7 @@ ## What Counts As "Correct" -- 真值是外部实现,不是 pegainfer 自己生成的 JSON。 +- 真值是外部实现,不是 openinfer 自己生成的 JSON。 - 对生成步,真值必须是 HF 的真实 incremental `past_key_values` 路径。 - 不要把“重建完整前缀后的 full-prefill”当生成步真值。它在后段层上可能连 HF 自己都不等于 HF incremental。 @@ -90,7 +90,7 @@ ``` 历史 case-level 覆盖率排查命令如下。`test_data/Qwen3.5-4B.json` -已经退役,新排查应优先走 `pegainfer-qwen35-4b/tests/hf_golden_gate.rs` +已经退役,新排查应优先走 `openinfer-qwen35-4b/tests/hf_golden_gate.rs` 或重建同等的 HF `past_key_values` token-id/logits dump: ```bash diff --git a/docs/playbooks/bench-vs-vllm.md b/docs/playbooks/bench-vs-vllm.md index 5af43a28..cefbe42f 100644 --- a/docs/playbooks/bench-vs-vllm.md +++ b/docs/playbooks/bench-vs-vllm.md @@ -1,4 +1,4 @@ -# pegainfer vs vLLM Comparative Benchmarking +# openinfer vs vLLM Comparative Benchmarking > **TL;DR:** Run both engines on the same GPU sequentially, benchmark with `vllm bench serve` as a unified client, compare TTFT/TPOT/throughput side by side. @@ -12,14 +12,14 @@ Key flags applied to both: `--ignore-eos` (forces exact output length), `--datas ``` MODEL_PATH=models/Qwen3-4B # or models/Qwen3.5-4B -VLLM_PYTHON=.venv/bin/python # pegainfer/.venv with vllm installed +VLLM_PYTHON=.venv/bin/python # openinfer/.venv with vllm installed VLLM_CMD=.venv/bin/vllm PORT=8000 RESULTS_DIR=bench_results/ ``` Prerequisites: -- `cargo build --release` — pegainfer up-to-date +- `cargo build --release` — openinfer up-to-date - vLLM installed in `.venv` (`uv pip install vllm`) - Kill any existing process on the port before starting @@ -50,16 +50,16 @@ $VLLM_CMD bench serve \ pkill -f "vllm serve" ``` -### 2. Benchmark pegainfer +### 2. Benchmark openinfer Same flow, different server: ```bash -# Start (Qwen3.5 requires PEGAINFER_TRITON_PYTHON for AOT Triton kernels) -RUST_LOG=warn PEGAINFER_TRITON_PYTHON=./.venv/bin/python \ +# Start (Qwen3.5 requires OPENINFER_TRITON_PYTHON for AOT Triton kernels) +RUST_LOG=warn OPENINFER_TRITON_PYTHON=./.venv/bin/python \ cargo run --release -- --model-path $MODEL_PATH --port $PORT & -# Poll until ready — pegainfer has no /v1/models; probe with a minimal completions request +# Poll until ready — openinfer has no /v1/models; probe with a minimal completions request until curl -sf http://localhost:$PORT/v1/completions \ -H "Content-Type: application/json" \ -d '{"model":"","prompt":"hi","max_tokens":1}' >/dev/null; do sleep 5; done @@ -74,7 +74,7 @@ $VLLM_CMD bench serve \ --result-filename pega-in-out.json # Cleanup -pkill -f "target/release/pegainfer" +pkill -f "target/release/openinfer" ``` ### 3. Compare @@ -102,13 +102,13 @@ Read both JSON results. Key metrics: - **vLLM cold start:** torch.compile triggers on the first 1–3 requests. With `n=10`, mean TTFT is inflated 5–50× and p99 is always a cold-start spike — neither is meaningful. **Read median only.** For stable p99, use `n>=30` (cold-start requests become a small tail of the distribution). Example: Qwen3.5-4B at (2048,1), n=10: mean=1279ms, median=222ms, p99=9846ms. - **vLLM prefix cache:** Disable with `vllm serve --no-enable-prefix-caching` for random synthetic prefill probes unless prefix-cache behavior is intentionally part of the experiment. A 2026-05-04 `input_len=4096,output_len=64` run showed vLLM prefix cache hits even on the random dataset. -- **Greedy must be explicit:** vLLM 0.19.1 `vllm bench serve` no longer forces `temperature=0`; pass `--temperature 0` in both vLLM and pegainfer runs when comparing kernel/runtime speed. +- **Greedy must be explicit:** vLLM 0.19.1 `vllm bench serve` no longer forces `temperature=0`; pass `--temperature 0` in both vLLM and openinfer runs when comparing kernel/runtime speed. - **Context limit includes output:** `--max-model-len` is prompt plus generated tokens. For `input-len=4096, output-len=64`, use at least `4160`; `8192` is the simple safe value on RTX 5090. - **vLLM serve log flag drift:** vLLM 0.19.1 rejects the old `--disable-log-requests` flag. Omit it unless `vllm serve --help=all` on that machine shows a supported equivalent. - **Text-only Qwen3.5 on vLLM:** Some Qwen3.5 checkpoints expose multimodal metadata. For text-only benchmarking, start `vllm serve` with `--language-model-only` (equivalent to `--limit-mm-per-prompt '{"image":0,"video":0}'`). -- **pegainfer empty prompts:** Random dataset may produce empty prompts which pegainfer rejects. Check failed request count. -- **pegainfer streaming usage:** The vLLM frontend can overreport streaming `usage.completion_tokens` in `vllm bench serve` results. For fixed-output probes, trust TTFT/TPOT/ITL and recompute output throughput from `num_prompts * output_len / duration` until usage accounting is fixed. +- **openinfer empty prompts:** Random dataset may produce empty prompts which openinfer rejects. Check failed request count. +- **openinfer streaming usage:** The vLLM frontend can overreport streaming `usage.completion_tokens` in `vllm bench serve` results. For fixed-output probes, trust TTFT/TPOT/ITL and recompute output throughput from `num_prompts * output_len / duration` until usage accounting is fixed. - **Zombie processes:** Always `pkill` after benchmarking. Leftover servers block the port and hold GPU memory. -- **CUDA Graph:** pegainfer enables CUDA Graph by default. For apples-to-apples decode comparison, note this in the report. vLLM also uses CUDA Graph by default. -- **Concurrency:** `--max-concurrency N` controls batch size. pegainfer supports continuous batching (bucket CUDA Graphs at [1,2,4,8,16,32,64]). Throughput scales near-linearly with concurrency for bandwidth-bound decode. +- **CUDA Graph:** openinfer enables CUDA Graph by default. For apples-to-apples decode comparison, note this in the report. vLLM also uses CUDA Graph by default. +- **Concurrency:** `--max-concurrency N` controls batch size. openinfer supports continuous batching (bucket CUDA Graphs at [1,2,4,8,16,32,64]). Throughput scales near-linearly with concurrency for bandwidth-bound decode. - **Qwen3.5-4B CUDA Graph OOM on 16 GB:** torch.compile + CUDA Graph needs ~1 GiB extra for graph profiling on top of the 12 GiB model+activation footprint. Workaround: `--max-num-seqs 1` reduces the graph capture to batch_size=1 only and fits in 16 GB. Add `PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True` for marginal help. diff --git a/docs/playbooks/developer-onboarding.md b/docs/playbooks/developer-onboarding.md index dfadf20b..15a8a2be 100644 --- a/docs/playbooks/developer-onboarding.md +++ b/docs/playbooks/developer-onboarding.md @@ -1,4 +1,4 @@ -# Developer Onboarding: Setting Up the pegainfer Dev Environment from Scratch +# Developer Onboarding: Setting Up the openinfer Dev Environment from Scratch **Status**: Complete **TL;DR**: Full new-developer onboarding — toolchain check, unified venv, build, tests, benchmark smoke test. @@ -23,7 +23,7 @@ uv --version # Python package manager The project uses a single `.venv` for everything: triton (build dependency) and torch/transformers (reference scripts). ```bash -cd pegainfer/ +cd openinfer/ uv venv uv pip install triton torch transformers accelerate pytest ``` @@ -35,7 +35,7 @@ Verify: .venv/bin/python -c "import torch; print(torch.__version__, torch.cuda.is_available())" ``` -> build.rs auto-detects `.venv/bin/python` for Triton AOT compilation. Override with `PEGAINFER_TRITON_PYTHON` if needed. +> build.rs auto-detects `.venv/bin/python` for Triton AOT compilation. Override with `OPENINFER_TRITON_PYTHON` if needed. ## 3. Build @@ -43,13 +43,13 @@ Verify: cargo build --release ``` -First build takes ~30s. Compiles CUDA kernels (`pegainfer-kernels/csrc/*.cu`) and Triton AOT kernels (`pegainfer-kernels/tools/triton/*.py`). +First build takes ~30s. Compiles CUDA kernels (`openinfer-kernels/csrc/*.cu`) and Triton AOT kernels (`openinfer-kernels/tools/triton/*.py`). ## 4. Run Tests ```bash cargo test -r --workspace --lib # unit tests (~9s) -cargo test -r -p pegainfer-qwen3-4b --test hf_golden_gate # Qwen3-4B logits vs HF golden (~7s, needs GPU + model) +cargo test -r -p openinfer-qwen3-4b --test hf_golden_gate # Qwen3-4B logits vs HF golden (~7s, needs GPU + model) ``` > **Always use `--release`**. Debug builds are extremely slow for GPU code and will timeout. @@ -112,14 +112,14 @@ cargo run -r --bin bench_serving -- --model-path models/Qwen3.5-4B request Accuracy tests live in each model crate: ```bash -cargo test -r -p pegainfer-qwen3-4b --test hf_golden_gate # Qwen3-4B logits vs stored HF golden (bf16 tolerance) -cargo test -r -p pegainfer-qwen35-4b --test hf_golden_gate # Qwen3.5-4B logits vs stored HF golden (bf16 tolerance) -cargo test -r -p pegainfer-qwen35-4b --test e2e_scheduler # Qwen3.5-4B scheduler request-flow integration +cargo test -r -p openinfer-qwen3-4b --test hf_golden_gate # Qwen3-4B logits vs stored HF golden (bf16 tolerance) +cargo test -r -p openinfer-qwen35-4b --test hf_golden_gate # Qwen3.5-4B logits vs stored HF golden (bf16 tolerance) +cargo test -r -p openinfer-qwen35-4b --test e2e_scheduler # Qwen3.5-4B scheduler request-flow integration ``` -Qwen3-4B no longer pins exact greedy text: a bit-wise baseline false-positives across GPUs (per-card bf16 GEMM drifts the low bits). `hf_golden_gate` instead teacher-forces a fixed set of sequences and asserts pegainfer's logprobs land within the bf16 noise floor of a stored HuggingFace reference — across bs=1, batched, and the CUDA-graph path. The reasoning and tolerances are in `docs/models/qwen3/accuracy-gate.md`. +Qwen3-4B no longer pins exact greedy text: a bit-wise baseline false-positives across GPUs (per-card bf16 GEMM drifts the low bits). `hf_golden_gate` instead teacher-forces a fixed set of sequences and asserts openinfer's logprobs land within the bf16 noise floor of a stored HuggingFace reference — across bs=1, batched, and the CUDA-graph path. The reasoning and tolerances are in `docs/models/qwen3/accuracy-gate.md`. -Qwen3.5-4B follows the same HF-golden rule with one model-specific caveat: its fixture is produced through HF `use_cache=True` / `past_key_values`, and its replay surface is sequential graph plus bucket-straddling batched graph plus slot compaction because Qwen3.5 does not currently have an eager batched decode path. A broader PegaInfer-owned rand/hash corpus is deferred until the project decides how to handle cross-architecture exact-token drift. +Qwen3.5-4B follows the same HF-golden rule with one model-specific caveat: its fixture is produced through HF `use_cache=True` / `past_key_values`, and its replay surface is sequential graph plus bucket-straddling batched graph plus slot compaction because Qwen3.5 does not currently have an eager batched decode path. A broader OpenInfer-owned rand/hash corpus is deferred until the project decides how to handle cross-architecture exact-token drift. ### Regenerating Reference Data diff --git a/docs/playbooks/model-optimization-pipeline.md b/docs/playbooks/model-optimization-pipeline.md index ecfe9b5a..35888973 100644 --- a/docs/playbooks/model-optimization-pipeline.md +++ b/docs/playbooks/model-optimization-pipeline.md @@ -15,7 +15,7 @@ All models use two profiles that isolate the prefill and decode paths: No mixed-workload profiles. Mixed performance can be inferred from the two pure paths, and mixed data is hard to attribute. -pegainfer: `bench_serving request --prompt-len --output-len ` (see [profiling-guide](profiling-guide.md)). vLLM: `vllm bench serve` (see [bench-vs-vllm](bench-vs-vllm.md)). +openinfer: `bench_serving request --prompt-len --output-len ` (see [profiling-guide](profiling-guide.md)). vLLM: `vllm bench serve` (see [bench-vs-vllm](bench-vs-vllm.md)). ## Per-Model Doc Structure @@ -23,14 +23,14 @@ One doc per model under `projects/`, containing three sections: ### 1. E2E Dashboard -Current pegainfer vs vLLM end-to-end numbers. **Update the pegainfer column after each optimization** to always reflect the latest state. The vLLM column only needs to be measured once at baseline — re-measure when upgrading vLLM. +Current openinfer vs vLLM end-to-end numbers. **Update the openinfer column after each optimization** to always reflect the latest state. The vLLM column only needs to be measured once at baseline — re-measure when upgrading vLLM. ```markdown ## E2E Dashboard GPU: ..., Model: ..., vLLM version: ..., single concurrency. -| Profile | Metric | pegainfer | vLLM | delta | +| Profile | Metric | openinfer | vLLM | delta | |---------|--------|-----------|------|-------| | prefill-heavy (2048,1) | TTFT median | ... | ... | ... | | prefill-heavy (2048,1) | TTFT p99 | ... | ... | ... | @@ -40,7 +40,7 @@ GPU: ..., Model: ..., vLLM version: ..., single concurrency. ### 2. Model Architecture & Operator Coverage -Expand the model's computation graph first, then annotate pegainfer's support status. +Expand the model's computation graph first, then annotate openinfer's support status. **Architecture summary:** layer count, layer type distribution, key shapes. diff --git a/docs/roadmap/direction.md b/docs/roadmap/direction.md index 98af22b1..410d9fab 100644 --- a/docs/roadmap/direction.md +++ b/docs/roadmap/direction.md @@ -1,6 +1,6 @@ # Direction -> **TL;DR:** pegainfer is not chasing a super abstraction that swallows every model. The next-stage shape is a set of reusable, stable infrastructure plus per-model engines with clear boundaries. Share what is stable; let go of what will truly fork. Optimize for keeping each model's context coherent for both humans and LLMs to read in one pass. +> **TL;DR:** openinfer is not chasing a super abstraction that swallows every model. The next-stage shape is a set of reusable, stable infrastructure plus per-model engines with clear boundaries. Share what is stable; let go of what will truly fork. Optimize for keeping each model's context coherent for both humans and LLMs to read in one pass. > > **Origin:** "One Size Can't Fit All" (2026-05-05). @@ -21,8 +21,8 @@ So the working principle is **share the stable parts; allow the fork-prone parts ### Shared infrastructure (stable, cross-model) - **Frontend** — HTTP/OpenAI surface, tokenizer, chat template, stop sequences, streaming, logprobs, usage. Bridged via `vllm-frontend-rs` plus a local engine-core adapter. The frontend should never know what model, parallel strategy, or attention pattern is behind it. -- **Runtime primitives** — `pegainfer-core` owns the generation contract (`ModelForward`, `GenerationState`), sampler, CUDA Graph state, weight loader, page/KV pool primitives. Per-model crates depend on it, not on root. -- **Kernel layer** — `pegainfer-kernels` owns kernel sources, FFI, and wrappers. cuBLAS, FlashInfer, Triton AOT, handwritten CUDA all live here. Open to extension; we don't fork third-party kernels lightly. +- **Runtime primitives** — `openinfer-core` owns the generation contract (`ModelForward`, `GenerationState`), sampler, CUDA Graph state, weight loader, page/KV pool primitives. Per-model crates depend on it, not on root. +- **Kernel layer** — `openinfer-kernels` owns kernel sources, FFI, and wrappers. cuBLAS, FlashInfer, Triton AOT, handwritten CUDA all live here. Open to extension; we don't fork third-party kernels lightly. - **Data plane** — PegaFlow stays the KV data plane (RDMA transfer, SSD offload, prefix dedup), not folded into model internals. - **Tooling** — benchmarks, profiling, eventual tracing/simulator infrastructure. @@ -62,7 +62,7 @@ The closed loop matters: offline expectation (simulator) + online measurement (t ## What this is not -- Not "Rust rewrite of vLLM." vLLM optimizes within Python; SGLang aspires to Rust but hasn't started. pegainfer's structural choice — per-model engines on shared infrastructure — is orthogonal to language. +- Not "Rust rewrite of vLLM." vLLM optimizes within Python; SGLang aspires to Rust but hasn't started. openinfer's structural choice — per-model engines on shared infrastructure — is orthogonal to language. - Not "general-purpose LLM serving stack." We are good at specific model families with clear ownership; that is the unit of scaling, not "support every model on every backend." - Not a permanent freeze on shared layers. The boundary moves when a new model genuinely demands it (e.g., MoE all-to-all may pull communication primitives into the shared layer). It does not move because abstraction feels nicer. diff --git a/docs/roadmap/execution.md b/docs/roadmap/execution.md index 4f63dd2d..39348869 100644 --- a/docs/roadmap/execution.md +++ b/docs/roadmap/execution.md @@ -9,7 +9,7 @@ These are the shared layers — frontend, runtime, kernels, ledger/simulator/tra ### In progress - **Model-owned kernel plans.** Qwen3 already carries a light `kernel_plan` mapping prefill/decode/unified phases → Rust wrappers, FFI symbols, and CUDA/Triton/cuBLAS backends. Extend the same shape to Qwen3.5 and DeepSeek V4 so each model crate is self-describing. -- **Frontend polish.** `vllm-frontend-rs` is the default OpenAI surface, talking to pegainfer via a local engine-core IPC bridge. Outstanding: logprobs / prompt-logprobs translation, usage accounting, and a deliberate decision on whether the served-model-id should decouple from the tokenizer path. +- **Frontend polish.** `vllm-frontend-rs` is the default OpenAI surface, talking to openinfer via a local engine-core IPC bridge. Outstanding: logprobs / prompt-logprobs translation, usage accounting, and a deliberate decision on whether the served-model-id should decouple from the tokenizer path. ### Next @@ -23,7 +23,7 @@ These are the shared layers — frontend, runtime, kernels, ledger/simulator/tra ## Models -Each model crate owns its own scheduler, kernels, accuracy story, and benchmarks. The boundary with shared infrastructure is `pegainfer-core` (runtime contract) + `pegainfer-kernels` (op layer). +Each model crate owns its own scheduler, kernels, accuracy story, and benchmarks. The boundary with shared infrastructure is `openinfer-core` (runtime contract) + `openinfer-kernels` (op layer). ### DeepSeek V4 diff --git a/docs/subsystems/correctness/logits-golden-gate.md b/docs/subsystems/correctness/logits-golden-gate.md index 7a4bcaaa..eff4a79c 100644 --- a/docs/subsystems/correctness/logits-golden-gate.md +++ b/docs/subsystems/correctness/logits-golden-gate.md @@ -1,6 +1,6 @@ # Numerical correctness: the logits golden gate -**TL;DR**: How to guard that a model's logits stay correct across prompts, hardware, and batch size — *without* binding to one GPU's exact bits. The pattern: store a reference (HuggingFace bf16) of top-K logprobs for fixed teacher-forced sequences, replay them through pegainfer, and assert (a) a structural *regret* check on the argmax and (b) the **mean** and **p99** of the per-token logprob delta stay at the bf16 noise floor. NOT exact text, NOT a hash, NOT bit-identical-across-batch, NOT the absolute max. Qwen3-4B is the reference implementation (`pegainfer-qwen3-4b/tests/hf_golden_gate.rs`, see `models/qwen3/accuracy-gate.md`); Qwen3.5-4B applies the same method with an HF `past_key_values` oracle and graph-only replay (`pegainfer-qwen35-4b/tests/hf_golden_gate.rs`, see `models/qwen35/accuracy.md`). +**TL;DR**: How to guard that a model's logits stay correct across prompts, hardware, and batch size — *without* binding to one GPU's exact bits. The pattern: store a reference (HuggingFace bf16) of top-K logprobs for fixed teacher-forced sequences, replay them through openinfer, and assert (a) a structural *regret* check on the argmax and (b) the **mean** and **p99** of the per-token logprob delta stay at the bf16 noise floor. NOT exact text, NOT a hash, NOT bit-identical-across-batch, NOT the absolute max. Qwen3-4B is the reference implementation (`openinfer-qwen3-4b/tests/hf_golden_gate.rs`, see `models/qwen3/accuracy-gate.md`); Qwen3.5-4B applies the same method with an HF `past_key_values` oracle and graph-only replay (`openinfer-qwen35-4b/tests/hf_golden_gate.rs`, see `models/qwen35/accuracy.md`). Last touched: 2026-05 @@ -10,7 +10,7 @@ Last touched: 2026-05 Note what this does *not* say: it does not say "bit-identical". Two facts make bit-identity a false invariant, and conflating either with a real bug is the trap every naive correctness test falls into: -1. **Cross-hardware.** pegainfer's logits come out of bf16 GEMM. Different GPUs (and different kernel/cuBLAS builds) use different tile shapes and accumulation orders, so the low mantissa bits differ by 1–2 ULP. bf16 has a 7-bit mantissa, so 1 ULP at logit magnitude ~16 is ≈0.125 nat — enough to flip an argmax on a near-tie. +1. **Cross-hardware.** openinfer's logits come out of bf16 GEMM. Different GPUs (and different kernel/cuBLAS builds) use different tile shapes and accumulation orders, so the low mantissa bits differ by 1–2 ULP. bf16 has a 7-bit mantissa, so 1 ULP at logit magnitude ~16 is ≈0.125 nat — enough to flip an argmax on a near-tie. 2. **Cross-batch.** The batched decode path is *not* batch-invariant: batch composition changes the order in which partial results are reduced, which drifts logits ~1 ULP the same way. So "batched == sequential, bit-for-bit" is false by construction — an `executor_equivalence`-style test that asserts it is asserting a falsehood and will flake on benign noise. (We have *measured* this batch-dependent drift; we have not isolated which kernel produces it, so the doc attributes it to reduction order, not to a named library.) The correct invariant is therefore *bounded* drift, not *zero* drift. Everything below is about bounding it strictly enough to catch real bugs while absorbing the irreducible bf16 tail. @@ -21,13 +21,13 @@ Both are **hardware-bound**: green on the machine that produced them, red everyw ## The four design choices -**1. A reference of equal precision, stored once.** Use HuggingFace as the numerical golden truth, dumped in **bf16** — the same precision regime as pegainfer, so the comparison is apples-to-apples — on GPU with `device_map=auto` so the one script scales to large models. Store top-K logprobs per evaluated position as safetensors (machine-only numeric data, nobody reads it). fp32 is reserved for one-time tie *adjudication*, not for the gate. +**1. A reference of equal precision, stored once.** Use HuggingFace as the numerical golden truth, dumped in **bf16** — the same precision regime as openinfer, so the comparison is apples-to-apples — on GPU with `device_map=auto` so the one script scales to large models. Store top-K logprobs per evaluated position as safetensors (machine-only numeric data, nobody reads it). fp32 is reserved for one-time tie *adjudication*, not for the gate. **2. Teacher-forcing, not free greedy.** Feed both engines the *identical* fixed token sequence (the reference's own prompt + decode tail) and score per position. Free-running greedy lets one argmax flip cascade, making every later position incomparable. Teacher-forcing isolates each position so a disagreement is a real per-position disagreement. -**3. A structural regret check on the argmax — magnitude-independent.** pegainfer's chosen token must be one the reference also ranks near its own best. *Regret* = how far below the reference's argmax (in the reference's own logprobs) pegainfer's pick sits; it must stay ≤ a tie tolerance (Qwen3-4B: 0.20 nat). Where the reference has a clear winner, the only token within tolerance is its argmax, so this enforces exact agreement there; at a genuine bf16 tie the runner-up is within a ULP or two, so a tie-flip is not a failure. A pick the reference scores clearly worse — or one absent from its top-K entirely (confidently wrong on a token the reference does not even rank) — is a real wrong-token bug. This regret form is deliberate: a plain margin-gated equality check leaves a hole in the sub-tolerance tie band where a garbage argmax escapes every check. +**3. A structural regret check on the argmax — magnitude-independent.** openinfer's chosen token must be one the reference also ranks near its own best. *Regret* = how far below the reference's argmax (in the reference's own logprobs) openinfer's pick sits; it must stay ≤ a tie tolerance (Qwen3-4B: 0.20 nat). Where the reference has a clear winner, the only token within tolerance is its argmax, so this enforces exact agreement there; at a genuine bf16 tie the runner-up is within a ULP or two, so a tie-flip is not a failure. A pick the reference scores clearly worse — or one absent from its top-K entirely (confidently wrong on a token the reference does not even rank) — is a real wrong-token bug. This regret form is deliberate: a plain margin-gated equality check leaves a hole in the sub-tolerance tie band where a garbage argmax escapes every check. -**4. Mean + p99 of the logprob delta — NOT the absolute max.** On the head tokens, bound `|pegainfer − reference|` two ways: +**4. Mean + p99 of the logprob delta — NOT the absolute max.** On the head tokens, bound `|openinfer − reference|` two ways: - **mean** — trips on *systematic* drift. A uniform logit shift of `d` nat moves every delta by ~`d`, so the mean catches a small global regression that a single-token check would miss. Averaged over thousands of deltas it is hardware-stable yet sensitive. - **p99** — bounds the tail without chasing the single worst token. @@ -58,7 +58,7 @@ mean and p99 are flat across every pass; only the absolute max moves: | graph (9 padded) | 153 | 0.034 | 0.13 | 0.44 | | graph (5 padded) | 85 | 0.032 | 0.11 | 0.14 | -The single worst token is the **same** one across bs=1 / eager-9 / graph-9 — a deep-tail token at logprob ≈−10, far below the argmax: the reference is fixed at −10.2508 while pegainfer reads −9.876 at bs=1 and −9.813 in the 9-seq batch. The delta swings 0.37→0.44 purely from the batch-dependent reduction order, with **zero** effect on the argmax. eager-9 and graph-9 are bit-identical, which proves the CUDA-graph path matches eager at the same composition; the only mover is batch composition. This is exactly the benign reduction-order noise the tolerance is built to absorb, and exactly why the max is printed but not asserted. +The single worst token is the **same** one across bs=1 / eager-9 / graph-9 — a deep-tail token at logprob ≈−10, far below the argmax: the reference is fixed at −10.2508 while openinfer reads −9.876 at bs=1 and −9.813 in the 9-seq batch. The delta swings 0.37→0.44 purely from the batch-dependent reduction order, with **zero** effect on the argmax. eager-9 and graph-9 are bit-identical, which proves the CUDA-graph path matches eager at the same composition; the only mover is batch composition. This is exactly the benign reduction-order noise the tolerance is built to absorb, and exactly why the max is printed but not asserted. ## Applying it to a new model line diff --git a/docs/subsystems/frontend/cpu-profiling-baseline.md b/docs/subsystems/frontend/cpu-profiling-baseline.md index 80191bf8..9939a677 100644 --- a/docs/subsystems/frontend/cpu-profiling-baseline.md +++ b/docs/subsystems/frontend/cpu-profiling-baseline.md @@ -1,8 +1,8 @@ -# Frontend CPU Profiling Baseline (pegainfer-sim) +# Frontend CPU Profiling Baseline (openinfer-sim) **Created**: 2026-06-05 **Last touched**: 2026-06 -**TL;DR**: CPU-side profiling of the vLLM/OpenAI frontend path using `pegainfer-sim` with fixed TTFT=5ms / TPOT=12ms. At 200 req / concurrency=16 / prompt=128 words / output=64 tokens the frontend adds ~150ms TTFT overhead above the 5ms simulated floor and shows no throughput bottleneck (QPS=18.2, 0 failures). Top hotspots: heap allocation (malloc/realloc ~10%), stream polling (~7.5%), clock_gettime (~2%), JSON serialization (~1%). No single frontend bottleneck dominates — the overhead is distributed across tokio runtime, IPC bridge, and HTTP framing. +**TL;DR**: CPU-side profiling of the vLLM/OpenAI frontend path using `openinfer-sim` with fixed TTFT=5ms / TPOT=12ms. At 200 req / concurrency=16 / prompt=128 words / output=64 tokens the frontend adds ~150ms TTFT overhead above the 5ms simulated floor and shows no throughput bottleneck (QPS=18.2, 0 failures). Top hotspots: heap allocation (malloc/realloc ~10%), stream polling (~7.5%), clock_gettime (~2%), JSON serialization (~1%). No single frontend bottleneck dominates — the overhead is distributed across tokio runtime, IPC bridge, and HTTP framing. ## Reproducible Benchmark @@ -10,15 +10,15 @@ ```bash # Build sim binary (requires protoc) -cargo build --release -p pegainfer-sim +cargo build --release -p openinfer-sim ``` ### Create a tiny local model dir (avoids HF download) ```bash -mkdir -p /tmp/pegainfer-sim-model +mkdir -p /tmp/openinfer-sim-model -cat > /tmp/pegainfer-sim-model/tokenizer.json << 'EOF' +cat > /tmp/openinfer-sim-model/tokenizer.json << 'EOF' { "version": "1.0", "truncation": null, @@ -34,20 +34,20 @@ cat > /tmp/pegainfer-sim-model/tokenizer.json << 'EOF' } EOF -cat > /tmp/pegainfer-sim-model/tokenizer_config.json << 'EOF' +cat > /tmp/openinfer-sim-model/tokenizer_config.json << 'EOF' { "unk_token": "", "tokenizer_class": "PreTrainedTokenizerFast" } EOF -cat > /tmp/pegainfer-sim-model/config.json << 'EOF' -{ "model_type": "pegainfer_sim", "max_position_embeddings": 8192 } +cat > /tmp/openinfer-sim-model/config.json << 'EOF' +{ "model_type": "openinfer_sim", "max_position_embeddings": 8192 } EOF ``` ### Start server ```bash -cargo run --release -p pegainfer-sim -- \ - --model-id /tmp/pegainfer-sim-model \ +cargo run --release -p openinfer-sim -- \ + --model-id /tmp/openinfer-sim-model \ --port 8732 \ --base-ttft-ms 5 \ --tpot-ms 12 \ @@ -60,7 +60,7 @@ cargo run --release -p pegainfer-sim -- \ ```bash python3 scripts/bench_http_serving.py \ --base-url http://127.0.0.1:8732 \ - --model /tmp/pegainfer-sim-model \ + --model /tmp/openinfer-sim-model \ --num-requests 200 \ --concurrency 16 \ --prompt-words 128 \ @@ -75,12 +75,12 @@ In a separate terminal after starting the server and confirming it responds: ```bash # Summary stats (IPC, cache misses, branch mispredictions) -SIM_PID=$(pgrep -f "target/release/pegainfer-sim") +SIM_PID=$(pgrep -f "target/release/openinfer-sim") perf stat -p $SIM_PID \ -e cycles,instructions,cache-references,cache-misses,branch-misses,task-clock,context-switches,cpu-migrations \ -- timeout 15 python3 scripts/bench_http_serving.py \ --base-url http://127.0.0.1:8732 \ - --model /tmp/pegainfer-sim-model \ + --model /tmp/openinfer-sim-model \ --num-requests 200 \ --concurrency 16 \ --prompt-words 128 \ @@ -91,7 +91,7 @@ perf stat -p $SIM_PID \ perf record -g -p $SIM_PID -o /tmp/sim-perf.data -- \ timeout 10 python3 scripts/bench_http_serving.py \ --base-url http://127.0.0.1:8732 \ - --model /tmp/pegainfer-sim-model \ + --model /tmp/openinfer-sim-model \ --num-requests 100 \ --concurrency 16 \ --prompt-words 128 \ @@ -176,7 +176,7 @@ Each direction below is tied to the specific measured frontend overhead from the ### 1. Reduce IPC bridge hops for single-engine deployments -**Measured basis**: The data path crosses 5 mpsc channels + 1 ZMQ Unix socket between `run_simulated_request` and `completion_sse_stream` (confirmed by source trace in `pegainfer-vllm-frontend/src/lib.rs` lines 303–847). The `output_loop` serializes all requests through a single `PushSocket`. +**Measured basis**: The data path crosses 5 mpsc channels + 1 ZMQ Unix socket between `run_simulated_request` and `completion_sse_stream` (confirmed by source trace in `openinfer-vllm-frontend/src/lib.rs` lines 303–847). The `output_loop` serializes all requests through a single `PushSocket`. **Direction**: For single-engine (non-distributed) deployments, bypass ZMQ and connect `LocalEngineBridge` directly through an in-process mpsc channel. This would remove the `encode_msgpack` → ZMQ send → ZMQ recv → `decode_msgpack` round-trip and its associated allocation (`rmp_serde::Decoder::any_inner` at 0.5%). diff --git a/docs/subsystems/frontend/simulated-inference-engine.md b/docs/subsystems/frontend/simulated-inference-engine.md index 14dce2cc..f417f80a 100644 --- a/docs/subsystems/frontend/simulated-inference-engine.md +++ b/docs/subsystems/frontend/simulated-inference-engine.md @@ -3,18 +3,18 @@ **Created**: 2026-05-16 **Status**: ready for PR review **Last touched**: 2026-05 -**TL;DR**: `pegainfer-sim` is a CPU-only simulated model crate that serves through the existing vLLM/OpenAI frontend with configurable TTFT/TPOT and lightweight HTTP e2e coverage. It is a benchmark and frontend validation harness, not a real-model performance path. +**TL;DR**: `openinfer-sim` is a CPU-only simulated model crate that serves through the existing vLLM/OpenAI frontend with configurable TTFT/TPOT and lightweight HTTP e2e coverage. It is a benchmark and frontend validation harness, not a real-model performance path. ## Scope -Issue #125 needs a server path that can run `vllm bench serve` without GPU or model weights while still exercising the same HTTP frontend used by real pegainfer models. +Issue #125 needs a server path that can run `vllm bench serve` without GPU or model weights while still exercising the same HTTP frontend used by real openinfer models. This PR keeps that boundary narrow: -- Add `pegainfer-engine` for the lightweight `EngineHandle`, `GenerateRequest`, `TokenEvent`, and `SamplingParams` contract. -- Re-export that contract from `pegainfer-core` so existing model crates keep their current imports. -- Move the vLLM bridge into `pegainfer-vllm-frontend`, leaving `pegainfer-server/src/vllm_frontend.rs` as a compatibility re-export. -- Add `pegainfer-sim` as an independently maintained model crate with a thin CLI binary. +- Add `openinfer-engine` for the lightweight `EngineHandle`, `GenerateRequest`, `TokenEvent`, and `SamplingParams` contract. +- Re-export that contract from `openinfer-core` so existing model crates keep their current imports. +- Move the vLLM bridge into `openinfer-vllm-frontend`, leaving `openinfer-server/src/vllm_frontend.rs` as a compatibility re-export. +- Add `openinfer-sim` as an independently maintained model crate with a thin CLI binary. Out of scope: @@ -24,7 +24,7 @@ Out of scope: ## Behavior -`pegainfer-sim` exposes CLI knobs for model id, port, max model length, base TTFT, prefill throughput, TPOT, and fallback token id. +`openinfer-sim` exposes CLI knobs for model id, port, max model length, base TTFT, prefill throughput, TPOT, and fallback token id. The timing model is intentionally simple: TTFT is `base_ttft_ms + prompt_len / prefill_tokens_per_ms`, and TPOT is a fixed delay between generated tokens. Output token ids cycle through the prompt tokens, using the fallback id for empty prompts. @@ -32,7 +32,7 @@ The frontend still needs tokenizer/model metadata, but the simulator never loads ## Frontend Metadata Contract -`pegainfer-sim` does not load model weights, but serving it through the +`openinfer-sim` does not load model weights, but serving it through the vLLM/OpenAI frontend still constructs the normal text/chat backend. That frontend path requires enough local model metadata to initialize tokenization and detokenization. @@ -45,9 +45,9 @@ metadata, but no weight files are required. ## Implementation Details -- `pegainfer-engine` owns the shared engine contract, while `pegainfer-core` only re-exports it for existing model crates. -- `pegainfer-vllm-frontend` owns the bridge logic; `pegainfer-server/src/vllm_frontend.rs` stays as a compatibility re-export. -- `pegainfer-sim` is kept as a separate model crate so future simulation changes do not have to live inside the real model crates. +- `openinfer-engine` owns the shared engine contract, while `openinfer-core` only re-exports it for existing model crates. +- `openinfer-vllm-frontend` owns the bridge logic; `openinfer-server/src/vllm_frontend.rs` stays as a compatibility re-export. +- `openinfer-sim` is kept as a separate model crate so future simulation changes do not have to live inside the real model crates. ## Future Plans diff --git a/docs/subsystems/kernels/kernel-op-reports.md b/docs/subsystems/kernels/kernel-op-reports.md index 15f4b385..4a2beb5e 100644 --- a/docs/subsystems/kernels/kernel-op-reports.md +++ b/docs/subsystems/kernels/kernel-op-reports.md @@ -3,7 +3,7 @@ **Created**: 2026-05-04 **Last touched**: 2026-05 **Status**: active; prefill/report commit ready, decode tuning deferred -**TL;DR**: `qwen3_kernel_snapshot` is no longer a Cargo bench. Qwen3 has feature-gated `qwen3_kernel_report` per-op kernel tooling and `qwen3_model_report` model-level decode operator reporting. Decode now routes through an eager `BatchDecodeDag`, so the executable forward sequence is also the `KernelCall` contract source for runtime tracing; `qwen3_model_report` disables CUDA Graph, traces that DAG, then joins traced TensorSpecs against measured microbench results to emit by-op, by-call-site, coverage, schedule-preview, latency-stat, and Graphviz DOT reports. Prefill remains covered by `qwen3_kernel_report` stage reports; measured FA2 `CTA_TILE_Q=64` is the Qwen3 production prefill default on the RTX 5090 grid. The model-agnostic timing loop, latency stats, and by-op/by-call-site rollup now live in the `pegainfer-bench` crate, shared by the Kimi-K2 model report; the qwen3 attention-specific regression framework (manifest/snapshot/provenance/CUPTI/cold-L2) deliberately stays in `qwen3_kernel_report`. +**TL;DR**: `qwen3_kernel_snapshot` is no longer a Cargo bench. Qwen3 has feature-gated `qwen3_kernel_report` per-op kernel tooling and `qwen3_model_report` model-level decode operator reporting. Decode now routes through an eager `BatchDecodeDag`, so the executable forward sequence is also the `KernelCall` contract source for runtime tracing; `qwen3_model_report` disables CUDA Graph, traces that DAG, then joins traced TensorSpecs against measured microbench results to emit by-op, by-call-site, coverage, schedule-preview, latency-stat, and Graphviz DOT reports. Prefill remains covered by `qwen3_kernel_report` stage reports; measured FA2 `CTA_TILE_Q=64` is the Qwen3 production prefill default on the RTX 5090 grid. The model-agnostic timing loop, latency stats, and by-op/by-call-site rollup now live in the `openinfer-bench` crate, shared by the Kimi-K2 model report; the qwen3 attention-specific regression framework (manifest/snapshot/provenance/CUPTI/cold-L2) deliberately stays in `qwen3_kernel_report`. ## Preparation @@ -11,15 +11,15 @@ - `docs/index.md` - located the active benchmarking, CUPTI, kernel-boundary, and Qwen3 model-crate docs. - `docs/models/qwen3/model-crate.md` - confirmed `qwen3_kernel_snapshot` was the current Qwen3 kernel snapshot runner and already captured warm/cold-L2 latency plus default CUPTI counters. - `docs/conventions/bench-regression.md` - clarified that the existing serving benchmark remains the model-level regression artifact; this task should not mix per-op reports with E2E snapshots. - - `docs/subsystems/kernels/pegainfer-kernels-boundary.md` - confirmed kernels should become first-class measurable assets and model DAG manifests should live with model crates. - - `docs/models/qwen3/kernels-crate.md` - confirmed kernel source/build ownership now lives in `pegainfer-kernels`, while model-owned DAG metadata belongs in the Qwen3 crate. + - `docs/subsystems/kernels/openinfer-kernels-boundary.md` - confirmed kernels should become first-class measurable assets and model DAG manifests should live with model crates. + - `docs/models/qwen3/kernels-crate.md` - confirmed kernel source/build ownership now lives in `openinfer-kernels`, while model-owned DAG metadata belongs in the Qwen3 crate. - `docs/playbooks/profiling-guide.md` - confirmed the diagnostic split between kernel composition/proportions and benchmark-grade latency. - **Relevant history**: - `docs/models/qwen3/model-crate.md` showed the current single-op snapshot already found the low-batch long-context decode-attention bottleneck. - **Plan**: 1. Add direct Qwen3 crate dev-dependencies for generic infrastructure (`clap` derive for CLI and `toml` for manifest parsing) instead of extending the hand-written parser. 2. Add a model-local TOML manifest for Qwen3-4B kernel reports, initially covering only op names, phases, shape sweeps, and variants. - 3. Replace `crates/pegainfer-qwen3-4b/benches/qwen3_kernel_snapshot.rs` with a manifest-driven `qwen3_kernel_report` bin; do not keep a bench wrapper. + 3. Replace `crates/openinfer-qwen3-4b/benches/qwen3_kernel_snapshot.rs` with a manifest-driven `qwen3_kernel_report` bin; do not keep a bench wrapper. 4. Add a composition command that reads per-op case results and emits a decode phase report by joining the manifest's op repeat rules with measured per-op reports. 5. Run formatting and the strongest local compile checks available; GPU execution may still require the CUDA validation host because this machine lacks local CUDA tooling. - **Risks / open questions**: @@ -29,13 +29,13 @@ ## Execution Log ### Step 1: Move from bench target to bin -- Removed the `qwen3_kernel_snapshot` bench target from `crates/pegainfer-qwen3-4b/Cargo.toml`. -- Moved the report runner to `crates/pegainfer-qwen3-4b/src/bin/qwen3_kernel_report.rs`. -- Added a `kernel-report` feature for generic tool dependencies (`clap`, `toml`, `sha2`, `hex`) and `pegainfer-cupti`; the bin requires that feature so normal Qwen3 library/server builds do not pull CUPTI into the default dependency graph. +- Removed the `qwen3_kernel_snapshot` bench target from `crates/openinfer-qwen3-4b/Cargo.toml`. +- Moved the report runner to `crates/openinfer-qwen3-4b/src/bin/qwen3_kernel_report.rs`. +- Added a `kernel-report` feature for generic tool dependencies (`clap`, `toml`, `sha2`, `hex`) and `openinfer-cupti`; the bin requires that feature so normal Qwen3 library/server builds do not pull CUPTI into the default dependency graph. - Removed the temporary `cargo bench` compatibility argument handling after the tool became a normal binary. ### Step 2: Add model-local manifest -- Added `crates/pegainfer-qwen3-4b/kernel_manifests/qwen3-4b.toml`. +- Added `crates/openinfer-qwen3-4b/kernel_manifests/qwen3-4b.toml`. - The first manifest now stays deliberately thin: `model`, `[[ops]]`, `phase`, per-op shape sweep fields, and variant labels. Provider-owned facts such as dtype, head counts, head dimension, page size, thresholds, and composition policy stay in Rust. ### Step 3: Refactor report schema and commands @@ -46,21 +46,21 @@ ### Step 4: Local validation - `cargo fmt --all --check` passed. - `cargo metadata --no-deps --format-version 1` passed. -- Local `cargo check --release -p pegainfer-qwen3-4b --bench qwen3_kernel_snapshot` previously failed before Rust type checking because this local host lacks a usable `nvcc`; GPU validation moved to the CUDA validation host. +- Local `cargo check --release -p openinfer-qwen3-4b --bench qwen3_kernel_snapshot` previously failed before Rust type checking because this local host lacks a usable `nvcc`; GPU validation moved to the CUDA validation host. ### Step 5: GPU minimal validation - Rebuilt the disposable validation worktree at `` from local `HEAD` commit `612850f`, then rsynced the current working tree changes over it. -- Copied initialized FlashInfer headers from `/third_party/flashinfer` into the clean worktree's `crates/pegainfer-kernels/third_party/flashinfer` directory. -- `PEGAINFER_CUDA_SM=120 cargo build --release -p pegainfer-qwen3-4b --features kernel-report --bin qwen3_kernel_report` passed. -- `PEGAINFER_CUDA_SM=120 cargo run --release -p pegainfer-qwen3-4b --features kernel-report --bin qwen3_kernel_report -- run --no-cupti --iters 1 --contexts 1024 --batch-sizes 1 --variants non_partition --out $RESULT_ROOT/qwen3_kernel_op_report_min.json` passed. -- `PEGAINFER_CUDA_SM=120 cargo run --release -p pegainfer-qwen3-4b --features kernel-report --bin qwen3_kernel_report -- compare --base $RESULT_ROOT/qwen3_kernel_op_report_min.json --new $RESULT_ROOT/qwen3_kernel_op_report_min.json` passed with `warnings=0 failures=0`. -- `PEGAINFER_CUDA_SM=120 cargo run --release -p pegainfer-qwen3-4b --features kernel-report --bin qwen3_kernel_report -- compose --input $RESULT_ROOT/qwen3_kernel_op_report_min.json --batch-size 1 --context 1024 --out $RESULT_ROOT/qwen3_kernel_composition_min.json` passed. +- Copied initialized FlashInfer headers from `/third_party/flashinfer` into the clean worktree's `crates/openinfer-kernels/third_party/flashinfer` directory. +- `OPENINFER_CUDA_SM=120 cargo build --release -p openinfer-qwen3-4b --features kernel-report --bin qwen3_kernel_report` passed. +- `OPENINFER_CUDA_SM=120 cargo run --release -p openinfer-qwen3-4b --features kernel-report --bin qwen3_kernel_report -- run --no-cupti --iters 1 --contexts 1024 --batch-sizes 1 --variants non_partition --out $RESULT_ROOT/qwen3_kernel_op_report_min.json` passed. +- `OPENINFER_CUDA_SM=120 cargo run --release -p openinfer-qwen3-4b --features kernel-report --bin qwen3_kernel_report -- compare --base $RESULT_ROOT/qwen3_kernel_op_report_min.json --new $RESULT_ROOT/qwen3_kernel_op_report_min.json` passed with `warnings=0 failures=0`. +- `OPENINFER_CUDA_SM=120 cargo run --release -p openinfer-qwen3-4b --features kernel-report --bin qwen3_kernel_report -- compose --input $RESULT_ROOT/qwen3_kernel_op_report_min.json --batch-size 1 --context 1024 --out $RESULT_ROOT/qwen3_kernel_composition_min.json` passed. - CUPTI minimal validation passed with `non_partition,split_kv_256x64` at `bs=1,ctx=1024`; the report contained 2 cases, 1 selection, CUPTI metrics for both cases, and selected `split_kv_256x64`. -- Default package build without the report feature also passed: `PEGAINFER_CUDA_SM=120 cargo build --release -p pegainfer-qwen3-4b`. +- Default package build without the report feature also passed: `OPENINFER_CUDA_SM=120 cargo build --release -p openinfer-qwen3-4b`. ### Step 6: Full GPU manifest run - Ran the full manifest command on the validation worktree: - - `PEGAINFER_CUDA_SM=120 time cargo run --release -p pegainfer-qwen3-4b --features kernel-report --bin qwen3_kernel_report -- run --out $RESULT_ROOT/qwen3_kernel_report_full.json` + - `OPENINFER_CUDA_SM=120 time cargo run --release -p openinfer-qwen3-4b --features kernel-report --bin qwen3_kernel_report -- run --out $RESULT_ROOT/qwen3_kernel_report_full.json` - Result: - `126` cases: `6` batch sizes x `7` context lengths x `3` variants. - `42` selections. @@ -69,7 +69,7 @@ - Runtime: `2:42.83 elapsed`. - Manifest hash: `62aada084b61795862c5d4dd23fa89d1`. - Self-compare passed: - - `PEGAINFER_CUDA_SM=120 cargo run --release -p pegainfer-qwen3-4b --features kernel-report --bin qwen3_kernel_report -- compare --base $RESULT_ROOT/qwen3_kernel_report_full.json --new $RESULT_ROOT/qwen3_kernel_report_full.json` + - `OPENINFER_CUDA_SM=120 cargo run --release -p openinfer-qwen3-4b --features kernel-report --bin qwen3_kernel_report -- compare --base $RESULT_ROOT/qwen3_kernel_report_full.json --new $RESULT_ROOT/qwen3_kernel_report_full.json` - Output: `kernel report compare complete: warnings=0 failures=0`. - Representative selections: - `bs=1,ctx=1024`: `split_kv_256x64`. @@ -82,7 +82,7 @@ - `non_partition`: `15`. - `split_kv_512x64`: `7`. - Composed the full report for `bs=1,ctx=4096`: - - `PEGAINFER_CUDA_SM=120 cargo run --release -p pegainfer-qwen3-4b --features kernel-report --bin qwen3_kernel_report -- compose --input $RESULT_ROOT/qwen3_kernel_report_full.json --batch-size 1 --context 4096 --out $RESULT_ROOT/qwen3_kernel_composition_full_bs1_ctx4096.json` + - `OPENINFER_CUDA_SM=120 cargo run --release -p openinfer-qwen3-4b --features kernel-report --bin qwen3_kernel_report -- compose --input $RESULT_ROOT/qwen3_kernel_report_full.json --batch-size 1 --context 4096 --out $RESULT_ROOT/qwen3_kernel_composition_full_bs1_ctx4096.json` - Output total: cold-L2 `958.473us`, `split_kv_256x64` repeated across 36 layers. - Coverage note still applies: only `paged_decode_attention` is included; linear, MLP, norm, embedding, and sampling are not covered yet. - Preserved the generated JSONs under: @@ -119,8 +119,8 @@ - removed `default_attention_kernel_specs`; - removed the old multi-launch `measure_decode_only` helper and `INNER_LAUNCHES`. - GPU validation in ``: - - `PEGAINFER_CUDA_SM=120 cargo build --release -p pegainfer-qwen3-4b --features kernel-report --bin qwen3_kernel_report` passed. - - `PEGAINFER_CUDA_SM=120 cargo build --release -p pegainfer-qwen3-4b` passed. + - `OPENINFER_CUDA_SM=120 cargo build --release -p openinfer-qwen3-4b --features kernel-report --bin qwen3_kernel_report` passed. + - `OPENINFER_CUDA_SM=120 cargo build --release -p openinfer-qwen3-4b` passed. - Decode report, `bs=1,ctx=1024`, no CUPTI, `iters=3`: `2` cases, `0` errors, selected `split_kv_256x64`; measured `non_partition=45.739us`, `split_kv_256x64=20.480us`. - Prefill report, `bs=1,seq=128`, no CUPTI, `iters=3`: `1` case, `0` errors, `24.917us`. - Prefill report, `bs=1,seq=1024`, no CUPTI, `iters=3`: `1` case, `0` errors, `142.325us`. @@ -133,8 +133,8 @@ - Removed the compare-time DRAM read-amplification gate. `compare` now gates only `latency_us`; metric interpretation is intentionally outside the runner. - Bumped op report schema to `4` and composition report schema to `3`. - GPU validation: - - `PEGAINFER_CUDA_SM=120 cargo build --release -p pegainfer-qwen3-4b --features kernel-report --bin qwen3_kernel_report` passed. - - `PEGAINFER_CUDA_SM=120 cargo build --release -p pegainfer-qwen3-4b` passed. + - `OPENINFER_CUDA_SM=120 cargo build --release -p openinfer-qwen3-4b --features kernel-report --bin qwen3_kernel_report` passed. + - `OPENINFER_CUDA_SM=120 cargo build --release -p openinfer-qwen3-4b` passed. - Decode raw-CUPTI validation: `2` cases, `0` errors, schema `4`; `case.cupti` keys were exactly the configured CUPTI metric names. - Prefill raw-CUPTI validation: `1` case, `0` errors, schema `4`; `case.cupti` keys were exactly the configured CUPTI metric names. - Decode and prefill self-compare passed with `warnings=0 failures=0`. @@ -143,18 +143,18 @@ ### Step 11: Full raw-CUPTI cold-L2 manifest run - Preserved full-run JSONs under ``. - Decode full command: - - `PEGAINFER_CUDA_SM=120 time cargo run --release -p pegainfer-qwen3-4b --features kernel-report --bin qwen3_kernel_report -- run --out ` + - `OPENINFER_CUDA_SM=120 time cargo run --release -p openinfer-qwen3-4b --features kernel-report --bin qwen3_kernel_report -- run --out ` - Result: schema `4`, `126` cases, `42` selections, `0` errors, `126` CUPTI cases, `128` measured iterations, elapsed `2:11.77`. - Selection counts: `split_kv_256x64=22`, `non_partition=13`, `split_kv_512x64=7`. - `case.cupti` contains exactly the configured CUPTI metric names. Cases and selections do not contain `diagnosis`. - Prefill full command: - - `PEGAINFER_CUDA_SM=120 time cargo run --release -p pegainfer-qwen3-4b --features kernel-report --bin qwen3_kernel_report -- run --op paged_prefill_attention --out ` + - `OPENINFER_CUDA_SM=120 time cargo run --release -p openinfer-qwen3-4b --features kernel-report --bin qwen3_kernel_report -- run --op paged_prefill_attention --out ` - Result: schema `4`, `7` cases, `7` selections, `0` errors, `7` CUPTI cases, `128` measured iterations, elapsed `0:07.43`. - Latency by `seq_len`: `128=24.687us`, `512=53.467us`, `1024=143.462us`, `2048=318.688us`, `4096=911.097us`, `8192=3015.025us`, `10000=4316.861us`. - `case.cupti` contains exactly the configured CUPTI metric names. Cases and selections do not contain `diagnosis`. - Both decode and prefill full JSONs passed self-compare with `warnings=0 failures=0`. - Decode composition command: - - `PEGAINFER_CUDA_SM=120 cargo run --release -p pegainfer-qwen3-4b --features kernel-report --bin qwen3_kernel_report -- compose --input --batch-size 1 --context 4096 --out ` + - `OPENINFER_CUDA_SM=120 cargo run --release -p openinfer-qwen3-4b --features kernel-report --bin qwen3_kernel_report -- compose --input --batch-size 1 --context 4096 --out ` - Result: schema `3`, no `diagnosis`, total decode-attention-only contribution `958.527us`. ### Step 12: Split prefill stages @@ -166,8 +166,8 @@ - Added the three stage ops to `kernel_manifests/qwen3-4b.toml`; each currently covers `batch_size=[1]` and the same `seq_len` grid as the full prefill report. - Preserved stage JSONs under ``. - GPU validation: - - `PEGAINFER_CUDA_SM=120 cargo build --release -p pegainfer-qwen3-4b --features kernel-report --bin qwen3_kernel_report` passed. - - `PEGAINFER_CUDA_SM=120 cargo build --release -p pegainfer-qwen3-4b` passed. + - `OPENINFER_CUDA_SM=120 cargo build --release -p openinfer-qwen3-4b --features kernel-report --bin qwen3_kernel_report` passed. + - `OPENINFER_CUDA_SM=120 cargo build --release -p openinfer-qwen3-4b` passed. - Full stage reports passed self-compare with `warnings=0 failures=0`. - Stage latency by `seq_len`: - `128`: full `24.632us`, QK+RoPE `8.181us`, KV scatter `5.418us`, attention core `13.148us`. @@ -202,7 +202,7 @@ - Local `cargo fmt --all --check` passed. - Local `cargo metadata --no-deps --format-version 1` passed. - Local `git diff --check` passed. - - CUDA host `PEGAINFER_CUDA_SM=120 cargo build --release -p pegainfer-qwen3-4b` passed. + - CUDA host `OPENINFER_CUDA_SM=120 cargo build --release -p openinfer-qwen3-4b` passed. ### Step 14: Add direct tensor-path CUPTI metrics - Investigated the RTX 5090 metric catalog with `/usr/local/cuda/bin/ncu --query-metrics-mode all --query-metrics --devices 0`. Non-interactive shells did not have `ncu` in `PATH`, but the binary exists under `/usr/local/cuda/bin/ncu`. @@ -242,10 +242,10 @@ ### Step 16: Inspect local FlashInfer prefill implementation - Initialized the local FlashInfer submodule: - - `git submodule update --init crates/pegainfer-kernels/third_party/flashinfer` + - `git submodule update --init crates/openinfer-kernels/third_party/flashinfer` - Checked out `779c24d1c9e6fcc51aa2359884696fbf4ac69b3b`. -- Confirmed the current PegaInfer wrapper calls FlashInfer's FA2 paged prefill path: - - `crates/pegainfer-kernels/csrc/paged_attention.cu` computes `cta_tile_q = FA2DetermineCtaTileQ(packed_qo_len, head_dim)` and dispatches `BatchPrefillWithPagedKVCacheDispatched`. +- Confirmed the current OpenInfer wrapper calls FlashInfer's FA2 paged prefill path: + - `crates/openinfer-kernels/csrc/paged_attention.cu` computes `cta_tile_q = FA2DetermineCtaTileQ(packed_qo_len, head_dim)` and dispatches `BatchPrefillWithPagedKVCacheDispatched`. - For Qwen3 `seq_len=10000`, `packed_qo_len = seq_len * (num_qo_heads / num_kv_heads) = 40000`, and FlashInfer's `FA2DetermineCtaTileQ` selects `CTA_TILE_Q=128`. - Rust prefill planning also calls `batch_prefill_cta_tile_q`, so any `CTA_TILE_Q` override must be plumbed into both the plan metadata and the kernel launch. - Matched the NCU kernel traits to FlashInfer source: @@ -253,7 +253,7 @@ - `NUM_MMA_KV` is not a free runtime flag in the current wrapper. FlashInfer derives it from register and shared-memory limits with `DISPATCH_NUM_MMA_KV(min(max_num_mma_kv_smem, max_num_mma_kv_reg), ...)`. - Checked backend alternatives in the same local source: - FlashInfer's Python `backend="cutlass"` path explicitly rejects SM12x/RTX 5090 and says to use `backend='fa2'`. - - `trtllm_fmha_v2_prefill` has SM120 support and supports `Q_PAGED_KV_HND`/`Q_PAGED_KV_NHD`, BF16, GQA, causal masks, and head dim 128, but it is exposed through FlashInfer's JIT/TVM FFI path (`fmha_v2_jit_binding.cu` + generated sources), not through the header-only FA2 path currently compiled by `pegainfer-kernels`. + - `trtllm_fmha_v2_prefill` has SM120 support and supports `Q_PAGED_KV_HND`/`Q_PAGED_KV_NHD`, BF16, GQA, causal masks, and head dim 128, but it is exposed through FlashInfer's JIT/TVM FFI path (`fmha_v2_jit_binding.cu` + generated sources), not through the header-only FA2 path currently compiled by `openinfer-kernels`. - Tuning conclusion: - First experiment: add report-only `prefill_attention_core` variants for `fa2_cta128`, `fa2_cta64`, and `fa2_cta16`, keeping plan metadata and launch dispatch consistent, then run the 10k CUPTI/NCU comparison. - Second experiment, only after the tile-Q sweep: consider a more invasive FMHAv2/SM120 integration or a FlashInfer Python-side benchmark to determine whether that backend beats FA2 for Qwen3's `Q_PAGED_KV` BF16 shape before wiring it into Rust/CUDA. @@ -266,7 +266,7 @@ - `block_tables` shaped `[batch_size, max_num_pages_per_seq]` for paged KV. - `Q` shaped `[total_tokens, num_qo_heads, head_dim]`. - Paged KV shaped `[pages, 2, num_kv_heads, page_size, head_dim]` for HND or `[pages, 2, page_size, num_kv_heads, head_dim]` for NHD. -- The wrapper currently transposes `Q_PAGED_KV_NHD` to HND with `.transpose(-3, -2).contiguous()`. PegaInfer's KV pool is page-first NHD with separate K/V offsets across layers, so the zero-copy route would need either an HND view/storage path or a lower-level wrapper that bypasses the Python NHD transpose. +- The wrapper currently transposes `Q_PAGED_KV_NHD` to HND with `.transpose(-3, -2).contiguous()`. OpenInfer's KV pool is page-first NHD with separate K/V offsets across layers, so the zero-copy route would need either an HND view/storage path or a lower-level wrapper that bypasses the Python NHD transpose. - Under the hood, FlashInfer generates FMHA v2 sources through `gen_fmha_v2_module(...)`, compiles generated kernels plus `fmha_v2_run.cu`, and exports `run` through TVM FFI (`fmha_v2_jit_binding.cu`). Direct Rust/CUDA integration is therefore not a drop-in header include like the current FA2 path. - Practical use options: - Lowest risk: benchmark it through FlashInfer's Python API on RTX 5090 with Qwen3-equivalent tensors to decide whether it beats FA2 at `seq_len=10000`. @@ -297,14 +297,14 @@ - Current contiguous single FA2 `bs=1,seq=10000`: `3953.402us`. - Current paged FA2 `bs=2,seq=10000`: `7483.966us` total, `3741.983us` per request. - Local-source FMHA v2 HND is `1.587x` slower than current paged FA2 at `bs=1`; its `bs=2` total is `1.540x` slower than current paged FA2 `bs=2`. - - NHD and HND are effectively the same latency in this direct function test (`1.003x` ratio), but the Python NHD path includes a contiguous transpose before the kernel and is therefore not a zero-copy integration path for PegaInfer. + - NHD and HND are effectively the same latency in this direct function test (`1.003x` ratio), but the Python NHD path includes a contiguous transpose before the kernel and is therefore not a zero-copy integration path for OpenInfer. - Also tested the official package public wrapper: - Report: `` - `backend="fa2"`, `kv_layout="NHD"`, `bs=1`: median `4095.216us`, close to our current FA2 path. - `backend="trtllm-gen"` initially hit a Python wrapper bug when passing `max_token_per_sequence` / `max_sequence_kv`: `UnboundLocalError: qo_indptr_host`. - Retrying without those max arguments reached the underlying TRT-LLM runner and failed for all tested `trtllm-gen` cases with `Unsupported architecture` on RTX 5090: ``. -- Conclusion: there is no measured reason to wire `trtllm_fmha_v2_prefill` into PegaInfer for the Qwen3 10k BF16 prefill attention core on RTX 5090. The current FA2 path is materially faster, and the official package's public `trtllm-gen` wrapper is not usable on this GPU through the tested release package. +- Conclusion: there is no measured reason to wire `trtllm_fmha_v2_prefill` into OpenInfer for the Qwen3 10k BF16 prefill attention core on RTX 5090. The current FA2 path is materially faster, and the official package's public `trtllm-gen` wrapper is not usable on this GPU through the tested release package. ### Step 19: Tune FA2 prefill CTA tile Q - Added report-only FA2 prefill variants for `CTA_TILE_Q`: @@ -313,7 +313,7 @@ - Implementation details: - Added `batch_prefill_paged_cuda_with_cta_tile_q` and matching plan helpers so launch dispatch and `request_indices` / `qo_tile_indices` metadata use the same tile size. - Kept the original C ABI functions as auto-heuristic wrappers. - - Exposed `PrefillPagedPlan::new_with_cta_tile_q` / `new_batch_with_cta_tile_q` through `pegainfer-core`. + - Exposed `PrefillPagedPlan::new_with_cta_tile_q` / `new_batch_with_cta_tile_q` through `openinfer-core`. - Switched Qwen3 production prefill planning to model-local `PREFILL_ATTENTION_CTA_TILE_Q = 64`; the global FlashInfer heuristic is unchanged. - Preserved tile-sweep JSONs under: - `` @@ -348,9 +348,9 @@ - Attention core: default `7542.625us` total vs `cta_q64=7403.535us`, about `1.9%` faster. - Full paged prefill op: default `8241.567us` total vs `cta_q64=8113.489us`, about `1.6%` faster. - GPU validation: - - `PEGAINFER_CUDA_SM=120 cargo build --release -p pegainfer-qwen3-4b --features kernel-report --bin qwen3_kernel_report` passed. - - `PEGAINFER_CUDA_SM=120 cargo build --release -p pegainfer-qwen3-4b` passed. - - `PEGAINFER_CUDA_SM=120 cargo test --release -p pegainfer-qwen3-4b` ran, but the existing `batch_decode::tests::batch_matches_sequential` test failed before exercising this change because the validation worktree has no model weights at the default model path (`No such file or directory` from `Qwen3Model::from_safetensors_with_runtime`). The earlier release builds and report runs are the validation for this kernel-level change. + - `OPENINFER_CUDA_SM=120 cargo build --release -p openinfer-qwen3-4b --features kernel-report --bin qwen3_kernel_report` passed. + - `OPENINFER_CUDA_SM=120 cargo build --release -p openinfer-qwen3-4b` passed. + - `OPENINFER_CUDA_SM=120 cargo test --release -p openinfer-qwen3-4b` ran, but the existing `batch_decode::tests::batch_matches_sequential` test failed before exercising this change because the validation worktree has no model weights at the default model path (`No such file or directory` from `Qwen3Model::from_safetensors_with_runtime`). The earlier release builds and report runs are the validation for this kernel-level change. ### Step 20: Commit validation - Fixed clippy cleanup found during commit prep: @@ -358,33 +358,33 @@ - `qwen3_kernel_report` no longer uses `Option::map(...).unwrap_or_else(...)` or a redundant selector-key clone. - Local checks passed: - `cargo fmt --all --check` - - `cargo metadata --no-deps --format-version 1 >$RESULT_ROOT/pegainfer_metadata.json` + - `cargo metadata --no-deps --format-version 1 >$RESULT_ROOT/openinfer_metadata.json` - `git diff --check` - GPU release clippy passed on the synced validation worktree: - - `PEGAINFER_CUDA_SM=120 PEGAINFER_TRITON_PYTHON=/bin/python cargo clippy --release -p pegainfer-kernels --all-targets -- -D warnings` - - `PEGAINFER_CUDA_SM=120 PEGAINFER_TRITON_PYTHON=/bin/python cargo clippy --release -p pegainfer-core --all-targets -- -D warnings` - - `PEGAINFER_CUDA_SM=120 PEGAINFER_TRITON_PYTHON=/bin/python cargo clippy --release -p pegainfer-qwen3-4b --features kernel-report --all-targets -- -D warnings` - - `PEGAINFER_CUDA_SM=120 PEGAINFER_TRITON_PYTHON=/bin/python cargo clippy --release -p pegainfer --bin pegainfer -- -D warnings` + - `OPENINFER_CUDA_SM=120 OPENINFER_TRITON_PYTHON=/bin/python cargo clippy --release -p openinfer-kernels --all-targets -- -D warnings` + - `OPENINFER_CUDA_SM=120 OPENINFER_TRITON_PYTHON=/bin/python cargo clippy --release -p openinfer-core --all-targets -- -D warnings` + - `OPENINFER_CUDA_SM=120 OPENINFER_TRITON_PYTHON=/bin/python cargo clippy --release -p openinfer-qwen3-4b --features kernel-report --all-targets -- -D warnings` + - `OPENINFER_CUDA_SM=120 OPENINFER_TRITON_PYTHON=/bin/python cargo clippy --release -p openinfer --bin openinfer -- -D warnings` ### Step 21: Add runtime-traced Qwen3 model operator report -- Added non-enum tensor metadata vocabulary under `pegainfer-kernels/src/tensor.rs`: +- Added non-enum tensor metadata vocabulary under `openinfer-kernels/src/tensor.rs`: - marker traits for dtype, layout, and axis tags; - erased `TensorSpec`, `TensorArg`, `AttrSpec`, and `KernelCall` for schedules, reports, and future instrumentation. -- Added `pegainfer-core::ops::call_spec` builders so op TensorSpec construction lives next to op wrappers, not in the model-report CLI. -- Added `pegainfer-core::ops::call_trace` and traced op wrappers behind the `kernel-call-trace` feature. Normal Qwen3 builds re-export the direct kernel ops and compile out trace labels/recording. The `pegainfer-qwen3-4b` `kernel-report` feature enables `kernel-call-trace`. +- Added `openinfer-core::ops::call_spec` builders so op TensorSpec construction lives next to op wrappers, not in the model-report CLI. +- Added `openinfer-core::ops::call_trace` and traced op wrappers behind the `kernel-call-trace` feature. Normal Qwen3 builds re-export the direct kernel ops and compile out trace labels/recording. The `openinfer-qwen3-4b` `kernel-report` feature enables `kernel-call-trace`. - Wired Qwen3 batch decode labels into the actual `batch_decode` path through feature-gated macros, so label formatting and trace collection disappear when the feature is off. The trace path forces CUDA Graph off before recording. - Added `qwen3_model_report`, a feature-gated per-model CLI: - - `cargo run --release -p pegainfer-qwen3-4b --features kernel-report --bin qwen3_model_report -- decode --batch-size 16 --kv-len 2048 --format text` + - `cargo run --release -p openinfer-qwen3-4b --features kernel-report --bin qwen3_model_report -- decode --batch-size 16 --kv-len 2048 --format text` - The CLI loads `models/Qwen3-4B` by default, creates KV state for the requested decode context, runs one real `batch_decode` trace, microbenches the traced KernelCalls, and emits JSON under `target/model_reports/qwen3-4b/`. - Missing bench providers fail loudly with the missing `op`, `label`, and TensorSpec; no estimated or nearest-neighbor rows are used. - Validation: - `cargo fmt --all --check` passed. - `git diff --check` passed. - - `cargo check --release -p pegainfer-qwen3-4b --lib` passed, confirming trace wrappers are not required for the normal library path. - - `cargo check --release -p pegainfer-qwen3-4b --features kernel-report --bin qwen3_model_report` passed. - - `cargo run --release -p pegainfer-qwen3-4b --features kernel-report --bin qwen3_model_report -- decode --batch-size 16 --kv-len 2048 --format text` passed and wrote `target/model_reports/qwen3-4b/decode-bs16-kv2048.json`. + - `cargo check --release -p openinfer-qwen3-4b --lib` passed, confirming trace wrappers are not required for the normal library path. + - `cargo check --release -p openinfer-qwen3-4b --features kernel-report --bin qwen3_model_report` passed. + - `cargo run --release -p openinfer-qwen3-4b --features kernel-report --bin qwen3_model_report -- decode --batch-size 16 --kv-len 2048 --format text` passed and wrote `target/model_reports/qwen3-4b/decode-bs16-kv2048.json`. - JSON audit confirmed `model/phase/config/schedule/by_op/by_call_site/coverage` are present, `schedule_source` is `runtime trace: Qwen3Model::batch_decode with CUDA Graph disabled`, and paged KV is represented as `bf16[page, layer, kv, pos_in_page, kv_head, head_dim] layout=paged_kv_page_first`. - - `cargo test --release -p pegainfer-qwen3-4b --lib` passed: `7 passed`. + - `cargo test --release -p openinfer-qwen3-4b --lib` passed: `7 passed`. ### Step 22: Add latency stats, readable tables, and DOT output - `qwen3_model_report` now records latency samples per unique traced `KernelCall` instead of a single mean. JSON schema `2` includes `mean_us`, sample `stddev_us`, `min_us`, `p50_us`, `p95_us`, `p99_us`, and `max_us`. @@ -394,21 +394,21 @@ - `target/model_reports/qwen3-4b/decode-bs16-kv2048.json` - `target/model_reports/qwen3-4b/decode-bs16-kv2048.dot` - Validation: - - `cargo check --release -p pegainfer-qwen3-4b --features kernel-report --bin qwen3_model_report` passed. - - `cargo run --release -p pegainfer-qwen3-4b --features kernel-report --bin qwen3_model_report -- decode --batch-size 16 --kv-len 2048 --format text` passed and wrote JSON plus DOT. + - `cargo check --release -p openinfer-qwen3-4b --features kernel-report --bin qwen3_model_report` passed. + - `cargo run --release -p openinfer-qwen3-4b --features kernel-report --bin qwen3_model_report -- decode --batch-size 16 --kv-len 2048 --format text` passed and wrote JSON plus DOT. - `dot -Tsvg target/model_reports/qwen3-4b/decode-bs16-kv2048.dot -o $RESULT_ROOT/qwen3-model-report.svg` passed. - JSON audit confirmed schema `2`, default `iters=32`, and `stddev_us` / `p99_us` fields in by-op rows. - - `cargo fmt --all --check`, `git diff --check`, and `cargo check --release -p pegainfer-qwen3-4b --lib` passed. + - `cargo fmt --all --check`, `git diff --check`, and `cargo check --release -p openinfer-qwen3-4b --lib` passed. ### Step 23: Make decode tracing come from an eager DAG builder -- Added `pegainfer-qwen3-4b/src/batch_decode_dag.rs`. +- Added `openinfer-qwen3-4b/src/batch_decode_dag.rs`. - `BatchDecodeDag` is eager: each method records the op's `KernelCall` contract when `kernel-call-trace` is active and immediately launches the production kernel. The batch decode forward path now calls DAG methods instead of wrapping free-form op calls with ad hoc trace labels. - Normal builds still compile out label construction through `dag_label!`; the `kernel-report` feature enables labels and `KernelCall` recording. - `all_reduce_hidden` now has an untraced internal path so the DAG builder owns decode all-reduce trace records without double-counting. - Validation: - - `cargo check --release -p pegainfer-qwen3-4b --lib` passed. - - `cargo check --release -p pegainfer-qwen3-4b --features kernel-report --bin qwen3_model_report` passed. - - `cargo run --release -p pegainfer-qwen3-4b --features kernel-report --bin qwen3_model_report -- decode --batch-size 16 --kv-len 2048 --format text` passed; by-op counts remained `gemm=109`, `gemm_rows=108`, `paged_decode_attention=36`, `all_reduce_hidden=72`. + - `cargo check --release -p openinfer-qwen3-4b --lib` passed. + - `cargo check --release -p openinfer-qwen3-4b --features kernel-report --bin qwen3_model_report` passed. + - `cargo run --release -p openinfer-qwen3-4b --features kernel-report --bin qwen3_model_report -- decode --batch-size 16 --kv-len 2048 --format text` passed; by-op counts remained `gemm=109`, `gemm_rows=108`, `paged_decode_attention=36`, `all_reduce_hidden=72`. ### Step 24: Check eager-DAG decode runtime impact - Ran current worktree decode-heavy request benchmark: @@ -418,7 +418,7 @@ - `target/release/qwen3_decode_context --model-path models/Qwen3-4B --contexts 1024 --iters 20` - Result: `p50=11.3781ms`, `avg=11.6660ms`. - Built a detached baseline worktree at `HEAD=3ffe745` and ran the same fixed-context decode probe on the same GPU/model: - - `PEGAINFER_TRITON_PYTHON=$LOCAL_PEGAINFER_DIR/.venv/bin/python cargo run --release -p pegainfer-qwen3-4b --bin qwen3_decode_context --manifest-path $RESULT_ROOT/pegainfer-bench-baseline/Cargo.toml -- --model-path $LOCAL_PEGAINFER_DIR/models/Qwen3-4B --contexts 1024 --iters 20` + - `OPENINFER_TRITON_PYTHON=$LOCAL_OPENINFER_DIR/.venv/bin/python cargo run --release -p openinfer-qwen3-4b --bin qwen3_decode_context --manifest-path $RESULT_ROOT/openinfer-bench-baseline/Cargo.toml -- --model-path $LOCAL_OPENINFER_DIR/models/Qwen3-4B --contexts 1024 --iters 20` - Result: `p50=11.3610ms`, `avg=11.6449ms`. - Interpretation: eager DAG is not measurable as a decode overhead in this probe. Current worktree vs same-machine `HEAD` baseline is `+0.0171ms` p50 (`+0.15%`), well under the benchmark-regression `2%` TPOT threshold. - The standard `bench_serving snapshot --warmup 5 --iters 20` could not complete because `prefill-heavy (10000,1)` hit CUDA OOM on this run. The existing tracked `bench_snapshots/rtx-5090/qwen3-4b.json` was restored from backup after the failed snapshot attempt. @@ -426,17 +426,17 @@ ### Step 25: PR readiness cleanup - Cleaned release clippy findings surfaced by the model-report work: - - `pegainfer-kernels::tensor::KernelCall` builder methods are now `#[must_use]`, and attrs are explicitly string-valued at the erased report boundary. - - `pegainfer-kernels/build.rs` and `pegainfer-core/src/cpu_topology.rs` no longer trip release clippy on `map(...).unwrap_or_else(...)`, raw pointer borrows, or redundant closures. + - `openinfer-kernels::tensor::KernelCall` builder methods are now `#[must_use]`, and attrs are explicitly string-valued at the erased report boundary. + - `openinfer-kernels/build.rs` and `openinfer-core/src/cpu_topology.rs` no longer trip release clippy on `map(...).unwrap_or_else(...)`, raw pointer borrows, or redundant closures. - Qwen3 e2e/regen tests and `qwen3_model_report` text/DOT rendering now pass the all-targets clippy style gates. - Local checks passed: - `cargo fmt --all --check` - `git diff --check` - - `cargo clippy --release -p pegainfer-qwen3-4b --features kernel-report --all-targets -- -D warnings` - - `cargo test --release -p pegainfer-qwen3-4b --lib` - - `cargo run --release -p pegainfer-qwen3-4b --features kernel-report --bin qwen3_model_report -- decode --batch-size 16 --kv-len 2048 --format text` + - `cargo clippy --release -p openinfer-qwen3-4b --features kernel-report --all-targets -- -D warnings` + - `cargo test --release -p openinfer-qwen3-4b --lib` + - `cargo run --release -p openinfer-qwen3-4b --features kernel-report --bin qwen3_model_report -- decode --batch-size 16 --kv-len 2048 --format text` - Qwen3-4B e2e was run with an absolute model path: - - `PEGAINFER_TEST_MODEL_PATH=$LOCAL_PEGAINFER_DIR/models/Qwen3-4B cargo test --release -p pegainfer-qwen3-4b --test e2e -- --nocapture` + - `OPENINFER_TEST_MODEL_PATH=$LOCAL_OPENINFER_DIR/models/Qwen3-4B cargo test --release -p openinfer-qwen3-4b --test e2e -- --nocapture` - Current worktree produced greedy-output mismatches on repeated runs. - A detached baseline worktree at `HEAD=3ffe745` reproduced the same Kanye prompt mismatch, so this e2e failure is not introduced by the eager-DAG/model-report changes. @@ -468,4 +468,4 @@ - **Follow-ups**: - Add the next provider for decode linear/MLP floor so composition reports explain more than decode attention. - Extend prefill tile-Q measurements to more batch sizes and model families before making a global kernel default. - - **Done (2026-05):** the model-agnostic layer was extracted into the `pegainfer-bench` crate — first the timing loop / latency stats / `KernelCall` accessors, then the by-op/by-call-site rollup (`RollupRow`, `CallSiteRow`, the accumulator, and row projection). Both qwen3 and Kimi-K2 model reports reuse it; row field order is preserved so the report JSON is unchanged. The *regression* framework (TOML manifest, `KernelSnapshot`, git/hardware/build provenance, `RegressionThresholds`, compare/compose, CUPTI, cold-L2) was evaluated and deliberately **kept** in `qwen3_kernel_report`: its schema is attention-domain-specific (`CaseShape` carries head/page dims, `CaseParams` carries chunk/`cta_tile_q`) with no second consumer, so sharing it would be premature abstraction. When a second model needs kernel-level regression gating, lift the generic primitives then — not before. + - **Done (2026-05):** the model-agnostic layer was extracted into the `openinfer-bench` crate — first the timing loop / latency stats / `KernelCall` accessors, then the by-op/by-call-site rollup (`RollupRow`, `CallSiteRow`, the accumulator, and row projection). Both qwen3 and Kimi-K2 model reports reuse it; row field order is preserved so the report JSON is unchanged. The *regression* framework (TOML manifest, `KernelSnapshot`, git/hardware/build provenance, `RegressionThresholds`, compare/compose, CUPTI, cold-L2) was evaluated and deliberately **kept** in `qwen3_kernel_report`: its schema is attention-domain-specific (`CaseShape` carries head/page dims, `CaseParams` carries chunk/`cta_tile_q`) with no second consumer, so sharing it would be premature abstraction. When a second model needs kernel-level regression gating, lift the generic primitives then — not before. diff --git a/docs/subsystems/kernels/pegainfer-kernels-boundary.md b/docs/subsystems/kernels/openinfer-kernels-boundary.md similarity index 95% rename from docs/subsystems/kernels/pegainfer-kernels-boundary.md rename to docs/subsystems/kernels/openinfer-kernels-boundary.md index ad73e67d..e645473a 100644 --- a/docs/subsystems/kernels/pegainfer-kernels-boundary.md +++ b/docs/subsystems/kernels/openinfer-kernels-boundary.md @@ -1,8 +1,8 @@ -# PegaInfer Per-Model Engine And Kernel Boundary +# OpenInfer Per-Model Engine And Kernel Boundary **Created**: 2026-05-03 **Status**: complete -**TL;DR**: pegainfer should evolve as reusable frontend/data-plane infrastructure plus per-model engines, not as one universal model abstraction. The first concrete step is extracting a kernels crate; kernels then become first-class assets through an index, ledger, simulator, and request tracing. PegaFlow remains the KV data plane instead of being folded into model internals. +**TL;DR**: openinfer should evolve as reusable frontend/data-plane infrastructure plus per-model engines, not as one universal model abstraction. The first concrete step is extracting a kernels crate; kernels then become first-class assets through an index, ledger, simulator, and request tracing. PegaFlow remains the KV data plane instead of being folded into model internals. ## Preparation @@ -24,7 +24,7 @@ ## Decision -PegaInfer should not become a single deep abstraction that forces dense full-attention models, hybrid linear-attention models, MLA/MoE models, multimodal models, and future RL/disaggregated variants through one execution model. +OpenInfer should not become a single deep abstraction that forces dense full-attention models, hybrid linear-attention models, MLA/MoE models, multimodal models, and future RL/disaggregated variants through one execution model. The project should instead use this shape: @@ -80,7 +80,7 @@ The initial human/LLM index should live beside the kernels crate so an engineer ## Simulator -Given a model config and a selected kernel set, pegainfer should be able to build an offline performance estimate: +Given a model config and a selected kernel set, openinfer should be able to build an offline performance estimate: - prefill TTFT for fixed prompt lengths, e.g. 1k, 2k, 10k; - decode TPOT for bs1 and high-batch cases; @@ -108,7 +108,7 @@ The low-overhead path should record: The debug path can sample CUDA events or CUPTI activity for a specific request. Whole-process `nsys` remains the deep offline tool, but online traces should answer "what happened to this request" without running a heavyweight profiler continuously. -The current vLLM frontend bridge already has protocol fields for `trace_headers`, `prefill_stats`, and logprob payloads. Filling those with pegainfer scheduler/runtime data is the natural integration point. +The current vLLM frontend bridge already has protocol fields for `trace_headers`, `prefill_stats`, and logprob payloads. Filling those with openinfer scheduler/runtime data is the natural integration point. ## PegaFlow Boundary @@ -145,14 +145,14 @@ This keeps Qwen3, Qwen3.5, and DSV3 free to use different state layouts while st ## Debrief -- **Outcome**: Captured the decision that pegainfer's next architecture should be per-model engines backed by shared frontend, runtime, kernel measurement, tracing, and PegaFlow data-plane layers. +- **Outcome**: Captured the decision that openinfer's next architecture should be per-model engines backed by shared frontend, runtime, kernel measurement, tracing, and PegaFlow data-plane layers. - **Pitfalls encountered**: - `docs/index.md` already referenced this file, but the file was absent locally. This doc fills that routing gap. - **Lessons learned**: - The existing source is already moving away from a single universal model abstraction; the documentation should make that direction explicit so future refactors do not fight the codebase. - **Follow-ups**: - Extract the Qwen3-4B model crate next. That crate should own config parsing, weight loading, state layout, and the prefill/decode/unified kernel DAG. - - If a TOML/JSON kernel manifest is still useful, put it in the Qwen3-4B model crate and make it generated or validated from the Rust DAG. Do not hand-maintain model manifests in `pegainfer-kernels`. + - If a TOML/JSON kernel manifest is still useful, put it in the Qwen3-4B model crate and make it generated or validated from the Rust DAG. Do not hand-maintain model manifests in `openinfer-kernels`. - Turn the kernel ledger into a concrete artifact format only after at least one model crate and tracing path need it. - Add request trace IDs and scheduler step spans through the vLLM frontend bridge. - Define the first PegaFlow KV block descriptor for Qwen3 paged KV. diff --git a/docs/subsystems/kernels/typed-forward-pipeline.md b/docs/subsystems/kernels/typed-forward-pipeline.md index f2b7952f..f3186cd6 100644 --- a/docs/subsystems/kernels/typed-forward-pipeline.md +++ b/docs/subsystems/kernels/typed-forward-pipeline.md @@ -1,6 +1,6 @@ # Typed Forward Pipeline Macro -> **TL;DR:** Build a reusable typed tensor pipeline macro in `pegainfer-kernels` so model crates can express common `typed_ops` forward chains without model-specific wrapper macros. +> **TL;DR:** Build a reusable typed tensor pipeline macro in `openinfer-kernels` so model crates can express common `typed_ops` forward chains without model-specific wrapper macros. > > **Last touched:** 2026-05 @@ -9,16 +9,16 @@ - **Read**: - `docs/index.md` - routed this task to the kernels subsystem with Kimi-K2 as the first consumer. - `docs/models/kimi-k2/bringup-history.md` - showed the Kimi decode/prefill hot paths and the CUDA Graph boundary constraints that the macro must preserve. - - `docs/subsystems/kernels/pegainfer-kernels-boundary.md` - confirmed reusable kernel/runtime helpers belong in `pegainfer-kernels`, while model-specific execution remains in model crates. - - `pegainfer-kernels/src/forward_pass.rs` - found the existing `typed_forward_pass!` DSL covering a subset of typed ops. - - `pegainfer-kernels/src/typed_ops.rs` - confirmed the typed op surface available to the macro. - - `pegainfer-kimi-k2/src/runner/worker.rs` - identified repeated typed op chains in MLA, dense MLP, and MoE paths. + - `docs/subsystems/kernels/openinfer-kernels-boundary.md` - confirmed reusable kernel/runtime helpers belong in `openinfer-kernels`, while model-specific execution remains in model crates. + - `openinfer-kernels/src/forward_pass.rs` - found the existing `typed_forward_pass!` DSL covering a subset of typed ops. + - `openinfer-kernels/src/typed_ops.rs` - confirmed the typed op surface available to the macro. + - `openinfer-kimi-k2/src/runner/worker.rs` - identified repeated typed op chains in MLA, dense MLP, and MoE paths. - **Relevant history**: - `docs/models/kimi-k2/bringup-history.md` records that decode graph capture requires stable pointers and no decode-step allocation; macro expansion must not hide allocations inside decode paths. - **Plan**: - 1. Extend `pegainfer-kernels/src/forward_pass.rs` into a generic typed pipeline macro with explicit `ctx`, `eps`, optional `seq_len`, and configurable GEMM mode. + 1. Extend `openinfer-kernels/src/forward_pass.rs` into a generic typed pipeline macro with explicit `ctx`, `eps`, optional `seq_len`, and configurable GEMM mode. 2. Support reusable statements for typed tensor allocation, `rms_norm`, `gemm`, `silu_mul`, `add`, `swap`, bf16/f32 conversion, and escape hatches for arbitrary calls that return `Result`. - 3. Replace repeated typed op chains in `pegainfer-kimi-k2/src/runner/worker.rs` with the generic macro while leaving Kimi-specific kernels explicit. + 3. Replace repeated typed op chains in `openinfer-kimi-k2/src/runner/worker.rs` with the generic macro while leaving Kimi-specific kernels explicit. 4. Run formatting and the narrowest compile checks that exercise the touched crates. - **Risks / open questions**: - Macro grammar must remain readable at call sites; too much DSL would make CUDA graph and borrow ordering harder to audit. @@ -28,13 +28,13 @@ ### Step 1: Replace the local forward macro with a reusable pipeline -- Reworked `pegainfer-kernels/src/forward_pass.rs` into `typed_pipeline!`. +- Reworked `openinfer-kernels/src/forward_pass.rs` into `typed_pipeline!`. - Added pipeline statements for tensor allocation, typed GEMM/RMSNorm/add/SiLU, bf16/f32 conversion, swaps, and explicit `try`/`call` escapes for model kernels. - Kept decode graph paths allocation-free by requiring `gemm = prefill` for any `tensor` allocation statement. ### Step 2: Make typed ops require typed weights -- Removed the untyped weight adapter layer from `pegainfer-kernels/src/typed_ops.rs`. +- Removed the untyped weight adapter layer from `openinfer-kernels/src/typed_ops.rs`. - `gemm_*_into` now accepts `GpuWeight` only. - RMSNorm helpers now accept `NormWeight` only. - Runtime-row vocab weights use `GpuTensor` so embedding/lm-head keep hidden width static while vocab rows remain runtime. @@ -42,7 +42,7 @@ ### Step 3: Move Kimi forward paths onto the generic pipeline -- Replaced repeated op chains in `pegainfer-kimi-k2/src/runner/worker/forward.rs` and `state.rs` with `typed_pipeline!`. +- Replaced repeated op chains in `openinfer-kimi-k2/src/runner/worker/forward.rs` and `state.rs` with `typed_pipeline!`. - Converted fixed Kimi decode/prefill weights to `GpuWeight`/`NormWeight` at load/package boundaries. - Converted token embedding and lm-head cache entries to `GpuTensor`. - Tightened MLA wrappers with seq_len/cache metadata validation and updated kernel-report measurement paths to call typed MLA APIs. @@ -50,9 +50,9 @@ ### Step 4: Verify -- `PEGAINFER_CUDA_SM=90a PEGAINFER_TRITON_PYTHON=$LOCAL_PEGAINFER_DIR/.venv/bin/python3 cargo check --release -p pegainfer-kimi-k2 --features kimi-k2 --tests` passed. -- `PEGAINFER_CUDA_SM=90a PEGAINFER_TRITON_PYTHON=$LOCAL_PEGAINFER_DIR/.venv/bin/python3 cargo check --release -p pegainfer-kimi-k2 --lib` passed after gating runtime/weights exports behind `kimi-k2`. -- `PEGAINFER_CUDA_SM=90a PEGAINFER_TRITON_PYTHON=$LOCAL_PEGAINFER_DIR/.venv/bin/python3 cargo check --release -p pegainfer-kimi-k2 --features kernel-report --bins` passed after migrating report harness inputs to typed Kimi kernel wrappers. +- `OPENINFER_CUDA_SM=90a OPENINFER_TRITON_PYTHON=$LOCAL_OPENINFER_DIR/.venv/bin/python3 cargo check --release -p openinfer-kimi-k2 --features kimi-k2 --tests` passed. +- `OPENINFER_CUDA_SM=90a OPENINFER_TRITON_PYTHON=$LOCAL_OPENINFER_DIR/.venv/bin/python3 cargo check --release -p openinfer-kimi-k2 --lib` passed after gating runtime/weights exports behind `kimi-k2`. +- `OPENINFER_CUDA_SM=90a OPENINFER_TRITON_PYTHON=$LOCAL_OPENINFER_DIR/.venv/bin/python3 cargo check --release -p openinfer-kimi-k2 --features kernel-report --bins` passed after migrating report harness inputs to typed Kimi kernel wrappers. - Final audit replaced the remaining worker load `debug_assert_eq!` rank/report checks with release `ensure!` errors and confirmed the Kimi worker/MLA typed path has no `AsGemmWeight`, `AsNormWeight`, `typed_forward`, or `TypedDecodeScratch` remnants. ## Debrief diff --git a/docs/subsystems/runtime/kv-cache-design.md b/docs/subsystems/runtime/kv-cache-design.md index 2cccb5a3..c1b451ff 100644 --- a/docs/subsystems/runtime/kv-cache-design.md +++ b/docs/subsystems/runtime/kv-cache-design.md @@ -169,7 +169,7 @@ impl BlockManager { **为什么 allocate 用 RequestId 而非返回 handle?** -handle 模式(Dynamo 的 `MutableBlock`/`CompleteBlock`)适合跨进程、多 worker 场景。pegainfer 是单进程、单 scheduler 线程模型——block ownership 在 `BlockManager` 内部用 `HashMap>` 跟踪即可。返回 handle 会引入 Arc/refcount 开销但没有实际收益。 +handle 模式(Dynamo 的 `MutableBlock`/`CompleteBlock`)适合跨进程、多 worker 场景。openinfer 是单进程、单 scheduler 线程模型——block ownership 在 `BlockManager` 内部用 `HashMap>` 跟踪即可。返回 handle 会引入 Arc/refcount 开销但没有实际收益。 ### RequestBlocks — per-request 视图 @@ -284,12 +284,12 @@ pub enum KernelKvMetadata { ### 组装:谁持有什么 ``` -pegainfer-core: +openinfer-core: BlockId, BlockState, BlockManager, BlockManagerStats RequestBlocks PhysicalBackend trait, KernelKvMetadata -pegainfer-core (或 pegainfer-kernels): +openinfer-core (或 openinfer-kernels): FullAttentionBackend MlaBackend @@ -422,11 +422,11 @@ bs=64, avg 16K tokens/req → 1,000 blocks/req → 64,000 blocks → 75% utiliza ## 已落地代码 -### `pegainfer-block-manager` crate +### `openinfer-block-manager` crate 独立 crate,零 GPU 依赖,纯逻辑。从 Dynamo `kvbm-logical`(Apache-2.0)的 `BlockStore` 精简而来。 -**文件**: `pegainfer-block-manager/src/lib.rs` +**文件**: `openinfer-block-manager/src/lib.rs` **核心类型**: - `BlockId(u32)` — block identity @@ -468,11 +468,11 @@ impl BlockManager { ### Step 1: ✓ BlockManager logical layer -已完成。`pegainfer-block-manager` crate 独立,零 GPU 依赖。 +已完成。`openinfer-block-manager` crate 独立,零 GPU 依赖。 ### Step 2: MlaBackend (PhysicalBackend) -在 `pegainfer-core` 中实现 `MlaBackend`:双 buffer (ckv + kpe),block_size=16,共享 BlockId 映射。对齐 K2 现有 FlashInfer MLA kernel 的 metadata 接口。 +在 `openinfer-core` 中实现 `MlaBackend`:双 buffer (ckv + kpe),block_size=16,共享 BlockId 映射。对齐 K2 现有 FlashInfer MLA kernel 的 metadata 接口。 ### Step 3: K2 迁移 diff --git a/docs/subsystems/runtime/pegaflow-offload-integration.md b/docs/subsystems/runtime/pegaflow-offload-integration.md index 2af55b94..c0653d8e 100644 --- a/docs/subsystems/runtime/pegaflow-offload-integration.md +++ b/docs/subsystems/runtime/pegaflow-offload-integration.md @@ -10,7 +10,7 @@ - **pegaflow `block_stride_bytes`**(PR #331 → novitalabs/pegaflow,`feat/inproc-load` 基于其上):解耦"块间步长"与"每块拷贝大小",让 page-first fused buffer 能注册。**已合入 master**。 - **pegaflow 进程内 load API**(PR #333,**已合入**,squash 进 #331 的 `07cac7e`):`LoadCompletion::{Shm,Channel}` + `batch_load_kv_blocks_multi_layer_inproc` → `oneshot::Receiver`,去掉 in-process 调用方对 shm `LoadState` 的依赖(Rust 进程内不需要),非阻塞 poll。 -- **`pegainfer-kv-offload::OffloadEngine`**:拥有 `PegaEngine` + 内嵌 tokio runtime;`Registration::from_buffer` 把 fused page-first buffer 映射成 per-layer 注册(**单段 `[K|V]`**:fused layout 里 K/V 本就连续 = `layer_stride` 一段,`block_stride = page_stride`,`segments=1`——不是 K/V split,那条路需要 `kv_stride > bytes_per_block`,此处不成立)。`save`(async fire-and-forget)/`save_blocking`(eviction handoff,同步捕获)/`query`(GPU+CPU hit)/`load`(oneshot)/`flush_saves`/`evict_all`。 +- **`openinfer-kv-offload::OffloadEngine`**:拥有 `PegaEngine` + 内嵌 tokio runtime;`Registration::from_buffer` 把 fused page-first buffer 映射成 per-layer 注册(**单段 `[K|V]`**:fused layout 里 K/V 本就连续 = `layer_stride` 一段,`block_stride = page_stride`,`segments=1`——不是 K/V split,那条路需要 `kv_stride > bytes_per_block`,此处不成立)。`save`(async fire-and-forget)/`save_blocking`(eviction handoff,同步捕获)/`query`(GPU+CPU hit)/`load`(oneshot)/`flush_saves`/`evict_all`。 - **`KvBuffer::device_ptr`**(kv-cache):注册用的稳定基址。 - **kvbm↔bytes 桥**(kv-cache `RequestKv`):`prompt_block_hashes` / `assigned_block_hashes` / `prefix_matched_blocks`,`SequenceHash::as_u128()` → 16B content key。 - **`tests/cpu_roundtrip.rs`**:真实 `KvBuffer` 上写已知 pattern → save → query → load 到**另一组** block → 字节级比对 + 零块负向控制。**通过**。 @@ -27,7 +27,7 @@ pegaflow(`third_party/pegaflow`,novita,Apache-2.0)原本是 **vLLM 的 KV connector 服务端**:KV 的编排逻辑(何时 save、query 几个 block、prefix 匹配、与 scheduler 的 admission/preemption 交互)全在 vLLM 的 Python connector 那一侧,`pegaflow-core` 只是底下干 D2H/H2D + 分层存储的**肌肉**。 -pegainfer 不是 vLLM,那套 Python connector 一行用不上。接入要做的是**用 Rust 自建那颗 connector 大脑**——而 kvbm 的 logical/physical 分层正是它的骨架: +openinfer 不是 vLLM,那套 Python connector 一行用不上。接入要做的是**用 Rust 自建那颗 connector 大脑**——而 kvbm 的 logical/physical 分层正是它的骨架: ``` per-model scheduler ← 策略:哪些 block 该 resident(full 前缀 / MLA 全前缀 / 未来稀疏选择) @@ -39,7 +39,7 @@ pegaflow-core ← 机制底座:D2H/H2D、DRAM/SSD/RDMA 分层 ## 2. 战略决策:pegaflow 取代 kvbm 死代码做物理 tier -pegainfer 仓里 vendored 的 `kvbm-physical` / `kvbm-engine` 设计目标就是分层卸载,但**至今零接线、是死代码**(无任何非 kvbm crate 依赖)。同时养两套分层卸载违反项目复杂度红线。本 spec 采纳:**`kvbm-logical`(逻辑层 + 前缀匹配)保留,pegaflow-core 顶替它下面缺失的物理卸载层,砍掉 `kvbm-physical`/`kvbm-engine`**。理由:pegaflow 同组维护、已上 PyPI、有 H800 benchmark、库化干净;kvbm 那两层是纯负债。 +openinfer 仓里 vendored 的 `kvbm-physical` / `kvbm-engine` 设计目标就是分层卸载,但**至今零接线、是死代码**(无任何非 kvbm crate 依赖)。同时养两套分层卸载违反项目复杂度红线。本 spec 采纳:**`kvbm-logical`(逻辑层 + 前缀匹配)保留,pegaflow-core 顶替它下面缺失的物理卸载层,砍掉 `kvbm-physical`/`kvbm-engine`**。理由:pegaflow 同组维护、已上 PyPI、有 H800 benchmark、库化干净;kvbm 那两层是纯负债。 ## 3. 三模型三 KV 形态 → connector 边界(实据) @@ -52,7 +52,7 @@ pegainfer 仓里 vendored 的 `kvbm-physical` / `kvbm-engine` 设计目标就是 **边界结论**:connector 只收 **block-structured、content-addressable** 的 KV(MLA latent / full-attn paged)。recurrent/SSM state 不进 connector。稀疏的 active-set gather 是独立的、未来的课题。 -证据:Kimi `pegainfer-kimi-k2/src/runner/{worker.rs:612-619, cache.rs:63-80, mla.rs:38-48}`、`scheduler.rs:16,27,146,180`、`pool.rs:123`;Qwen3.5 linear `pegainfer-qwen35-4b/src/...recurrent.rs`、`batch_decode_graph.rs:82-86`;DeepSeek `pegainfer-deepseek-v4/src/...state.rs:220, indexer.rs:609-670`、`csrc/.../deepseek_indexer.cu:470-527`。 +证据:Kimi `openinfer-kimi-k2/src/runner/{worker.rs:612-619, cache.rs:63-80, mla.rs:38-48}`、`scheduler.rs:16,27,146,180`、`pool.rs:123`;Qwen3.5 linear `openinfer-qwen35-4b/src/...recurrent.rs`、`batch_decode_graph.rs:82-86`;DeepSeek `openinfer-deepseek-v4/src/...state.rs:220, indexer.rs:609-670`、`csrc/.../deepseek_indexer.cu:470-527`。 ## 4. 路线 @@ -64,15 +64,15 @@ pegainfer 仓里 vendored 的 `kvbm-physical` / `kvbm-engine` 设计目标就是 四条承重假设由 10-agent workflow 对抗验证: -1. **✅ 进程内注册裸指针,无 IPC、无第二进程**:`register_context_layer_batch(data_ptrs: &[u64])`(`pegaflow-core/src/lib.rs:242-259`)收裸设备地址,拷贝路径直接喂给 driver API `cuMemcpyDtoHAsync_v2`(`transfer/memcpy.rs:82-89`);IPC 只在 server/Python 层,core 零 IPC 调用点。cudarc 附设备 **primary context**(与 pegainfer 同一),自建 worker stream。 +1. **✅ 进程内注册裸指针,无 IPC、无第二进程**:`register_context_layer_batch(data_ptrs: &[u64])`(`pegaflow-core/src/lib.rs:242-259`)收裸设备地址,拷贝路径直接喂给 driver API `cuMemcpyDtoHAsync_v2`(`transfer/memcpy.rs:82-89`);IPC 只在 server/Python 层,core 零 IPC 调用点。cudarc 附设备 **primary context**(与 openinfer 同一),自建 worker stream。 2. **✅ 依赖无致命冲突**:cudarc 单 major(0.19.3↔0.19.7 统一),cuda-12080/12090 共存(build.rs 取高版本),tokio/tonic/prost 兼容。**依赖行**(git rev pin 到上游 master `07cac7e`,含 #331+#333;`default-features=false` 砍掉 pegaflow 自带的 `cuda-12`/`rdma`,靠 workspace cudarc 提供的 `cuda-12090`+`nvrtc` 满足——pegaflow-core 无 `cfg(cuda-12)` gate): ```toml pegaflow-core = { git = "https://github.com/novitalabs/pegaflow.git", rev = "07cac7e50e8ae7be15ad1b9311401039c9ee439b", default-features = false } ``` 下次再改 pegaflow:临时换回 path dep 共同开发 → 提 PR → 合入后 re-pin rev。 - **为何 `cuda-12` 而非 `cuda-13`**(本机明明是 CUDA 13.3 toolkit / 13.0 driver):pegainfer 有意锁 `cudarc/cuda-12090`(`Cargo.toml:92-93`,issue #263——配 cudarc 0.19.5+ 的 per-symbol lazy loading,压低 binding level 以**不抬高 runtime driver floor**、保宽部署兼容;故意不用 `cuda-version-from-build-system` 自动,否则 driver floor 会跟着构建机 toolkit 走)。cudarc 在 workspace 是**单实例、feature 取并集后选最高版本**:pegaflow 用 `cuda-12` 并集后仍是 12090、不抬 floor;用 `cuda-13`(→ `cudarc/cuda-13000`)会把**整个 workspace 含 pegainfer 自己**顶到 13000、driver floor 抬到 CUDA 13,撞翻 #263。整体迁 cu13 是独立决策(须同时改 pegainfer 的 cudarc + revisit #263),本期不做。 + **为何 `cuda-12` 而非 `cuda-13`**(本机明明是 CUDA 13.3 toolkit / 13.0 driver):openinfer 有意锁 `cudarc/cuda-12090`(`Cargo.toml:92-93`,issue #263——配 cudarc 0.19.5+ 的 per-symbol lazy loading,压低 binding level 以**不抬高 runtime driver floor**、保宽部署兼容;故意不用 `cuda-version-from-build-system` 自动,否则 driver floor 会跟着构建机 toolkit 走)。cudarc 在 workspace 是**单实例、feature 取并集后选最高版本**:pegaflow 用 `cuda-12` 并集后仍是 12090、不抬 floor;用 `cuda-13`(→ `cudarc/cuda-13000`)会把**整个 workspace 含 openinfer 自己**顶到 13000、driver floor 抬到 CUDA 13,撞翻 #263。整体迁 cu13 是独立决策(须同时改 openinfer 的 cudarc + revisit #263),本期不做。 3. **⚠️ Layout**:block-hash 键直接适配(`u64→Vec`);page-first layout **不适配**(见 §5.R1);Kimi per-layer 布局**天然适配**。 -4. **✅ 流同步**:host-side 粗同步可解——save 前 pegainfer 必须 `synchronize()` compute stream(pegaflow 私有 stream 只自同步,`gpu_worker.rs:520-528`),restore 前自旋 poll `LoadState`。代价:损 compute/offload 重叠(见 §6.R3)。 +4. **✅ 流同步**:host-side 粗同步可解——save 前 openinfer 必须 `synchronize()` compute stream(pegaflow 私有 stream 只自同步,`gpu_worker.rs:520-528`),restore 前自旋 poll `LoadState`。代价:损 compute/offload 重叠(见 §6.R3)。 ## 6. connector 接口(dense-first,稀疏留门不展开) @@ -111,7 +111,7 @@ trait KvResidencyPolicy { | R4 | 依赖误配(裸 default-features=false / 漏 cuda-12) | minor | §5.2 依赖行已定,CI 编译验证 | | R5 | 稀疏 active-set offload 的 token-vs-block 粒度落差 | 已知开放 | 见下,不在本期 | -**稀疏(已知开放问题,不在本期)**:连 dynamo KVBM 都没解 sparse attention offloading——它的复用是 radix 前缀、offload 是 frequency/LRU、tier 是整请求异步流动,对 SWA 只在 router 透传 `kv_cache_spec_sliding_window` 做 window-aware 前缀,对 topk 零处理。没有现成抽象可继承。pegainfer 侧 DeepSeek 的 indexer 已产出显式可拦截的 active-set 信号,但 token/row 粒度 ≠ block 粒度,且 compressor 已控 footprint 当前不需 offload。机制层(内容寻址 + 可插拔 policy + 语义无关 transfer)本就不堵稀疏,真正缺的 decode-loop gather 大脑到时候结合具体模型新写更准。 +**稀疏(已知开放问题,不在本期)**:连 dynamo KVBM 都没解 sparse attention offloading——它的复用是 radix 前缀、offload 是 frequency/LRU、tier 是整请求异步流动,对 SWA 只在 router 透传 `kv_cache_spec_sliding_window` 做 window-aware 前缀,对 topk 零处理。没有现成抽象可继承。openinfer 侧 DeepSeek 的 indexer 已产出显式可拦截的 active-set 信号,但 token/row 粒度 ≠ block 粒度,且 compressor 已控 footprint 当前不需 offload。机制层(内容寻址 + 可插拔 policy + 语义无关 transfer)本就不堵稀疏,真正缺的 decode-loop gather 大脑到时候结合具体模型新写更准。 ## 8. 下一步:Kimi MLA 最小 spike diff --git a/docs/subsystems/runtime/qwen3-kvbm-integration-spec.md b/docs/subsystems/runtime/qwen3-kvbm-integration-spec.md index 8c236353..83116656 100644 --- a/docs/subsystems/runtime/qwen3-kvbm-integration-spec.md +++ b/docs/subsystems/runtime/qwen3-kvbm-integration-spec.md @@ -124,9 +124,9 @@ pub struct KvBuffer { ## 详细改动清单 -### 1. pegainfer-core: 改造 KvPool → KvBuffer +### 1. openinfer-core: 改造 KvPool → KvBuffer -**文件**: `pegainfer-core/src/kv_pool.rs` → 重命名为 `kv_buffer.rs` +**文件**: `openinfer-core/src/kv_pool.rs` → 重命名为 `kv_buffer.rs` - 删除 `PagePool` 依赖,删除 `KvState` - `KvPool::new()` → `KvBuffer::new()`: 分配 `num_blocks × page_stride` 的 GPU buffer @@ -134,11 +134,11 @@ pub struct KvBuffer { - 新增 `KvBuffer::padding_block_id()` → 返回 block 0(约定 block 0 是 padding) - `KvDesc` 不变:仍然接收 `&[i32]` page indices + seq_len -**文件**: `pegainfer-core/Cargo.toml` +**文件**: `openinfer-core/Cargo.toml` - 添加 `kvbm-logical = { workspace = true }` -### 2. pegainfer-qwen3-4b/src/executor.rs: SequenceStore 替换 RequestStateStore +### 2. openinfer-qwen3-4b/src/executor.rs: SequenceStore 替换 RequestStateStore **删除**: - `RequestStateStore` @@ -185,7 +185,7 @@ fn build_kv_desc<'a>( - `alloc_kv()` → 在 `SchedulableSequence::new()` 时创建 - `drop_request()` → `seq.release()` + remove from SequenceStore -### 3. pegainfer-qwen3-4b/src/scheduler.rs: Admission 改用 BlockManager +### 3. openinfer-qwen3-4b/src/scheduler.rs: Admission 改用 BlockManager **删除**: - `pages_needed()`, `max_request_tokens()`, `max_active_tokens()`, `current_active_tokens()`, `active_future_pages()` @@ -307,12 +307,12 @@ let padding_immutable = block_manager.register_block( ## 接入顺序 -1. **pegainfer-core**: 改造 `kv_pool.rs` → `kv_buffer.rs`(保留 `KvLayout`、`KvDesc`,删 `PagePool` 依赖)。保留旧 `kv_pool.rs` 不删,Qwen3.5 等其他模型仍用旧路径。 -2. **pegainfer-qwen3-4b/executor.rs**: `RequestStateStore` → `SequenceStore`,引入 `BlockManager`。 -3. **pegainfer-qwen3-4b/scheduler.rs**: Admission 改为 block 计数。 -4. **pegainfer-qwen3-4b/{prefill,batch_decode,unified_forward}.rs**: Forward path 适配 `SchedulableSequence` + `KvDesc` from block IDs。 -5. **pegainfer-qwen3-4b/batch_decode_buffers.rs**: `sync_paged_meta` 接口改造。 -6. **E2E 测试验证**: `cargo test --release -p pegainfer-qwen3-4b --test e2e` +1. **openinfer-core**: 改造 `kv_pool.rs` → `kv_buffer.rs`(保留 `KvLayout`、`KvDesc`,删 `PagePool` 依赖)。保留旧 `kv_pool.rs` 不删,Qwen3.5 等其他模型仍用旧路径。 +2. **openinfer-qwen3-4b/executor.rs**: `RequestStateStore` → `SequenceStore`,引入 `BlockManager`。 +3. **openinfer-qwen3-4b/scheduler.rs**: Admission 改为 block 计数。 +4. **openinfer-qwen3-4b/{prefill,batch_decode,unified_forward}.rs**: Forward path 适配 `SchedulableSequence` + `KvDesc` from block IDs。 +5. **openinfer-qwen3-4b/batch_decode_buffers.rs**: `sync_paged_meta` 接口改造。 +6. **E2E 测试验证**: `cargo test --release -p openinfer-qwen3-4b --test e2e` 7. **TPOT 回归测试**: bench 跑一遍确认无退化。 每一步编译通过 + 已有 UT 通过后再进入下一步。 @@ -322,7 +322,7 @@ let padding_immutable = block_manager.register_block( ## 验收标准 - [ ] `cargo check --workspace` 通过 -- [ ] `cargo test --release -p pegainfer-qwen3-4b --test e2e` 全绿 +- [ ] `cargo test --release -p openinfer-qwen3-4b --test e2e` 全绿 - [ ] `cargo test --release -p kvbm-logical --lib` 全绿(port 没被改坏) - [ ] 其他模型 crate 不受影响(仍然用 `KvPool`) - [ ] 单请求 TPOT bench ±3% 以内 diff --git a/docs/subsystems/runtime/runtime.md b/docs/subsystems/runtime/runtime.md index 5fb976ab..83bf05c7 100644 --- a/docs/subsystems/runtime/runtime.md +++ b/docs/subsystems/runtime/runtime.md @@ -1,6 +1,6 @@ # Runtime -> **TL;DR:** Runtime complexity grows fast as new model families come in. We control it by keeping a shared core (`pegainfer-core`) that owns the generation contract and orchestration, and pushing model-specific execution into per-model crates behind a single trait. The trait deliberately hides prefill vs decode and homogeneous vs hybrid attention from the caller. +> **TL;DR:** Runtime complexity grows fast as new model families come in. We control it by keeping a shared core (`openinfer-core`) that owns the generation contract and orchestration, and pushing model-specific execution into per-model crates behind a single trait. The trait deliberately hides prefill vs decode and homogeneous vs hybrid attention from the caller. > > **Last touched:** 2026-05. @@ -36,9 +36,9 @@ Design points worth keeping in mind: ## What's been done -- Shared runtime/API entry extracted into `pegainfer-core`: sampler, page/KV pools, weight loading, CUDA Graph state, shared op adapters, `ModelForward` / `GenerationState`. -- Per-model crates (`pegainfer-qwen3-4b`, `pegainfer-qwen35-4b`) own their config, weights, prefill/decode execution, scheduler, and tests. -- Generation loop unified into `pegainfer-core` against the trait — replaces ~120 lines of duplicated orchestration per model. +- Shared runtime/API entry extracted into `openinfer-core`: sampler, page/KV pools, weight loading, CUDA Graph state, shared op adapters, `ModelForward` / `GenerationState`. +- Per-model crates (`openinfer-qwen3-4b`, `openinfer-qwen35-4b`) own their config, weights, prefill/decode execution, scheduler, and tests. +- Generation loop unified into `openinfer-core` against the trait — replaces ~120 lines of duplicated orchestration per model. - Internal modules (decode buffers, KV cache, recurrent state, FFI bindings, tokenizer streaming, weight-loader helpers) pulled back behind crate-local visibility. `unreachable_pub` is meaningful again. - Trace machinery removed from the active public surface. diff --git a/docs/subsystems/scheduler/scheduler.md b/docs/subsystems/scheduler/scheduler.md index 4d02095f..ff7530fb 100644 --- a/docs/subsystems/scheduler/scheduler.md +++ b/docs/subsystems/scheduler/scheduler.md @@ -50,7 +50,7 @@ The bridge to FlashInfer's `paged_kv_t` (which expects separate `k_data`/`v_data ### QPS=2 head-to-head vs vLLM 0.18.x -| Metric | pegainfer | vLLM | Delta | +| Metric | openinfer | vLLM | Delta | |---|---|---|---| | TTFT median | 301ms | 359ms | **−16%** | | TTFT p99 | 951ms | 1245ms | **−24%** | @@ -60,11 +60,11 @@ The bridge to FlashInfer's `paged_kv_t` (which expects separate `k_data`/`v_data | failed | 9 | **0** | — | | ITL p99 | 291ms | **211ms** | +38% | -pegainfer wins 17/20 metrics. Std lower across the board (TTFT −20%, TPOT −22%). vLLM wins on robustness (zero failed) and ITL tail. +openinfer wins 17/20 metrics. Std lower across the board (TTFT −20%, TPOT −22%). vLLM wins on robustness (zero failed) and ITL tail. ### Decode TPOT scaling (in=1, out=128) -| Concurrency | pegainfer | vLLM | +| Concurrency | openinfer | vLLM | |---|---|---| | 1 | 10.75ms | 11.31ms | | 4 | 10.96ms | 11.52ms | diff --git a/pegainfer-bench/Cargo.toml b/openinfer-bench/Cargo.toml similarity index 76% rename from pegainfer-bench/Cargo.toml rename to openinfer-bench/Cargo.toml index afe29104..d0803d66 100644 --- a/pegainfer-bench/Cargo.toml +++ b/openinfer-bench/Cargo.toml @@ -1,12 +1,12 @@ [package] -name = "pegainfer-bench" +name = "openinfer-bench" version = "0.1.0" edition = "2024" [dependencies] anyhow = { workspace = true } cudarc = { workspace = true } -pegainfer-kernels = { workspace = true } +openinfer-kernels = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } diff --git a/pegainfer-bench/src/lib.rs b/openinfer-bench/src/lib.rs similarity index 99% rename from pegainfer-bench/src/lib.rs rename to openinfer-bench/src/lib.rs index 0f8c88fe..c521c7d9 100644 --- a/pegainfer-bench/src/lib.rs +++ b/openinfer-bench/src/lib.rs @@ -10,7 +10,7 @@ use anyhow::{Result, anyhow, bail}; use cudarc::driver::sys; -use pegainfer_kernels::tensor::{DeviceContext, DeviceMatrix, GpuWeight, KernelCall, TensorSpec}; +use openinfer_kernels::tensor::{DeviceContext, DeviceMatrix, GpuWeight, KernelCall, TensorSpec}; use serde::Serialize; #[derive(Clone, Debug, Serialize)] diff --git a/pegainfer-comm/Cargo.toml b/openinfer-comm/Cargo.toml similarity index 52% rename from pegainfer-comm/Cargo.toml rename to openinfer-comm/Cargo.toml index 88631554..faf95d4e 100644 --- a/pegainfer-comm/Cargo.toml +++ b/openinfer-comm/Cargo.toml @@ -1,8 +1,8 @@ [package] -name = "pegainfer-comm" +name = "openinfer-comm" edition = "2024" publish = false -description = "PegaInfer comm-backend public surface (EP all-to-all first; future data-movement surfaces later). Requires CUDA, RDMA Verbs, and GDRCopy on the host." +description = "OpenInfer comm-backend public surface (EP all-to-all first; future data-movement surfaces later). Requires CUDA, RDMA Verbs, and GDRCopy on the host." autobins = false autotests = false @@ -10,10 +10,10 @@ autotests = false thiserror = { workspace = true } anyhow = { workspace = true } log = { workspace = true } -pegainfer-core = { workspace = true } -p2p-all-to-all = { path = "crates/pegainfer-comm-p2p-all-to-all", package = "pegainfer-comm-p2p-all-to-all", features = ["hw-rdma"] } -fabric-lib = { path = "crates/pegainfer-comm-fabric-lib", package = "pegainfer-comm-fabric-lib", features = ["hw-rdma"] } -cuda-lib = { path = "crates/pegainfer-comm-cuda-lib", package = "pegainfer-comm-cuda-lib", features = ["hw-cuda"] } +openinfer-core = { workspace = true } +p2p-all-to-all = { path = "crates/openinfer-comm-p2p-all-to-all", package = "openinfer-comm-p2p-all-to-all", features = ["hw-rdma"] } +fabric-lib = { path = "crates/openinfer-comm-fabric-lib", package = "openinfer-comm-fabric-lib", features = ["hw-rdma"] } +cuda-lib = { path = "crates/openinfer-comm-cuda-lib", package = "openinfer-comm-cuda-lib", features = ["hw-cuda"] } cudarc = { workspace = true } half = { workspace = true } clap = { workspace = true } diff --git a/pegainfer-comm/LICENSE b/openinfer-comm/LICENSE similarity index 100% rename from pegainfer-comm/LICENSE rename to openinfer-comm/LICENSE diff --git a/pegainfer-comm/MANIFEST.in b/openinfer-comm/MANIFEST.in similarity index 100% rename from pegainfer-comm/MANIFEST.in rename to openinfer-comm/MANIFEST.in diff --git a/pegainfer-comm/NOTICE.md b/openinfer-comm/NOTICE.md similarity index 91% rename from pegainfer-comm/NOTICE.md rename to openinfer-comm/NOTICE.md index 4906ba5a..70a1bffb 100644 --- a/pegainfer-comm/NOTICE.md +++ b/openinfer-comm/NOTICE.md @@ -15,7 +15,7 @@ all-to-all communication library. ## Scope of this port This port narrows `pplx-garden` to a **Verbs-only** RDMA transport backend, to -be reused by PegaInfer for EP all-to-all on NVLink + InfiniBand / RoCE +be reused by OpenInfer for EP all-to-all on NVLink + InfiniBand / RoCE machines. The libfabric / non-Verbs RDMA provider is removed in its entirety. ## Modifications applied during the port @@ -69,12 +69,12 @@ the authoritative attribution record for this vendored import; they must be retained in any redistribution. The vendored Rust workspace crates have been renamed under the -`pegainfer-comm-*` prefix (for example `pegainfer-comm-p2p-all-to-all`, -`pegainfer-comm-fabric-lib`) so the workspace can coexist with PegaInfer's -existing crate namespace and so the public surface is PegaInfer-owned: +`openinfer-comm-*` prefix (for example `openinfer-comm-p2p-all-to-all`, +`openinfer-comm-fabric-lib`) so the workspace can coexist with OpenInfer's +existing crate namespace and so the public surface is OpenInfer-owned: -* **PegaInfer-owned in this tree** — the Rust workspace crate names, the - top-level `pegainfer-comm` crate, and the public PegaInfer-facing API it +* **OpenInfer-owned in this tree** — the Rust workspace crate names, the + top-level `openinfer-comm` crate, and the public OpenInfer-facing API it exposes (`EpAllToAll`, `EpBackend`, `EpBackendBuilder`, `EpTopology`, `DispatchPlan` / `CombinePlan`, `SendBuf` / `RecvBuf`, the handle and error types). These names are this tree's, not upstream's. @@ -87,7 +87,7 @@ existing crate namespace and so the public surface is PegaInfer-owned: * **Internal layout names** — internal module / file names inside the renamed Rust crates may continue to mirror upstream layout when that helps compare against the vendored tests or upstream source. These names are not - PegaInfer public API. + OpenInfer public API. When this tree is included in further redistributions, this NOTICE plus the original MIT `LICENSE` must be carried alongside the code per the MIT terms. diff --git a/pegainfer-comm/README.md b/openinfer-comm/README.md similarity index 91% rename from pegainfer-comm/README.md rename to openinfer-comm/README.md index 4031ffb5..8c005956 100644 --- a/pegainfer-comm/README.md +++ b/openinfer-comm/README.md @@ -1,7 +1,7 @@ -# pegainfer-comm +# openinfer-comm -Skeleton comm-backend surface for **PegaInfer**: a narrow, hardware-free -trait that PegaInfer's request scheduler will use to drive cross-rank +Skeleton comm-backend surface for **OpenInfer**: a narrow, hardware-free +trait that OpenInfer's request scheduler will use to drive cross-rank data movement (EP all-to-all first; future data-movement surfaces later). **Status: skeleton.** This crate currently exposes the *shape* of the @@ -12,7 +12,7 @@ can be reviewed before the hardware adapter is wired in. The crate is structured so that the default-feature build does not require any hardware-class system header (CUDA SDK, GDRCopy, RDMA -Verbs). This lets PegaInfer's main CI lane `cargo check -p pegainfer-comm` +Verbs). This lets OpenInfer's main CI lane `cargo check -p openinfer-comm` on a barebones development machine. Hardware backends live behind feature flags and only compile in when the matching feature is on. @@ -106,18 +106,18 @@ matures. They are written in to constrain follow-up PRs: feature modes; no caller can obtain a backend whose trait methods would panic. -## Wrapper crates are *not* PegaInfer's public API +## Wrapper crates are *not* OpenInfer's public API This repository contains several upstream-derived wrapper crates (`p2p-all-to-all`, `fabric-lib`, `cuda-lib`, `torch-lib`, `a2a-kernels`, `python-ext`, plus their `*-sys` siblings). They are hardware -implementation packages reached only through `pegainfer-comm` +implementation packages reached only through `openinfer-comm` adapters. Their names, types, and feature flags are **not** part of -PegaInfer's API contract and may evolve as the upstream and adapter +OpenInfer's API contract and may evolve as the upstream and adapter layers change. -PegaInfer code (and any code outside this crate) should depend on -`pegainfer-comm` only. Direct use of wrapper-crate types from outside +OpenInfer code (and any code outside this crate) should depend on +`openinfer-comm` only. Direct use of wrapper-crate types from outside this crate's `backend::*` adapter modules is unsupported. ## Usage sketch @@ -128,7 +128,7 @@ intentional — callers must dispatch on the `Result`. **Do not call window: ```rust -use pegainfer_comm::{EpBackendBuilder, EpTopology}; +use openinfer_comm::{EpBackendBuilder, EpTopology}; // `EpTopology` is `#[non_exhaustive]`; outside this crate the constructor // is the only stable way to obtain one. @@ -157,5 +157,5 @@ match EpBackendBuilder::new().topology(topology).build() { See the top-level `LICENSE` and `NOTICE.md`. The hardware backend is being adapted from upstream `pplx-garden`; this crate adds the -PegaInfer-facing public surface skeleton and the feature-gating that +OpenInfer-facing public surface skeleton and the feature-gating that keeps the default build hardware-free. diff --git a/pegainfer-comm/benchmarks/__init__.py b/openinfer-comm/benchmarks/__init__.py similarity index 100% rename from pegainfer-comm/benchmarks/__init__.py rename to openinfer-comm/benchmarks/__init__.py diff --git a/pegainfer-comm/benchmarks/bench_all_to_all.py b/openinfer-comm/benchmarks/bench_all_to_all.py similarity index 100% rename from pegainfer-comm/benchmarks/bench_all_to_all.py rename to openinfer-comm/benchmarks/bench_all_to_all.py diff --git a/pegainfer-comm/crates/pegainfer-comm-a2a-kernels/Cargo.toml b/openinfer-comm/crates/openinfer-comm-a2a-kernels/Cargo.toml similarity index 78% rename from pegainfer-comm/crates/pegainfer-comm-a2a-kernels/Cargo.toml rename to openinfer-comm/crates/openinfer-comm-a2a-kernels/Cargo.toml index 5ca84bf2..ae8e0ef1 100644 --- a/pegainfer-comm/crates/pegainfer-comm-a2a-kernels/Cargo.toml +++ b/openinfer-comm/crates/openinfer-comm-a2a-kernels/Cargo.toml @@ -1,7 +1,7 @@ [package] edition = "2024" links = "a2a_kernels" -name = "pegainfer-comm-a2a-kernels" +name = "openinfer-comm-a2a-kernels" publish = false [features] @@ -17,4 +17,4 @@ cxx = { workspace = true } [build-dependencies] cc = { workspace = true, features = ["parallel"] } cxx-build = { workspace = true } -build-utils = { path = "../pegainfer-comm-build-utils", package = "pegainfer-comm-build-utils" } +build-utils = { path = "../openinfer-comm-build-utils", package = "openinfer-comm-build-utils" } diff --git a/pegainfer-comm/crates/pegainfer-comm-a2a-kernels/build.rs b/openinfer-comm/crates/openinfer-comm-a2a-kernels/build.rs similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-a2a-kernels/build.rs rename to openinfer-comm/crates/openinfer-comm-a2a-kernels/build.rs diff --git a/pegainfer-comm/crates/pegainfer-comm-a2a-kernels/src/a2a/a2a_combine_recv.cu b/openinfer-comm/crates/openinfer-comm-a2a-kernels/src/a2a/a2a_combine_recv.cu similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-a2a-kernels/src/a2a/a2a_combine_recv.cu rename to openinfer-comm/crates/openinfer-comm-a2a-kernels/src/a2a/a2a_combine_recv.cu diff --git a/pegainfer-comm/crates/pegainfer-comm-a2a-kernels/src/a2a/a2a_combine_send.cu b/openinfer-comm/crates/openinfer-comm-a2a-kernels/src/a2a/a2a_combine_send.cu similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-a2a-kernels/src/a2a/a2a_combine_send.cu rename to openinfer-comm/crates/openinfer-comm-a2a-kernels/src/a2a/a2a_combine_send.cu diff --git a/pegainfer-comm/crates/pegainfer-comm-a2a-kernels/src/a2a/a2a_dispatch_recv.cu b/openinfer-comm/crates/openinfer-comm-a2a-kernels/src/a2a/a2a_dispatch_recv.cu similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-a2a-kernels/src/a2a/a2a_dispatch_recv.cu rename to openinfer-comm/crates/openinfer-comm-a2a-kernels/src/a2a/a2a_dispatch_recv.cu diff --git a/pegainfer-comm/crates/pegainfer-comm-a2a-kernels/src/a2a/a2a_dispatch_send.cu b/openinfer-comm/crates/openinfer-comm-a2a-kernels/src/a2a/a2a_dispatch_send.cu similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-a2a-kernels/src/a2a/a2a_dispatch_send.cu rename to openinfer-comm/crates/openinfer-comm-a2a-kernels/src/a2a/a2a_dispatch_send.cu diff --git a/pegainfer-comm/crates/pegainfer-comm-a2a-kernels/src/a2a/a2a_kernels.h b/openinfer-comm/crates/openinfer-comm-a2a-kernels/src/a2a/a2a_kernels.h similarity index 98% rename from pegainfer-comm/crates/pegainfer-comm-a2a-kernels/src/a2a/a2a_kernels.h rename to openinfer-comm/crates/openinfer-comm-a2a-kernels/src/a2a/a2a_kernels.h index 185b4a02..ba4be145 100644 --- a/pegainfer-comm/crates/pegainfer-comm-a2a-kernels/src/a2a/a2a_kernels.h +++ b/openinfer-comm/crates/openinfer-comm-a2a-kernels/src/a2a/a2a_kernels.h @@ -7,7 +7,7 @@ // `ScalarType` is defined inside the cxx bridge below, under the // `a2a_kernels` namespace. The cxx-generated header provides the C++ enum // declaration that the .cu sources reference as `a2a_kernels::ScalarType`. -#include "pegainfer-comm-a2a-kernels/src/hw_cuda_impl.rs.h" +#include "openinfer-comm-a2a-kernels/src/hw_cuda_impl.rs.h" namespace a2a_kernels { diff --git a/pegainfer-comm/crates/pegainfer-comm-a2a-kernels/src/core/combine_utils.cuh b/openinfer-comm/crates/openinfer-comm-a2a-kernels/src/core/combine_utils.cuh similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-a2a-kernels/src/core/combine_utils.cuh rename to openinfer-comm/crates/openinfer-comm-a2a-kernels/src/core/combine_utils.cuh diff --git a/pegainfer-comm/crates/pegainfer-comm-a2a-kernels/src/core/common_utils.h b/openinfer-comm/crates/openinfer-comm-a2a-kernels/src/core/common_utils.h similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-a2a-kernels/src/core/common_utils.h rename to openinfer-comm/crates/openinfer-comm-a2a-kernels/src/core/common_utils.h diff --git a/pegainfer-comm/crates/pegainfer-comm-a2a-kernels/src/core/device_utils.cuh b/openinfer-comm/crates/openinfer-comm-a2a-kernels/src/core/device_utils.cuh similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-a2a-kernels/src/core/device_utils.cuh rename to openinfer-comm/crates/openinfer-comm-a2a-kernels/src/core/device_utils.cuh diff --git a/pegainfer-comm/crates/pegainfer-comm-a2a-kernels/src/core/launch_utils.cuh b/openinfer-comm/crates/openinfer-comm-a2a-kernels/src/core/launch_utils.cuh similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-a2a-kernels/src/core/launch_utils.cuh rename to openinfer-comm/crates/openinfer-comm-a2a-kernels/src/core/launch_utils.cuh diff --git a/pegainfer-comm/crates/pegainfer-comm-a2a-kernels/src/core/memory.cuh b/openinfer-comm/crates/openinfer-comm-a2a-kernels/src/core/memory.cuh similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-a2a-kernels/src/core/memory.cuh rename to openinfer-comm/crates/openinfer-comm-a2a-kernels/src/core/memory.cuh diff --git a/pegainfer-comm/crates/pegainfer-comm-a2a-kernels/src/core/vector.cuh b/openinfer-comm/crates/openinfer-comm-a2a-kernels/src/core/vector.cuh similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-a2a-kernels/src/core/vector.cuh rename to openinfer-comm/crates/openinfer-comm-a2a-kernels/src/core/vector.cuh diff --git a/pegainfer-comm/crates/pegainfer-comm-a2a-kernels/src/hw_cuda_impl.rs b/openinfer-comm/crates/openinfer-comm-a2a-kernels/src/hw_cuda_impl.rs similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-a2a-kernels/src/hw_cuda_impl.rs rename to openinfer-comm/crates/openinfer-comm-a2a-kernels/src/hw_cuda_impl.rs diff --git a/pegainfer-comm/crates/pegainfer-comm-a2a-kernels/src/lib.rs b/openinfer-comm/crates/openinfer-comm-a2a-kernels/src/lib.rs similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-a2a-kernels/src/lib.rs rename to openinfer-comm/crates/openinfer-comm-a2a-kernels/src/lib.rs diff --git a/pegainfer-comm/crates/pegainfer-comm-build-utils/Cargo.toml b/openinfer-comm/crates/openinfer-comm-build-utils/Cargo.toml similarity index 54% rename from pegainfer-comm/crates/pegainfer-comm-build-utils/Cargo.toml rename to openinfer-comm/crates/openinfer-comm-build-utils/Cargo.toml index d8429795..61992136 100644 --- a/pegainfer-comm/crates/pegainfer-comm-build-utils/Cargo.toml +++ b/openinfer-comm/crates/openinfer-comm-build-utils/Cargo.toml @@ -1,4 +1,4 @@ [package] edition = "2024" -name = "pegainfer-comm-build-utils" +name = "openinfer-comm-build-utils" publish = false diff --git a/pegainfer-comm/crates/pegainfer-comm-build-utils/src/lib.rs b/openinfer-comm/crates/openinfer-comm-build-utils/src/lib.rs similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-build-utils/src/lib.rs rename to openinfer-comm/crates/openinfer-comm-build-utils/src/lib.rs diff --git a/pegainfer-comm/crates/pegainfer-comm-cuda-lib/Cargo.toml b/openinfer-comm/crates/openinfer-comm-cuda-lib/Cargo.toml similarity index 55% rename from pegainfer-comm/crates/pegainfer-comm-cuda-lib/Cargo.toml rename to openinfer-comm/crates/openinfer-comm-cuda-lib/Cargo.toml index 7246e569..0ed9edec 100644 --- a/pegainfer-comm/crates/pegainfer-comm-cuda-lib/Cargo.toml +++ b/openinfer-comm/crates/openinfer-comm-cuda-lib/Cargo.toml @@ -1,13 +1,13 @@ [package] edition = "2024" -name = "pegainfer-comm-cuda-lib" +name = "openinfer-comm-cuda-lib" publish = false [features] # Hardware backend feature: when on, the CUDA wrapper implementation is built # and FFI types are re-exported. When off (default), this crate compiles to a # near-empty shell exposing only the `HW_CUDA_ENABLED` diagnostic marker. The -# abstract PegaInfer-facing surface lives in the top-level `pegainfer-comm` +# abstract OpenInfer-facing surface lives in the top-level `openinfer-comm` # crate, NOT here. default = [] hw-cuda = [ @@ -17,13 +17,13 @@ hw-cuda = [ ] [dependencies] -cuda-sys = { path = "../pegainfer-comm-cuda-sys", package = "pegainfer-comm-cuda-sys" } -cudart-sys = { path = "../pegainfer-comm-cudart-sys", package = "pegainfer-comm-cudart-sys" } -gdrapi-sys = { path = "../pegainfer-comm-gdrapi-sys", package = "pegainfer-comm-gdrapi-sys" } +cuda-sys = { path = "../openinfer-comm-cuda-sys", package = "openinfer-comm-cuda-sys" } +cudart-sys = { path = "../openinfer-comm-cudart-sys", package = "openinfer-comm-cudart-sys" } +gdrapi-sys = { path = "../openinfer-comm-gdrapi-sys", package = "openinfer-comm-gdrapi-sys" } libc = { workspace = true } thiserror = { workspace = true } bincode = { workspace = true, features = ["derive", "alloc"] } [dev-dependencies] -proc-lib = { path = "../pegainfer-comm-proc-lib", package = "pegainfer-comm-proc-lib", features = ["hw-cuda"] } +proc-lib = { path = "../openinfer-comm-proc-lib", package = "openinfer-comm-proc-lib", features = ["hw-cuda"] } diff --git a/pegainfer-comm/crates/pegainfer-comm-cuda-lib/src/cumem.rs b/openinfer-comm/crates/openinfer-comm-cuda-lib/src/cumem.rs similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-cuda-lib/src/cumem.rs rename to openinfer-comm/crates/openinfer-comm-cuda-lib/src/cumem.rs diff --git a/pegainfer-comm/crates/pegainfer-comm-cuda-lib/src/device.rs b/openinfer-comm/crates/openinfer-comm-cuda-lib/src/device.rs similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-cuda-lib/src/device.rs rename to openinfer-comm/crates/openinfer-comm-cuda-lib/src/device.rs diff --git a/pegainfer-comm/crates/pegainfer-comm-cuda-lib/src/driver.rs b/openinfer-comm/crates/openinfer-comm-cuda-lib/src/driver.rs similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-cuda-lib/src/driver.rs rename to openinfer-comm/crates/openinfer-comm-cuda-lib/src/driver.rs diff --git a/pegainfer-comm/crates/pegainfer-comm-cuda-lib/src/error.rs b/openinfer-comm/crates/openinfer-comm-cuda-lib/src/error.rs similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-cuda-lib/src/error.rs rename to openinfer-comm/crates/openinfer-comm-cuda-lib/src/error.rs diff --git a/pegainfer-comm/crates/pegainfer-comm-cuda-lib/src/event.rs b/openinfer-comm/crates/openinfer-comm-cuda-lib/src/event.rs similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-cuda-lib/src/event.rs rename to openinfer-comm/crates/openinfer-comm-cuda-lib/src/event.rs diff --git a/pegainfer-comm/crates/pegainfer-comm-cuda-lib/src/gdr.rs b/openinfer-comm/crates/openinfer-comm-cuda-lib/src/gdr.rs similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-cuda-lib/src/gdr.rs rename to openinfer-comm/crates/openinfer-comm-cuda-lib/src/gdr.rs diff --git a/pegainfer-comm/crates/pegainfer-comm-cuda-lib/src/lib.rs b/openinfer-comm/crates/openinfer-comm-cuda-lib/src/lib.rs similarity index 92% rename from pegainfer-comm/crates/pegainfer-comm-cuda-lib/src/lib.rs rename to openinfer-comm/crates/openinfer-comm-cuda-lib/src/lib.rs index f083d32c..7fae45ec 100644 --- a/pegainfer-comm/crates/pegainfer-comm-cuda-lib/src/lib.rs +++ b/openinfer-comm/crates/openinfer-comm-cuda-lib/src/lib.rs @@ -8,8 +8,8 @@ //! When the feature is disabled (the default), the crate compiles to a //! near-empty shell that only exposes the `HW_CUDA_ENABLED` diagnostic //! marker. This crate is a hardware implementation layer, not a public -//! abstract API; the PegaInfer-facing trait/plan/error/handle surface lives -//! in the (future) top-level `pegainfer-comm` crate. +//! abstract API; the OpenInfer-facing trait/plan/error/handle surface lives +//! in the (future) top-level `openinfer-comm` crate. #![allow(non_snake_case)] /// Whether the `hw-cuda` feature is active in this build. Diagnostic only. diff --git a/pegainfer-comm/crates/pegainfer-comm-cuda-lib/src/mem.rs b/openinfer-comm/crates/openinfer-comm-cuda-lib/src/mem.rs similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-cuda-lib/src/mem.rs rename to openinfer-comm/crates/openinfer-comm-cuda-lib/src/mem.rs diff --git a/pegainfer-comm/crates/pegainfer-comm-cuda-lib/src/rt.rs b/openinfer-comm/crates/openinfer-comm-cuda-lib/src/rt.rs similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-cuda-lib/src/rt.rs rename to openinfer-comm/crates/openinfer-comm-cuda-lib/src/rt.rs diff --git a/pegainfer-comm/crates/pegainfer-comm-cuda-lib/src/test_driver.rs b/openinfer-comm/crates/openinfer-comm-cuda-lib/src/test_driver.rs similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-cuda-lib/src/test_driver.rs rename to openinfer-comm/crates/openinfer-comm-cuda-lib/src/test_driver.rs diff --git a/pegainfer-comm/crates/pegainfer-comm-cuda-lib/src/test_gdr.rs b/openinfer-comm/crates/openinfer-comm-cuda-lib/src/test_gdr.rs similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-cuda-lib/src/test_gdr.rs rename to openinfer-comm/crates/openinfer-comm-cuda-lib/src/test_gdr.rs diff --git a/pegainfer-comm/crates/pegainfer-comm-cuda-sys/Cargo.toml b/openinfer-comm/crates/openinfer-comm-cuda-sys/Cargo.toml similarity index 80% rename from pegainfer-comm/crates/pegainfer-comm-cuda-sys/Cargo.toml rename to openinfer-comm/crates/openinfer-comm-cuda-sys/Cargo.toml index 207077d6..7a104a64 100644 --- a/pegainfer-comm/crates/pegainfer-comm-cuda-sys/Cargo.toml +++ b/openinfer-comm/crates/openinfer-comm-cuda-sys/Cargo.toml @@ -1,7 +1,7 @@ [package] edition = "2024" links = "cuda" -name = "pegainfer-comm-cuda-sys" +name = "openinfer-comm-cuda-sys" publish = false [features] @@ -14,6 +14,6 @@ default = [] system-bindings = [] [build-dependencies] -build-utils = { path = "../pegainfer-comm-build-utils", package = "pegainfer-comm-build-utils" } +build-utils = { path = "../openinfer-comm-build-utils", package = "openinfer-comm-build-utils" } bindgen = { workspace = true } diff --git a/pegainfer-comm/crates/pegainfer-comm-cuda-sys/build.rs b/openinfer-comm/crates/openinfer-comm-cuda-sys/build.rs similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-cuda-sys/build.rs rename to openinfer-comm/crates/openinfer-comm-cuda-sys/build.rs diff --git a/pegainfer-comm/crates/pegainfer-comm-cuda-sys/src/lib.rs b/openinfer-comm/crates/openinfer-comm-cuda-sys/src/lib.rs similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-cuda-sys/src/lib.rs rename to openinfer-comm/crates/openinfer-comm-cuda-sys/src/lib.rs diff --git a/openinfer-comm/crates/openinfer-comm-cudart-sys/Cargo.toml b/openinfer-comm/crates/openinfer-comm-cudart-sys/Cargo.toml new file mode 100644 index 00000000..87a5b0ba --- /dev/null +++ b/openinfer-comm/crates/openinfer-comm-cudart-sys/Cargo.toml @@ -0,0 +1,15 @@ +[package] +edition = "2024" +links = "cudart" +name = "openinfer-comm-cudart-sys" +publish = false + +[features] +# Internal sys-crate feature. See `openinfer-comm-cuda-sys/Cargo.toml` for the rationale. +default = [] +system-bindings = [] + +[build-dependencies] +build-utils = { path = "../openinfer-comm-build-utils", package = "openinfer-comm-build-utils" } + +bindgen = { workspace = true } diff --git a/pegainfer-comm/crates/pegainfer-comm-cudart-sys/build.rs b/openinfer-comm/crates/openinfer-comm-cudart-sys/build.rs similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-cudart-sys/build.rs rename to openinfer-comm/crates/openinfer-comm-cudart-sys/build.rs diff --git a/pegainfer-comm/crates/pegainfer-comm-cudart-sys/src/lib.rs b/openinfer-comm/crates/openinfer-comm-cudart-sys/src/lib.rs similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-cudart-sys/src/lib.rs rename to openinfer-comm/crates/openinfer-comm-cudart-sys/src/lib.rs diff --git a/pegainfer-comm/crates/pegainfer-comm-cudart-sys/wrapper.h b/openinfer-comm/crates/openinfer-comm-cudart-sys/wrapper.h similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-cudart-sys/wrapper.h rename to openinfer-comm/crates/openinfer-comm-cudart-sys/wrapper.h diff --git a/pegainfer-comm/crates/pegainfer-comm-fabric-debug/Cargo.toml b/openinfer-comm/crates/openinfer-comm-fabric-debug/Cargo.toml similarity index 56% rename from pegainfer-comm/crates/pegainfer-comm-fabric-debug/Cargo.toml rename to openinfer-comm/crates/openinfer-comm-fabric-debug/Cargo.toml index 4bf5bf94..883bcc55 100644 --- a/pegainfer-comm/crates/pegainfer-comm-fabric-debug/Cargo.toml +++ b/openinfer-comm/crates/openinfer-comm-fabric-debug/Cargo.toml @@ -1,6 +1,6 @@ [package] edition = "2024" -name = "pegainfer-comm-fabric-debug" +name = "openinfer-comm-fabric-debug" publish = false [features] @@ -11,9 +11,9 @@ default = [] hw-rdma = ["fabric-lib/hw-rdma", "cuda-lib/hw-cuda"] [dependencies] -fabric-lib = { path = "../pegainfer-comm-fabric-lib", package = "pegainfer-comm-fabric-lib" } -cuda-lib = { path = "../pegainfer-comm-cuda-lib", package = "pegainfer-comm-cuda-lib" } -logging-lib = { path = "../pegainfer-comm-logging-lib", package = "pegainfer-comm-logging-lib", features = [] } +fabric-lib = { path = "../openinfer-comm-fabric-lib", package = "openinfer-comm-fabric-lib" } +cuda-lib = { path = "../openinfer-comm-cuda-lib", package = "openinfer-comm-cuda-lib" } +logging-lib = { path = "../openinfer-comm-logging-lib", package = "openinfer-comm-logging-lib", features = [] } anyhow = { workspace = true } bytes = { workspace = true } diff --git a/pegainfer-comm/crates/pegainfer-comm-fabric-debug/src/hw_rdma_impl.rs b/openinfer-comm/crates/openinfer-comm-fabric-debug/src/hw_rdma_impl.rs similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-fabric-debug/src/hw_rdma_impl.rs rename to openinfer-comm/crates/openinfer-comm-fabric-debug/src/hw_rdma_impl.rs diff --git a/pegainfer-comm/crates/pegainfer-comm-fabric-debug/src/main.rs b/openinfer-comm/crates/openinfer-comm-fabric-debug/src/main.rs similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-fabric-debug/src/main.rs rename to openinfer-comm/crates/openinfer-comm-fabric-debug/src/main.rs diff --git a/pegainfer-comm/crates/pegainfer-comm-fabric-lib/Cargo.toml b/openinfer-comm/crates/openinfer-comm-fabric-lib/Cargo.toml similarity index 70% rename from pegainfer-comm/crates/pegainfer-comm-fabric-lib/Cargo.toml rename to openinfer-comm/crates/openinfer-comm-fabric-lib/Cargo.toml index 2a0116ed..ddb8ac0e 100644 --- a/pegainfer-comm/crates/pegainfer-comm-fabric-lib/Cargo.toml +++ b/openinfer-comm/crates/openinfer-comm-fabric-lib/Cargo.toml @@ -1,14 +1,14 @@ [package] edition = "2024" -name = "pegainfer-comm-fabric-lib" +name = "openinfer-comm-fabric-lib" publish = false [features] # Hardware backend feature: when on, the Verbs RDMA implementation is built and # the public surface (`FabricEngine`, `Worker`, `TransferEngine`, etc.) is # exposed. When off (default), this crate compiles to a near-empty shell with -# only `HW_RDMA_ENABLED`. The abstract PegaInfer-facing surface lives in the -# top-level `pegainfer-comm` crate, NOT here. +# only `HW_RDMA_ENABLED`. The abstract OpenInfer-facing surface lives in the +# top-level `openinfer-comm` crate, NOT here. default = [] hw-rdma = [ "libibverbs-sys/system-bindings", @@ -17,10 +17,10 @@ hw-rdma = [ tokio = ["dep:tokio", "tokio/sync", "tokio/rt-multi-thread"] [dependencies] -libibverbs-sys = { path = "../pegainfer-comm-libibverbs-sys", package = "pegainfer-comm-libibverbs-sys" } +libibverbs-sys = { path = "../openinfer-comm-libibverbs-sys", package = "openinfer-comm-libibverbs-sys" } -cuda-lib = { path = "../pegainfer-comm-cuda-lib", package = "pegainfer-comm-cuda-lib" } -thread-lib = { path = "../pegainfer-comm-thread-lib", package = "pegainfer-comm-thread-lib" } +cuda-lib = { path = "../openinfer-comm-cuda-lib", package = "openinfer-comm-cuda-lib" } +thread-lib = { path = "../openinfer-comm-thread-lib", package = "openinfer-comm-thread-lib" } anyhow = { workspace = true } bytes = { workspace = true } diff --git a/pegainfer-comm/crates/pegainfer-comm-fabric-lib/src/api.rs b/openinfer-comm/crates/openinfer-comm-fabric-lib/src/api.rs similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-fabric-lib/src/api.rs rename to openinfer-comm/crates/openinfer-comm-fabric-lib/src/api.rs diff --git a/pegainfer-comm/crates/pegainfer-comm-fabric-lib/src/domain_group.rs b/openinfer-comm/crates/openinfer-comm-fabric-lib/src/domain_group.rs similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-fabric-lib/src/domain_group.rs rename to openinfer-comm/crates/openinfer-comm-fabric-lib/src/domain_group.rs diff --git a/pegainfer-comm/crates/pegainfer-comm-fabric-lib/src/error.rs b/openinfer-comm/crates/openinfer-comm-fabric-lib/src/error.rs similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-fabric-lib/src/error.rs rename to openinfer-comm/crates/openinfer-comm-fabric-lib/src/error.rs diff --git a/pegainfer-comm/crates/pegainfer-comm-fabric-lib/src/fabric_engine.rs b/openinfer-comm/crates/openinfer-comm-fabric-lib/src/fabric_engine.rs similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-fabric-lib/src/fabric_engine.rs rename to openinfer-comm/crates/openinfer-comm-fabric-lib/src/fabric_engine.rs diff --git a/pegainfer-comm/crates/pegainfer-comm-fabric-lib/src/host_buffer.rs b/openinfer-comm/crates/openinfer-comm-fabric-lib/src/host_buffer.rs similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-fabric-lib/src/host_buffer.rs rename to openinfer-comm/crates/openinfer-comm-fabric-lib/src/host_buffer.rs diff --git a/pegainfer-comm/crates/pegainfer-comm-fabric-lib/src/imm_count.rs b/openinfer-comm/crates/openinfer-comm-fabric-lib/src/imm_count.rs similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-fabric-lib/src/imm_count.rs rename to openinfer-comm/crates/openinfer-comm-fabric-lib/src/imm_count.rs diff --git a/pegainfer-comm/crates/pegainfer-comm-fabric-lib/src/interface.rs b/openinfer-comm/crates/openinfer-comm-fabric-lib/src/interface.rs similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-fabric-lib/src/interface.rs rename to openinfer-comm/crates/openinfer-comm-fabric-lib/src/interface.rs diff --git a/pegainfer-comm/crates/pegainfer-comm-fabric-lib/src/lib.rs b/openinfer-comm/crates/openinfer-comm-fabric-lib/src/lib.rs similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-fabric-lib/src/lib.rs rename to openinfer-comm/crates/openinfer-comm-fabric-lib/src/lib.rs diff --git a/pegainfer-comm/crates/pegainfer-comm-fabric-lib/src/mr.rs b/openinfer-comm/crates/openinfer-comm-fabric-lib/src/mr.rs similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-fabric-lib/src/mr.rs rename to openinfer-comm/crates/openinfer-comm-fabric-lib/src/mr.rs diff --git a/pegainfer-comm/crates/pegainfer-comm-fabric-lib/src/provider.rs b/openinfer-comm/crates/openinfer-comm-fabric-lib/src/provider.rs similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-fabric-lib/src/provider.rs rename to openinfer-comm/crates/openinfer-comm-fabric-lib/src/provider.rs diff --git a/pegainfer-comm/crates/pegainfer-comm-fabric-lib/src/provider_dispatch.rs b/openinfer-comm/crates/openinfer-comm-fabric-lib/src/provider_dispatch.rs similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-fabric-lib/src/provider_dispatch.rs rename to openinfer-comm/crates/openinfer-comm-fabric-lib/src/provider_dispatch.rs diff --git a/pegainfer-comm/crates/pegainfer-comm-fabric-lib/src/rdma_op.rs b/openinfer-comm/crates/openinfer-comm-fabric-lib/src/rdma_op.rs similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-fabric-lib/src/rdma_op.rs rename to openinfer-comm/crates/openinfer-comm-fabric-lib/src/rdma_op.rs diff --git a/pegainfer-comm/crates/pegainfer-comm-fabric-lib/src/topo.rs b/openinfer-comm/crates/openinfer-comm-fabric-lib/src/topo.rs similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-fabric-lib/src/topo.rs rename to openinfer-comm/crates/openinfer-comm-fabric-lib/src/topo.rs diff --git a/pegainfer-comm/crates/pegainfer-comm-fabric-lib/src/transfer_engine.rs b/openinfer-comm/crates/openinfer-comm-fabric-lib/src/transfer_engine.rs similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-fabric-lib/src/transfer_engine.rs rename to openinfer-comm/crates/openinfer-comm-fabric-lib/src/transfer_engine.rs diff --git a/pegainfer-comm/crates/pegainfer-comm-fabric-lib/src/transfer_engine_builder.rs b/openinfer-comm/crates/openinfer-comm-fabric-lib/src/transfer_engine_builder.rs similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-fabric-lib/src/transfer_engine_builder.rs rename to openinfer-comm/crates/openinfer-comm-fabric-lib/src/transfer_engine_builder.rs diff --git a/pegainfer-comm/crates/pegainfer-comm-fabric-lib/src/utils/defer.rs b/openinfer-comm/crates/openinfer-comm-fabric-lib/src/utils/defer.rs similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-fabric-lib/src/utils/defer.rs rename to openinfer-comm/crates/openinfer-comm-fabric-lib/src/utils/defer.rs diff --git a/pegainfer-comm/crates/pegainfer-comm-fabric-lib/src/utils/hex.rs b/openinfer-comm/crates/openinfer-comm-fabric-lib/src/utils/hex.rs similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-fabric-lib/src/utils/hex.rs rename to openinfer-comm/crates/openinfer-comm-fabric-lib/src/utils/hex.rs diff --git a/pegainfer-comm/crates/pegainfer-comm-fabric-lib/src/utils/memory.rs b/openinfer-comm/crates/openinfer-comm-fabric-lib/src/utils/memory.rs similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-fabric-lib/src/utils/memory.rs rename to openinfer-comm/crates/openinfer-comm-fabric-lib/src/utils/memory.rs diff --git a/pegainfer-comm/crates/pegainfer-comm-fabric-lib/src/utils/mod.rs b/openinfer-comm/crates/openinfer-comm-fabric-lib/src/utils/mod.rs similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-fabric-lib/src/utils/mod.rs rename to openinfer-comm/crates/openinfer-comm-fabric-lib/src/utils/mod.rs diff --git a/pegainfer-comm/crates/pegainfer-comm-fabric-lib/src/utils/obj_pool.rs b/openinfer-comm/crates/openinfer-comm-fabric-lib/src/utils/obj_pool.rs similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-fabric-lib/src/utils/obj_pool.rs rename to openinfer-comm/crates/openinfer-comm-fabric-lib/src/utils/obj_pool.rs diff --git a/pegainfer-comm/crates/pegainfer-comm-fabric-lib/src/verbs/mod.rs b/openinfer-comm/crates/openinfer-comm-fabric-lib/src/verbs/mod.rs similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-fabric-lib/src/verbs/mod.rs rename to openinfer-comm/crates/openinfer-comm-fabric-lib/src/verbs/mod.rs diff --git a/pegainfer-comm/crates/pegainfer-comm-fabric-lib/src/verbs/verbs_address.rs b/openinfer-comm/crates/openinfer-comm-fabric-lib/src/verbs/verbs_address.rs similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-fabric-lib/src/verbs/verbs_address.rs rename to openinfer-comm/crates/openinfer-comm-fabric-lib/src/verbs/verbs_address.rs diff --git a/pegainfer-comm/crates/pegainfer-comm-fabric-lib/src/verbs/verbs_devinfo.rs b/openinfer-comm/crates/openinfer-comm-fabric-lib/src/verbs/verbs_devinfo.rs similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-fabric-lib/src/verbs/verbs_devinfo.rs rename to openinfer-comm/crates/openinfer-comm-fabric-lib/src/verbs/verbs_devinfo.rs diff --git a/pegainfer-comm/crates/pegainfer-comm-fabric-lib/src/verbs/verbs_domain.rs b/openinfer-comm/crates/openinfer-comm-fabric-lib/src/verbs/verbs_domain.rs similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-fabric-lib/src/verbs/verbs_domain.rs rename to openinfer-comm/crates/openinfer-comm-fabric-lib/src/verbs/verbs_domain.rs diff --git a/pegainfer-comm/crates/pegainfer-comm-fabric-lib/src/verbs/verbs_qp.rs b/openinfer-comm/crates/openinfer-comm-fabric-lib/src/verbs/verbs_qp.rs similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-fabric-lib/src/verbs/verbs_qp.rs rename to openinfer-comm/crates/openinfer-comm-fabric-lib/src/verbs/verbs_qp.rs diff --git a/pegainfer-comm/crates/pegainfer-comm-fabric-lib/src/verbs/verbs_rdma_op.rs b/openinfer-comm/crates/openinfer-comm-fabric-lib/src/verbs/verbs_rdma_op.rs similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-fabric-lib/src/verbs/verbs_rdma_op.rs rename to openinfer-comm/crates/openinfer-comm-fabric-lib/src/verbs/verbs_rdma_op.rs diff --git a/pegainfer-comm/crates/pegainfer-comm-fabric-lib/src/worker.rs b/openinfer-comm/crates/openinfer-comm-fabric-lib/src/worker.rs similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-fabric-lib/src/worker.rs rename to openinfer-comm/crates/openinfer-comm-fabric-lib/src/worker.rs diff --git a/openinfer-comm/crates/openinfer-comm-gdrapi-sys/Cargo.toml b/openinfer-comm/crates/openinfer-comm-gdrapi-sys/Cargo.toml new file mode 100644 index 00000000..7cbe6b2d --- /dev/null +++ b/openinfer-comm/crates/openinfer-comm-gdrapi-sys/Cargo.toml @@ -0,0 +1,15 @@ +[package] +edition = "2024" +links = "gdrapi" +name = "openinfer-comm-gdrapi-sys" +publish = false + +[features] +# Internal sys-crate feature. See `openinfer-comm-cuda-sys/Cargo.toml` for the rationale. +default = [] +system-bindings = [] + +[build-dependencies] +build-utils = { path = "../openinfer-comm-build-utils", package = "openinfer-comm-build-utils" } + +bindgen = { workspace = true } diff --git a/pegainfer-comm/crates/pegainfer-comm-gdrapi-sys/build.rs b/openinfer-comm/crates/openinfer-comm-gdrapi-sys/build.rs similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-gdrapi-sys/build.rs rename to openinfer-comm/crates/openinfer-comm-gdrapi-sys/build.rs diff --git a/pegainfer-comm/crates/pegainfer-comm-gdrapi-sys/src/lib.rs b/openinfer-comm/crates/openinfer-comm-gdrapi-sys/src/lib.rs similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-gdrapi-sys/src/lib.rs rename to openinfer-comm/crates/openinfer-comm-gdrapi-sys/src/lib.rs diff --git a/pegainfer-comm/crates/pegainfer-comm-libibverbs-sys/Cargo.toml b/openinfer-comm/crates/openinfer-comm-libibverbs-sys/Cargo.toml similarity index 51% rename from pegainfer-comm/crates/pegainfer-comm-libibverbs-sys/Cargo.toml rename to openinfer-comm/crates/openinfer-comm-libibverbs-sys/Cargo.toml index 5ef5e71d..ee2dabb2 100644 --- a/pegainfer-comm/crates/pegainfer-comm-libibverbs-sys/Cargo.toml +++ b/openinfer-comm/crates/openinfer-comm-libibverbs-sys/Cargo.toml @@ -1,16 +1,16 @@ [package] edition = "2024" links = "ibverbs" -name = "pegainfer-comm-libibverbs-sys" +name = "openinfer-comm-libibverbs-sys" publish = false [features] -# Internal sys-crate feature. See `pegainfer-comm-cuda-sys/Cargo.toml` for the rationale. +# Internal sys-crate feature. See `openinfer-comm-cuda-sys/Cargo.toml` for the rationale. default = [] system-bindings = [] [build-dependencies] -build-utils = { path = "../pegainfer-comm-build-utils", package = "pegainfer-comm-build-utils" } +build-utils = { path = "../openinfer-comm-build-utils", package = "openinfer-comm-build-utils" } bindgen = { workspace = true } cc = { workspace = true } diff --git a/pegainfer-comm/crates/pegainfer-comm-libibverbs-sys/build.rs b/openinfer-comm/crates/openinfer-comm-libibverbs-sys/build.rs similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-libibverbs-sys/build.rs rename to openinfer-comm/crates/openinfer-comm-libibverbs-sys/build.rs diff --git a/pegainfer-comm/crates/pegainfer-comm-libibverbs-sys/src/lib.rs b/openinfer-comm/crates/openinfer-comm-libibverbs-sys/src/lib.rs similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-libibverbs-sys/src/lib.rs rename to openinfer-comm/crates/openinfer-comm-libibverbs-sys/src/lib.rs diff --git a/pegainfer-comm/crates/pegainfer-comm-libibverbs-sys/wrapper.h b/openinfer-comm/crates/openinfer-comm-libibverbs-sys/wrapper.h similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-libibverbs-sys/wrapper.h rename to openinfer-comm/crates/openinfer-comm-libibverbs-sys/wrapper.h diff --git a/pegainfer-comm/crates/pegainfer-comm-logging-lib/Cargo.toml b/openinfer-comm/crates/openinfer-comm-logging-lib/Cargo.toml similarity index 84% rename from pegainfer-comm/crates/pegainfer-comm-logging-lib/Cargo.toml rename to openinfer-comm/crates/openinfer-comm-logging-lib/Cargo.toml index fc4338e2..70d96f66 100644 --- a/pegainfer-comm/crates/pegainfer-comm-logging-lib/Cargo.toml +++ b/openinfer-comm/crates/openinfer-comm-logging-lib/Cargo.toml @@ -1,12 +1,12 @@ [package] edition = "2024" -name = "pegainfer-comm-logging-lib" +name = "openinfer-comm-logging-lib" publish = false [dependencies] anyhow = { workspace = true } is-terminal = { workspace = true } -# Inline clap rather than inherit pegainfer root's entry: this crate needs +# Inline clap rather than inherit openinfer root's entry: this crate needs # the `env` feature (#[clap(... env = "PPLX_LOG_FORMAT", ...)]) which the # root entry does not enable. Keeping the override local to this wrapper # avoids touching the root workspace.dependencies for the rest of the tree. diff --git a/pegainfer-comm/crates/pegainfer-comm-logging-lib/src/lib.rs b/openinfer-comm/crates/openinfer-comm-logging-lib/src/lib.rs similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-logging-lib/src/lib.rs rename to openinfer-comm/crates/openinfer-comm-logging-lib/src/lib.rs diff --git a/pegainfer-comm/crates/pegainfer-comm-p2p-all-to-all/Cargo.toml b/openinfer-comm/crates/openinfer-comm-p2p-all-to-all/Cargo.toml similarity index 64% rename from pegainfer-comm/crates/pegainfer-comm-p2p-all-to-all/Cargo.toml rename to openinfer-comm/crates/openinfer-comm-p2p-all-to-all/Cargo.toml index cb309779..22771a19 100644 --- a/pegainfer-comm/crates/pegainfer-comm-p2p-all-to-all/Cargo.toml +++ b/openinfer-comm/crates/openinfer-comm-p2p-all-to-all/Cargo.toml @@ -1,6 +1,6 @@ [package] edition = "2024" -name = "pegainfer-comm-p2p-all-to-all" +name = "openinfer-comm-p2p-all-to-all" publish = false [features] @@ -16,10 +16,10 @@ hw-rdma = [ ] [dependencies] -a2a-kernels = { path = "../pegainfer-comm-a2a-kernels", package = "pegainfer-comm-a2a-kernels" } -cuda-lib = { path = "../pegainfer-comm-cuda-lib", package = "pegainfer-comm-cuda-lib" } -fabric-lib = { path = "../pegainfer-comm-fabric-lib", package = "pegainfer-comm-fabric-lib" } -thread-lib = { path = "../pegainfer-comm-thread-lib", package = "pegainfer-comm-thread-lib" } +a2a-kernels = { path = "../openinfer-comm-a2a-kernels", package = "openinfer-comm-a2a-kernels" } +cuda-lib = { path = "../openinfer-comm-cuda-lib", package = "openinfer-comm-cuda-lib" } +fabric-lib = { path = "../openinfer-comm-fabric-lib", package = "openinfer-comm-fabric-lib" } +thread-lib = { path = "../openinfer-comm-thread-lib", package = "openinfer-comm-thread-lib" } anyhow = { workspace = true } # nvtx is NVIDIA's profiling marker library and is only referenced from the diff --git a/pegainfer-comm/crates/pegainfer-comm-p2p-all-to-all/src/a2a_context.rs b/openinfer-comm/crates/openinfer-comm-p2p-all-to-all/src/a2a_context.rs similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-p2p-all-to-all/src/a2a_context.rs rename to openinfer-comm/crates/openinfer-comm-p2p-all-to-all/src/a2a_context.rs diff --git a/pegainfer-comm/crates/pegainfer-comm-p2p-all-to-all/src/a2a_handles.rs b/openinfer-comm/crates/openinfer-comm-p2p-all-to-all/src/a2a_handles.rs similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-p2p-all-to-all/src/a2a_handles.rs rename to openinfer-comm/crates/openinfer-comm-p2p-all-to-all/src/a2a_handles.rs diff --git a/pegainfer-comm/crates/pegainfer-comm-p2p-all-to-all/src/a2a_worker.rs b/openinfer-comm/crates/openinfer-comm-p2p-all-to-all/src/a2a_worker.rs similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-p2p-all-to-all/src/a2a_worker.rs rename to openinfer-comm/crates/openinfer-comm-p2p-all-to-all/src/a2a_worker.rs diff --git a/pegainfer-comm/crates/pegainfer-comm-p2p-all-to-all/src/lib.rs b/openinfer-comm/crates/openinfer-comm-p2p-all-to-all/src/lib.rs similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-p2p-all-to-all/src/lib.rs rename to openinfer-comm/crates/openinfer-comm-p2p-all-to-all/src/lib.rs diff --git a/pegainfer-comm/crates/pegainfer-comm-proc-lib/Cargo.toml b/openinfer-comm/crates/openinfer-comm-proc-lib/Cargo.toml similarity index 80% rename from pegainfer-comm/crates/pegainfer-comm-proc-lib/Cargo.toml rename to openinfer-comm/crates/openinfer-comm-proc-lib/Cargo.toml index 04b0fcb4..dd263852 100644 --- a/pegainfer-comm/crates/pegainfer-comm-proc-lib/Cargo.toml +++ b/openinfer-comm/crates/openinfer-comm-proc-lib/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "pegainfer-comm-proc-lib" +name = "openinfer-comm-proc-lib" edition = "2024" publish = false @@ -19,4 +19,4 @@ hw-cuda = ["dep:cudart-sys", "cudart-sys/system-bindings"] quote = "1.0" proc-macro2 = "1.0" -cudart-sys = { path = "../pegainfer-comm-cudart-sys", package = "pegainfer-comm-cudart-sys", optional = true } +cudart-sys = { path = "../openinfer-comm-cudart-sys", package = "openinfer-comm-cudart-sys", optional = true } diff --git a/pegainfer-comm/crates/pegainfer-comm-proc-lib/src/lib.rs b/openinfer-comm/crates/openinfer-comm-proc-lib/src/lib.rs similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-proc-lib/src/lib.rs rename to openinfer-comm/crates/openinfer-comm-proc-lib/src/lib.rs diff --git a/pegainfer-comm/crates/pegainfer-comm-python-ext/Cargo.toml b/openinfer-comm/crates/openinfer-comm-python-ext/Cargo.toml similarity index 54% rename from pegainfer-comm/crates/pegainfer-comm-python-ext/Cargo.toml rename to openinfer-comm/crates/openinfer-comm-python-ext/Cargo.toml index ded54b06..ce5774ad 100644 --- a/pegainfer-comm/crates/pegainfer-comm-python-ext/Cargo.toml +++ b/openinfer-comm/crates/openinfer-comm-python-ext/Cargo.toml @@ -1,6 +1,6 @@ [package] edition = "2024" -name = "pegainfer-comm-python-ext" +name = "openinfer-comm-python-ext" publish = false [lib] @@ -20,12 +20,12 @@ hw-rdma = [ ] [dependencies] -cuda-lib = { path = "../pegainfer-comm-cuda-lib", package = "pegainfer-comm-cuda-lib" } -fabric-lib = { path = "../pegainfer-comm-fabric-lib", package = "pegainfer-comm-fabric-lib" } -logging-lib = { path = "../pegainfer-comm-logging-lib", package = "pegainfer-comm-logging-lib" } -thread-lib = { path = "../pegainfer-comm-thread-lib", package = "pegainfer-comm-thread-lib" } -torch-lib = { path = "../pegainfer-comm-torch-lib", package = "pegainfer-comm-torch-lib" } -p2p-all-to-all = { path = "../pegainfer-comm-p2p-all-to-all", package = "pegainfer-comm-p2p-all-to-all" } +cuda-lib = { path = "../openinfer-comm-cuda-lib", package = "openinfer-comm-cuda-lib" } +fabric-lib = { path = "../openinfer-comm-fabric-lib", package = "openinfer-comm-fabric-lib" } +logging-lib = { path = "../openinfer-comm-logging-lib", package = "openinfer-comm-logging-lib" } +thread-lib = { path = "../openinfer-comm-thread-lib", package = "openinfer-comm-thread-lib" } +torch-lib = { path = "../openinfer-comm-torch-lib", package = "openinfer-comm-torch-lib" } +p2p-all-to-all = { path = "../openinfer-comm-p2p-all-to-all", package = "openinfer-comm-p2p-all-to-all" } bincode = { workspace = true } bytes = { workspace = true } diff --git a/pegainfer-comm/crates/pegainfer-comm-python-ext/src/lib.rs b/openinfer-comm/crates/openinfer-comm-python-ext/src/lib.rs similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-python-ext/src/lib.rs rename to openinfer-comm/crates/openinfer-comm-python-ext/src/lib.rs diff --git a/pegainfer-comm/crates/pegainfer-comm-python-ext/src/py_cumem.rs b/openinfer-comm/crates/openinfer-comm-python-ext/src/py_cumem.rs similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-python-ext/src/py_cumem.rs rename to openinfer-comm/crates/openinfer-comm-python-ext/src/py_cumem.rs diff --git a/pegainfer-comm/crates/pegainfer-comm-python-ext/src/py_device.rs b/openinfer-comm/crates/openinfer-comm-python-ext/src/py_device.rs similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-python-ext/src/py_device.rs rename to openinfer-comm/crates/openinfer-comm-python-ext/src/py_device.rs diff --git a/pegainfer-comm/crates/pegainfer-comm-python-ext/src/py_fabric_lib.rs b/openinfer-comm/crates/openinfer-comm-python-ext/src/py_fabric_lib.rs similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-python-ext/src/py_fabric_lib.rs rename to openinfer-comm/crates/openinfer-comm-python-ext/src/py_fabric_lib.rs diff --git a/pegainfer-comm/crates/pegainfer-comm-python-ext/src/py_p2p_all_to_all.rs b/openinfer-comm/crates/openinfer-comm-python-ext/src/py_p2p_all_to_all.rs similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-python-ext/src/py_p2p_all_to_all.rs rename to openinfer-comm/crates/openinfer-comm-python-ext/src/py_p2p_all_to_all.rs diff --git a/pegainfer-comm/crates/pegainfer-comm-thread-lib/Cargo.toml b/openinfer-comm/crates/openinfer-comm-thread-lib/Cargo.toml similarity index 81% rename from pegainfer-comm/crates/pegainfer-comm-thread-lib/Cargo.toml rename to openinfer-comm/crates/openinfer-comm-thread-lib/Cargo.toml index 377265a1..455169dc 100644 --- a/pegainfer-comm/crates/pegainfer-comm-thread-lib/Cargo.toml +++ b/openinfer-comm/crates/openinfer-comm-thread-lib/Cargo.toml @@ -1,6 +1,6 @@ [package] edition = "2024" -name = "pegainfer-comm-thread-lib" +name = "openinfer-comm-thread-lib" publish = false [dependencies] diff --git a/pegainfer-comm/crates/pegainfer-comm-thread-lib/src/lib.rs b/openinfer-comm/crates/openinfer-comm-thread-lib/src/lib.rs similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-thread-lib/src/lib.rs rename to openinfer-comm/crates/openinfer-comm-thread-lib/src/lib.rs diff --git a/pegainfer-comm/crates/pegainfer-comm-torch-lib/Cargo.toml b/openinfer-comm/crates/openinfer-comm-torch-lib/Cargo.toml similarity index 79% rename from pegainfer-comm/crates/pegainfer-comm-torch-lib/Cargo.toml rename to openinfer-comm/crates/openinfer-comm-torch-lib/Cargo.toml index c80b76fd..6181d3a4 100644 --- a/pegainfer-comm/crates/pegainfer-comm-torch-lib/Cargo.toml +++ b/openinfer-comm/crates/openinfer-comm-torch-lib/Cargo.toml @@ -1,7 +1,7 @@ [package] edition = "2024" links = "torch-lib" -name = "pegainfer-comm-torch-lib" +name = "openinfer-comm-torch-lib" publish = false [features] @@ -14,7 +14,7 @@ hw-cuda = ["cuda-lib/hw-cuda"] [dependencies] cxx = { workspace = true } pyo3 = { workspace = true } -cuda-lib = { path = "../pegainfer-comm-cuda-lib", package = "pegainfer-comm-cuda-lib" } +cuda-lib = { path = "../openinfer-comm-cuda-lib", package = "openinfer-comm-cuda-lib" } [build-dependencies] cxx-build = { workspace = true } diff --git a/pegainfer-comm/crates/pegainfer-comm-torch-lib/build.rs b/openinfer-comm/crates/openinfer-comm-torch-lib/build.rs similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-torch-lib/build.rs rename to openinfer-comm/crates/openinfer-comm-torch-lib/build.rs diff --git a/pegainfer-comm/crates/pegainfer-comm-torch-lib/src/hw_cuda_impl.rs b/openinfer-comm/crates/openinfer-comm-torch-lib/src/hw_cuda_impl.rs similarity index 98% rename from pegainfer-comm/crates/pegainfer-comm-torch-lib/src/hw_cuda_impl.rs rename to openinfer-comm/crates/openinfer-comm-torch-lib/src/hw_cuda_impl.rs index 9ea0e10f..dc1d33fe 100644 --- a/pegainfer-comm/crates/pegainfer-comm-torch-lib/src/hw_cuda_impl.rs +++ b/openinfer-comm/crates/openinfer-comm-torch-lib/src/hw_cuda_impl.rs @@ -49,7 +49,7 @@ mod ffi { } unsafe extern "C++" { - include!("pegainfer-comm-torch-lib/src/torch_lib.h"); + include!("openinfer-comm-torch-lib/src/torch_lib.h"); unsafe fn from_blob( data_ptr: *mut c_char, shape: &[i64], diff --git a/pegainfer-comm/crates/pegainfer-comm-torch-lib/src/lib.rs b/openinfer-comm/crates/openinfer-comm-torch-lib/src/lib.rs similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-torch-lib/src/lib.rs rename to openinfer-comm/crates/openinfer-comm-torch-lib/src/lib.rs diff --git a/pegainfer-comm/crates/pegainfer-comm-torch-lib/src/test_torch.rs b/openinfer-comm/crates/openinfer-comm-torch-lib/src/test_torch.rs similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-torch-lib/src/test_torch.rs rename to openinfer-comm/crates/openinfer-comm-torch-lib/src/test_torch.rs diff --git a/pegainfer-comm/crates/pegainfer-comm-torch-lib/src/torch_lib.cc b/openinfer-comm/crates/openinfer-comm-torch-lib/src/torch_lib.cc similarity index 100% rename from pegainfer-comm/crates/pegainfer-comm-torch-lib/src/torch_lib.cc rename to openinfer-comm/crates/openinfer-comm-torch-lib/src/torch_lib.cc diff --git a/pegainfer-comm/crates/pegainfer-comm-torch-lib/src/torch_lib.h b/openinfer-comm/crates/openinfer-comm-torch-lib/src/torch_lib.h similarity index 92% rename from pegainfer-comm/crates/pegainfer-comm-torch-lib/src/torch_lib.h rename to openinfer-comm/crates/openinfer-comm-torch-lib/src/torch_lib.h index f083a659..639d96a8 100644 --- a/pegainfer-comm/crates/pegainfer-comm-torch-lib/src/torch_lib.h +++ b/openinfer-comm/crates/openinfer-comm-torch-lib/src/torch_lib.h @@ -21,7 +21,7 @@ class TorchProfilerGuard final { } // namespace torch_lib -#include "pegainfer-comm-torch-lib/src/hw_cuda_impl.rs.h" +#include "openinfer-comm-torch-lib/src/hw_cuda_impl.rs.h" namespace torch_lib { diff --git a/pegainfer-comm/docker/dev.Dockerfile b/openinfer-comm/docker/dev.Dockerfile similarity index 100% rename from pegainfer-comm/docker/dev.Dockerfile rename to openinfer-comm/docker/dev.Dockerfile diff --git a/pegainfer-comm/pyproject.toml b/openinfer-comm/pyproject.toml similarity index 98% rename from pegainfer-comm/pyproject.toml rename to openinfer-comm/pyproject.toml index 8dd1bb12..97509020 100644 --- a/pegainfer-comm/pyproject.toml +++ b/openinfer-comm/pyproject.toml @@ -17,7 +17,7 @@ package-dir = {"" = "python"} [[tool.setuptools-rust.ext-modules]] target = "pplx_garden._rust" -path = "crates/pegainfer-comm-python-ext/Cargo.toml" +path = "crates/openinfer-comm-python-ext/Cargo.toml" binding = "PyO3" features = ["hw-rdma"] diff --git a/pegainfer-comm/python/pplx_garden/__init__.py b/openinfer-comm/python/pplx_garden/__init__.py similarity index 100% rename from pegainfer-comm/python/pplx_garden/__init__.py rename to openinfer-comm/python/pplx_garden/__init__.py diff --git a/pegainfer-comm/python/pplx_garden/distributed/__init__.py b/openinfer-comm/python/pplx_garden/distributed/__init__.py similarity index 100% rename from pegainfer-comm/python/pplx_garden/distributed/__init__.py rename to openinfer-comm/python/pplx_garden/distributed/__init__.py diff --git a/pegainfer-comm/python/pplx_garden/distributed/distributed_ops.py b/openinfer-comm/python/pplx_garden/distributed/distributed_ops.py similarity index 100% rename from pegainfer-comm/python/pplx_garden/distributed/distributed_ops.py rename to openinfer-comm/python/pplx_garden/distributed/distributed_ops.py diff --git a/pegainfer-comm/python/pplx_garden/distributed/nccl_all_reduce.py b/openinfer-comm/python/pplx_garden/distributed/nccl_all_reduce.py similarity index 100% rename from pegainfer-comm/python/pplx_garden/distributed/nccl_all_reduce.py rename to openinfer-comm/python/pplx_garden/distributed/nccl_all_reduce.py diff --git a/pegainfer-comm/python/pplx_garden/distributed/parallel_group.py b/openinfer-comm/python/pplx_garden/distributed/parallel_group.py similarity index 100% rename from pegainfer-comm/python/pplx_garden/distributed/parallel_group.py rename to openinfer-comm/python/pplx_garden/distributed/parallel_group.py diff --git a/pegainfer-comm/python/pplx_garden/distributed/process_group.py b/openinfer-comm/python/pplx_garden/distributed/process_group.py similarity index 100% rename from pegainfer-comm/python/pplx_garden/distributed/process_group.py rename to openinfer-comm/python/pplx_garden/distributed/process_group.py diff --git a/pegainfer-comm/python/pplx_garden/distributed/torch_group.py b/openinfer-comm/python/pplx_garden/distributed/torch_group.py similarity index 100% rename from pegainfer-comm/python/pplx_garden/distributed/torch_group.py rename to openinfer-comm/python/pplx_garden/distributed/torch_group.py diff --git a/pegainfer-comm/python/pplx_garden/fabric_lib.py b/openinfer-comm/python/pplx_garden/fabric_lib.py similarity index 100% rename from pegainfer-comm/python/pplx_garden/fabric_lib.py rename to openinfer-comm/python/pplx_garden/fabric_lib.py diff --git a/pegainfer-comm/python/pplx_garden/fabric_lib.pyi b/openinfer-comm/python/pplx_garden/fabric_lib.pyi similarity index 100% rename from pegainfer-comm/python/pplx_garden/fabric_lib.pyi rename to openinfer-comm/python/pplx_garden/fabric_lib.pyi diff --git a/pegainfer-comm/python/pplx_garden/kernels/__init__.py b/openinfer-comm/python/pplx_garden/kernels/__init__.py similarity index 100% rename from pegainfer-comm/python/pplx_garden/kernels/__init__.py rename to openinfer-comm/python/pplx_garden/kernels/__init__.py diff --git a/pegainfer-comm/python/pplx_garden/kernels/all_to_all.py b/openinfer-comm/python/pplx_garden/kernels/all_to_all.py similarity index 100% rename from pegainfer-comm/python/pplx_garden/kernels/all_to_all.py rename to openinfer-comm/python/pplx_garden/kernels/all_to_all.py diff --git a/pegainfer-comm/python/pplx_garden/kernels/p2p_all_to_all.py b/openinfer-comm/python/pplx_garden/kernels/p2p_all_to_all.py similarity index 100% rename from pegainfer-comm/python/pplx_garden/kernels/p2p_all_to_all.py rename to openinfer-comm/python/pplx_garden/kernels/p2p_all_to_all.py diff --git a/pegainfer-comm/python/pplx_garden/native/__init__.py b/openinfer-comm/python/pplx_garden/native/__init__.py similarity index 100% rename from pegainfer-comm/python/pplx_garden/native/__init__.py rename to openinfer-comm/python/pplx_garden/native/__init__.py diff --git a/pegainfer-comm/python/pplx_garden/native/cumem.py b/openinfer-comm/python/pplx_garden/native/cumem.py similarity index 100% rename from pegainfer-comm/python/pplx_garden/native/cumem.py rename to openinfer-comm/python/pplx_garden/native/cumem.py diff --git a/pegainfer-comm/python/pplx_garden/native/cumem.pyi b/openinfer-comm/python/pplx_garden/native/cumem.pyi similarity index 100% rename from pegainfer-comm/python/pplx_garden/native/cumem.pyi rename to openinfer-comm/python/pplx_garden/native/cumem.pyi diff --git a/pegainfer-comm/python/pplx_garden/native/p2p_all_to_all.py b/openinfer-comm/python/pplx_garden/native/p2p_all_to_all.py similarity index 100% rename from pegainfer-comm/python/pplx_garden/native/p2p_all_to_all.py rename to openinfer-comm/python/pplx_garden/native/p2p_all_to_all.py diff --git a/pegainfer-comm/python/pplx_garden/native/p2p_all_to_all.pyi b/openinfer-comm/python/pplx_garden/native/p2p_all_to_all.pyi similarity index 100% rename from pegainfer-comm/python/pplx_garden/native/p2p_all_to_all.pyi rename to openinfer-comm/python/pplx_garden/native/p2p_all_to_all.pyi diff --git a/pegainfer-comm/python/pplx_garden/py.typed b/openinfer-comm/python/pplx_garden/py.typed similarity index 100% rename from pegainfer-comm/python/pplx_garden/py.typed rename to openinfer-comm/python/pplx_garden/py.typed diff --git a/pegainfer-comm/python/pplx_garden/utils/__init__.py b/openinfer-comm/python/pplx_garden/utils/__init__.py similarity index 100% rename from pegainfer-comm/python/pplx_garden/utils/__init__.py rename to openinfer-comm/python/pplx_garden/utils/__init__.py diff --git a/pegainfer-comm/python/pplx_garden/utils/logging_utils.py b/openinfer-comm/python/pplx_garden/utils/logging_utils.py similarity index 100% rename from pegainfer-comm/python/pplx_garden/utils/logging_utils.py rename to openinfer-comm/python/pplx_garden/utils/logging_utils.py diff --git a/pegainfer-comm/python/pplx_garden/utils/math.py b/openinfer-comm/python/pplx_garden/utils/math.py similarity index 100% rename from pegainfer-comm/python/pplx_garden/utils/math.py rename to openinfer-comm/python/pplx_garden/utils/math.py diff --git a/pegainfer-comm/python/pplx_garden/utils/torch.py b/openinfer-comm/python/pplx_garden/utils/torch.py similarity index 100% rename from pegainfer-comm/python/pplx_garden/utils/torch.py rename to openinfer-comm/python/pplx_garden/utils/torch.py diff --git a/pegainfer-comm/rustfmt.toml b/openinfer-comm/rustfmt.toml similarity index 100% rename from pegainfer-comm/rustfmt.toml rename to openinfer-comm/rustfmt.toml diff --git a/pegainfer-comm/scripts/run-docker.sh b/openinfer-comm/scripts/run-docker.sh similarity index 100% rename from pegainfer-comm/scripts/run-docker.sh rename to openinfer-comm/scripts/run-docker.sh diff --git a/pegainfer-comm/src/bin/pplx_a2a_bench.rs b/openinfer-comm/src/bin/pplx_a2a_bench.rs similarity index 98% rename from pegainfer-comm/src/bin/pplx_a2a_bench.rs rename to openinfer-comm/src/bin/pplx_a2a_bench.rs index b104f0b6..15e93bde 100644 --- a/pegainfer-comm/src/bin/pplx_a2a_bench.rs +++ b/openinfer-comm/src/bin/pplx_a2a_bench.rs @@ -9,8 +9,8 @@ use anyhow::{Context, Result, ensure}; use clap::Parser; use cudarc::driver::{CudaContext, CudaSlice, CudaStream, DevicePtr, DevicePtrMut}; use half::bf16; -use pegainfer_comm::ScalarType; -use pegainfer_comm::bootstrap::{ +use openinfer_comm::ScalarType; +use openinfer_comm::bootstrap::{ EpModelShape, PplxBootstrapParams, build_intra_node_backends_for_devices, }; @@ -321,7 +321,7 @@ fn run_config(config: &BenchConfig) -> Result>> { fn run_rank( rank: usize, - mut backend: pegainfer_comm::EpBackend, + mut backend: openinfer_comm::EpBackend, config: &BenchConfig, barrier: &Barrier, ) -> Result> { @@ -442,7 +442,7 @@ fn run_rank( } fn dispatch_send( - backend: &mut pegainfer_comm::EpBackend, + backend: &mut openinfer_comm::EpBackend, num_tokens: usize, hidden: usize, topk: usize, @@ -474,7 +474,7 @@ fn dispatch_send( } fn dispatch_send_route_only( - backend: &mut pegainfer_comm::EpBackend, + backend: &mut openinfer_comm::EpBackend, num_tokens: usize, topk: usize, indices: &CudaSlice, @@ -498,7 +498,7 @@ fn dispatch_send_route_only( } fn dispatch_recv( - backend: &mut pegainfer_comm::EpBackend, + backend: &mut openinfer_comm::EpBackend, hidden: usize, recv_tokens_per_expert: &mut CudaSlice, out_x: &mut CudaSlice, @@ -521,7 +521,7 @@ fn dispatch_recv( } fn dispatch_recv_counts_only( - backend: &mut pegainfer_comm::EpBackend, + backend: &mut openinfer_comm::EpBackend, recv_tokens_per_expert: &mut CudaSlice, gpu: &GpuContext, ) -> Result<()> { @@ -533,7 +533,7 @@ fn dispatch_recv_counts_only( } fn combine_send( - backend: &mut pegainfer_comm::EpBackend, + backend: &mut openinfer_comm::EpBackend, hidden: usize, expert_y: &CudaSlice, gpu: &GpuContext, @@ -550,7 +550,7 @@ fn combine_send( } fn combine_recv( - backend: &mut pegainfer_comm::EpBackend, + backend: &mut openinfer_comm::EpBackend, num_tokens: usize, hidden: usize, topk: usize, diff --git a/pegainfer-comm/src/bootstrap.rs b/openinfer-comm/src/bootstrap.rs similarity index 100% rename from pegainfer-comm/src/bootstrap.rs rename to openinfer-comm/src/bootstrap.rs diff --git a/pegainfer-comm/src/bootstrap/pplx.rs b/openinfer-comm/src/bootstrap/pplx.rs similarity index 99% rename from pegainfer-comm/src/bootstrap/pplx.rs rename to openinfer-comm/src/bootstrap/pplx.rs index 69b0cc4c..a313754d 100644 --- a/pegainfer-comm/src/bootstrap/pplx.rs +++ b/openinfer-comm/src/bootstrap/pplx.rs @@ -13,7 +13,7 @@ use std::sync::Arc; use std::thread; use anyhow::{Context, Result, bail, ensure}; -use pegainfer_core::cpu_topology::{CpuId, RankThreadPlacementPlan}; +use openinfer_core::cpu_topology::{CpuId, RankThreadPlacementPlan}; use crate::raw::cuda_lib::cumem::{ CUAllocHandle, CUMemAllocHandle, CUMemHandleKind, CUMemMapping, diff --git a/pegainfer-comm/src/ep_backend.rs b/openinfer-comm/src/ep_backend.rs similarity index 99% rename from pegainfer-comm/src/ep_backend.rs rename to openinfer-comm/src/ep_backend.rs index eba6df21..b1a526b0 100644 --- a/pegainfer-comm/src/ep_backend.rs +++ b/openinfer-comm/src/ep_backend.rs @@ -136,7 +136,7 @@ pub struct EpBackendParams { /// # Concurrency /// /// The upstream methods take `&mut self`, so [`EpBackend`] must be driven -/// from a single owning lane (PegaInfer's rank worker thread). The +/// from a single owning lane (OpenInfer's rank worker thread). The /// `AllToAllContext`'s internal worker thread handles all cross-rank /// progress; the caller does not poll. pub struct EpBackend { @@ -145,7 +145,7 @@ pub struct EpBackend { // `AllToAllContext` wraps an `Arc` + worker thread handle + // pre-allocated CUDA memory (all upstream types that are individually -// `Send + Sync`). The struct is moved across threads through PegaInfer's +// `Send + Sync`). The struct is moved across threads through OpenInfer's // `RankCommand::EnablePplx` channel; mark it `Send` explicitly so the // channel does not require upstream auto-traits we don't control. unsafe impl Send for EpBackend {} diff --git a/pegainfer-comm/src/error.rs b/openinfer-comm/src/error.rs similarity index 92% rename from pegainfer-comm/src/error.rs rename to openinfer-comm/src/error.rs index 9c234c91..0d4c8b38 100644 --- a/pegainfer-comm/src/error.rs +++ b/openinfer-comm/src/error.rs @@ -5,10 +5,10 @@ use std::error::Error as StdError; use thiserror::Error; -/// Result alias for `pegainfer-comm` public API. +/// Result alias for `openinfer-comm` public API. pub type Result = std::result::Result; -/// Public error type for `pegainfer-comm`. +/// Public error type for `openinfer-comm`. #[derive(Debug, Error)] pub enum Error { /// A parameter passed to a backend method was malformed or inconsistent diff --git a/pegainfer-comm/src/lib.rs b/openinfer-comm/src/lib.rs similarity index 82% rename from pegainfer-comm/src/lib.rs rename to openinfer-comm/src/lib.rs index 1de4cde3..19398449 100644 --- a/pegainfer-comm/src/lib.rs +++ b/openinfer-comm/src/lib.rs @@ -1,7 +1,7 @@ -//! PegaInfer comm-backend public surface. +//! OpenInfer comm-backend public surface. //! //! [`EpBackend`] wraps the upstream `pplx-garden` NVLink + RDMA all-to-all -//! context with a thin Rust surface tailored for PegaInfer's MoE call sites: +//! context with a thin Rust surface tailored for OpenInfer's MoE call sites: //! `dispatch_send / dispatch_recv / combine_send / combine_recv`, kept //! separate so callers can overlap host-side compute between send and recv. @@ -20,7 +20,7 @@ pub use ep_backend::{ pub mod bootstrap; /// Re-exports of the underlying `pplx-garden` building blocks. Available -/// so PegaInfer-side bootstrap code can build `EpBackendParams` without +/// so OpenInfer-side bootstrap code can build `EpBackendParams` without /// taking direct dependencies on the vendored crates. pub mod raw { pub use cuda_lib; diff --git a/pegainfer-comm/tests/__init__.py b/openinfer-comm/tests/__init__.py similarity index 100% rename from pegainfer-comm/tests/__init__.py rename to openinfer-comm/tests/__init__.py diff --git a/pegainfer-comm/tests/fabric.py b/openinfer-comm/tests/fabric.py similarity index 100% rename from pegainfer-comm/tests/fabric.py rename to openinfer-comm/tests/fabric.py diff --git a/pegainfer-comm/tests/fabric_lib/__init__.py b/openinfer-comm/tests/fabric_lib/__init__.py similarity index 100% rename from pegainfer-comm/tests/fabric_lib/__init__.py rename to openinfer-comm/tests/fabric_lib/__init__.py diff --git a/pegainfer-comm/tests/fabric_lib/test_handle.py b/openinfer-comm/tests/fabric_lib/test_handle.py similarity index 100% rename from pegainfer-comm/tests/fabric_lib/test_handle.py rename to openinfer-comm/tests/fabric_lib/test_handle.py diff --git a/pegainfer-comm/tests/fabric_lib/test_transfer_engine.py b/openinfer-comm/tests/fabric_lib/test_transfer_engine.py similarity index 100% rename from pegainfer-comm/tests/fabric_lib/test_transfer_engine.py rename to openinfer-comm/tests/fabric_lib/test_transfer_engine.py diff --git a/pegainfer-comm/tests/fabric_lib/test_types.py b/openinfer-comm/tests/fabric_lib/test_types.py similarity index 100% rename from pegainfer-comm/tests/fabric_lib/test_types.py rename to openinfer-comm/tests/fabric_lib/test_types.py diff --git a/pegainfer-comm/tests/markers.py b/openinfer-comm/tests/markers.py similarity index 100% rename from pegainfer-comm/tests/markers.py rename to openinfer-comm/tests/markers.py diff --git a/pegainfer-comm/tests/p2p_all_to_all/__init__.py b/openinfer-comm/tests/p2p_all_to_all/__init__.py similarity index 100% rename from pegainfer-comm/tests/p2p_all_to_all/__init__.py rename to openinfer-comm/tests/p2p_all_to_all/__init__.py diff --git a/pegainfer-comm/tests/p2p_all_to_all/data.py b/openinfer-comm/tests/p2p_all_to_all/data.py similarity index 100% rename from pegainfer-comm/tests/p2p_all_to_all/data.py rename to openinfer-comm/tests/p2p_all_to_all/data.py diff --git a/pegainfer-comm/tests/p2p_all_to_all/test_p2p_all_to_all.py b/openinfer-comm/tests/p2p_all_to_all/test_p2p_all_to_all.py similarity index 100% rename from pegainfer-comm/tests/p2p_all_to_all/test_p2p_all_to_all.py rename to openinfer-comm/tests/p2p_all_to_all/test_p2p_all_to_all.py diff --git a/pegainfer-comm/tests/pplx_roundtrip.rs b/openinfer-comm/tests/pplx_roundtrip.rs similarity index 99% rename from pegainfer-comm/tests/pplx_roundtrip.rs rename to openinfer-comm/tests/pplx_roundtrip.rs index 490bb881..584632f8 100644 --- a/pegainfer-comm/tests/pplx_roundtrip.rs +++ b/openinfer-comm/tests/pplx_roundtrip.rs @@ -4,7 +4,7 @@ //! slot on the destination rank, and that the combine path aggregates it back. //! //! Requires 8 GPUs with NVLink + RDMA. Run with: -//! cargo test -p pegainfer-comm --test pplx_roundtrip -- --nocapture +//! cargo test -p openinfer-comm --test pplx_roundtrip -- --nocapture use std::ffi::c_void; use std::ptr; @@ -13,8 +13,8 @@ use std::thread; use cudarc::driver::{CudaContext, DevicePtr, DevicePtrMut}; use half::bf16; -use pegainfer_comm::ScalarType; -use pegainfer_comm::bootstrap::{ +use openinfer_comm::ScalarType; +use openinfer_comm::bootstrap::{ EpModelShape, PplxBootstrapParams, build_intra_node_backends_for_devices, }; diff --git a/pegainfer-core/Cargo.toml b/openinfer-core/Cargo.toml similarity index 83% rename from pegainfer-core/Cargo.toml rename to openinfer-core/Cargo.toml index 2372079d..5f22f2d2 100644 --- a/pegainfer-core/Cargo.toml +++ b/openinfer-core/Cargo.toml @@ -1,11 +1,11 @@ [package] -name = "pegainfer-core" +name = "openinfer-core" version = "0.1.0" edition = "2024" [dependencies] -pegainfer-engine = { workspace = true } -pegainfer-kernels = { workspace = true } +openinfer-engine = { workspace = true } +openinfer-kernels = { workspace = true } anyhow = { workspace = true } colored = { workspace = true } cudarc = { workspace = true } diff --git a/pegainfer-core/src/cpu_topology.rs b/openinfer-core/src/cpu_topology.rs similarity index 100% rename from pegainfer-core/src/cpu_topology.rs rename to openinfer-core/src/cpu_topology.rs diff --git a/pegainfer-core/src/cuda_graph.rs b/openinfer-core/src/cuda_graph.rs similarity index 100% rename from pegainfer-core/src/cuda_graph.rs rename to openinfer-core/src/cuda_graph.rs diff --git a/openinfer-core/src/engine.rs b/openinfer-core/src/engine.rs new file mode 100644 index 00000000..4b2ddc06 --- /dev/null +++ b/openinfer-core/src/engine.rs @@ -0,0 +1 @@ +pub use openinfer_engine::engine::*; diff --git a/openinfer-core/src/ffi.rs b/openinfer-core/src/ffi.rs new file mode 100644 index 00000000..0f756fc1 --- /dev/null +++ b/openinfer-core/src/ffi.rs @@ -0,0 +1 @@ +pub use openinfer_kernels::ffi::*; diff --git a/pegainfer-core/src/kv_cache.rs b/openinfer-core/src/kv_cache.rs similarity index 100% rename from pegainfer-core/src/kv_cache.rs rename to openinfer-core/src/kv_cache.rs diff --git a/pegainfer-core/src/kv_pool.rs b/openinfer-core/src/kv_pool.rs similarity index 98% rename from pegainfer-core/src/kv_pool.rs rename to openinfer-core/src/kv_pool.rs index 3a3649ef..f1e170fa 100644 --- a/pegainfer-core/src/kv_pool.rs +++ b/openinfer-core/src/kv_pool.rs @@ -40,8 +40,8 @@ impl KvLayout { } } - pub fn kernel_layout(&self) -> pegainfer_kernels::paged_kv::PagedKvLayout { - pegainfer_kernels::paged_kv::PagedKvLayout { + pub fn kernel_layout(&self) -> openinfer_kernels::paged_kv::PagedKvLayout { + openinfer_kernels::paged_kv::PagedKvLayout { page_size: self.page_size, num_layers: self.num_layers, num_kv_heads: self.num_kv_heads, diff --git a/pegainfer-core/src/lib.rs b/openinfer-core/src/lib.rs similarity index 80% rename from pegainfer-core/src/lib.rs rename to openinfer-core/src/lib.rs index 7dfbd718..30045898 100644 --- a/pegainfer-core/src/lib.rs +++ b/openinfer-core/src/lib.rs @@ -1,4 +1,4 @@ -//! Shared runtime API used by pegainfer model crates. +//! Shared runtime API used by openinfer model crates. pub mod cpu_topology; pub mod cuda_graph; diff --git a/pegainfer-core/src/logging.rs b/openinfer-core/src/logging.rs similarity index 100% rename from pegainfer-core/src/logging.rs rename to openinfer-core/src/logging.rs diff --git a/pegainfer-core/src/ops.rs b/openinfer-core/src/ops.rs similarity index 96% rename from pegainfer-core/src/ops.rs rename to openinfer-core/src/ops.rs index 79780de7..b54379aa 100644 --- a/pegainfer-core/src/ops.rs +++ b/openinfer-core/src/ops.rs @@ -13,8 +13,7 @@ pub use attention::{ paged_attention_batch_decode_hd256_into, paged_attention_batch_decode_into, paged_attention_batch_decode_split_kv_into, prefill_attention_paged_into, }; -pub use paged_plan::PrefillPagedPlan; -pub use pegainfer_kernels::ops::{ +pub use openinfer_kernels::ops::{ LoraDecodeGroupedProjection, accumulate_bf16_token_scaled_to_f32_into, add_batch, add_batch_into, bf16_hidden_to_f32_into, embedding_decode_into, extract_vec, extract_vec_into, f32_to_bf16_hidden_into, fused_add_rms_norm_into, gather_hidden_tokens_into, gemm, @@ -26,11 +25,12 @@ pub use pegainfer_kernels::ops::{ scaled_add_rows_token_range_into, silu_mul_batch, silu_mul_batch_into, write_vec_into, }; #[cfg(not(feature = "kernel-call-trace"))] -pub use pegainfer_kernels::ops::{ +pub use openinfer_kernels::ops::{ embedding_batch, fused_add_rms_norm_batch_into, gemm_into, gemm_rows_into, gemm_token_range_into_checked, qk_norm_rope_batch_decode_into, rms_norm_batch_into, silu_mul_fused_batch_into, }; +pub use paged_plan::PrefillPagedPlan; pub use sampling::{ argmax, argmax_batch_bf16_indexed_into, argmax_batch_bf16_into, flashinfer_topk_row_states_bytes, gpu_sample, gpu_sample_into, select_batch_tokens_into, diff --git a/pegainfer-core/src/ops/attention.rs b/openinfer-core/src/ops/attention.rs similarity index 95% rename from pegainfer-core/src/ops/attention.rs rename to openinfer-core/src/ops/attention.rs index 77852cd0..9b5d795b 100644 --- a/pegainfer-core/src/ops/attention.rs +++ b/openinfer-core/src/ops/attention.rs @@ -30,7 +30,7 @@ pub fn prefill_attention_paged_into( head_dim: usize, rms_eps: f32, ) -> Result<()> { - pegainfer_kernels::ops::prefill_attention_paged_into( + openinfer_kernels::ops::prefill_attention_paged_into( ctx, q_batch, k_batch, @@ -85,7 +85,7 @@ pub fn paged_attention_batch_decode_into( "non_partition", )); } - pegainfer_kernels::ops::paged_attention_batch_decode_into( + openinfer_kernels::ops::paged_attention_batch_decode_into( ctx, q, k, @@ -146,7 +146,7 @@ pub fn paged_attention_batch_decode_split_kv_into( "split_kv_256x64", )); } - pegainfer_kernels::ops::paged_attention_batch_decode_split_kv_into( + openinfer_kernels::ops::paged_attention_batch_decode_split_kv_into( ctx, q, k, @@ -193,7 +193,7 @@ pub fn paged_attention_batch_decode_hd256_into( num_qo_heads: usize, batch_size: usize, ) -> Result<()> { - pegainfer_kernels::ops::paged_attention_batch_decode_hd256_into( + openinfer_kernels::ops::paged_attention_batch_decode_hd256_into( ctx, q, k, diff --git a/pegainfer-core/src/ops/call_spec.rs b/openinfer-core/src/ops/call_spec.rs similarity index 98% rename from pegainfer-core/src/ops/call_spec.rs rename to openinfer-core/src/ops/call_spec.rs index c5013176..b4a6d47a 100644 --- a/pegainfer-core/src/ops/call_spec.rs +++ b/openinfer-core/src/ops/call_spec.rs @@ -1,4 +1,4 @@ -use pegainfer_kernels::tensor::{ +use openinfer_kernels::tensor::{ AxisSpec, AxisTag, Batch, BatchPlusOne, Bf16, Contiguous1D, F32, HeadDim, Hidden, HiddenStatesLayout, I32, InDim, Inter2, Intermediate, KernelCall, Kv, KvDim, KvHead, Layer, OutDim, OutTotal, Page, PageSlot, PagedKvPageFirst, PosInPage, QDim, RopeDim, RowMajor2D, Seq, @@ -209,7 +209,7 @@ pub fn weight_matrix_total(out_total: usize, in_dim: usize) -> TensorSpec { ]) } -pub fn vector(dim: usize) -> TensorSpec { +pub fn vector(dim: usize) -> TensorSpec { TensorSpec::new::([AxisSpec::new::(dim)]) } @@ -247,5 +247,5 @@ pub fn meta_i32(size: usize) -> TensorSpec { } pub fn meta_u8(size: usize) -> TensorSpec { - TensorSpec::new::([AxisSpec::new::(size)]) + TensorSpec::new::([AxisSpec::new::(size)]) } diff --git a/pegainfer-core/src/ops/call_trace.rs b/openinfer-core/src/ops/call_trace.rs similarity index 98% rename from pegainfer-core/src/ops/call_trace.rs rename to openinfer-core/src/ops/call_trace.rs index 5467be3f..a3850fc1 100644 --- a/pegainfer-core/src/ops/call_trace.rs +++ b/openinfer-core/src/ops/call_trace.rs @@ -2,7 +2,7 @@ use std::cell::{Cell, RefCell}; use std::sync::{Mutex, OnceLock}; use anyhow::Result; -use pegainfer_kernels::tensor::KernelCall; +use openinfer_kernels::tensor::KernelCall; thread_local! { static TRACE: RefCell>> = const { RefCell::new(None) }; diff --git a/pegainfer-core/src/ops/paged_plan.rs b/openinfer-core/src/ops/paged_plan.rs similarity index 94% rename from pegainfer-core/src/ops/paged_plan.rs rename to openinfer-core/src/ops/paged_plan.rs index adeadb30..1d162951 100644 --- a/pegainfer-core/src/ops/paged_plan.rs +++ b/openinfer-core/src/ops/paged_plan.rs @@ -7,7 +7,7 @@ use crate::kv_pool::KvDesc; use crate::tensor::DeviceContext; pub struct PrefillPagedPlan { - inner: pegainfer_kernels::ops::PrefillPagedPlan, + inner: openinfer_kernels::ops::PrefillPagedPlan, } impl PrefillPagedPlan { @@ -49,7 +49,7 @@ impl PrefillPagedPlan { .map(|p| p.index() as i32) .collect(); Ok(Self { - inner: pegainfer_kernels::ops::PrefillPagedPlan::new_with_cta_tile_q( + inner: openinfer_kernels::ops::PrefillPagedPlan::new_with_cta_tile_q( ctx, &page_indices, desc.last_page_len(), @@ -106,7 +106,7 @@ impl PrefillPagedPlan { .collect(); let last_page_lens: Vec = descs.iter().map(KvDesc::last_page_len).collect(); Ok(Self { - inner: pegainfer_kernels::ops::PrefillPagedPlan::new_batch_with_cta_tile_q( + inner: openinfer_kernels::ops::PrefillPagedPlan::new_batch_with_cta_tile_q( ctx, &page_indices, &last_page_lens, @@ -133,7 +133,7 @@ impl PrefillPagedPlan { cta_tile_q_override: i32, ) -> Result { Ok(Self { - inner: pegainfer_kernels::ops::PrefillPagedPlan::new_batch_with_cta_tile_q( + inner: openinfer_kernels::ops::PrefillPagedPlan::new_batch_with_cta_tile_q( ctx, page_indices, last_page_lens, @@ -189,7 +189,7 @@ impl PrefillPagedPlan { } impl Deref for PrefillPagedPlan { - type Target = pegainfer_kernels::ops::PrefillPagedPlan; + type Target = openinfer_kernels::ops::PrefillPagedPlan; fn deref(&self) -> &Self::Target { &self.inner diff --git a/pegainfer-core/src/ops/sampling.rs b/openinfer-core/src/ops/sampling.rs similarity index 95% rename from pegainfer-core/src/ops/sampling.rs rename to openinfer-core/src/ops/sampling.rs index 31231120..c31fa677 100644 --- a/pegainfer-core/src/ops/sampling.rs +++ b/openinfer-core/src/ops/sampling.rs @@ -4,7 +4,7 @@ use cudarc::driver::CudaSlice; use crate::sampler::SamplingParams; use crate::tensor::{DeviceContext, DeviceVec, HiddenStates}; -pub use pegainfer_kernels::ops::{ +pub use openinfer_kernels::ops::{ argmax, argmax_batch_bf16_indexed_into, argmax_batch_bf16_into, flashinfer_topk_row_states_bytes, }; @@ -22,7 +22,7 @@ pub fn gpu_sample( params: &SamplingParams, random_val: f32, ) -> Result { - pegainfer_kernels::ops::gpu_sample( + openinfer_kernels::ops::gpu_sample( ctx, logits, probs_scratch, @@ -47,7 +47,7 @@ pub fn gpu_sample_into( params: &SamplingParams, random_val: f32, ) -> Result { - pegainfer_kernels::ops::gpu_sample_into( + openinfer_kernels::ops::gpu_sample_into( ctx, logits, probs_scratch, @@ -127,7 +127,7 @@ pub fn select_batch_tokens_into( if params_i.is_greedy() { continue; } - let logits_i = pegainfer_kernels::ops::extract_vec(ctx, logits, i)?; + let logits_i = openinfer_kernels::ops::extract_vec(ctx, logits, i)?; tokens[i] = gpu_sample_into( ctx, &logits_i, diff --git a/pegainfer-core/src/ops/traced.rs b/openinfer-core/src/ops/traced.rs similarity index 89% rename from pegainfer-core/src/ops/traced.rs rename to openinfer-core/src/ops/traced.rs index 23317c63..7b0384df 100644 --- a/pegainfer-core/src/ops/traced.rs +++ b/openinfer-core/src/ops/traced.rs @@ -7,7 +7,7 @@ use crate::ops::call_spec::{ }; use crate::ops::call_trace; use crate::tensor::{DeviceContext, DeviceMatrix, DeviceVec, HiddenStates}; -use pegainfer_kernels::tensor::{Hidden, InDim, OutDim}; +use openinfer_kernels::tensor::{Hidden, InDim, OutDim}; pub fn embedding_batch( ctx: &DeviceContext, @@ -24,7 +24,7 @@ pub fn embedding_batch( out.seq_len, )); } - pegainfer_kernels::ops::embedding_batch(ctx, embed, token_ids_gpu, out) + openinfer_kernels::ops::embedding_batch(ctx, embed, token_ids_gpu, out) } pub fn rms_norm_batch_into( @@ -43,7 +43,7 @@ pub fn rms_norm_batch_into( eps, )); } - pegainfer_kernels::ops::rms_norm_batch_into(ctx, x, weight, eps, out); + openinfer_kernels::ops::rms_norm_batch_into(ctx, x, weight, eps, out); } pub fn gemm_rows_into( @@ -65,7 +65,7 @@ pub fn gemm_rows_into( x.seq_len, )); } - pegainfer_kernels::ops::gemm_rows_into(ctx, weight, row_offset, num_rows, x, out); + openinfer_kernels::ops::gemm_rows_into(ctx, weight, row_offset, num_rows, x, out); } pub fn gemm_into( @@ -83,7 +83,7 @@ pub fn gemm_into( x.seq_len, )); } - pegainfer_kernels::ops::gemm_into(ctx, weight, x, out); + openinfer_kernels::ops::gemm_into(ctx, weight, x, out); } pub fn gemm_token_range_into_checked( @@ -102,7 +102,7 @@ pub fn gemm_token_range_into_checked( out.seq_len, )); } - pegainfer_kernels::ops::gemm_token_range_into_checked(ctx, weight, x, token_offset, out) + openinfer_kernels::ops::gemm_token_range_into_checked(ctx, weight, x, token_offset, out) } pub fn qk_norm_rope_batch_decode_into( @@ -134,7 +134,7 @@ pub fn qk_norm_rope_batch_decode_into( rms_eps, )); } - pegainfer_kernels::ops::qk_norm_rope_batch_decode_into( + openinfer_kernels::ops::qk_norm_rope_batch_decode_into( ctx, q, k, @@ -167,7 +167,7 @@ pub fn fused_add_rms_norm_batch_into( eps, )); } - pegainfer_kernels::ops::fused_add_rms_norm_batch_into(ctx, hidden, residual, weight, eps, out); + openinfer_kernels::ops::fused_add_rms_norm_batch_into(ctx, hidden, residual, weight, eps, out); } pub fn silu_mul_fused_batch_into( @@ -183,7 +183,7 @@ pub fn silu_mul_fused_batch_into( gate_up.seq_len, )); } - pegainfer_kernels::ops::silu_mul_fused_batch_into(ctx, gate_up, out); + openinfer_kernels::ops::silu_mul_fused_batch_into(ctx, gate_up, out); } pub(crate) fn paged_decode_call_spec( @@ -195,7 +195,7 @@ pub(crate) fn paged_decode_call_spec( num_q_heads: usize, batch_size: usize, variant: &'static str, -) -> pegainfer_kernels::tensor::KernelCall { +) -> openinfer_kernels::tensor::KernelCall { call_spec::paged_decode_attention_call( label, PagedDecodeCallSpec { diff --git a/pegainfer-core/src/page_pool.rs b/openinfer-core/src/page_pool.rs similarity index 100% rename from pegainfer-core/src/page_pool.rs rename to openinfer-core/src/page_pool.rs diff --git a/openinfer-core/src/parallel.rs b/openinfer-core/src/parallel.rs new file mode 100644 index 00000000..5cb939cc --- /dev/null +++ b/openinfer-core/src/parallel.rs @@ -0,0 +1 @@ +pub use openinfer_engine::parallel::*; diff --git a/openinfer-core/src/sampler.rs b/openinfer-core/src/sampler.rs new file mode 100644 index 00000000..6d659457 --- /dev/null +++ b/openinfer-core/src/sampler.rs @@ -0,0 +1 @@ +pub use openinfer_engine::sampler::*; diff --git a/openinfer-core/src/tensor.rs b/openinfer-core/src/tensor.rs new file mode 100644 index 00000000..ec673b3f --- /dev/null +++ b/openinfer-core/src/tensor.rs @@ -0,0 +1 @@ +pub use openinfer_kernels::tensor::*; diff --git a/pegainfer-core/src/weight_loader.rs b/openinfer-core/src/weight_loader.rs similarity index 100% rename from pegainfer-core/src/weight_loader.rs rename to openinfer-core/src/weight_loader.rs diff --git a/pegainfer-cupti/Cargo.toml b/openinfer-cupti/Cargo.toml similarity index 82% rename from pegainfer-cupti/Cargo.toml rename to openinfer-cupti/Cargo.toml index 19682f62..a56a1be6 100644 --- a/pegainfer-cupti/Cargo.toml +++ b/openinfer-cupti/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "pegainfer-cupti" +name = "openinfer-cupti" version = "0.1.0" edition = "2024" diff --git a/pegainfer-cupti/build.rs b/openinfer-cupti/build.rs similarity index 96% rename from pegainfer-cupti/build.rs rename to openinfer-cupti/build.rs index 397a97a9..e93375fd 100644 --- a/pegainfer-cupti/build.rs +++ b/openinfer-cupti/build.rs @@ -28,7 +28,7 @@ fn main() { add_if_exists(&mut build, &cuda_include); add_if_exists(&mut build, &cuda_target_include); add_if_exists(&mut build, &cupti_include); - build.compile("pegainfer_cupti_range_profiler"); + build.compile("openinfer_cupti_range_profiler"); let cuda_lib64 = cuda_root.join("lib64"); let cuda_target_lib = cuda_root.join("targets/x86_64-linux/lib"); diff --git a/pegainfer-cupti/csrc/range_profiler.cpp b/openinfer-cupti/csrc/range_profiler.cpp similarity index 99% rename from pegainfer-cupti/csrc/range_profiler.cpp rename to openinfer-cupti/csrc/range_profiler.cpp index 4d7a0a19..22f4affe 100644 --- a/pegainfer-cupti/csrc/range_profiler.cpp +++ b/openinfer-cupti/csrc/range_profiler.cpp @@ -355,7 +355,7 @@ void evaluate(CUpti_Profiler_Host_Object *host, std::vector &counter_da } // namespace -extern "C" int pegainfer_cupti_profile_range( +extern "C" int openinfer_cupti_profile_range( CUcontext context, size_t device_index, const char *range_name, const char **metric_names, size_t metric_count, PegaCuptiCallback prepare_fn, PegaCuptiCallback launch_fn, void *userdata, double *metric_values, diff --git a/pegainfer-cupti/src/lib.rs b/openinfer-cupti/src/lib.rs similarity index 98% rename from pegainfer-cupti/src/lib.rs rename to openinfer-cupti/src/lib.rs index 7b9e2480..507bc81d 100644 --- a/pegainfer-cupti/src/lib.rs +++ b/openinfer-cupti/src/lib.rs @@ -4,7 +4,7 @@ type CuContext = *mut c_void; type CuptiCallback = Option c_int>; unsafe extern "C" { - fn pegainfer_cupti_profile_range( + fn openinfer_cupti_profile_range( context: CuContext, device_index: usize, range_name: *const c_char, @@ -107,7 +107,7 @@ pub unsafe fn profile_range_with_prepare<'a>( }; let status = unsafe { - pegainfer_cupti_profile_range( + openinfer_cupti_profile_range( context, device_index, range_name.as_ptr(), diff --git a/pegainfer-deepseek-v2-lite/Cargo.toml b/openinfer-deepseek-v2-lite/Cargo.toml similarity index 87% rename from pegainfer-deepseek-v2-lite/Cargo.toml rename to openinfer-deepseek-v2-lite/Cargo.toml index a1f81b1e..d1926f23 100644 --- a/pegainfer-deepseek-v2-lite/Cargo.toml +++ b/openinfer-deepseek-v2-lite/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "pegainfer-deepseek-v2-lite" +name = "openinfer-deepseek-v2-lite" version = "0.1.0" edition = "2024" autobenches = false @@ -18,8 +18,8 @@ hex = { workspace = true } libloading = "0.9" memmap2 = { workspace = true } nvtx = { workspace = true } -pegainfer-core = { workspace = true } -pegainfer-engine = { workspace = true } +openinfer-core = { workspace = true } +openinfer-engine = { workspace = true } safetensors = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } diff --git a/pegainfer-deepseek-v2-lite/src/attribution.rs b/openinfer-deepseek-v2-lite/src/attribution.rs similarity index 99% rename from pegainfer-deepseek-v2-lite/src/attribution.rs rename to openinfer-deepseek-v2-lite/src/attribution.rs index 49b00598..470587f3 100644 --- a/pegainfer-deepseek-v2-lite/src/attribution.rs +++ b/openinfer-deepseek-v2-lite/src/attribution.rs @@ -6,7 +6,7 @@ use std::{ use anyhow::Result; use cudarc::driver::sys; -use pegainfer_core::tensor::DeviceContext; +use openinfer_core::tensor::DeviceContext; use serde::Serialize; #[derive(Clone, Debug, Default)] @@ -449,7 +449,7 @@ fn micros(duration: Duration) -> u64 { } fn nvtx_enabled_from_env() -> bool { - nvtx_enabled_value(env::var("PEGAINFER_DSV2_LITE_NVTX").ok().as_deref()) + nvtx_enabled_value(env::var("OPENINFER_DSV2_LITE_NVTX").ok().as_deref()) } fn nvtx_enabled_value(value: Option<&str>) -> bool { diff --git a/pegainfer-deepseek-v2-lite/src/bin/dsv2_lite_ep2_decode_attribution.rs b/openinfer-deepseek-v2-lite/src/bin/dsv2_lite_ep2_decode_attribution.rs similarity index 96% rename from pegainfer-deepseek-v2-lite/src/bin/dsv2_lite_ep2_decode_attribution.rs rename to openinfer-deepseek-v2-lite/src/bin/dsv2_lite_ep2_decode_attribution.rs index 8ce75079..5734fec1 100644 --- a/pegainfer-deepseek-v2-lite/src/bin/dsv2_lite_ep2_decode_attribution.rs +++ b/openinfer-deepseek-v2-lite/src/bin/dsv2_lite_ep2_decode_attribution.rs @@ -4,8 +4,8 @@ use std::{ }; use anyhow::{Context, Result, bail, ensure}; -use pegainfer_deepseek_v2_lite::{DecodeGraphReadinessReport, DeepSeekV2LiteEp2Generator}; -use pegainfer_engine::engine::EngineLoadOptions; +use openinfer_deepseek_v2_lite::{DecodeGraphReadinessReport, DeepSeekV2LiteEp2Generator}; +use openinfer_engine::engine::EngineLoadOptions; use serde_json::{Value, json}; use sha2::{Digest, Sha256}; use vllm_text::tokenizer::{HuggingFaceTokenizer, Tokenizer}; @@ -116,8 +116,8 @@ fn main() -> Result<()> { fn single_report( tokenizer: &HuggingFaceTokenizer, prompt_tokens: &[u32], - result: &pegainfer_deepseek_v2_lite::GenerationResult, - attribution: &pegainfer_deepseek_v2_lite::DecodeAttributionProfile, + result: &openinfer_deepseek_v2_lite::GenerationResult, + attribution: &openinfer_deepseek_v2_lite::DecodeAttributionProfile, graph_readiness: &DecodeGraphReadinessReport, ) -> Result { let generated_text = tokenizer @@ -197,8 +197,8 @@ fn single_report( fn batch_report( tokenizer: &HuggingFaceTokenizer, prompt_tokens: &[u32], - result: &pegainfer_deepseek_v2_lite::BatchedGenerationResult, - attribution: &pegainfer_deepseek_v2_lite::DecodeAttributionProfile, + result: &openinfer_deepseek_v2_lite::BatchedGenerationResult, + attribution: &openinfer_deepseek_v2_lite::DecodeAttributionProfile, graph_readiness: &DecodeGraphReadinessReport, ) -> Result { ensure!( @@ -333,7 +333,7 @@ fn parse_cli() -> Result> { } "-h" | "--help" => { println!( - "DeepSeek-V2-Lite EP2 decode attribution gate\n\nUSAGE:\n dsv2_lite_ep2_decode_attribution [--model-path PATH] [--batch-size N] [--nccl-graph-smoke] [--out PATH]\n\nThe gate is intentionally fixed to prompt=Hello, output_len=16, with batch-size in 1..=8. Select NCCL with PEGAINFER_DSV2_LITE_EP_BACKEND=nccl. Use --nccl-graph-smoke to run a preallocated f32 NCCL all-reduce CUDA Graph capture/replay smoke after attribution." + "DeepSeek-V2-Lite EP2 decode attribution gate\n\nUSAGE:\n dsv2_lite_ep2_decode_attribution [--model-path PATH] [--batch-size N] [--nccl-graph-smoke] [--out PATH]\n\nThe gate is intentionally fixed to prompt=Hello, output_len=16, with batch-size in 1..=8. Select NCCL with OPENINFER_DSV2_LITE_EP_BACKEND=nccl. Use --nccl-graph-smoke to run a preallocated f32 NCCL all-reduce CUDA Graph capture/replay smoke after attribution." ); return Ok(None); } @@ -350,7 +350,7 @@ fn parse_cli() -> Result> { })) } -fn ep_report(stats: &pegainfer_deepseek_v2_lite::GenerationStats) -> Value { +fn ep_report(stats: &openinfer_deepseek_v2_lite::GenerationStats) -> Value { let (local_route_count, remote_route_count) = match stats.ep_backend.as_str() { "host-staged" => ( stats.host_dispatch_local_routes, @@ -381,7 +381,7 @@ fn ep_report(stats: &pegainfer_deepseek_v2_lite::GenerationStats) -> Value { }) } -fn by_op_rows(rows: &[pegainfer_deepseek_v2_lite::SectionRollup]) -> Vec { +fn by_op_rows(rows: &[openinfer_deepseek_v2_lite::SectionRollup]) -> Vec { rows.iter() .map(|row| { json!({ @@ -403,7 +403,7 @@ fn by_op_rows(rows: &[pegainfer_deepseek_v2_lite::SectionRollup]) -> Vec fn coverage_rows( backend: &str, batch_size: usize, - attribution: &pegainfer_deepseek_v2_lite::DecodeAttributionProfile, + attribution: &openinfer_deepseek_v2_lite::DecodeAttributionProfile, graph_readiness: &DecodeGraphReadinessReport, ) -> Vec { vec![ @@ -430,7 +430,7 @@ fn coverage_rows( json!({ "item": "nvtx_ranges", "status": if attribution.nvtx_enabled() { "emitted" } else { "available_when_enabled" }, - "source": "set PEGAINFER_DSV2_LITE_NVTX=1 to emit NVTX ranges for the same selected GPU/NCCL attribution sections", + "source": "set OPENINFER_DSV2_LITE_NVTX=1 to emit NVTX ranges for the same selected GPU/NCCL attribution sections", }), json!({ "item": "throughput_or_production_ep_readiness", @@ -450,7 +450,7 @@ fn coverage_rows( ] } -fn gpu_timing_status(attribution: &pegainfer_deepseek_v2_lite::DecodeAttributionProfile) -> &str { +fn gpu_timing_status(attribution: &openinfer_deepseek_v2_lite::DecodeAttributionProfile) -> &str { match ( attribution.gpu_sample_count(), attribution.gpu_timing_failure_count(), diff --git a/pegainfer-deepseek-v2-lite/src/config.rs b/openinfer-deepseek-v2-lite/src/config.rs similarity index 100% rename from pegainfer-deepseek-v2-lite/src/config.rs rename to openinfer-deepseek-v2-lite/src/config.rs diff --git a/pegainfer-deepseek-v2-lite/src/device.rs b/openinfer-deepseek-v2-lite/src/device.rs similarity index 93% rename from pegainfer-deepseek-v2-lite/src/device.rs rename to openinfer-deepseek-v2-lite/src/device.rs index a337716e..7b6d584a 100644 --- a/pegainfer-deepseek-v2-lite/src/device.rs +++ b/openinfer-deepseek-v2-lite/src/device.rs @@ -1,7 +1,7 @@ use std::cell::Cell; use anyhow::{Result, ensure}; -use pegainfer_core::{ffi, tensor::DeviceContext}; +use openinfer_core::{ffi, tensor::DeviceContext}; thread_local! { static ACTIVE_DEVICE: Cell> = const { Cell::new(None) }; diff --git a/pegainfer-deepseek-v2-lite/src/engine.rs b/openinfer-deepseek-v2-lite/src/engine.rs similarity index 99% rename from pegainfer-deepseek-v2-lite/src/engine.rs rename to openinfer-deepseek-v2-lite/src/engine.rs index b0917aa8..08360ed2 100644 --- a/pegainfer-deepseek-v2-lite/src/engine.rs +++ b/openinfer-deepseek-v2-lite/src/engine.rs @@ -4,7 +4,7 @@ use std::{ }; use anyhow::{Context, Result}; -use pegainfer_engine::engine::{ +use openinfer_engine::engine::{ EngineHandle, EngineLoadOptions, FinishReason, GenerateRequest, TokenEvent, }; use tokio::sync::mpsc; diff --git a/pegainfer-deepseek-v2-lite/src/ep.rs b/openinfer-deepseek-v2-lite/src/ep.rs similarity index 100% rename from pegainfer-deepseek-v2-lite/src/ep.rs rename to openinfer-deepseek-v2-lite/src/ep.rs diff --git a/pegainfer-deepseek-v2-lite/src/host_ops.rs b/openinfer-deepseek-v2-lite/src/host_ops.rs similarity index 99% rename from pegainfer-deepseek-v2-lite/src/host_ops.rs rename to openinfer-deepseek-v2-lite/src/host_ops.rs index 2bf43a0e..d7fc434b 100644 --- a/pegainfer-deepseek-v2-lite/src/host_ops.rs +++ b/openinfer-deepseek-v2-lite/src/host_ops.rs @@ -1,6 +1,6 @@ use anyhow::{Result, ensure}; use half::bf16; -use pegainfer_core::tensor::{DeviceContext, HiddenStates}; +use openinfer_core::tensor::{DeviceContext, HiddenStates}; use crate::{Config, device::activate}; diff --git a/pegainfer-deepseek-v2-lite/src/lib.rs b/openinfer-deepseek-v2-lite/src/lib.rs similarity index 96% rename from pegainfer-deepseek-v2-lite/src/lib.rs rename to openinfer-deepseek-v2-lite/src/lib.rs index b5eeb8bd..d6d7af1f 100644 --- a/pegainfer-deepseek-v2-lite/src/lib.rs +++ b/openinfer-deepseek-v2-lite/src/lib.rs @@ -12,7 +12,7 @@ mod weights; use std::path::Path; use anyhow::Result; -use pegainfer_engine::engine::{EngineHandle, EngineLoadOptions}; +use openinfer_engine::engine::{EngineHandle, EngineLoadOptions}; pub use attribution::{CallSiteRollup, DecodeAttributionProfile, SectionRollup, SectionSample}; pub use config::Config; diff --git a/pegainfer-deepseek-v2-lite/src/model.rs b/openinfer-deepseek-v2-lite/src/model.rs similarity index 99% rename from pegainfer-deepseek-v2-lite/src/model.rs rename to openinfer-deepseek-v2-lite/src/model.rs index 60b48212..3ad83ccf 100644 --- a/pegainfer-deepseek-v2-lite/src/model.rs +++ b/openinfer-deepseek-v2-lite/src/model.rs @@ -2,7 +2,7 @@ use std::{collections::HashMap, path::Path}; use anyhow::{Result, bail, ensure}; use half::bf16; -use pegainfer_core::{ +use openinfer_core::{ ops, tensor::{DeviceContext, DeviceMatrix, HiddenStates}, weight_loader::{deserialize_shards, load_shard_info, load_tensor_2d, mmap_shards}, diff --git a/pegainfer-deepseek-v2-lite/src/nccl_backend.rs b/openinfer-deepseek-v2-lite/src/nccl_backend.rs similarity index 99% rename from pegainfer-deepseek-v2-lite/src/nccl_backend.rs rename to openinfer-deepseek-v2-lite/src/nccl_backend.rs index 4d2194de..7e2b184d 100644 --- a/pegainfer-deepseek-v2-lite/src/nccl_backend.rs +++ b/openinfer-deepseek-v2-lite/src/nccl_backend.rs @@ -17,7 +17,7 @@ use cudarc::{ }; use half::bf16; use libloading::Library; -use pegainfer_core::{ +use openinfer_core::{ ops, tensor::{DeviceContext, HiddenStates}, }; diff --git a/pegainfer-deepseek-v2-lite/src/runtime.rs b/openinfer-deepseek-v2-lite/src/runtime.rs similarity index 100% rename from pegainfer-deepseek-v2-lite/src/runtime.rs rename to openinfer-deepseek-v2-lite/src/runtime.rs diff --git a/pegainfer-deepseek-v2-lite/src/runtime/backend.rs b/openinfer-deepseek-v2-lite/src/runtime/backend.rs similarity index 95% rename from pegainfer-deepseek-v2-lite/src/runtime/backend.rs rename to openinfer-deepseek-v2-lite/src/runtime/backend.rs index 45b1da79..870e54e8 100644 --- a/pegainfer-deepseek-v2-lite/src/runtime/backend.rs +++ b/openinfer-deepseek-v2-lite/src/runtime/backend.rs @@ -1,11 +1,11 @@ use std::env; use anyhow::{Result, bail, ensure}; -use pegainfer_core::tensor::DeviceContext; +use openinfer_core::tensor::DeviceContext; use crate::nccl_backend::NaiveNcclEp2Backend; -const EP_BACKEND_ENV: &str = "PEGAINFER_DSV2_LITE_EP_BACKEND"; +const EP_BACKEND_ENV: &str = "OPENINFER_DSV2_LITE_EP_BACKEND"; const HOST_STAGED_BACKEND: &str = "host-staged"; pub(super) const NCCL_BACKEND: &str = "nccl"; #[derive(Clone, Copy, Debug, Eq, PartialEq)] diff --git a/pegainfer-deepseek-v2-lite/src/runtime/generation.rs b/openinfer-deepseek-v2-lite/src/runtime/generation.rs similarity index 99% rename from pegainfer-deepseek-v2-lite/src/runtime/generation.rs rename to openinfer-deepseek-v2-lite/src/runtime/generation.rs index c12be921..12f6057e 100644 --- a/pegainfer-deepseek-v2-lite/src/runtime/generation.rs +++ b/openinfer-deepseek-v2-lite/src/runtime/generation.rs @@ -1,7 +1,7 @@ use std::{path::Path, time::Instant}; use anyhow::{Context, Result, ensure}; -use pegainfer_engine::engine::{EngineLoadOptions, FinishReason}; +use openinfer_engine::engine::{EngineLoadOptions, FinishReason}; use super::{ DeepSeekV2LiteEp2Generator, diff --git a/pegainfer-deepseek-v2-lite/src/runtime/helpers.rs b/openinfer-deepseek-v2-lite/src/runtime/helpers.rs similarity index 96% rename from pegainfer-deepseek-v2-lite/src/runtime/helpers.rs rename to openinfer-deepseek-v2-lite/src/runtime/helpers.rs index 7e7708bb..23d91714 100644 --- a/pegainfer-deepseek-v2-lite/src/runtime/helpers.rs +++ b/openinfer-deepseek-v2-lite/src/runtime/helpers.rs @@ -1,7 +1,7 @@ use std::time::Duration; use anyhow::{Result, bail}; -use pegainfer_engine::engine::FinishReason; +use openinfer_engine::engine::FinishReason; use sha2::{Digest, Sha256}; pub(super) fn token_sha256(tokens: &[u32]) -> String { diff --git a/pegainfer-deepseek-v2-lite/src/runtime/layers.rs b/openinfer-deepseek-v2-lite/src/runtime/layers.rs similarity index 99% rename from pegainfer-deepseek-v2-lite/src/runtime/layers.rs rename to openinfer-deepseek-v2-lite/src/runtime/layers.rs index a47e7e01..d5035cdd 100644 --- a/pegainfer-deepseek-v2-lite/src/runtime/layers.rs +++ b/openinfer-deepseek-v2-lite/src/runtime/layers.rs @@ -1,6 +1,6 @@ use anyhow::{Context, Result, ensure}; use half::bf16; -use pegainfer_core::{ +use openinfer_core::{ ops, tensor::{DeviceVec, HiddenStates}, }; diff --git a/pegainfer-deepseek-v2-lite/src/runtime/moe.rs b/openinfer-deepseek-v2-lite/src/runtime/moe.rs similarity index 99% rename from pegainfer-deepseek-v2-lite/src/runtime/moe.rs rename to openinfer-deepseek-v2-lite/src/runtime/moe.rs index 9409689e..828369a3 100644 --- a/pegainfer-deepseek-v2-lite/src/runtime/moe.rs +++ b/openinfer-deepseek-v2-lite/src/runtime/moe.rs @@ -1,6 +1,6 @@ use anyhow::{Result, bail}; use half::bf16; -use pegainfer_core::{ops, tensor::HiddenStates}; +use openinfer_core::{ops, tensor::HiddenStates}; use super::{DeepSeekV2LiteEp2Generator, backend::EpBackendRuntime}; use crate::{ @@ -375,7 +375,7 @@ impl DeepSeekV2LiteEp2Generator { } fn expert_forward_device( - ctx: &pegainfer_core::tensor::DeviceContext, + ctx: &openinfer_core::tensor::DeviceContext, expert: &ExpertMlp, input: &HiddenStates, token_idx: usize, diff --git a/pegainfer-deepseek-v2-lite/src/runtime/readiness.rs b/openinfer-deepseek-v2-lite/src/runtime/readiness.rs similarity index 99% rename from pegainfer-deepseek-v2-lite/src/runtime/readiness.rs rename to openinfer-deepseek-v2-lite/src/runtime/readiness.rs index 710276e0..e9bb8133 100644 --- a/pegainfer-deepseek-v2-lite/src/runtime/readiness.rs +++ b/openinfer-deepseek-v2-lite/src/runtime/readiness.rs @@ -35,7 +35,7 @@ impl DeepSeekV2LiteEp2Generator { Some(report) } EpBackendRuntime::HostStaged => bail!( - "DeepSeek-V2-Lite --nccl-graph-smoke requires PEGAINFER_DSV2_LITE_EP_BACKEND=nccl" + "DeepSeek-V2-Lite --nccl-graph-smoke requires OPENINFER_DSV2_LITE_EP_BACKEND=nccl" ), } } else { diff --git a/pegainfer-deepseek-v2-lite/src/runtime/tests.rs b/openinfer-deepseek-v2-lite/src/runtime/tests.rs similarity index 97% rename from pegainfer-deepseek-v2-lite/src/runtime/tests.rs rename to openinfer-deepseek-v2-lite/src/runtime/tests.rs index a2f98c75..358eee55 100644 --- a/pegainfer-deepseek-v2-lite/src/runtime/tests.rs +++ b/openinfer-deepseek-v2-lite/src/runtime/tests.rs @@ -2,7 +2,7 @@ use super::{ backend::{EpBackendKind, parse_backend, validate_backend_and_devices}, helpers::{append_generated_token, ensure_same_prompt_batch_rows_match}, }; -use pegainfer_engine::engine::FinishReason; +use openinfer_engine::engine::FinishReason; #[test] fn append_generated_token_handles_eos_stop_vs_ignore() { diff --git a/pegainfer-deepseek-v2-lite/src/runtime/types.rs b/openinfer-deepseek-v2-lite/src/runtime/types.rs similarity index 99% rename from pegainfer-deepseek-v2-lite/src/runtime/types.rs rename to openinfer-deepseek-v2-lite/src/runtime/types.rs index 92292dbd..7333b9a5 100644 --- a/pegainfer-deepseek-v2-lite/src/runtime/types.rs +++ b/openinfer-deepseek-v2-lite/src/runtime/types.rs @@ -1,6 +1,6 @@ use std::path::PathBuf; -use pegainfer_engine::engine::FinishReason; +use openinfer_engine::engine::FinishReason; use serde::Serialize; use super::backend::{EpBackendKind, NCCL_BACKEND}; diff --git a/pegainfer-deepseek-v2-lite/src/weights.rs b/openinfer-deepseek-v2-lite/src/weights.rs similarity index 100% rename from pegainfer-deepseek-v2-lite/src/weights.rs rename to openinfer-deepseek-v2-lite/src/weights.rs diff --git a/pegainfer-deepseek-v2-lite/tests/e2e_ep2.rs b/openinfer-deepseek-v2-lite/tests/e2e_ep2.rs similarity index 96% rename from pegainfer-deepseek-v2-lite/tests/e2e_ep2.rs rename to openinfer-deepseek-v2-lite/tests/e2e_ep2.rs index fc658907..4c9bc551 100644 --- a/pegainfer-deepseek-v2-lite/tests/e2e_ep2.rs +++ b/openinfer-deepseek-v2-lite/tests/e2e_ep2.rs @@ -4,8 +4,8 @@ use std::{ }; use anyhow::{Context, Result, ensure}; -use pegainfer_deepseek_v2_lite::DeepSeekV2LiteEp2Generator; -use pegainfer_engine::engine::{EngineLoadOptions, FinishReason}; +use openinfer_deepseek_v2_lite::DeepSeekV2LiteEp2Generator; +use openinfer_engine::engine::{EngineLoadOptions, FinishReason}; use sha2::{Digest, Sha256}; use vllm_text::tokenizer::{HuggingFaceTokenizer, Tokenizer}; @@ -24,12 +24,12 @@ const EXPECTED_OUTPUT_SHA256_PAIRS: &[(&str, &str, &str)] = &[ ]; const DSV2_LITE_HIDDEN_SIZE: usize = 2048; const DSV2_LITE_MOE_LAYERS: usize = 26; -const E2E_JSON_OUT_ENV: &str = "PEGAINFER_DSV2_LITE_E2E_JSON_OUT"; +const E2E_JSON_OUT_ENV: &str = "OPENINFER_DSV2_LITE_E2E_JSON_OUT"; #[test] fn test_deepseek_v2_lite_ep2_rust_generation() -> Result<()> { - let model_path_label = env::var("PEGAINFER_TEST_MODEL_PATH") - .context("PEGAINFER_TEST_MODEL_PATH must point to DeepSeek-V2-Lite weights")?; + let model_path_label = env::var("OPENINFER_TEST_MODEL_PATH") + .context("OPENINFER_TEST_MODEL_PATH must point to DeepSeek-V2-Lite weights")?; let model_path = resolve_model_path(&model_path_label); ensure!( model_path.join("config.json").exists(), @@ -258,7 +258,7 @@ fn matched_expected_output_oracle(token_sha256: &str, text_sha256: &str) -> Opti } fn current_backend() -> String { - env::var("PEGAINFER_DSV2_LITE_EP_BACKEND").unwrap_or_else(|_| "host-staged".to_string()) + env::var("OPENINFER_DSV2_LITE_EP_BACKEND").unwrap_or_else(|_| "host-staged".to_string()) } fn resolve_model_path(raw: &str) -> PathBuf { diff --git a/pegainfer-deepseek-v4/Cargo.toml b/openinfer-deepseek-v4/Cargo.toml similarity index 76% rename from pegainfer-deepseek-v4/Cargo.toml rename to openinfer-deepseek-v4/Cargo.toml index 5d087810..c0681ea9 100644 --- a/pegainfer-deepseek-v4/Cargo.toml +++ b/openinfer-deepseek-v4/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "pegainfer-deepseek-v4" +name = "openinfer-deepseek-v4" version = "0.1.0" edition = "2024" autobenches = false @@ -8,18 +8,18 @@ autotests = false [features] default = [] -deepseek-v4 = ["pegainfer-kernels/deepseek-v4"] +deepseek-v4 = ["openinfer-kernels/deepseek-v4"] deepseek-v4-cutedsl-diagnostic = [ "deepseek-v4", - "pegainfer-kernels/deepseek-v4-cutedsl-diagnostic", + "openinfer-kernels/deepseek-v4-cutedsl-diagnostic", ] # Activate the pplx-garden NVLink + RDMA all-to-all backend for decode MoE. -# Pulls in `pegainfer-comm` and exposes `runtime::moe_pplx` for the +# Pulls in `openinfer-comm` and exposes `runtime::moe_pplx` for the # rank-worker MoE call site. Compiling with this feature requires CUDA, # RDMA Verbs, and GDRCopy on the host. pplx-ep = [ "deepseek-v4", - "dep:pegainfer-comm", + "dep:openinfer-comm", ] [dependencies] @@ -31,9 +31,9 @@ half = { workspace = true } libc = { workspace = true } log = { workspace = true } memmap2 = { workspace = true } -pegainfer-comm = { workspace = true, optional = true } -pegainfer-core = { workspace = true } -pegainfer-kernels = { workspace = true } +openinfer-comm = { workspace = true, optional = true } +openinfer-core = { workspace = true } +openinfer-kernels = { workspace = true } safetensors = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } diff --git a/pegainfer-deepseek-v4/src/bin/deepseek_kernel_check.rs b/openinfer-deepseek-v4/src/bin/deepseek_kernel_check.rs similarity index 99% rename from pegainfer-deepseek-v4/src/bin/deepseek_kernel_check.rs rename to openinfer-deepseek-v4/src/bin/deepseek_kernel_check.rs index 5c6d05ed..af8baa28 100644 --- a/pegainfer-deepseek-v4/src/bin/deepseek_kernel_check.rs +++ b/openinfer-deepseek-v4/src/bin/deepseek_kernel_check.rs @@ -3,12 +3,12 @@ use std::{env, fs, path::PathBuf}; use anyhow::{Context, Result, bail, ensure}; use cudarc::driver::{DevicePtr, DevicePtrMut}; use half::bf16; -use pegainfer_deepseek_v4::{ +use openinfer_deepseek_v4::{ Bf16HiddenStates, Config, HcHiddenStates, RankGpuContext, RankWeights, TensorParallelConfig, hc_head_bf16_hidden, load_rank_subset_to_gpu, rank_local_logits_from_hidden, rms_norm_bf16_hidden, score_route_bf16_hidden, }; -use pegainfer_kernels::ffi; +use openinfer_kernels::ffi; use safetensors::{Dtype, SafeTensors, tensor::TensorView}; fn main() -> Result<()> { @@ -255,7 +255,7 @@ impl Args { } Ok(Self { model_dir: model_dir - .or_else(|| env::var_os("PEGAINFER_TEST_MODEL_PATH").map(PathBuf::from)) + .or_else(|| env::var_os("OPENINFER_TEST_MODEL_PATH").map(PathBuf::from)) .unwrap_or_else(|| PathBuf::from("models/DeepSeek-V4-Flash")), fixture_dir: fixture_dir .unwrap_or_else(|| PathBuf::from("/tmp/deepseek_kernel_fixtures")), diff --git a/pegainfer-deepseek-v4/src/bin/deepseek_v4_e2e.rs b/openinfer-deepseek-v4/src/bin/deepseek_v4_e2e.rs similarity index 96% rename from pegainfer-deepseek-v4/src/bin/deepseek_v4_e2e.rs rename to openinfer-deepseek-v4/src/bin/deepseek_v4_e2e.rs index 758d86fa..d455b2ce 100644 --- a/pegainfer-deepseek-v4/src/bin/deepseek_v4_e2e.rs +++ b/openinfer-deepseek-v4/src/bin/deepseek_v4_e2e.rs @@ -2,8 +2,8 @@ use std::path::PathBuf; use anyhow::{Result, bail}; use clap::Parser; -use pegainfer_core::logging; -use pegainfer_deepseek_v4::e2e_runner::{ +use openinfer_core::logging; +use openinfer_deepseek_v4::e2e_runner::{ self, DEFAULT_GROUND_TRUTH_PATH, DEFAULT_MAX_NEW_TOKENS, DEFAULT_MODEL_PATH, E2eOptions, }; diff --git a/pegainfer-deepseek-v4/src/config.rs b/openinfer-deepseek-v4/src/config.rs similarity index 100% rename from pegainfer-deepseek-v4/src/config.rs rename to openinfer-deepseek-v4/src/config.rs diff --git a/pegainfer-deepseek-v4/src/direct.rs b/openinfer-deepseek-v4/src/direct.rs similarity index 100% rename from pegainfer-deepseek-v4/src/direct.rs rename to openinfer-deepseek-v4/src/direct.rs diff --git a/pegainfer-deepseek-v4/src/direct/affinity.rs b/openinfer-deepseek-v4/src/direct/affinity.rs similarity index 95% rename from pegainfer-deepseek-v4/src/direct/affinity.rs rename to openinfer-deepseek-v4/src/direct/affinity.rs index 08b605f7..561c2c4f 100644 --- a/pegainfer-deepseek-v4/src/direct/affinity.rs +++ b/openinfer-deepseek-v4/src/direct/affinity.rs @@ -1,4 +1,4 @@ -pub(super) use pegainfer_core::cpu_topology::{ +pub(super) use openinfer_core::cpu_topology::{ CpuId, RankThreadPlacement, RankThreadPlacementPlan, pin_current_thread_to_cpu, }; diff --git a/pegainfer-deepseek-v4/src/direct/scheduler.rs b/openinfer-deepseek-v4/src/direct/scheduler.rs similarity index 99% rename from pegainfer-deepseek-v4/src/direct/scheduler.rs rename to openinfer-deepseek-v4/src/direct/scheduler.rs index 204d6278..3e3028fa 100644 --- a/pegainfer-deepseek-v4/src/direct/scheduler.rs +++ b/openinfer-deepseek-v4/src/direct/scheduler.rs @@ -9,7 +9,7 @@ use std::{ use anyhow::{Context, Result, bail, ensure}; use log::{info, warn}; -use pegainfer_core::engine::{ +use openinfer_core::engine::{ EngineHandle, EngineLoadOptions, FinishReason, GenerateRequest, TokenEvent, }; use tokio::sync::mpsc::{self, error::TryRecvError}; @@ -311,7 +311,7 @@ impl DeepSeekV4DirectGenerator { /// participates in host-side bookkeeping that is incompatible with /// graph capture/replay). #[cfg(feature = "pplx-ep")] - pub fn enable_pplx(&self, ep_backends: Vec) -> Result<()> { + pub fn enable_pplx(&self, ep_backends: Vec) -> Result<()> { self.runtime.enable_pplx(ep_backends) } @@ -1095,18 +1095,18 @@ pub fn start_engine(model_path: &Path, options: EngineLoadOptions) -> Result>(), generator.runtime.thread_placement(), - pegainfer_comm::bootstrap::PplxBootstrapParams::default(), + openinfer_comm::bootstrap::PplxBootstrapParams::default(), ) { Ok((backends, resources)) => { // Leak resources for process lifetime — bootstrap is one-shot. @@ -1296,7 +1296,7 @@ fn handle_request(generator: &mut DeepSeekV4DirectGenerator, req: GenerateReques } info!( - "pegainfer_http_trace {}", + "openinfer_http_trace {}", serde_json::json!({ "request_id": request_id, "queued_at_unix_s": queued_at_unix_s, @@ -1607,7 +1607,7 @@ fn handle_request_wave(generator: &mut DeepSeekV4DirectGenerator, requests: Vec< fn log_http_trace_for_active_request(active_req: &ActiveDirectRequest) { info!( - "pegainfer_http_trace {}", + "openinfer_http_trace {}", serde_json::json!({ "request_id": active_req.request_id, "queued_at_unix_s": active_req.queued_at_unix_s, @@ -1952,7 +1952,7 @@ mod tests { #[test] #[ignore = "requires 8 GPUs and DeepSeek-V4-Flash weights"] fn batch_decode_logits_match_two_single_decode_rows() -> Result<()> { - let model_path = env::var_os("PEGAINFER_TEST_MODEL_PATH") + let model_path = env::var_os("OPENINFER_TEST_MODEL_PATH") .map(PathBuf::from) .unwrap_or_else(|| PathBuf::from("models/DeepSeek-V4-Flash")); let mut generator = DeepSeekV4DirectGenerator::from_model_dir(&model_path)?; @@ -1981,7 +1981,7 @@ mod tests { #[test] #[ignore = "requires 8 GPUs and DeepSeek-V4-Flash weights"] fn active_set_batch_tick_releases_each_slot() -> Result<()> { - let model_path = env::var_os("PEGAINFER_TEST_MODEL_PATH") + let model_path = env::var_os("OPENINFER_TEST_MODEL_PATH") .map(PathBuf::from) .unwrap_or_else(|| PathBuf::from("models/DeepSeek-V4-Flash")); let mut generator = DeepSeekV4DirectGenerator::from_model_dir(&model_path)?; diff --git a/pegainfer-deepseek-v4/src/direct/worker.rs b/openinfer-deepseek-v4/src/direct/worker.rs similarity index 99% rename from pegainfer-deepseek-v4/src/direct/worker.rs rename to openinfer-deepseek-v4/src/direct/worker.rs index d4ca4cc0..22177cbe 100644 --- a/pegainfer-deepseek-v4/src/direct/worker.rs +++ b/openinfer-deepseek-v4/src/direct/worker.rs @@ -85,7 +85,7 @@ enum RankCommand { /// the pplx path is desired. #[cfg(feature = "pplx-ep")] EnablePplx { - ep_backend: pegainfer_comm::EpBackend, + ep_backend: openinfer_comm::EpBackend, resp: channel::Sender>, }, Shutdown, @@ -132,7 +132,7 @@ struct RankDecodeScratch { /// Per-rank EP backend (worker thread + MR-registered buffers). Moved /// in via `RankCommand::EnablePplx`; absent for the NCCL path. #[cfg(feature = "pplx-ep")] - ep_backend: Option, + ep_backend: Option, attention_projection: AttentionProjectionScratch, attention_output: AttentionOutputScratch, attention_index: AttentionIndexScratch, @@ -200,7 +200,7 @@ impl RankDecodeScratch { ctx: &RankGpuContext, config: &Config, world_size: usize, - ep_backend: pegainfer_comm::EpBackend, + ep_backend: openinfer_comm::EpBackend, ) -> Result<()> { if self.moe_pplx.is_none() { self.moe_pplx = Some(crate::runtime::MoePplxScratch::new( @@ -521,7 +521,7 @@ impl RankWorker { /// worker thread. Once acknowledged, subsequent `decode` / `decode_batch` /// calls route the routed-expert step through pplx. #[cfg(feature = "pplx-ep")] - fn enable_pplx(&self, ep_backend: pegainfer_comm::EpBackend) -> Result<()> { + fn enable_pplx(&self, ep_backend: openinfer_comm::EpBackend) -> Result<()> { let (resp_tx, resp_rx) = channel::bounded(1); self.tx .send(RankCommand::EnablePplx { @@ -641,7 +641,7 @@ impl FullDirectRuntime { /// `decode` / `decode_batch` commands route the routed-expert step /// through pplx instead of NCCL AG/RS on every rank. #[cfg(feature = "pplx-ep")] - pub(super) fn enable_pplx(&self, ep_backends: Vec) -> Result<()> { + pub(super) fn enable_pplx(&self, ep_backends: Vec) -> Result<()> { ensure!( ep_backends.len() == self.workers.len(), "enable_pplx expected {} EP backends (one per rank), got {}", @@ -668,7 +668,7 @@ impl Drop for FullDirectRuntime { fn bind_rank_thread(ctx: &RankGpuContext) -> Result<()> { ctx.set_current()?; unsafe { - pegainfer_kernels::ffi::cublas_init(); + openinfer_kernels::ffi::cublas_init(); } Ok(()) } @@ -1620,7 +1620,7 @@ fn run_prefill_on_rank_lane( )?; if profile && rank == 0 { info!( - "pegainfer_prefill_profile {}", + "openinfer_prefill_profile {}", serde_json::json!({ "rank": rank, "prompt_tokens": seq_len, diff --git a/pegainfer-deepseek-v4/src/e2e_runner.rs b/openinfer-deepseek-v4/src/e2e_runner.rs similarity index 98% rename from pegainfer-deepseek-v4/src/e2e_runner.rs rename to openinfer-deepseek-v4/src/e2e_runner.rs index 5b28dba5..5ef424ee 100644 --- a/pegainfer-deepseek-v4/src/e2e_runner.rs +++ b/openinfer-deepseek-v4/src/e2e_runner.rs @@ -4,8 +4,8 @@ use std::time::{Duration, Instant}; use anyhow::{Context, Result, bail}; use log::info; -use pegainfer_core::engine::{EngineHandle, EngineLoadOptions, GenerateRequest, TokenEvent}; -use pegainfer_core::sampler::SamplingParams; +use openinfer_core::engine::{EngineHandle, EngineLoadOptions, GenerateRequest, TokenEvent}; +use openinfer_core::sampler::SamplingParams; use serde::Deserialize; use tokio::sync::mpsc; use vllm_text::tokenizer::{HuggingFaceTokenizer, Tokenizer}; diff --git a/pegainfer-deepseek-v4/src/lib.rs b/openinfer-deepseek-v4/src/lib.rs similarity index 100% rename from pegainfer-deepseek-v4/src/lib.rs rename to openinfer-deepseek-v4/src/lib.rs diff --git a/pegainfer-deepseek-v4/src/model.rs b/openinfer-deepseek-v4/src/model.rs similarity index 100% rename from pegainfer-deepseek-v4/src/model.rs rename to openinfer-deepseek-v4/src/model.rs diff --git a/pegainfer-deepseek-v4/src/runtime/attention.rs b/openinfer-deepseek-v4/src/runtime/attention.rs similarity index 100% rename from pegainfer-deepseek-v4/src/runtime/attention.rs rename to openinfer-deepseek-v4/src/runtime/attention.rs diff --git a/pegainfer-deepseek-v4/src/runtime/attention_base.rs b/openinfer-deepseek-v4/src/runtime/attention_base.rs similarity index 100% rename from pegainfer-deepseek-v4/src/runtime/attention_base.rs rename to openinfer-deepseek-v4/src/runtime/attention_base.rs diff --git a/pegainfer-deepseek-v4/src/runtime/block.rs b/openinfer-deepseek-v4/src/runtime/block.rs similarity index 100% rename from pegainfer-deepseek-v4/src/runtime/block.rs rename to openinfer-deepseek-v4/src/runtime/block.rs diff --git a/pegainfer-deepseek-v4/src/runtime/collectives.rs b/openinfer-deepseek-v4/src/runtime/collectives.rs similarity index 100% rename from pegainfer-deepseek-v4/src/runtime/collectives.rs rename to openinfer-deepseek-v4/src/runtime/collectives.rs diff --git a/pegainfer-deepseek-v4/src/runtime/compressor.rs b/openinfer-deepseek-v4/src/runtime/compressor.rs similarity index 100% rename from pegainfer-deepseek-v4/src/runtime/compressor.rs rename to openinfer-deepseek-v4/src/runtime/compressor.rs diff --git a/pegainfer-deepseek-v4/src/runtime/core.rs b/openinfer-deepseek-v4/src/runtime/core.rs similarity index 100% rename from pegainfer-deepseek-v4/src/runtime/core.rs rename to openinfer-deepseek-v4/src/runtime/core.rs diff --git a/pegainfer-deepseek-v4/src/runtime/indexer.rs b/openinfer-deepseek-v4/src/runtime/indexer.rs similarity index 100% rename from pegainfer-deepseek-v4/src/runtime/indexer.rs rename to openinfer-deepseek-v4/src/runtime/indexer.rs diff --git a/pegainfer-deepseek-v4/src/runtime/mod.rs b/openinfer-deepseek-v4/src/runtime/mod.rs similarity index 96% rename from pegainfer-deepseek-v4/src/runtime/mod.rs rename to openinfer-deepseek-v4/src/runtime/mod.rs index 67b38017..2d0d2d3f 100644 --- a/pegainfer-deepseek-v4/src/runtime/mod.rs +++ b/openinfer-deepseek-v4/src/runtime/mod.rs @@ -4,7 +4,7 @@ use anyhow::{Context, Result, ensure}; use cudarc::driver::{CudaSlice, DevicePtr, DevicePtrMut}; use cudarc::nccl::{ReduceOp, safe::Comm}; use half::bf16; -use pegainfer_kernels::ffi; +use openinfer_kernels::ffi; use crate::{ config::Config, diff --git a/pegainfer-deepseek-v4/src/runtime/moe.rs b/openinfer-deepseek-v4/src/runtime/moe.rs similarity index 100% rename from pegainfer-deepseek-v4/src/runtime/moe.rs rename to openinfer-deepseek-v4/src/runtime/moe.rs diff --git a/pegainfer-deepseek-v4/src/runtime/moe_pplx.rs b/openinfer-deepseek-v4/src/runtime/moe_pplx.rs similarity index 98% rename from pegainfer-deepseek-v4/src/runtime/moe_pplx.rs rename to openinfer-deepseek-v4/src/runtime/moe_pplx.rs index 58ccad4c..1e212630 100644 --- a/pegainfer-deepseek-v4/src/runtime/moe_pplx.rs +++ b/openinfer-deepseek-v4/src/runtime/moe_pplx.rs @@ -4,7 +4,7 @@ //! same call-site contract, same routing / shared-expert / grouped-FP4 //! GEMM helpers, but cross-rank token movement uses the upstream //! four-step pipeline (`dispatch_send → dispatch_recv → combine_send → -//! combine_recv`) wrapped by [`pegainfer_comm::EpBackend`]. +//! combine_recv`) wrapped by [`openinfer_comm::EpBackend`]. //! //! # Stream layout //! @@ -32,8 +32,8 @@ use std::ffi::c_void; use std::ptr; use cudarc::driver::CudaStream; -use pegainfer_comm::{EpBackend, ScalarType}; -use pegainfer_kernels::ffi; +use openinfer_comm::{EpBackend, ScalarType}; +use openinfer_kernels::ffi; use super::core::shared_expert_forward_bf16_hidden_scratch; use super::moe::{ diff --git a/pegainfer-deepseek-v4/src/runtime/state.rs b/openinfer-deepseek-v4/src/runtime/state.rs similarity index 99% rename from pegainfer-deepseek-v4/src/runtime/state.rs rename to openinfer-deepseek-v4/src/runtime/state.rs index 465bc5e8..0a6db4dd 100644 --- a/pegainfer-deepseek-v4/src/runtime/state.rs +++ b/openinfer-deepseek-v4/src/runtime/state.rs @@ -136,7 +136,7 @@ pub(crate) struct MoeRunContext<'a> { /// `EpBackend` and pplx scratch. #[cfg(feature = "pplx-ep")] pub(crate) struct MoePplxRunContext<'a> { - pub(crate) ep: &'a mut pegainfer_comm::EpBackend, + pub(crate) ep: &'a mut openinfer_comm::EpBackend, pub(crate) scratch: &'a mut MoePplxScratch, } @@ -598,7 +598,7 @@ impl MoePplxScratch { // Match the upstream pplx-garden formula (p2p_all_to_all.py:105). // The packed expert buffer is addressed via padded indices, so // capacity must include `expert_padding` slop. Keep these - // parameters in sync with `pegainfer_comm::bootstrap::PplxBootstrapParams`. + // parameters in sync with `openinfer_comm::bootstrap::PplxBootstrapParams`. const EXPERT_PADDING: usize = 16; // Match PplxBootstrapParams::default().max_num_tokens — EpBackend // is initialized with this capacity, so the scratch buffer must diff --git a/pegainfer-deepseek-v4/src/weights.rs b/openinfer-deepseek-v4/src/weights.rs similarity index 99% rename from pegainfer-deepseek-v4/src/weights.rs rename to openinfer-deepseek-v4/src/weights.rs index 31990c36..76f5fd7d 100644 --- a/pegainfer-deepseek-v4/src/weights.rs +++ b/openinfer-deepseek-v4/src/weights.rs @@ -7,7 +7,7 @@ use std::{ use anyhow::{Context, Result, bail, ensure}; use cudarc::driver::{CudaContext, CudaSlice, CudaStream}; use memmap2::Mmap; -use pegainfer_kernels::ffi; +use openinfer_kernels::ffi; use safetensors::{Dtype, SafeTensors}; use std::sync::Arc; diff --git a/pegainfer-deepseek-v4/tests/mp8_manifest.rs b/openinfer-deepseek-v4/tests/mp8_manifest.rs similarity index 99% rename from pegainfer-deepseek-v4/tests/mp8_manifest.rs rename to openinfer-deepseek-v4/tests/mp8_manifest.rs index 91dd1d1f..9485658b 100644 --- a/pegainfer-deepseek-v4/tests/mp8_manifest.rs +++ b/openinfer-deepseek-v4/tests/mp8_manifest.rs @@ -1,7 +1,7 @@ use std::path::{Path, PathBuf}; use half::bf16; -use pegainfer_deepseek_v4::{ +use openinfer_deepseek_v4::{ AttentionProjections, Bf16Cache, Bf16HiddenStates, Config, DeepSeekRankModel, DeepSeekRopeCache, GpuRawTensor, QuantLinearRef, RankGpuContext, RankWeights, TensorParallelConfig, TensorRef, apply_rope_attention_projections, @@ -15,7 +15,7 @@ use safetensors::Dtype; const DEFAULT_MODEL_PATH: &str = "models/DeepSeek-V4-Flash"; fn deepseek_model_path() -> PathBuf { - std::env::var_os("PEGAINFER_TEST_MODEL_PATH") + std::env::var_os("OPENINFER_TEST_MODEL_PATH") .map(PathBuf::from) .unwrap_or_else(|| PathBuf::from(DEFAULT_MODEL_PATH)) } @@ -717,7 +717,7 @@ fn rank0_full_gpu_load_builds_executor_owned_model() { let config = Config::from_model_dir(model_path).expect("load config"); let ctx = RankGpuContext::new(0).expect("create CUDA context"); - let weights = pegainfer_deepseek_v4::load_rank_to_gpu( + let weights = openinfer_deepseek_v4::load_rank_to_gpu( &ctx, model_path, &config, diff --git a/pegainfer-engine/Cargo.toml b/openinfer-engine/Cargo.toml similarity index 89% rename from pegainfer-engine/Cargo.toml rename to openinfer-engine/Cargo.toml index b9f6b3f1..2f824541 100644 --- a/pegainfer-engine/Cargo.toml +++ b/openinfer-engine/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "pegainfer-engine" +name = "openinfer-engine" version = "0.1.0" edition = "2024" diff --git a/pegainfer-engine/src/engine.rs b/openinfer-engine/src/engine.rs similarity index 100% rename from pegainfer-engine/src/engine.rs rename to openinfer-engine/src/engine.rs diff --git a/pegainfer-engine/src/lib.rs b/openinfer-engine/src/lib.rs similarity index 100% rename from pegainfer-engine/src/lib.rs rename to openinfer-engine/src/lib.rs diff --git a/pegainfer-engine/src/parallel.rs b/openinfer-engine/src/parallel.rs similarity index 100% rename from pegainfer-engine/src/parallel.rs rename to openinfer-engine/src/parallel.rs diff --git a/pegainfer-engine/src/sampler.rs b/openinfer-engine/src/sampler.rs similarity index 100% rename from pegainfer-engine/src/sampler.rs rename to openinfer-engine/src/sampler.rs diff --git a/pegainfer-kernels/Cargo.toml b/openinfer-kernels/Cargo.toml similarity index 92% rename from pegainfer-kernels/Cargo.toml rename to openinfer-kernels/Cargo.toml index f36b8e12..dbba2936 100644 --- a/pegainfer-kernels/Cargo.toml +++ b/openinfer-kernels/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "pegainfer-kernels" +name = "openinfer-kernels" version = "0.1.0" edition = "2024" diff --git a/pegainfer-kernels/KERNELS.md b/openinfer-kernels/KERNELS.md similarity index 93% rename from pegainfer-kernels/KERNELS.md rename to openinfer-kernels/KERNELS.md index c93287d4..1b1a5da8 100644 --- a/pegainfer-kernels/KERNELS.md +++ b/openinfer-kernels/KERNELS.md @@ -1,4 +1,4 @@ -# PegaInfer Kernels Index +# OpenInfer Kernels Index **Scope**: this crate owns CUDA/Triton build output, FFI declarations, kernel ABI tensor helpers, paged-KV layout metadata, and Rust operator wrappers. Runtime policy objects such as `KvPool`, `PagePool`, and `SamplingParams` stay outside this crate. @@ -37,9 +37,9 @@ Qwen3-4B uses bf16 dense full attention with `hidden_size=2560`, `num_attention_ ## DeepSeek V4 MP8 Path DeepSeek V4 uses the `deepseek-v4` Cargo feature. The server feature forwards -through `pegainfer-deepseek-v4/deepseek-v4` to `pegainfer-kernels/deepseek-v4`. -Runtime call sites live in `pegainfer-deepseek-v4/src/runtime/` and call these -symbols directly through `pegainfer_kernels::ffi`. +through `openinfer-deepseek-v4/deepseek-v4` to `openinfer-kernels/deepseek-v4`. +Runtime call sites live in `openinfer-deepseek-v4/src/runtime/` and call these +symbols directly through `openinfer_kernels::ffi`. | op_id | Runtime owner | FFI symbols | Source | Backend | Shape / layout notes | | --- | --- | --- | --- | --- | --- | @@ -66,9 +66,9 @@ symbols directly through `pegainfer_kernels::ffi`. ## Kimi-K2 Text TP8/EP8 Path -Kimi-K2 uses the `pegainfer-kimi-k2` model crate. The kernel-crate surface +Kimi-K2 uses the `openinfer-kimi-k2` model crate. The kernel-crate surface is text-only and targets TP8/EP8 with bs > 1 from the start. Shared BF16 ops -reuse existing PegaInfer wrappers. Kimi-specific MoE router and routed INT4 +reuse existing OpenInfer wrappers. Kimi-specific MoE router and routed INT4 expert entry points live under model-specific ops modules. Kimi router uses the existing graph-safe GEMM path plus a device-side top8 selector. Routed experts run on the vLLM Marlin WNA16 backend; the earlier CUTLASS example69 @@ -84,20 +84,20 @@ AG/RS. | op_id | Runtime owner | Rust wrapper | FFI symbols | Source | Backend | Shape / layout notes | | --- | --- | --- | --- | --- | --- | --- | -| `kimi_k2.norm.rms_batch` | `pegainfer-kimi-k2` | `ops::rms_norm_batch_into` | `rms_norm_batched_cuda` | `csrc/shared/flashinfer_norm.cu` | FlashInfer CUDA | BF16 hidden states, one row per token; Kimi hidden `7168`, q LoRA `1536`, and kv LoRA `512` all use the parameterized wrapper. This is not a fallback path. | -| `kimi_k2.norm.rms_vec` | `pegainfer-kimi-k2` | `ops::rms_norm_into` | `rms_norm_cuda` | `csrc/shared/flashinfer_norm.cu` | FlashInfer CUDA | BF16 single vector path; exposed in `pegainfer-kimi-k2` headers as `RmsNormBackend::FlashInferVec`. | -| `kimi_k2.norm.fused_add_rms` | `pegainfer-kimi-k2` | `ops::fused_add_rms_norm_batch_into` | `fused_add_rms_norm_batched_cuda` | `csrc/shared/flashinfer_norm.cu` | FlashInfer CUDA | Residual add plus RMSNorm over bs > 1 token batches. | -| `kimi_k2.linear.dense_bf16` | `pegainfer-kimi-k2` | `ops::gemm_into` / `ops::gemm_rows_into` | `gemm_cuda` | `csrc/shared/linear.cu` | cuBLAS | BF16 attention, dense MLP, shared expert, router gate, and lm_head shard projections. | -| `kimi_k2.attn.mla_fused_qkv_a` | `pegainfer-kimi-k2` | `ops::gemm_graphsafe_into_checked` | `gemm_graphsafe_cuda` | `csrc/shared/linear.cu` | graph-safe cuBLAS GEMM | Load-time `DeviceMatrix::vstack(q_a_proj, kv_a_proj_with_mqa)` creates weight `[2112,7168]`; decode writes `qkv_a [B,2112]` without D2H or step-time allocation. | -| `kimi_k2.attn.mla_split_qkv_a` | `pegainfer-kimi-k2` | `ops::kimi_mla_split_qkv_a` | `kimi_mla_split_qkv_a_cuda` | `csrc/kimi_k2/kimi_mla.cu` | CUDA | Splits fused `qkv_a [B,2112]` into `q_a [B,1536]`, compressed KV `[B,512]`, and raw `k_rope [B,64]`. This replaces the old separate `kv_a` split path. | -| `kimi_k2.attn.mla_rope_split_decode` | `pegainfer-kimi-k2` | `ops::kimi_mla_rope_split_decode_rt` | `kimi_mla_rope_split_decode_cuda` | `csrc/kimi_k2/kimi_mla.cu` | CUDA | Decode-step split+RoPE prep: `q_proj [B,8,192]` and current `k_rope [B,64]` plus device positions produce `q_nope [B,8,128]`, `q_pe [B,8,64]`, and `append_kpe [B,64]` in Kimi split-half RoPE layout. | -| `kimi_k2.attn.mla_absorb_q` | `pegainfer-kimi-k2` | `ops::kimi_mla_absorb_q_nope_rt` | `kimi_mla_absorb_q_nope_cuda` | `csrc/kimi_k2/kimi_mla.cu` | graph-safe cuBLAS strided-batched GEMM | Uses the `W_UK` slice inside `kv_b_proj [8,256,512]` directly: `q_nope [B,8,128] -> q_abs_nope [B,8,512]`, one cuBLAS batch per local head, no weight repack. | -| `kimi_k2.attn.mla_paged_append` | `pegainfer-kimi-k2` | `ops::kimi_mla_paged_kv_append` | `kimi_mla_paged_kv_append_cuda` | `csrc/kimi_k2/kimi_mla.cu` | FlashInfer MLA page helper | Appends compressed MLA KV step tensors into paged cache: `append_ckv [nnz,512]`, `append_kpe [nnz,64]`, device `batch_indices/positions`, page table CSR, and explicit ckv/kpe strides. Runtime may use separate ckv/kpe buffers or strided views into concat storage. | -| `kimi_k2.attn.mla_decode_paged` | `pegainfer-kimi-k2` | `ops::kimi_flashinfer_batch_decode_mla_rt` | `kimi_flashinfer_batch_decode_mla_cuda` | `csrc/kimi_k2/kimi_mla.cu` | FlashInfer BatchDecode MLA | Consumes absorbed `q_abs_nope [B,8,512]`, `q_pe [B,8,64]`, paged compressed KV, and decode plan arrays; writes latent attention output `[B,8,512]`. `W_UK_T [H,128,512]` absorption and `W_UV [H,512,128]` v-up stay model-side. | -| `kimi_k2.attn.mla_v_up` | `pegainfer-kimi-k2` | `ops::kimi_mla_v_up_rt` | `kimi_mla_v_up_cuda` | `csrc/kimi_k2/kimi_mla.cu` | graph-safe cuBLAS strided-batched GEMM | Uses the `W_UV` slice inside `kv_b_proj [8,256,512]` directly: FlashInfer latent `[B,8,512] -> attn_out [B,8,128]`, one cuBLAS batch per local head, no D2H. | -| `kimi_k2.moe.router_noaux_tc` | `pegainfer-kimi-k2` | `ops::kimi_router_noaux_tc_launch` | `kimi_k2_router_noaux_tc_cuda` | `csrc/kimi_k2/kimi_router.cu` | graph-safe GEMM + CUDA selector | BF16 hidden `[padded_tokens,7168]`, gate `[384,7168]`, correction bias `[384]`, output top8 route weights/indices for active tokens; logits projection uses library GEMM, selection stays device-resident. H20 rank0 gate covers real K2.5 layer1 gate/bias. | -| `kimi_k2.moe.marlin_align_block_size` | `pegainfer-kimi-k2` | `ops::kimi_moe_marlin_align_block_size` | `kimi_moe_marlin_align_block_size_cuda` | `csrc/kimi_k2/kimi_experts.cu` | CUDA routing metadata | Device-resident vLLM Marlin/WNA16 alignment: `sorted_token_ids`, `expert_ids`, and `num_tokens_post_padded` for local EP experts. It ignores non-local experts like vLLM `ignore_invalid_experts=True`, pads each local expert to block size `8/16/32/48/64`, uses sentinel `active_tokens * topk`, and performs no D2H or allocation in the decode step. | -| `kimi_k2.moe.int4_marlin_package` | `pegainfer-kimi-k2` | `ops::kimi_marlin_int4_reorder_weight`, `ops::kimi_marlin_int4_reorder_scale`, `ops::kimi_marlin_int4_fuse_w13` | `kimi_marlin_int4_reorder_weight_cuda`, `kimi_marlin_int4_reorder_scale_cuda`, `kimi_marlin_int4_fuse_w13_cuda` | `csrc/kimi_k2/kimi_marlin_int4.cu` | CUDA load-time package helpers | Weight package preserves vLLM `uint4b8` bias=8 nibbles. Single projections repack checkpoint `[expert,out,K/8] int32` into Marlin no-actorder `[expert,K/16,N*2] int32`; scale package converts checkpoint `[expert,out,K/32]` into vLLM Marlin group-major+perm64 `[expert,K/32,out]`. Final runtime package fuses gate/up into W13 `[expert,K/16,4096*2]` and W13 scale `[expert,K/32,4096]`; W2 remains `[expert,2048/16,7168*2]` and `[expert,2048/32,7168]`. These are load/package helpers, not decode hot-path kernels. | +| `kimi_k2.norm.rms_batch` | `openinfer-kimi-k2` | `ops::rms_norm_batch_into` | `rms_norm_batched_cuda` | `csrc/shared/flashinfer_norm.cu` | FlashInfer CUDA | BF16 hidden states, one row per token; Kimi hidden `7168`, q LoRA `1536`, and kv LoRA `512` all use the parameterized wrapper. This is not a fallback path. | +| `kimi_k2.norm.rms_vec` | `openinfer-kimi-k2` | `ops::rms_norm_into` | `rms_norm_cuda` | `csrc/shared/flashinfer_norm.cu` | FlashInfer CUDA | BF16 single vector path; exposed in `openinfer-kimi-k2` headers as `RmsNormBackend::FlashInferVec`. | +| `kimi_k2.norm.fused_add_rms` | `openinfer-kimi-k2` | `ops::fused_add_rms_norm_batch_into` | `fused_add_rms_norm_batched_cuda` | `csrc/shared/flashinfer_norm.cu` | FlashInfer CUDA | Residual add plus RMSNorm over bs > 1 token batches. | +| `kimi_k2.linear.dense_bf16` | `openinfer-kimi-k2` | `ops::gemm_into` / `ops::gemm_rows_into` | `gemm_cuda` | `csrc/shared/linear.cu` | cuBLAS | BF16 attention, dense MLP, shared expert, router gate, and lm_head shard projections. | +| `kimi_k2.attn.mla_fused_qkv_a` | `openinfer-kimi-k2` | `ops::gemm_graphsafe_into_checked` | `gemm_graphsafe_cuda` | `csrc/shared/linear.cu` | graph-safe cuBLAS GEMM | Load-time `DeviceMatrix::vstack(q_a_proj, kv_a_proj_with_mqa)` creates weight `[2112,7168]`; decode writes `qkv_a [B,2112]` without D2H or step-time allocation. | +| `kimi_k2.attn.mla_split_qkv_a` | `openinfer-kimi-k2` | `ops::kimi_mla_split_qkv_a` | `kimi_mla_split_qkv_a_cuda` | `csrc/kimi_k2/kimi_mla.cu` | CUDA | Splits fused `qkv_a [B,2112]` into `q_a [B,1536]`, compressed KV `[B,512]`, and raw `k_rope [B,64]`. This replaces the old separate `kv_a` split path. | +| `kimi_k2.attn.mla_rope_split_decode` | `openinfer-kimi-k2` | `ops::kimi_mla_rope_split_decode_rt` | `kimi_mla_rope_split_decode_cuda` | `csrc/kimi_k2/kimi_mla.cu` | CUDA | Decode-step split+RoPE prep: `q_proj [B,8,192]` and current `k_rope [B,64]` plus device positions produce `q_nope [B,8,128]`, `q_pe [B,8,64]`, and `append_kpe [B,64]` in Kimi split-half RoPE layout. | +| `kimi_k2.attn.mla_absorb_q` | `openinfer-kimi-k2` | `ops::kimi_mla_absorb_q_nope_rt` | `kimi_mla_absorb_q_nope_cuda` | `csrc/kimi_k2/kimi_mla.cu` | graph-safe cuBLAS strided-batched GEMM | Uses the `W_UK` slice inside `kv_b_proj [8,256,512]` directly: `q_nope [B,8,128] -> q_abs_nope [B,8,512]`, one cuBLAS batch per local head, no weight repack. | +| `kimi_k2.attn.mla_paged_append` | `openinfer-kimi-k2` | `ops::kimi_mla_paged_kv_append` | `kimi_mla_paged_kv_append_cuda` | `csrc/kimi_k2/kimi_mla.cu` | FlashInfer MLA page helper | Appends compressed MLA KV step tensors into paged cache: `append_ckv [nnz,512]`, `append_kpe [nnz,64]`, device `batch_indices/positions`, page table CSR, and explicit ckv/kpe strides. Runtime may use separate ckv/kpe buffers or strided views into concat storage. | +| `kimi_k2.attn.mla_decode_paged` | `openinfer-kimi-k2` | `ops::kimi_flashinfer_batch_decode_mla_rt` | `kimi_flashinfer_batch_decode_mla_cuda` | `csrc/kimi_k2/kimi_mla.cu` | FlashInfer BatchDecode MLA | Consumes absorbed `q_abs_nope [B,8,512]`, `q_pe [B,8,64]`, paged compressed KV, and decode plan arrays; writes latent attention output `[B,8,512]`. `W_UK_T [H,128,512]` absorption and `W_UV [H,512,128]` v-up stay model-side. | +| `kimi_k2.attn.mla_v_up` | `openinfer-kimi-k2` | `ops::kimi_mla_v_up_rt` | `kimi_mla_v_up_cuda` | `csrc/kimi_k2/kimi_mla.cu` | graph-safe cuBLAS strided-batched GEMM | Uses the `W_UV` slice inside `kv_b_proj [8,256,512]` directly: FlashInfer latent `[B,8,512] -> attn_out [B,8,128]`, one cuBLAS batch per local head, no D2H. | +| `kimi_k2.moe.router_noaux_tc` | `openinfer-kimi-k2` | `ops::kimi_router_noaux_tc_launch` | `kimi_k2_router_noaux_tc_cuda` | `csrc/kimi_k2/kimi_router.cu` | graph-safe GEMM + CUDA selector | BF16 hidden `[padded_tokens,7168]`, gate `[384,7168]`, correction bias `[384]`, output top8 route weights/indices for active tokens; logits projection uses library GEMM, selection stays device-resident. H20 rank0 gate covers real K2.5 layer1 gate/bias. | +| `kimi_k2.moe.marlin_align_block_size` | `openinfer-kimi-k2` | `ops::kimi_moe_marlin_align_block_size` | `kimi_moe_marlin_align_block_size_cuda` | `csrc/kimi_k2/kimi_experts.cu` | CUDA routing metadata | Device-resident vLLM Marlin/WNA16 alignment: `sorted_token_ids`, `expert_ids`, and `num_tokens_post_padded` for local EP experts. It ignores non-local experts like vLLM `ignore_invalid_experts=True`, pads each local expert to block size `8/16/32/48/64`, uses sentinel `active_tokens * topk`, and performs no D2H or allocation in the decode step. | +| `kimi_k2.moe.int4_marlin_package` | `openinfer-kimi-k2` | `ops::kimi_marlin_int4_reorder_weight`, `ops::kimi_marlin_int4_reorder_scale`, `ops::kimi_marlin_int4_fuse_w13` | `kimi_marlin_int4_reorder_weight_cuda`, `kimi_marlin_int4_reorder_scale_cuda`, `kimi_marlin_int4_fuse_w13_cuda` | `csrc/kimi_k2/kimi_marlin_int4.cu` | CUDA load-time package helpers | Weight package preserves vLLM `uint4b8` bias=8 nibbles. Single projections repack checkpoint `[expert,out,K/8] int32` into Marlin no-actorder `[expert,K/16,N*2] int32`; scale package converts checkpoint `[expert,out,K/32]` into vLLM Marlin group-major+perm64 `[expert,K/32,out]`. Final runtime package fuses gate/up into W13 `[expert,K/16,4096*2]` and W13 scale `[expert,K/32,4096]`; W2 remains `[expert,2048/16,7168*2]` and `[expert,2048/32,7168]`. These are load/package helpers, not decode hot-path kernels. | ## Non-Qwen3 Compatibility diff --git a/pegainfer-kernels/build.rs b/openinfer-kernels/build.rs similarity index 95% rename from pegainfer-kernels/build.rs rename to openinfer-kernels/build.rs index 84895917..91fdabf9 100644 --- a/pegainfer-kernels/build.rs +++ b/openinfer-kernels/build.rs @@ -58,7 +58,7 @@ fn crate_root() -> PathBuf { } fn build_timing_enabled() -> bool { - std::env::var("PEGAINFER_BUILD_TIMING").is_ok_and(|value| { + std::env::var("OPENINFER_BUILD_TIMING").is_ok_and(|value| { let value = value.trim().to_ascii_lowercase(); !(value.is_empty() || value == "0" || value == "false" || value == "off") }) @@ -91,7 +91,7 @@ fn parse_job_count_env(name: &str) -> Option { } fn nvcc_job_count() -> usize { - if let Some(jobs) = parse_job_count_env("PEGAINFER_NVCC_JOBS") { + if let Some(jobs) = parse_job_count_env("OPENINFER_NVCC_JOBS") { return jobs; } @@ -215,7 +215,7 @@ fn sm_targets_from_nvidia_smi() -> Option> { } fn detect_sm_targets() -> Vec { - if let Ok(env) = std::env::var("PEGAINFER_CUDA_SM").or_else(|_| std::env::var("CUDA_SM")) { + if let Ok(env) = std::env::var("OPENINFER_CUDA_SM").or_else(|_| std::env::var("CUDA_SM")) { let mut sms = Vec::new(); for token in env.split(',') { if let Some(sm) = parse_sm_token(token) { @@ -241,7 +241,7 @@ fn detect_sm_targets() -> Vec { } print!( - "cargo:warning=Failed to detect GPU SMs via nvidia-smi. Set PEGAINFER_CUDA_SM/CUDA_SM environment variable to override." + "cargo:warning=Failed to detect GPU SMs via nvidia-smi. Set OPENINFER_CUDA_SM/CUDA_SM environment variable to override." ); panic!("GPU detection failed"); } @@ -312,16 +312,16 @@ fn is_deepep_source(csrc_dir: &Path, path: &Path) -> bool { /// NCCL >= 2.30.4 root (include/nccl.h + lib/libnccl.so.2) for the DeepEP /// shim's device API (ncclDevComm / windows / GIN). cudarc dlopens whatever /// libnccl.so.2 it finds at runtime, so build and runtime must point at the -/// same install: set PEGAINFER_NCCL_ROOT and put `$PEGAINFER_NCCL_ROOT/lib` +/// same install: set OPENINFER_NCCL_ROOT and put `$OPENINFER_NCCL_ROOT/lib` /// on LD_LIBRARY_PATH. The nvidia-nccl-cu13 wheel layout works directly: /// pip download 'nvidia-nccl-cu13>=2.30.4' --no-deps -d /tmp/nccl \ /// && unzip /tmp/nccl/*.whl 'nvidia/nccl/*' -d /tmp/nccl \ -/// && export PEGAINFER_NCCL_ROOT=/tmp/nccl/nvidia/nccl +/// && export OPENINFER_NCCL_ROOT=/tmp/nccl/nvidia/nccl fn deepep_nccl_root() -> PathBuf { - let Ok(root) = std::env::var("PEGAINFER_NCCL_ROOT").map(PathBuf::from) else { + let Ok(root) = std::env::var("OPENINFER_NCCL_ROOT").map(PathBuf::from) else { panic!( "The kimi-k2 feature builds the DeepEP shim, which needs NCCL >= 2.30.4. \ - Set PEGAINFER_NCCL_ROOT to an install with include/nccl.h and lib/libnccl.so.2 \ + Set OPENINFER_NCCL_ROOT to an install with include/nccl.h and lib/libnccl.so.2 \ (e.g. the unpacked nvidia-nccl-cu13 wheel)." ) }; @@ -329,7 +329,7 @@ fn deepep_nccl_root() -> PathBuf { let header = root.join("include/nccl.h"); let contents = fs::read_to_string(&header).unwrap_or_else(|err| { panic!( - "PEGAINFER_NCCL_ROOT: cannot read {}: {err}", + "OPENINFER_NCCL_ROOT: cannot read {}: {err}", header.display() ) }); @@ -345,14 +345,14 @@ fn deepep_nccl_root() -> PathBuf { + version_component("NCCL_PATCH"); assert!( version >= 23004, - "PEGAINFER_NCCL_ROOT points at NCCL {version} (< 2.30.4); the DeepEP shim needs the \ + "OPENINFER_NCCL_ROOT points at NCCL {version} (< 2.30.4); the DeepEP shim needs the \ NCCL device API" ); let lib = root.join("lib/libnccl.so.2"); assert!( lib.is_file(), - "PEGAINFER_NCCL_ROOT: {} not found", + "OPENINFER_NCCL_ROOT: {} not found", lib.display() ); root @@ -411,16 +411,16 @@ fn probe_triton_python(candidate: &str) -> Result { } fn find_triton_python() -> Result { - if let Ok(candidate) = std::env::var("PEGAINFER_TRITON_PYTHON") { + if let Ok(candidate) = std::env::var("OPENINFER_TRITON_PYTHON") { let candidate = candidate.trim(); if candidate.is_empty() { return Err( - "PEGAINFER_TRITON_PYTHON is set but empty. See pegainfer-kernels/tools/triton/README.md.".to_string(), + "OPENINFER_TRITON_PYTHON is set but empty. See openinfer-kernels/tools/triton/README.md.".to_string(), ); } return probe_triton_python(candidate).map_err(|message| { format!( - "PEGAINFER_TRITON_PYTHON=`{candidate}` could not import Triton. {message}. See pegainfer-kernels/tools/triton/README.md." + "OPENINFER_TRITON_PYTHON=`{candidate}` could not import Triton. {message}. See openinfer-kernels/tools/triton/README.md." ) }); } @@ -441,7 +441,7 @@ fn find_triton_python() -> Result { } Err(format!( - "Could not find a Python interpreter with Triton installed. Set PEGAINFER_TRITON_PYTHON, bootstrap .venv, or ensure `python3 -c 'import triton'` works. Probe results: {}.", + "Could not find a Python interpreter with Triton installed. Set OPENINFER_TRITON_PYTHON, bootstrap .venv, or ensure `python3 -c 'import triton'` works. Probe results: {}.", diagnostics.join(" | ") )) } @@ -463,13 +463,13 @@ fn probe_tilelang_python(candidate: &str) -> Result { } fn find_tilelang_python() -> Result { - if let Ok(candidate) = std::env::var("PEGAINFER_TILELANG_PYTHON") { + if let Ok(candidate) = std::env::var("OPENINFER_TILELANG_PYTHON") { let candidate = candidate.trim(); if candidate.is_empty() { - return Err("PEGAINFER_TILELANG_PYTHON is set but empty.".to_string()); + return Err("OPENINFER_TILELANG_PYTHON is set but empty.".to_string()); } return probe_tilelang_python(candidate).map_err(|message| { - format!("PEGAINFER_TILELANG_PYTHON=`{candidate}` could not import TileLang: {message}") + format!("OPENINFER_TILELANG_PYTHON=`{candidate}` could not import TileLang: {message}") }); } @@ -515,13 +515,13 @@ fn probe_cutedsl_python(candidate: &str) -> Result { } fn find_cutedsl_python() -> Result { - if let Ok(candidate) = std::env::var("PEGAINFER_CUTEDSL_PYTHON") { + if let Ok(candidate) = std::env::var("OPENINFER_CUTEDSL_PYTHON") { let candidate = candidate.trim(); if candidate.is_empty() { - return Err("PEGAINFER_CUTEDSL_PYTHON is set but empty.".to_string()); + return Err("OPENINFER_CUTEDSL_PYTHON is set but empty.".to_string()); } return probe_cutedsl_python(candidate).map_err(|message| { - format!("PEGAINFER_CUTEDSL_PYTHON=`{candidate}` could not import CuTe DSL: {message}") + format!("OPENINFER_CUTEDSL_PYTHON=`{candidate}` could not import CuTe DSL: {message}") }); } @@ -545,7 +545,7 @@ fn find_cutedsl_python() -> Result { } Err(format!( - "Could not find a Python interpreter with CuTe DSL installed. Set PEGAINFER_CUTEDSL_PYTHON. Probe results: {}.", + "Could not find a Python interpreter with CuTe DSL installed. Set OPENINFER_CUTEDSL_PYTHON. Probe results: {}.", diagnostics.join(" | ") )) } @@ -604,7 +604,7 @@ fn generate_deepseek_tilelang_artifacts(out_dir: &Path) -> TileLangArtifacts { cu_path.display() ); println!("cargo:rerun-if-changed={}", generator_path.display()); - println!("cargo:rerun-if-env-changed=PEGAINFER_TILELANG_PYTHON"); + println!("cargo:rerun-if-env-changed=OPENINFER_TILELANG_PYTHON"); TileLangArtifacts { cu_files: vec![cu_path], @@ -635,7 +635,7 @@ fn generate_deepseek_cutedsl_artifacts(out_dir: &Path) -> CuTeDslArtifacts { .arg(&artifact_dir) .arg("--repo-root") .arg(&repo_root); - if let Ok(cutlass_root) = std::env::var("PEGAINFER_CUTEDSL_CUTLASS_ROOT") { + if let Ok(cutlass_root) = std::env::var("OPENINFER_CUTEDSL_CUTLASS_ROOT") { command.arg("--cutlass-root").arg(cutlass_root); } @@ -686,8 +686,8 @@ fn generate_deepseek_cutedsl_artifacts(out_dir: &Path) -> CuTeDslArtifacts { artifact_dir.display() ); println!("cargo:rerun-if-changed={}", generator_path.display()); - println!("cargo:rerun-if-env-changed=PEGAINFER_CUTEDSL_PYTHON"); - println!("cargo:rerun-if-env-changed=PEGAINFER_CUTEDSL_CUTLASS_ROOT"); + println!("cargo:rerun-if-env-changed=OPENINFER_CUTEDSL_PYTHON"); + println!("cargo:rerun-if-env-changed=OPENINFER_CUTEDSL_CUTLASS_ROOT"); CuTeDslArtifacts { obj_files, @@ -709,13 +709,13 @@ fn flashinfer_includes() -> FlashInferIncludes { let crate_root = crate_root(); let root = workspace_root(); - if let Ok(path) = std::env::var("PEGAINFER_FLASHINFER_INCLUDE") { + if let Ok(path) = std::env::var("OPENINFER_FLASHINFER_INCLUDE") { let path = PathBuf::from(path); if path.join("flashinfer/sampling.cuh").exists() { return flashinfer_includes_from_include(path); } println!( - "cargo:warning=PEGAINFER_FLASHINFER_INCLUDE={} does not contain flashinfer/sampling.cuh; falling back.", + "cargo:warning=OPENINFER_FLASHINFER_INCLUDE={} does not contain flashinfer/sampling.cuh; falling back.", path.display() ); } @@ -802,7 +802,7 @@ fn triton_target(sm_targets: &[String]) -> String { if sm_targets.len() > 1 { println!( - "cargo:warning=Triton AOT currently emits one cubin per kernel spec; using highest detected target sm_{max_sm}. Set PEGAINFER_CUDA_SM to pin one target explicitly." + "cargo:warning=Triton AOT currently emits one cubin per kernel spec; using highest detected target sm_{max_sm}. Set OPENINFER_CUDA_SM to pin one target explicitly." ); } @@ -1127,7 +1127,7 @@ fn compile_triton_aot_kernels(cuda_path: &str, out_dir: &Path, sm_targets: &[Str "cargo:rerun-if-changed={}", root.join("tools/triton/gen_triton_aot.py").display() ); - println!("cargo:rerun-if-env-changed=PEGAINFER_TRITON_PYTHON"); + println!("cargo:rerun-if-env-changed=OPENINFER_TRITON_PYTHON"); } fn main() { @@ -1349,12 +1349,12 @@ fn main() { if !deepseek_enabled { println!( - "cargo:warning=DeepSeek V4 CUDA/TileLang kernels disabled; enable the pegainfer-kernels `deepseek-v4` feature to build them" + "cargo:warning=DeepSeek V4 CUDA/TileLang kernels disabled; enable the openinfer-kernels `deepseek-v4` feature to build them" ); } if !kimi_k2_enabled { println!( - "cargo:warning=Kimi-K2 CUDA kernels disabled; enable the pegainfer-kernels `kimi-k2` feature to build them" + "cargo:warning=Kimi-K2 CUDA kernels disabled; enable the openinfer-kernels `kimi-k2` feature to build them" ); } @@ -1526,12 +1526,12 @@ fn main() { println!("cargo:rerun-if-changed=build.rs"); println!("cargo:rerun-if-env-changed=CUDA_HOME"); println!("cargo:rerun-if-env-changed=CUDA_PATH"); - println!("cargo:rerun-if-env-changed=PEGAINFER_CUDA_SM"); + println!("cargo:rerun-if-env-changed=OPENINFER_CUDA_SM"); println!("cargo:rerun-if-env-changed=CUDA_SM"); - println!("cargo:rerun-if-env-changed=PEGAINFER_FLASHINFER_INCLUDE"); - println!("cargo:rerun-if-env-changed=PEGAINFER_BUILD_TIMING"); - println!("cargo:rerun-if-env-changed=PEGAINFER_NVCC_JOBS"); - println!("cargo:rerun-if-env-changed=PEGAINFER_NCCL_ROOT"); + println!("cargo:rerun-if-env-changed=OPENINFER_FLASHINFER_INCLUDE"); + println!("cargo:rerun-if-env-changed=OPENINFER_BUILD_TIMING"); + println!("cargo:rerun-if-env-changed=OPENINFER_NVCC_JOBS"); + println!("cargo:rerun-if-env-changed=OPENINFER_NCCL_ROOT"); println!( "cargo:rerun-if-changed={}", root.join("third_party/DeepEP/deep_ep/include").display() diff --git a/pegainfer-kernels/csrc/common.cuh b/openinfer-kernels/csrc/common.cuh similarity index 100% rename from pegainfer-kernels/csrc/common.cuh rename to openinfer-kernels/csrc/common.cuh diff --git a/pegainfer-kernels/csrc/deepep/deepep.h b/openinfer-kernels/csrc/deepep/deepep.h similarity index 100% rename from pegainfer-kernels/csrc/deepep/deepep.h rename to openinfer-kernels/csrc/deepep/deepep.h diff --git a/pegainfer-kernels/csrc/deepep/deepep_config.cuh b/openinfer-kernels/csrc/deepep/deepep_config.cuh similarity index 100% rename from pegainfer-kernels/csrc/deepep/deepep_config.cuh rename to openinfer-kernels/csrc/deepep/deepep_config.cuh diff --git a/pegainfer-kernels/csrc/deepep/deepep_shim.cu b/openinfer-kernels/csrc/deepep/deepep_shim.cu similarity index 100% rename from pegainfer-kernels/csrc/deepep/deepep_shim.cu rename to openinfer-kernels/csrc/deepep/deepep_shim.cu diff --git a/pegainfer-kernels/csrc/deepseek_v4/deepseek_attention.cu b/openinfer-kernels/csrc/deepseek_v4/deepseek_attention.cu similarity index 100% rename from pegainfer-kernels/csrc/deepseek_v4/deepseek_attention.cu rename to openinfer-kernels/csrc/deepseek_v4/deepseek_attention.cu diff --git a/pegainfer-kernels/csrc/deepseek_v4/deepseek_common.cuh b/openinfer-kernels/csrc/deepseek_v4/deepseek_common.cuh similarity index 100% rename from pegainfer-kernels/csrc/deepseek_v4/deepseek_common.cuh rename to openinfer-kernels/csrc/deepseek_v4/deepseek_common.cuh diff --git a/pegainfer-kernels/csrc/deepseek_v4/deepseek_compressor.cu b/openinfer-kernels/csrc/deepseek_v4/deepseek_compressor.cu similarity index 100% rename from pegainfer-kernels/csrc/deepseek_v4/deepseek_compressor.cu rename to openinfer-kernels/csrc/deepseek_v4/deepseek_compressor.cu diff --git a/pegainfer-kernels/csrc/deepseek_v4/deepseek_hc.cu b/openinfer-kernels/csrc/deepseek_v4/deepseek_hc.cu similarity index 100% rename from pegainfer-kernels/csrc/deepseek_v4/deepseek_hc.cu rename to openinfer-kernels/csrc/deepseek_v4/deepseek_hc.cu diff --git a/pegainfer-kernels/csrc/deepseek_v4/deepseek_indexer.cu b/openinfer-kernels/csrc/deepseek_v4/deepseek_indexer.cu similarity index 100% rename from pegainfer-kernels/csrc/deepseek_v4/deepseek_indexer.cu rename to openinfer-kernels/csrc/deepseek_v4/deepseek_indexer.cu diff --git a/pegainfer-kernels/csrc/deepseek_v4/deepseek_moe.cu b/openinfer-kernels/csrc/deepseek_v4/deepseek_moe.cu similarity index 100% rename from pegainfer-kernels/csrc/deepseek_v4/deepseek_moe.cu rename to openinfer-kernels/csrc/deepseek_v4/deepseek_moe.cu diff --git a/pegainfer-kernels/csrc/deepseek_v4/deepseek_quant.cu b/openinfer-kernels/csrc/deepseek_v4/deepseek_quant.cu similarity index 100% rename from pegainfer-kernels/csrc/deepseek_v4/deepseek_quant.cu rename to openinfer-kernels/csrc/deepseek_v4/deepseek_quant.cu diff --git a/pegainfer-kernels/csrc/kimi_k2/kimi_experts.cu b/openinfer-kernels/csrc/kimi_k2/kimi_experts.cu similarity index 100% rename from pegainfer-kernels/csrc/kimi_k2/kimi_experts.cu rename to openinfer-kernels/csrc/kimi_k2/kimi_experts.cu diff --git a/pegainfer-kernels/csrc/kimi_k2/kimi_marlin_int4.cu b/openinfer-kernels/csrc/kimi_k2/kimi_marlin_int4.cu similarity index 98% rename from pegainfer-kernels/csrc/kimi_k2/kimi_marlin_int4.cu rename to openinfer-kernels/csrc/kimi_k2/kimi_marlin_int4.cu index c9cb1e7a..5d063114 100644 --- a/pegainfer-kernels/csrc/kimi_k2/kimi_marlin_int4.cu +++ b/openinfer-kernels/csrc/kimi_k2/kimi_marlin_int4.cu @@ -3,7 +3,7 @@ #include #include -namespace pegainfer_kimi_marlin_int4 { +namespace openinfer_kimi_marlin_int4 { constexpr int kKimiLocalExperts = 48; constexpr int kKimiInt4GroupSize = 32; @@ -162,9 +162,9 @@ __global__ void kimi_marlin_fuse_w13_scale_kernel( } } -} // namespace pegainfer_kimi_marlin_int4 +} // namespace openinfer_kimi_marlin_int4 -using namespace pegainfer_kimi_marlin_int4; +using namespace openinfer_kimi_marlin_int4; extern "C" { diff --git a/pegainfer-kernels/csrc/kimi_k2/kimi_marlin_wna16.cu b/openinfer-kernels/csrc/kimi_k2/kimi_marlin_wna16.cu similarity index 96% rename from pegainfer-kernels/csrc/kimi_k2/kimi_marlin_wna16.cu rename to openinfer-kernels/csrc/kimi_k2/kimi_marlin_wna16.cu index 7521b606..a329eccc 100644 --- a/pegainfer-kernels/csrc/kimi_k2/kimi_marlin_wna16.cu +++ b/openinfer-kernels/csrc/kimi_k2/kimi_marlin_wna16.cu @@ -4,11 +4,11 @@ #include #include -#define MARLIN_NAMESPACE_NAME pegainfer_kimi_marlin_moe_wna16 +#define MARLIN_NAMESPACE_NAME openinfer_kimi_marlin_moe_wna16 #include "vllm_marlin/moe/marlin_moe_wna16/kernel.h" #include "vllm_marlin/moe/marlin_moe_wna16/marlin_template.h" -namespace pegainfer_kimi_marlin_moe_wna16 { +namespace openinfer_kimi_marlin_moe_wna16 { __global__ void MarlinDefault(MARLIN_KERNEL_PARAMS) {} @@ -407,7 +407,7 @@ __global__ void sum_topk_rows_kernel( out[idx] = acc; } -} // namespace pegainfer_kimi_marlin_moe_wna16 +} // namespace openinfer_kimi_marlin_moe_wna16 extern "C" { @@ -434,7 +434,7 @@ CUresult kimi_marlin_wna16_gemm_cuda( int group_size, int sm_count, cudaStream_t stream) { - return pegainfer_kimi_marlin_moe_wna16::launch_marlin_gemm( + return openinfer_kimi_marlin_moe_wna16::launch_marlin_gemm( input, output, c_tmp, b_qweight, b_scales, workspace, sorted_token_ids, expert_ids, num_tokens_post_padded, topk_weights, workspace_len, sorted_token_ids_len, moe_block_size, top_k, mul_topk_weights, size_m, @@ -454,9 +454,9 @@ CUresult kimi_marlin_w13_swiglu_cuda( constexpr int threads = 256; int total = rows * intermediate_dim; int blocks = (total + threads - 1) / threads; - pegainfer_kimi_marlin_moe_wna16::swiglu_w13_kernel<<>>( + openinfer_kimi_marlin_moe_wna16::swiglu_w13_kernel<<>>( w13, out, rows, intermediate_dim); - return pegainfer_kimi_marlin_moe_wna16::last_error_to_cu(cudaPeekAtLastError()); + return openinfer_kimi_marlin_moe_wna16::last_error_to_cu(cudaPeekAtLastError()); } CUresult kimi_marlin_w13_swiglu_expanded_cuda( @@ -487,9 +487,9 @@ CUresult kimi_marlin_w13_swiglu_expanded_cuda( int max_blocks = (max_rows * intermediate_dim + threads - 1) / threads; int blocks = sm_count * 8; if (blocks > max_blocks) blocks = max_blocks; - pegainfer_kimi_marlin_moe_wna16::swiglu_w13_expanded_kernel<<>>( + openinfer_kimi_marlin_moe_wna16::swiglu_w13_expanded_kernel<<>>( w13, out, num_tokens_post_padded, max_rows, intermediate_dim); - return pegainfer_kimi_marlin_moe_wna16::last_error_to_cu(cudaPeekAtLastError()); + return openinfer_kimi_marlin_moe_wna16::last_error_to_cu(cudaPeekAtLastError()); } CUresult kimi_marlin_sum_topk_rows_f32_cuda( @@ -507,9 +507,9 @@ CUresult kimi_marlin_sum_topk_rows_f32_cuda( constexpr int threads = 256; int total = active_tokens * hidden_dim; int blocks = (total + threads - 1) / threads; - pegainfer_kimi_marlin_moe_wna16::sum_topk_rows_kernel<<>>( + openinfer_kimi_marlin_moe_wna16::sum_topk_rows_kernel<<>>( route_output, out, active_tokens, topk, hidden_dim); - return pegainfer_kimi_marlin_moe_wna16::last_error_to_cu(cudaPeekAtLastError()); + return openinfer_kimi_marlin_moe_wna16::last_error_to_cu(cudaPeekAtLastError()); } } // extern "C" diff --git a/pegainfer-kernels/csrc/kimi_k2/kimi_mla.cu b/openinfer-kernels/csrc/kimi_k2/kimi_mla.cu similarity index 100% rename from pegainfer-kernels/csrc/kimi_k2/kimi_mla.cu rename to openinfer-kernels/csrc/kimi_k2/kimi_mla.cu diff --git a/pegainfer-kernels/csrc/kimi_k2/kimi_mla_cublaslt.cu b/openinfer-kernels/csrc/kimi_k2/kimi_mla_cublaslt.cu similarity index 100% rename from pegainfer-kernels/csrc/kimi_k2/kimi_mla_cublaslt.cu rename to openinfer-kernels/csrc/kimi_k2/kimi_mla_cublaslt.cu diff --git a/pegainfer-kernels/csrc/kimi_k2/kimi_o_proj.cu b/openinfer-kernels/csrc/kimi_k2/kimi_o_proj.cu similarity index 100% rename from pegainfer-kernels/csrc/kimi_k2/kimi_o_proj.cu rename to openinfer-kernels/csrc/kimi_k2/kimi_o_proj.cu diff --git a/pegainfer-kernels/csrc/kimi_k2/kimi_router.cu b/openinfer-kernels/csrc/kimi_k2/kimi_router.cu similarity index 100% rename from pegainfer-kernels/csrc/kimi_k2/kimi_router.cu rename to openinfer-kernels/csrc/kimi_k2/kimi_router.cu diff --git a/pegainfer-kernels/csrc/kimi_k2/kimi_shared_gate_up.cu b/openinfer-kernels/csrc/kimi_k2/kimi_shared_gate_up.cu similarity index 100% rename from pegainfer-kernels/csrc/kimi_k2/kimi_shared_gate_up.cu rename to openinfer-kernels/csrc/kimi_k2/kimi_shared_gate_up.cu diff --git a/pegainfer-kernels/csrc/kimi_k2/vllm_marlin/core/scalar_type.hpp b/openinfer-kernels/csrc/kimi_k2/vllm_marlin/core/scalar_type.hpp similarity index 100% rename from pegainfer-kernels/csrc/kimi_k2/vllm_marlin/core/scalar_type.hpp rename to openinfer-kernels/csrc/kimi_k2/vllm_marlin/core/scalar_type.hpp diff --git a/pegainfer-kernels/csrc/kimi_k2/vllm_marlin/moe/marlin_moe_wna16/kernel.h b/openinfer-kernels/csrc/kimi_k2/vllm_marlin/moe/marlin_moe_wna16/kernel.h similarity index 100% rename from pegainfer-kernels/csrc/kimi_k2/vllm_marlin/moe/marlin_moe_wna16/kernel.h rename to openinfer-kernels/csrc/kimi_k2/vllm_marlin/moe/marlin_moe_wna16/kernel.h diff --git a/pegainfer-kernels/csrc/kimi_k2/vllm_marlin/moe/marlin_moe_wna16/marlin_template.h b/openinfer-kernels/csrc/kimi_k2/vllm_marlin/moe/marlin_moe_wna16/marlin_template.h similarity index 100% rename from pegainfer-kernels/csrc/kimi_k2/vllm_marlin/moe/marlin_moe_wna16/marlin_template.h rename to openinfer-kernels/csrc/kimi_k2/vllm_marlin/moe/marlin_moe_wna16/marlin_template.h diff --git a/pegainfer-kernels/csrc/kimi_k2/vllm_marlin/quantization/gptq_marlin/dequant.h b/openinfer-kernels/csrc/kimi_k2/vllm_marlin/quantization/gptq_marlin/dequant.h similarity index 100% rename from pegainfer-kernels/csrc/kimi_k2/vllm_marlin/quantization/gptq_marlin/dequant.h rename to openinfer-kernels/csrc/kimi_k2/vllm_marlin/quantization/gptq_marlin/dequant.h diff --git a/pegainfer-kernels/csrc/kimi_k2/vllm_marlin/quantization/gptq_marlin/marlin.cuh b/openinfer-kernels/csrc/kimi_k2/vllm_marlin/quantization/gptq_marlin/marlin.cuh similarity index 100% rename from pegainfer-kernels/csrc/kimi_k2/vllm_marlin/quantization/gptq_marlin/marlin.cuh rename to openinfer-kernels/csrc/kimi_k2/vllm_marlin/quantization/gptq_marlin/marlin.cuh diff --git a/pegainfer-kernels/csrc/kimi_k2/vllm_marlin/quantization/gptq_marlin/marlin_dtypes.cuh b/openinfer-kernels/csrc/kimi_k2/vllm_marlin/quantization/gptq_marlin/marlin_dtypes.cuh similarity index 100% rename from pegainfer-kernels/csrc/kimi_k2/vllm_marlin/quantization/gptq_marlin/marlin_dtypes.cuh rename to openinfer-kernels/csrc/kimi_k2/vllm_marlin/quantization/gptq_marlin/marlin_dtypes.cuh diff --git a/pegainfer-kernels/csrc/kimi_k2/vllm_marlin/quantization/marlin/dequant.h b/openinfer-kernels/csrc/kimi_k2/vllm_marlin/quantization/marlin/dequant.h similarity index 100% rename from pegainfer-kernels/csrc/kimi_k2/vllm_marlin/quantization/marlin/dequant.h rename to openinfer-kernels/csrc/kimi_k2/vllm_marlin/quantization/marlin/dequant.h diff --git a/pegainfer-kernels/csrc/kimi_k2/vllm_marlin/quantization/marlin/marlin.cuh b/openinfer-kernels/csrc/kimi_k2/vllm_marlin/quantization/marlin/marlin.cuh similarity index 100% rename from pegainfer-kernels/csrc/kimi_k2/vllm_marlin/quantization/marlin/marlin.cuh rename to openinfer-kernels/csrc/kimi_k2/vllm_marlin/quantization/marlin/marlin.cuh diff --git a/pegainfer-kernels/csrc/kimi_k2/vllm_marlin/quantization/marlin/marlin_dtypes.cuh b/openinfer-kernels/csrc/kimi_k2/vllm_marlin/quantization/marlin/marlin_dtypes.cuh similarity index 100% rename from pegainfer-kernels/csrc/kimi_k2/vllm_marlin/quantization/marlin/marlin_dtypes.cuh rename to openinfer-kernels/csrc/kimi_k2/vllm_marlin/quantization/marlin/marlin_dtypes.cuh diff --git a/pegainfer-kernels/csrc/kimi_k2/vllm_marlin/quantization/marlin/marlin_mma.h b/openinfer-kernels/csrc/kimi_k2/vllm_marlin/quantization/marlin/marlin_mma.h similarity index 100% rename from pegainfer-kernels/csrc/kimi_k2/vllm_marlin/quantization/marlin/marlin_mma.h rename to openinfer-kernels/csrc/kimi_k2/vllm_marlin/quantization/marlin/marlin_mma.h diff --git a/pegainfer-kernels/csrc/lora_fused.cu b/openinfer-kernels/csrc/lora_fused.cu similarity index 100% rename from pegainfer-kernels/csrc/lora_fused.cu rename to openinfer-kernels/csrc/lora_fused.cu diff --git a/pegainfer-kernels/csrc/qwen35/conv1d.cu b/openinfer-kernels/csrc/qwen35/conv1d.cu similarity index 100% rename from pegainfer-kernels/csrc/qwen35/conv1d.cu rename to openinfer-kernels/csrc/qwen35/conv1d.cu diff --git a/pegainfer-kernels/csrc/qwen35/gated_delta_rule.cu b/openinfer-kernels/csrc/qwen35/gated_delta_rule.cu similarity index 100% rename from pegainfer-kernels/csrc/qwen35/gated_delta_rule.cu rename to openinfer-kernels/csrc/qwen35/gated_delta_rule.cu diff --git a/pegainfer-kernels/csrc/qwen35/prefill_attention_hd256.cu b/openinfer-kernels/csrc/qwen35/prefill_attention_hd256.cu similarity index 100% rename from pegainfer-kernels/csrc/qwen35/prefill_attention_hd256.cu rename to openinfer-kernels/csrc/qwen35/prefill_attention_hd256.cu diff --git a/pegainfer-kernels/csrc/shared/argmax.cu b/openinfer-kernels/csrc/shared/argmax.cu similarity index 100% rename from pegainfer-kernels/csrc/shared/argmax.cu rename to openinfer-kernels/csrc/shared/argmax.cu diff --git a/pegainfer-kernels/csrc/shared/elementwise.cu b/openinfer-kernels/csrc/shared/elementwise.cu similarity index 100% rename from pegainfer-kernels/csrc/shared/elementwise.cu rename to openinfer-kernels/csrc/shared/elementwise.cu diff --git a/pegainfer-kernels/csrc/shared/flashinfer_norm.cu b/openinfer-kernels/csrc/shared/flashinfer_norm.cu similarity index 99% rename from pegainfer-kernels/csrc/shared/flashinfer_norm.cu rename to openinfer-kernels/csrc/shared/flashinfer_norm.cu index 5a8c2bf3..b9e4f6e7 100644 --- a/pegainfer-kernels/csrc/shared/flashinfer_norm.cu +++ b/openinfer-kernels/csrc/shared/flashinfer_norm.cu @@ -25,7 +25,7 @@ using DType = __nv_bfloat16; -namespace pegainfer { +namespace openinfer { namespace norm { // Exact-preserving variant for the decode pattern: @@ -165,7 +165,7 @@ cudaError_t FusedAddRMSNormRound(T* hidden, const T* residual, T* weight, T* out } } // namespace norm -} // namespace pegainfer +} // namespace openinfer __global__ void rms_norm_batched_serial_kernel(const DType *x, const DType *weight, DType *out, int hidden_dim, int seq_len, float eps) { @@ -253,7 +253,7 @@ CUresult fused_add_rms_norm_round_batched_cuda(DType *hidden, const DType *resid const DType *weight, DType *out, int hidden_dim, int batch_size, float eps, cudaStream_t stream) { - cudaError_t err = pegainfer::norm::FusedAddRMSNormRound( + cudaError_t err = openinfer::norm::FusedAddRMSNormRound( hidden, residual, const_cast(weight), out, /*batch_size=*/static_cast(batch_size), /*d=*/static_cast(hidden_dim), diff --git a/pegainfer-kernels/csrc/shared/flashinfer_sampling.cu b/openinfer-kernels/csrc/shared/flashinfer_sampling.cu similarity index 100% rename from pegainfer-kernels/csrc/shared/flashinfer_sampling.cu rename to openinfer-kernels/csrc/shared/flashinfer_sampling.cu diff --git a/pegainfer-kernels/csrc/shared/flashinfer_top1.cu b/openinfer-kernels/csrc/shared/flashinfer_top1.cu similarity index 100% rename from pegainfer-kernels/csrc/shared/flashinfer_top1.cu rename to openinfer-kernels/csrc/shared/flashinfer_top1.cu diff --git a/pegainfer-kernels/csrc/shared/fused_proj.cu b/openinfer-kernels/csrc/shared/fused_proj.cu similarity index 100% rename from pegainfer-kernels/csrc/shared/fused_proj.cu rename to openinfer-kernels/csrc/shared/fused_proj.cu diff --git a/pegainfer-kernels/csrc/shared/linear.cu b/openinfer-kernels/csrc/shared/linear.cu similarity index 100% rename from pegainfer-kernels/csrc/shared/linear.cu rename to openinfer-kernels/csrc/shared/linear.cu diff --git a/pegainfer-kernels/csrc/shared/norm.cu b/openinfer-kernels/csrc/shared/norm.cu similarity index 100% rename from pegainfer-kernels/csrc/shared/norm.cu rename to openinfer-kernels/csrc/shared/norm.cu diff --git a/pegainfer-kernels/csrc/shared/paged_attention.cu b/openinfer-kernels/csrc/shared/paged_attention.cu similarity index 100% rename from pegainfer-kernels/csrc/shared/paged_attention.cu rename to openinfer-kernels/csrc/shared/paged_attention.cu diff --git a/pegainfer-kernels/csrc/shared/prefill_attention.cu b/openinfer-kernels/csrc/shared/prefill_attention.cu similarity index 100% rename from pegainfer-kernels/csrc/shared/prefill_attention.cu rename to openinfer-kernels/csrc/shared/prefill_attention.cu diff --git a/pegainfer-kernels/src/ffi.rs b/openinfer-kernels/src/ffi.rs similarity index 89% rename from pegainfer-kernels/src/ffi.rs rename to openinfer-kernels/src/ffi.rs index c72cda6c..e6302865 100644 --- a/pegainfer-kernels/src/ffi.rs +++ b/openinfer-kernels/src/ffi.rs @@ -1,5 +1,5 @@ // FFI surface for CUDA/cuBLAS/FlashInfer kernels, split by owning model. -// Public paths are unchanged: `pegainfer_kernels::ffi::` resolves via the re-exports below. +// Public paths are unchanged: `openinfer_kernels::ffi::` resolves via the re-exports below. // Half type (16-bit float) - same layout as CUDA half. Shared ABI type used by all submodules. pub type Half = u16; diff --git a/pegainfer-kernels/src/ffi/deepep.rs b/openinfer-kernels/src/ffi/deepep.rs similarity index 100% rename from pegainfer-kernels/src/ffi/deepep.rs rename to openinfer-kernels/src/ffi/deepep.rs diff --git a/pegainfer-kernels/src/ffi/deepseek.rs b/openinfer-kernels/src/ffi/deepseek.rs similarity index 100% rename from pegainfer-kernels/src/ffi/deepseek.rs rename to openinfer-kernels/src/ffi/deepseek.rs diff --git a/pegainfer-kernels/src/ffi/kimi.rs b/openinfer-kernels/src/ffi/kimi.rs similarity index 100% rename from pegainfer-kernels/src/ffi/kimi.rs rename to openinfer-kernels/src/ffi/kimi.rs diff --git a/pegainfer-kernels/src/ffi/lora.rs b/openinfer-kernels/src/ffi/lora.rs similarity index 100% rename from pegainfer-kernels/src/ffi/lora.rs rename to openinfer-kernels/src/ffi/lora.rs diff --git a/pegainfer-kernels/src/ffi/qwen35.rs b/openinfer-kernels/src/ffi/qwen35.rs similarity index 100% rename from pegainfer-kernels/src/ffi/qwen35.rs rename to openinfer-kernels/src/ffi/qwen35.rs diff --git a/pegainfer-kernels/src/ffi/shared.rs b/openinfer-kernels/src/ffi/shared.rs similarity index 100% rename from pegainfer-kernels/src/ffi/shared.rs rename to openinfer-kernels/src/ffi/shared.rs diff --git a/pegainfer-kernels/src/forward_pass.rs b/openinfer-kernels/src/forward_pass.rs similarity index 100% rename from pegainfer-kernels/src/forward_pass.rs rename to openinfer-kernels/src/forward_pass.rs diff --git a/pegainfer-kernels/src/gpu_buffers.rs b/openinfer-kernels/src/gpu_buffers.rs similarity index 100% rename from pegainfer-kernels/src/gpu_buffers.rs rename to openinfer-kernels/src/gpu_buffers.rs diff --git a/pegainfer-kernels/src/lib.rs b/openinfer-kernels/src/lib.rs similarity index 100% rename from pegainfer-kernels/src/lib.rs rename to openinfer-kernels/src/lib.rs diff --git a/pegainfer-kernels/src/ops.rs b/openinfer-kernels/src/ops.rs similarity index 100% rename from pegainfer-kernels/src/ops.rs rename to openinfer-kernels/src/ops.rs diff --git a/pegainfer-kernels/src/ops/attention.rs b/openinfer-kernels/src/ops/attention.rs similarity index 100% rename from pegainfer-kernels/src/ops/attention.rs rename to openinfer-kernels/src/ops/attention.rs diff --git a/pegainfer-kernels/src/ops/deepep.rs b/openinfer-kernels/src/ops/deepep.rs similarity index 100% rename from pegainfer-kernels/src/ops/deepep.rs rename to openinfer-kernels/src/ops/deepep.rs diff --git a/pegainfer-kernels/src/ops/elementwise.rs b/openinfer-kernels/src/ops/elementwise.rs similarity index 100% rename from pegainfer-kernels/src/ops/elementwise.rs rename to openinfer-kernels/src/ops/elementwise.rs diff --git a/pegainfer-kernels/src/ops/embedding.rs b/openinfer-kernels/src/ops/embedding.rs similarity index 100% rename from pegainfer-kernels/src/ops/embedding.rs rename to openinfer-kernels/src/ops/embedding.rs diff --git a/pegainfer-kernels/src/ops/kimi_k2.rs b/openinfer-kernels/src/ops/kimi_k2.rs similarity index 100% rename from pegainfer-kernels/src/ops/kimi_k2.rs rename to openinfer-kernels/src/ops/kimi_k2.rs diff --git a/pegainfer-kernels/src/ops/kimi_k2/experts.rs b/openinfer-kernels/src/ops/kimi_k2/experts.rs similarity index 100% rename from pegainfer-kernels/src/ops/kimi_k2/experts.rs rename to openinfer-kernels/src/ops/kimi_k2/experts.rs diff --git a/pegainfer-kernels/src/ops/kimi_k2/mla.rs b/openinfer-kernels/src/ops/kimi_k2/mla.rs similarity index 100% rename from pegainfer-kernels/src/ops/kimi_k2/mla.rs rename to openinfer-kernels/src/ops/kimi_k2/mla.rs diff --git a/pegainfer-kernels/src/ops/kimi_k2/mla_rt.rs b/openinfer-kernels/src/ops/kimi_k2/mla_rt.rs similarity index 100% rename from pegainfer-kernels/src/ops/kimi_k2/mla_rt.rs rename to openinfer-kernels/src/ops/kimi_k2/mla_rt.rs diff --git a/pegainfer-kernels/src/ops/kimi_k2/o_proj.rs b/openinfer-kernels/src/ops/kimi_k2/o_proj.rs similarity index 100% rename from pegainfer-kernels/src/ops/kimi_k2/o_proj.rs rename to openinfer-kernels/src/ops/kimi_k2/o_proj.rs diff --git a/pegainfer-kernels/src/ops/kimi_k2/router.rs b/openinfer-kernels/src/ops/kimi_k2/router.rs similarity index 100% rename from pegainfer-kernels/src/ops/kimi_k2/router.rs rename to openinfer-kernels/src/ops/kimi_k2/router.rs diff --git a/pegainfer-kernels/src/ops/kimi_k2/shared_gate_up.rs b/openinfer-kernels/src/ops/kimi_k2/shared_gate_up.rs similarity index 100% rename from pegainfer-kernels/src/ops/kimi_k2/shared_gate_up.rs rename to openinfer-kernels/src/ops/kimi_k2/shared_gate_up.rs diff --git a/pegainfer-kernels/src/ops/linear.rs b/openinfer-kernels/src/ops/linear.rs similarity index 100% rename from pegainfer-kernels/src/ops/linear.rs rename to openinfer-kernels/src/ops/linear.rs diff --git a/pegainfer-kernels/src/ops/lora.rs b/openinfer-kernels/src/ops/lora.rs similarity index 100% rename from pegainfer-kernels/src/ops/lora.rs rename to openinfer-kernels/src/ops/lora.rs diff --git a/pegainfer-kernels/src/ops/norm.rs b/openinfer-kernels/src/ops/norm.rs similarity index 100% rename from pegainfer-kernels/src/ops/norm.rs rename to openinfer-kernels/src/ops/norm.rs diff --git a/pegainfer-kernels/src/ops/sampling.rs b/openinfer-kernels/src/ops/sampling.rs similarity index 100% rename from pegainfer-kernels/src/ops/sampling.rs rename to openinfer-kernels/src/ops/sampling.rs diff --git a/pegainfer-kernels/src/paged_kv.rs b/openinfer-kernels/src/paged_kv.rs similarity index 100% rename from pegainfer-kernels/src/paged_kv.rs rename to openinfer-kernels/src/paged_kv.rs diff --git a/pegainfer-kernels/src/tensor.rs b/openinfer-kernels/src/tensor.rs similarity index 100% rename from pegainfer-kernels/src/tensor.rs rename to openinfer-kernels/src/tensor.rs diff --git a/pegainfer-kernels/src/typed_ops.rs b/openinfer-kernels/src/typed_ops.rs similarity index 100% rename from pegainfer-kernels/src/typed_ops.rs rename to openinfer-kernels/src/typed_ops.rs diff --git a/pegainfer-kernels/tests/deepseek_compressor_nonoverlap.rs b/openinfer-kernels/tests/deepseek_compressor_nonoverlap.rs similarity index 99% rename from pegainfer-kernels/tests/deepseek_compressor_nonoverlap.rs rename to openinfer-kernels/tests/deepseek_compressor_nonoverlap.rs index 92a49492..105e7092 100644 --- a/pegainfer-kernels/tests/deepseek_compressor_nonoverlap.rs +++ b/openinfer-kernels/tests/deepseek_compressor_nonoverlap.rs @@ -7,7 +7,7 @@ use std::ptr; use anyhow::{Context, Result, ensure}; use cudarc::driver::sys::CUstream; use half::bf16; -use pegainfer_kernels::ffi; +use openinfer_kernels::ffi; const CUDA_MEMCPY_HOST_TO_DEVICE: i32 = 1; const CUDA_MEMCPY_DEVICE_TO_HOST: i32 = 2; diff --git a/pegainfer-kernels/tests/deepseek_compressor_overlap.rs b/openinfer-kernels/tests/deepseek_compressor_overlap.rs similarity index 99% rename from pegainfer-kernels/tests/deepseek_compressor_overlap.rs rename to openinfer-kernels/tests/deepseek_compressor_overlap.rs index 8305cfe8..268e56d7 100644 --- a/pegainfer-kernels/tests/deepseek_compressor_overlap.rs +++ b/openinfer-kernels/tests/deepseek_compressor_overlap.rs @@ -7,7 +7,7 @@ use std::ptr; use anyhow::{Context, Result, ensure}; use cudarc::driver::sys::CUstream; use half::bf16; -use pegainfer_kernels::ffi; +use openinfer_kernels::ffi; const CUDA_MEMCPY_HOST_TO_DEVICE: i32 = 1; const CUDA_MEMCPY_DEVICE_TO_HOST: i32 = 2; diff --git a/pegainfer-kernels/tests/deepseek_cutedsl_indexer.rs b/openinfer-kernels/tests/deepseek_cutedsl_indexer.rs similarity index 99% rename from pegainfer-kernels/tests/deepseek_cutedsl_indexer.rs rename to openinfer-kernels/tests/deepseek_cutedsl_indexer.rs index c5a347e1..ddd3bdfe 100644 --- a/pegainfer-kernels/tests/deepseek_cutedsl_indexer.rs +++ b/openinfer-kernels/tests/deepseek_cutedsl_indexer.rs @@ -7,7 +7,7 @@ use std::ptr; use anyhow::{Result, ensure}; use cudarc::driver::sys::{CUresult, CUstream}; use half::bf16; -use pegainfer_kernels::ffi; +use openinfer_kernels::ffi; const HEAD_DIM: usize = 128; const CUDA_MEMCPY_HOST_TO_DEVICE: i32 = 1; diff --git a/pegainfer-kernels/tests/deepseek_indexer_topk.rs b/openinfer-kernels/tests/deepseek_indexer_topk.rs similarity index 99% rename from pegainfer-kernels/tests/deepseek_indexer_topk.rs rename to openinfer-kernels/tests/deepseek_indexer_topk.rs index 51293f11..beee2474 100644 --- a/pegainfer-kernels/tests/deepseek_indexer_topk.rs +++ b/openinfer-kernels/tests/deepseek_indexer_topk.rs @@ -6,7 +6,7 @@ use std::ptr; use anyhow::{Result, ensure}; use cudarc::driver::sys::CUstream; -use pegainfer_kernels::ffi; +use openinfer_kernels::ffi; const CUDA_MEMCPY_HOST_TO_DEVICE: i32 = 1; const CUDA_MEMCPY_DEVICE_TO_HOST: i32 = 2; diff --git a/pegainfer-kernels/third_party/DeepEP b/openinfer-kernels/third_party/DeepEP similarity index 100% rename from pegainfer-kernels/third_party/DeepEP rename to openinfer-kernels/third_party/DeepEP diff --git a/pegainfer-kernels/third_party/flashinfer b/openinfer-kernels/third_party/flashinfer similarity index 100% rename from pegainfer-kernels/third_party/flashinfer rename to openinfer-kernels/third_party/flashinfer diff --git a/pegainfer-kernels/tools/cutedsl/deepseek_v4/generate.py b/openinfer-kernels/tools/cutedsl/deepseek_v4/generate.py similarity index 98% rename from pegainfer-kernels/tools/cutedsl/deepseek_v4/generate.py rename to openinfer-kernels/tools/cutedsl/deepseek_v4/generate.py index 0af2fb93..f9ce3650 100644 --- a/pegainfer-kernels/tools/cutedsl/deepseek_v4/generate.py +++ b/openinfer-kernels/tools/cutedsl/deepseek_v4/generate.py @@ -30,7 +30,7 @@ def load_sm120_gemm_class(cutlass_root: Path): gemm_path = find_sm120_dense_gemm_path(cutlass_root) - spec = importlib.util.spec_from_file_location("pegainfer_cutedsl_sm120_dense_gemm", gemm_path) + spec = importlib.util.spec_from_file_location("openinfer_cutedsl_sm120_dense_gemm", gemm_path) module = importlib.util.module_from_spec(spec) assert spec.loader is not None spec.loader.exec_module(module) @@ -59,7 +59,7 @@ def find_cutlass_root(repo_root: Path, explicit: str | None) -> Path: [ repo_root / "../cutlass-upstream", repo_root / "cutlass-upstream", - repo_root / "pegainfer-kernels/third_party/flashinfer/3rdparty/cutlass", + repo_root / "openinfer-kernels/third_party/flashinfer/3rdparty/cutlass", ] ) for candidate in candidates: @@ -74,7 +74,7 @@ def find_cutlass_root(repo_root: Path, explicit: str | None) -> Path: ): return candidate raise FileNotFoundError( - "Could not find CUTLASS CuTe DSL examples. Set PEGAINFER_CUTEDSL_CUTLASS_ROOT." + "Could not find CUTLASS CuTe DSL examples. Set OPENINFER_CUTEDSL_CUTLASS_ROOT." ) diff --git a/pegainfer-kernels/tools/deepseek_v4/score_select_bench.cu b/openinfer-kernels/tools/deepseek_v4/score_select_bench.cu similarity index 100% rename from pegainfer-kernels/tools/deepseek_v4/score_select_bench.cu rename to openinfer-kernels/tools/deepseek_v4/score_select_bench.cu diff --git a/pegainfer-kernels/tools/deepseek_v4/swiglu_quant_bench.cu b/openinfer-kernels/tools/deepseek_v4/swiglu_quant_bench.cu similarity index 100% rename from pegainfer-kernels/tools/deepseek_v4/swiglu_quant_bench.cu rename to openinfer-kernels/tools/deepseek_v4/swiglu_quant_bench.cu diff --git a/pegainfer-kernels/tools/deepseek_v4/w13_grouped_fp4_bench.cu b/openinfer-kernels/tools/deepseek_v4/w13_grouped_fp4_bench.cu similarity index 100% rename from pegainfer-kernels/tools/deepseek_v4/w13_grouped_fp4_bench.cu rename to openinfer-kernels/tools/deepseek_v4/w13_grouped_fp4_bench.cu diff --git a/pegainfer-kernels/tools/kimi_k2/compare_logits_fixture.py b/openinfer-kernels/tools/kimi_k2/compare_logits_fixture.py similarity index 98% rename from pegainfer-kernels/tools/kimi_k2/compare_logits_fixture.py rename to openinfer-kernels/tools/kimi_k2/compare_logits_fixture.py index 4459452b..e5aa12d3 100644 --- a/pegainfer-kernels/tools/kimi_k2/compare_logits_fixture.py +++ b/openinfer-kernels/tools/kimi_k2/compare_logits_fixture.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -"""Compare PegaInfer Kimi-K2 logits against an external HF logits fixture. +"""Compare OpenInfer Kimi-K2 logits against an external HF logits fixture. The reference directory must be produced by hf_logits_reference.py. The candidate safetensors file must contain a full-vocab FP32 logits tensor, by diff --git a/pegainfer-kernels/tools/kimi_k2/compare_vllm_topk_fixture.py b/openinfer-kernels/tools/kimi_k2/compare_vllm_topk_fixture.py similarity index 99% rename from pegainfer-kernels/tools/kimi_k2/compare_vllm_topk_fixture.py rename to openinfer-kernels/tools/kimi_k2/compare_vllm_topk_fixture.py index dd62863b..a48657f3 100644 --- a/pegainfer-kernels/tools/kimi_k2/compare_vllm_topk_fixture.py +++ b/openinfer-kernels/tools/kimi_k2/compare_vllm_topk_fixture.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -"""Compare PegaInfer Kimi-K2 full-vocab logits with a vLLM top-logprobs fixture.""" +"""Compare OpenInfer Kimi-K2 full-vocab logits with a vLLM top-logprobs fixture.""" from __future__ import annotations diff --git a/pegainfer-kernels/tools/kimi_k2/hf_logits_reference.py b/openinfer-kernels/tools/kimi_k2/hf_logits_reference.py similarity index 98% rename from pegainfer-kernels/tools/kimi_k2/hf_logits_reference.py rename to openinfer-kernels/tools/kimi_k2/hf_logits_reference.py index c7f099fc..ee4a6cef 100644 --- a/pegainfer-kernels/tools/kimi_k2/hf_logits_reference.py +++ b/openinfer-kernels/tools/kimi_k2/hf_logits_reference.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 """Dump a full-model Kimi-K2 one-token logits fixture from HF remote code. -This is the parity source for PegaInfer full-model logits gates. It intentionally +This is the parity source for OpenInfer full-model logits gates. It intentionally loads the model from an existing local model directory and writes the rendered prompt, input ids, raw last-token logits, top-k logits, and top-k logprobs. """ @@ -131,7 +131,7 @@ def patch_kimi_remote_code_init_weights(model_load_path: Path) -> str: module_name = f"transformers_modules.{model_load_path.name}.modeling_deepseek" module = importlib.import_module(module_name) base_cls = module.DeepseekV3PreTrainedModel - if getattr(base_cls, "_pegainfer_compressed_init_patch", False): + if getattr(base_cls, "_openinfer_compressed_init_patch", False): return module_name original_init_weights = base_cls._init_weights @@ -145,7 +145,7 @@ def patched_init_weights(self: Any, layer: torch.nn.Module) -> None: original_init_weights(self, layer) base_cls._init_weights = patched_init_weights - base_cls._pegainfer_compressed_init_patch = True + base_cls._openinfer_compressed_init_patch = True return module_name diff --git a/pegainfer-kernels/tools/kimi_k2/kimi_k25_parity_prompts.json b/openinfer-kernels/tools/kimi_k2/kimi_k25_parity_prompts.json similarity index 100% rename from pegainfer-kernels/tools/kimi_k2/kimi_k25_parity_prompts.json rename to openinfer-kernels/tools/kimi_k2/kimi_k25_parity_prompts.json diff --git a/pegainfer-kernels/tools/kimi_k2/torch_reference.py b/openinfer-kernels/tools/kimi_k2/torch_reference.py similarity index 100% rename from pegainfer-kernels/tools/kimi_k2/torch_reference.py rename to openinfer-kernels/tools/kimi_k2/torch_reference.py diff --git a/pegainfer-kernels/tools/kimi_k2/vllm_logits_reference.py b/openinfer-kernels/tools/kimi_k2/vllm_logits_reference.py similarity index 99% rename from pegainfer-kernels/tools/kimi_k2/vllm_logits_reference.py rename to openinfer-kernels/tools/kimi_k2/vllm_logits_reference.py index 34dd171c..91808ff8 100644 --- a/pegainfer-kernels/tools/kimi_k2/vllm_logits_reference.py +++ b/openinfer-kernels/tools/kimi_k2/vllm_logits_reference.py @@ -3,7 +3,7 @@ vLLM's public generation API exposes generated tokens and top logprobs rather than stable raw logits. Use this fixture to cross-check temperature-0 greedy -serving behavior against PegaInfer full-vocab candidate logits. +serving behavior against OpenInfer full-vocab candidate logits. """ from __future__ import annotations diff --git a/pegainfer-kernels/tools/kimi_k2/vllm_marlin_wna16_reference.py b/openinfer-kernels/tools/kimi_k2/vllm_marlin_wna16_reference.py similarity index 100% rename from pegainfer-kernels/tools/kimi_k2/vllm_marlin_wna16_reference.py rename to openinfer-kernels/tools/kimi_k2/vllm_marlin_wna16_reference.py diff --git a/pegainfer-kernels/tools/tilelang/README.md b/openinfer-kernels/tools/tilelang/README.md similarity index 95% rename from pegainfer-kernels/tools/tilelang/README.md rename to openinfer-kernels/tools/tilelang/README.md index 0e80ebb8..987541d4 100644 --- a/pegainfer-kernels/tools/tilelang/README.md +++ b/openinfer-kernels/tools/tilelang/README.md @@ -1,7 +1,7 @@ # TileLang Generators This directory owns TileLang-based CUDA source generators used by -`pegainfer-kernels`. +`openinfer-kernels`. Keep the technology boundary here and put model- or shape-family-specific programs in subdirectories: diff --git a/pegainfer-kernels/tools/tilelang/deepseek_v4/generate.py b/openinfer-kernels/tools/tilelang/deepseek_v4/generate.py similarity index 99% rename from pegainfer-kernels/tools/tilelang/deepseek_v4/generate.py rename to openinfer-kernels/tools/tilelang/deepseek_v4/generate.py index 43f9c6cd..3227d965 100644 --- a/pegainfer-kernels/tools/tilelang/deepseek_v4/generate.py +++ b/openinfer-kernels/tools/tilelang/deepseek_v4/generate.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -"""Generate DeepSeek V4 TileLang CUDA kernels for pegainfer. +"""Generate DeepSeek V4 TileLang CUDA kernels for openinfer. The TileLang programs below are adapted from DeepSeek-AI's official `DeepSeek-V4-Flash/inference/kernel.py` kernels. The upstream model repository @@ -974,7 +974,7 @@ def main() -> None: ) out_path.write_text( - "// Generated by pegainfer-kernels/tools/tilelang/deepseek_v4/generate.py\n" + "// Generated by openinfer-kernels/tools/tilelang/deepseek_v4/generate.py\n" "#include \n" "#include \n" "\n" diff --git a/pegainfer-kernels/tools/triton/README.md b/openinfer-kernels/tools/triton/README.md similarity index 76% rename from pegainfer-kernels/tools/triton/README.md rename to openinfer-kernels/tools/triton/README.md index 56b06b4d..93b38065 100644 --- a/pegainfer-kernels/tools/triton/README.md +++ b/openinfer-kernels/tools/triton/README.md @@ -1,6 +1,6 @@ # Triton AOT Integration -`pegainfer` currently uses Triton AOT for the Qwen3.5 HD256 prefill kernel and the +`openinfer` currently uses Triton AOT for the Qwen3.5 HD256 prefill kernel and the Qwen3.5 GDR chunkwise prefill kernels. ## What this covers @@ -30,7 +30,7 @@ uv pip install -p .venv/bin/python triton Then either point the build to that interpreter explicitly: ```bash -export PEGAINFER_TRITON_PYTHON=$PWD/.venv/bin/python +export OPENINFER_TRITON_PYTHON=$PWD/.venv/bin/python ``` or let `build.rs` auto-probe `.venv/bin/python` before trying `python3` / `python`. @@ -38,10 +38,10 @@ or let `build.rs` auto-probe `.venv/bin/python` before trying `python3` / `pytho If `nvidia-smi` is unavailable where you build, also set the target SM manually. ```bash -export PEGAINFER_CUDA_SM=120 +export OPENINFER_CUDA_SM=120 ``` -`PEGAINFER_CUDA_SM` also drives the explicit Triton AOT compile target, so it is the default escape hatch when the build environment cannot query a live GPU. +`OPENINFER_CUDA_SM` also drives the explicit Triton AOT compile target, so it is the default escape hatch when the build environment cannot query a live GPU. ### Windows @@ -50,7 +50,7 @@ Official Triton does not ship Windows wheels. Use [`triton-windows`](https://git ```powershell uv venv .venv --python 3.12 uv pip install "triton-windows<3.7" -$env:PEGAINFER_TRITON_PYTHON = ".venv\Scripts\python.exe" +$env:OPENINFER_TRITON_PYTHON = ".venv\Scripts\python.exe" $env:CUDA_PATH = "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.x" ``` @@ -65,7 +65,7 @@ cargo build --release Generated Triton artifacts are written to Cargo `OUT_DIR`, typically under: ```text -target/release/build/pegainfer-kernels-*/out/triton_aot/ +target/release/build/openinfer-kernels-*/out/triton_aot/ ``` ## Validation @@ -73,17 +73,17 @@ target/release/build/pegainfer-kernels-*/out/triton_aot/ Run the focused GPU tests for the active Triton-backed paths: ```bash -cargo test --release -p pegainfer-qwen35-4b recurrent::tests::conv1d_prefill_handoff_matches_single_prefill -- --nocapture -PEGAINFER_TEST_MODEL_PATH=/path/to/Qwen3.5-4B cargo test --release -p pegainfer-qwen35-4b --test e2e_scheduler -- --nocapture +cargo test --release -p openinfer-qwen35-4b recurrent::tests::conv1d_prefill_handoff_matches_single_prefill -- --nocapture +OPENINFER_TEST_MODEL_PATH=/path/to/Qwen3.5-4B cargo test --release -p openinfer-qwen35-4b --test e2e_scheduler -- --nocapture ``` ## Common failures - `Could not find a Python interpreter with Triton installed` - - Set `PEGAINFER_TRITON_PYTHON`, or bootstrap `.venv` with `uv`. + - Set `OPENINFER_TRITON_PYTHON`, or bootstrap `.venv` with `uv`. - `GPU detection failed` - - Set `PEGAINFER_CUDA_SM` explicitly if `nvidia-smi` is not available during build. + - Set `OPENINFER_CUDA_SM` explicitly if `nvidia-smi` is not available during build. - `Triton AOT generator failed` - - Re-run the build and inspect the generator stderr printed by `build.rs`; the generator accepts an explicit `cuda::32` target derived from `PEGAINFER_CUDA_SM`. + - Re-run the build and inspect the generator stderr printed by `build.rs`; the generator accepts an explicit `cuda::32` target derived from `OPENINFER_CUDA_SM`. - `CUDA_ERROR_NO_BINARY_FOR_GPU` or similar runtime load errors - Rebuild on the target GPU environment; the generated Triton cubin is target-specific. diff --git a/pegainfer-kernels/tools/triton/flash_attention_prefill_hd256_kernel.py b/openinfer-kernels/tools/triton/flash_attention_prefill_hd256_kernel.py similarity index 100% rename from pegainfer-kernels/tools/triton/flash_attention_prefill_hd256_kernel.py rename to openinfer-kernels/tools/triton/flash_attention_prefill_hd256_kernel.py diff --git a/pegainfer-kernels/tools/triton/gated_delta_rule_chunkwise_kernels.py b/openinfer-kernels/tools/triton/gated_delta_rule_chunkwise_kernels.py similarity index 99% rename from pegainfer-kernels/tools/triton/gated_delta_rule_chunkwise_kernels.py rename to openinfer-kernels/tools/triton/gated_delta_rule_chunkwise_kernels.py index 04647b88..e2227750 100644 --- a/pegainfer-kernels/tools/triton/gated_delta_rule_chunkwise_kernels.py +++ b/openinfer-kernels/tools/triton/gated_delta_rule_chunkwise_kernels.py @@ -11,7 +11,7 @@ # - fla/ops/common/chunk_o.py # - fla/ops/gated_delta_rule/wy_fast.py # -# pegainfer-specific changes: +# openinfer-specific changes: # - fixed Qwen3.5 shapes (batch=1, H=32, K=128, V=128, chunk_size=64) # - Triton AOT-friendly surface and wrapper contracts # - no backward / varlen / generic autotune surface @@ -218,7 +218,7 @@ def gdr_solve_tril_64_qwen35_kernel( """Fixed-size BT=64 solve_tril for batch=1, fixed-length chunk-wise GDR. Adapted from FLA's chunk-wise WY / solve_tril path, specialized down to the - Qwen3.5 runtime shape and Triton AOT constraints used by pegainfer. + Qwen3.5 runtime shape and Triton AOT constraints used by openinfer. """ chunk_idx = tl.program_id(0) v_head = tl.program_id(1) @@ -446,7 +446,7 @@ def gdr_chunk_state_qwen35_kernel( """Chunk-wise recurrent-state update for Qwen3.5 GDR prefill. Adapted from FLA's chunk-state / delta-h recurrence kernels, then reshaped - for pegainfer's fixed Qwen3.5 runtime and decode-state contract. + for openinfer's fixed Qwen3.5 runtime and decode-state contract. This stage assumes `g`, `w`, and `u` are already prepared. It stores one per-chunk state snapshot in `[K, V]` scratch layout, writes token-level @@ -625,7 +625,7 @@ def gdr_chunk_o_qwen35_kernel( """Chunk-wise output stage for Qwen3.5 GDR prefill. Adapted from FLA's chunk-wise output kernel, then specialized for the - fixed-shape Qwen3.5 runtime path used by pegainfer. + fixed-shape Qwen3.5 runtime path used by openinfer. This stage consumes normalized q/k, token-level `v_new`, and per-chunk state snapshots to produce the final token-major output. diff --git a/pegainfer-kernels/tools/triton/gen_triton_aot.py b/openinfer-kernels/tools/triton/gen_triton_aot.py similarity index 100% rename from pegainfer-kernels/tools/triton/gen_triton_aot.py rename to openinfer-kernels/tools/triton/gen_triton_aot.py diff --git a/pegainfer-kimi-k2/Cargo.toml b/openinfer-kimi-k2/Cargo.toml similarity index 78% rename from pegainfer-kimi-k2/Cargo.toml rename to openinfer-kimi-k2/Cargo.toml index 926d1e13..fe75ea52 100644 --- a/pegainfer-kimi-k2/Cargo.toml +++ b/openinfer-kimi-k2/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "pegainfer-kimi-k2" +name = "openinfer-kimi-k2" version = "0.1.0" edition = "2024" autobenches = false @@ -15,10 +15,10 @@ cudarc = { workspace = true, features = ["nccl"] } half = { workspace = true } log = { workspace = true } memmap2 = { workspace = true } -pegainfer-bench = { workspace = true } -pegainfer-core = { workspace = true } -pegainfer-kernels = { workspace = true } -pegainfer-kv-cache = { workspace = true } +openinfer-bench = { workspace = true } +openinfer-core = { workspace = true } +openinfer-kernels = { workspace = true } +openinfer-kv-cache = { workspace = true } rand = { workspace = true } safetensors = { workspace = true } serde = { workspace = true } @@ -27,8 +27,8 @@ tokio = { workspace = true } [features] default = [] -kimi-k2 = ["pegainfer-kernels/kimi-k2"] -kernel-call-trace = ["pegainfer-core/kernel-call-trace"] +kimi-k2 = ["openinfer-kernels/kimi-k2"] +kernel-call-trace = ["openinfer-core/kernel-call-trace"] kernel-report = [ "dep:clap", "kimi-k2", diff --git a/pegainfer-kimi-k2/src/batch_decode_trace.rs b/openinfer-kimi-k2/src/batch_decode_trace.rs similarity index 99% rename from pegainfer-kimi-k2/src/batch_decode_trace.rs rename to openinfer-kimi-k2/src/batch_decode_trace.rs index f5473815..670c8dcb 100644 --- a/pegainfer-kimi-k2/src/batch_decode_trace.rs +++ b/openinfer-kimi-k2/src/batch_decode_trace.rs @@ -1,11 +1,11 @@ use anyhow::{Result, ensure}; #[cfg(feature = "kernel-call-trace")] -use pegainfer_core::{ +use openinfer_core::{ engine::{EngineLoadOptions, GenerateRequest, TokenEvent}, ops::call_trace, sampler::SamplingParams, }; -use pegainfer_kernels::tensor::{ +use openinfer_kernels::tensor::{ AxisSpec, Bf16, Contiguous1D, F32, HiddenStatesLayout, I32, KernelCall, RowMajor2D, TensorSpec, U32, }; @@ -16,7 +16,7 @@ use crate::config::{ KIMI_K2_DENSE_INTERMEDIATE, KIMI_K2_HIDDEN, KIMI_K2_LAYERS, KIMI_K2_MOE_LAYERS, KIMI_K2_Q_LORA_RANK, KIMI_K2_ROUTED_EXPERTS, KIMI_K2_TOPK, KIMI_K2_VOCAB, }; -use pegainfer_kernels::ops::{ +use openinfer_kernels::ops::{ KIMI_K2_EXPERT_INTERMEDIATE, KIMI_K2_MLA_ABS_Q_LOCAL_OUT_TP8, KIMI_K2_MLA_KV_B_LOCAL_OUT_TP8, KIMI_K2_MLA_KV_LORA_RANK, KIMI_K2_MLA_O_LOCAL_IN_TP8, KIMI_K2_MLA_Q_LOCAL_OUT_TP8, KIMI_K2_MLA_Q_PE_LOCAL_OUT_TP8, KIMI_K2_MLA_QKV_A_OUT, KIMI_K2_MLA_ROPE_DIM, diff --git a/pegainfer-kimi-k2/src/bin/kimi_kernel_report.rs b/openinfer-kimi-k2/src/bin/kimi_kernel_report.rs similarity index 96% rename from pegainfer-kimi-k2/src/bin/kimi_kernel_report.rs rename to openinfer-kimi-k2/src/bin/kimi_kernel_report.rs index 299c7111..5626d28f 100644 --- a/pegainfer-kimi-k2/src/bin/kimi_kernel_report.rs +++ b/openinfer-kimi-k2/src/bin/kimi_kernel_report.rs @@ -4,11 +4,11 @@ use std::path::{Path, PathBuf}; use anyhow::{Context, Result, bail}; use clap::{Parser, Subcommand, ValueEnum}; use log::info; -use pegainfer_kernels::tensor::KernelCall; -use pegainfer_kimi_k2::batch_decode_trace::{ +use openinfer_kernels::tensor::KernelCall; +use openinfer_kimi_k2::batch_decode_trace::{ MODEL, trace_decode_kernel_calls, trace_runtime_decode_kernel_calls, }; -use pegainfer_kimi_k2::kernel_report::{MeasuredCall, bench_key, measure_call}; +use openinfer_kimi_k2::kernel_report::{MeasuredCall, bench_key, measure_call}; use serde::Serialize; const DEFAULT_ITERS: u64 = 32; @@ -82,7 +82,7 @@ struct KernelReport { } fn main() -> Result<()> { - pegainfer_core::logging::init_default(); + openinfer_core::logging::init_default(); match Cli::parse().command { Command::Run(args) => run(args), Command::Trace(args) => trace(args), diff --git a/pegainfer-kimi-k2/src/bin/kimi_model_report.rs b/openinfer-kimi-k2/src/bin/kimi_model_report.rs similarity index 97% rename from pegainfer-kimi-k2/src/bin/kimi_model_report.rs rename to openinfer-kimi-k2/src/bin/kimi_model_report.rs index 3acf3c68..4778dc98 100644 --- a/pegainfer-kimi-k2/src/bin/kimi_model_report.rs +++ b/openinfer-kimi-k2/src/bin/kimi_model_report.rs @@ -4,15 +4,15 @@ use std::path::{Path, PathBuf}; use anyhow::{Context, Result, bail}; use clap::{Parser, ValueEnum}; -use pegainfer_bench::{Accum, CallSiteRow, RollupRow, accumulate, call_site_row, rollup_row}; -use pegainfer_kernels::ops::KIMI_K2_EP_WORLD; -use pegainfer_kernels::tensor::KernelCall; -use pegainfer_kimi_k2::KIMI_K2_LAYERS; -use pegainfer_kimi_k2::batch_decode_trace::{ +use openinfer_bench::{Accum, CallSiteRow, RollupRow, accumulate, call_site_row, rollup_row}; +use openinfer_kernels::ops::KIMI_K2_EP_WORLD; +use openinfer_kernels::tensor::KernelCall; +use openinfer_kimi_k2::KIMI_K2_LAYERS; +use openinfer_kimi_k2::batch_decode_trace::{ MODEL, PHASE_DECODE, TP_WORLD_SIZE, normalize_call_site, trace_decode_kernel_calls, trace_runtime_decode_kernel_calls, }; -use pegainfer_kimi_k2::kernel_report::{LatencyStats, MeasuredCall, bench_key, measure_call}; +use openinfer_kimi_k2::kernel_report::{LatencyStats, MeasuredCall, bench_key, measure_call}; use serde::Serialize; const DEFAULT_ITERS: u64 = 16; diff --git a/pegainfer-kimi-k2/src/config.rs b/openinfer-kimi-k2/src/config.rs similarity index 99% rename from pegainfer-kimi-k2/src/config.rs rename to openinfer-kimi-k2/src/config.rs index 61f2a331..5a2b53fb 100644 --- a/pegainfer-kimi-k2/src/config.rs +++ b/openinfer-kimi-k2/src/config.rs @@ -399,8 +399,8 @@ impl KimiK2ParallelShape { } #[must_use] - pub(crate) fn parallel_config(&self) -> pegainfer_core::parallel::ParallelConfig { - pegainfer_core::parallel::ParallelConfig::new(self.tp_world, self.dp_world) + pub(crate) fn parallel_config(&self) -> openinfer_core::parallel::ParallelConfig { + openinfer_core::parallel::ParallelConfig::new(self.tp_world, self.dp_world) } } diff --git a/pegainfer-kimi-k2/src/kernel_report.rs b/openinfer-kimi-k2/src/kernel_report.rs similarity index 99% rename from pegainfer-kimi-k2/src/kernel_report.rs rename to openinfer-kimi-k2/src/kernel_report.rs index 0c8a7503..f6b3197c 100644 --- a/pegainfer-kimi-k2/src/kernel_report.rs +++ b/openinfer-kimi-k2/src/kernel_report.rs @@ -2,12 +2,12 @@ use anyhow::{Result, bail, ensure}; use cudarc::driver::CudaSlice; use half::bf16; // The model-agnostic harness — timing loop, latency stats, `KernelCall` accessors — -// lives in `pegainfer-bench`. Re-export the types the report bins consume so their -// `pegainfer_kimi_k2::kernel_report::{LatencyStats, MeasuredCall, bench_key}` imports +// lives in `openinfer-bench`. Re-export the types the report bins consume so their +// `openinfer_kimi_k2::kernel_report::{LatencyStats, MeasuredCall, bench_key}` imports // keep resolving here; only Kimi's `measure_*` providers and `measure_call` are local. -pub use pegainfer_bench::{LatencyStats, MeasuredCall, bench_key}; -use pegainfer_bench::{attr_usize, axis, input, measure_loop, output, zero_matrix, zero_weight}; -use pegainfer_kernels::{ +pub use openinfer_bench::{LatencyStats, MeasuredCall, bench_key}; +use openinfer_bench::{attr_usize, axis, input, measure_loop, output, zero_matrix, zero_weight}; +use openinfer_kernels::{ ops::{ KIMI_K2_EXPERT_INTERMEDIATE, KIMI_K2_HIDDEN, KIMI_K2_INT4_GROUP_SIZE, KIMI_K2_LOCAL_EXPERTS, KIMI_K2_MLA_ABS_Q_LOCAL_OUT_TP8, KIMI_K2_MLA_KV_B_LOCAL_OUT_TP8, diff --git a/pegainfer-kimi-k2/src/lib.rs b/openinfer-kimi-k2/src/lib.rs similarity index 95% rename from pegainfer-kimi-k2/src/lib.rs rename to openinfer-kimi-k2/src/lib.rs index f7466cee..3f6ca134 100644 --- a/pegainfer-kimi-k2/src/lib.rs +++ b/openinfer-kimi-k2/src/lib.rs @@ -14,7 +14,7 @@ use std::path::Path; use anyhow::Result; -use pegainfer_core::engine::{EngineHandle, EngineLoadOptions}; +use openinfer_core::engine::{EngineHandle, EngineLoadOptions}; #[cfg(feature = "kimi-k2")] pub mod batch_decode_trace; diff --git a/pegainfer-kimi-k2/src/runner.rs b/openinfer-kimi-k2/src/runner.rs similarity index 100% rename from pegainfer-kimi-k2/src/runner.rs rename to openinfer-kimi-k2/src/runner.rs diff --git a/pegainfer-kimi-k2/src/runner/affinity.rs b/openinfer-kimi-k2/src/runner/affinity.rs similarity index 99% rename from pegainfer-kimi-k2/src/runner/affinity.rs rename to openinfer-kimi-k2/src/runner/affinity.rs index 85eeb655..c72d3a33 100644 --- a/pegainfer-kimi-k2/src/runner/affinity.rs +++ b/openinfer-kimi-k2/src/runner/affinity.rs @@ -1,7 +1,7 @@ use std::collections::BTreeSet; use anyhow::{Context, Result, ensure}; -use pegainfer_core::cpu_topology::{ +use openinfer_core::cpu_topology::{ CpuId, RankCpuSlice, RankNumaNode, cuda_device_numa_node, current_allowed_cpus, pin_current_thread_to_cpu, read_numa_cpu_pool, split_rank_cpu_slices, }; diff --git a/pegainfer-kimi-k2/src/runner/bringup.rs b/openinfer-kimi-k2/src/runner/bringup.rs similarity index 99% rename from pegainfer-kimi-k2/src/runner/bringup.rs rename to openinfer-kimi-k2/src/runner/bringup.rs index 5f078aa5..e77909b1 100644 --- a/pegainfer-kimi-k2/src/runner/bringup.rs +++ b/openinfer-kimi-k2/src/runner/bringup.rs @@ -10,7 +10,7 @@ use anyhow::{Context, Result, bail, ensure}; use bytesize::ByteSize; use crossbeam_channel::bounded; use log::{debug, info}; -use pegainfer_core::{ +use openinfer_core::{ engine::{EngineHandle, EngineLoadOptions, EpBackend, GenerateRequest}, parallel::ParallelConfig, }; @@ -28,7 +28,7 @@ use crate::{ }, weights::{KimiRankGpuContext, KimiRankSlicedLoadPlan, ensure_text_only_model_index}, }; -use pegainfer_kv_cache::BlockPool; +use openinfer_kv_cache::BlockPool; /// TP8 replicates the KV pool on every rank: 8192 pages × 16 tokens × /// (576 ckv + 64 kpe) bf16 ≈ 9.2 GiB per rank — the same footprint as the @@ -378,7 +378,7 @@ fn init_tp_nccl(workers: &[KimiRankWorker]) -> Result<()> { fn install_deepep_backends(workers: &[KimiRankWorker]) -> Result<()> { let started = Instant::now(); info!("start install DeepEP EP backend: ranks={}", workers.len()); - let unique_id = pegainfer_kernels::ops::deepep_unique_id()?; + let unique_id = openinfer_kernels::ops::deepep_unique_id()?; let receivers = workers .iter() .map(|worker| worker.enable_deepep_async(unique_id, workers.len())) diff --git a/pegainfer-kimi-k2/src/runner/config.rs b/openinfer-kimi-k2/src/runner/config.rs similarity index 95% rename from pegainfer-kimi-k2/src/runner/config.rs rename to openinfer-kimi-k2/src/runner/config.rs index fa0b4bc0..91b99816 100644 --- a/pegainfer-kimi-k2/src/runner/config.rs +++ b/openinfer-kimi-k2/src/runner/config.rs @@ -1,6 +1,6 @@ use std::path::PathBuf; -use pegainfer_core::parallel::ParallelConfig; +use openinfer_core::parallel::ParallelConfig; use crate::runner::affinity::KimiRankThreadPlacementPlan; use crate::runner::worker::KimiK2RankPlacement; diff --git a/pegainfer-kimi-k2/src/runner/executor.rs b/openinfer-kimi-k2/src/runner/executor.rs similarity index 100% rename from pegainfer-kimi-k2/src/runner/executor.rs rename to openinfer-kimi-k2/src/runner/executor.rs diff --git a/pegainfer-kimi-k2/src/runner/executor/tp1_dp8.rs b/openinfer-kimi-k2/src/runner/executor/tp1_dp8.rs similarity index 100% rename from pegainfer-kimi-k2/src/runner/executor/tp1_dp8.rs rename to openinfer-kimi-k2/src/runner/executor/tp1_dp8.rs diff --git a/pegainfer-kimi-k2/src/runner/executor/tp8_dp1.rs b/openinfer-kimi-k2/src/runner/executor/tp8_dp1.rs similarity index 99% rename from pegainfer-kimi-k2/src/runner/executor/tp8_dp1.rs rename to openinfer-kimi-k2/src/runner/executor/tp8_dp1.rs index a8e170f6..15473198 100644 --- a/pegainfer-kimi-k2/src/runner/executor/tp8_dp1.rs +++ b/openinfer-kimi-k2/src/runner/executor/tp8_dp1.rs @@ -215,7 +215,7 @@ fn ensure_no_logprobs_tp8(requested: bool) -> Result<()> { Ok(()) } -fn ensure_greedy_tp8(sampling: &pegainfer_core::sampler::SamplingParams) -> Result<()> { +fn ensure_greedy_tp8(sampling: &openinfer_core::sampler::SamplingParams) -> Result<()> { ensure!( sampling.is_greedy(), "Kimi TP8 path does not support sampling yet: each rank holds a vocab \ diff --git a/pegainfer-kimi-k2/src/runner/load_balancer.rs b/openinfer-kimi-k2/src/runner/load_balancer.rs similarity index 100% rename from pegainfer-kimi-k2/src/runner/load_balancer.rs rename to openinfer-kimi-k2/src/runner/load_balancer.rs diff --git a/pegainfer-kimi-k2/src/runner/moe_deepep.rs b/openinfer-kimi-k2/src/runner/moe_deepep.rs similarity index 99% rename from pegainfer-kimi-k2/src/runner/moe_deepep.rs rename to openinfer-kimi-k2/src/runner/moe_deepep.rs index d415bbf3..e10694f4 100644 --- a/pegainfer-kimi-k2/src/runner/moe_deepep.rs +++ b/openinfer-kimi-k2/src/runner/moe_deepep.rs @@ -25,7 +25,7 @@ use anyhow::{Context, Result, ensure}; use cudarc::driver::CudaSlice; -use pegainfer_kernels::{ +use openinfer_kernels::{ ops::{ DeepEp, DeepEpDispatchScratch, KIMI_K2_EP_WORLD, KIMI_K2_LOCAL_EXPERTS, KIMI_K2_ROUTER_SCALE, KIMI_K2_SHARED_GATE_UP, KimiMarlinInt4ExpertWeights, diff --git a/pegainfer-kimi-k2/src/runner/moe_nccl.rs b/openinfer-kimi-k2/src/runner/moe_nccl.rs similarity index 99% rename from pegainfer-kimi-k2/src/runner/moe_nccl.rs rename to openinfer-kimi-k2/src/runner/moe_nccl.rs index 06a0be17..bd45c65e 100644 --- a/pegainfer-kimi-k2/src/runner/moe_nccl.rs +++ b/openinfer-kimi-k2/src/runner/moe_nccl.rs @@ -24,7 +24,7 @@ use anyhow::{Context, Result}; use cudarc::nccl::{ReduceOp, safe::Comm}; -use pegainfer_kernels::{ +use openinfer_kernels::{ ops::{ KIMI_K2_EP_WORLD, KIMI_K2_ROUTER_SCALE, KimiMarlinRouteWorkspace, KimiMarlinWna16Workspace, KimiRouterBatch, KimiRouterConfig, KimiRouterOutput, kimi_add_f32_bf16_to_bf16, @@ -352,7 +352,7 @@ pub(super) fn forward_moe_layer_batch_into( seq_len * KIMI_K2_HIDDEN, KIMI_K2_ROUTER_SCALE, )?; - pegainfer_kernels::typed_pipeline! { + openinfer_kernels::typed_pipeline! { ctx = ctx, eps = KIMI_K2_RMS_NORM_EPS; add(hidden, &shared_out => next_hidden); } diff --git a/pegainfer-kimi-k2/src/runner/scheduler.rs b/openinfer-kimi-k2/src/runner/scheduler.rs similarity index 99% rename from pegainfer-kimi-k2/src/runner/scheduler.rs rename to openinfer-kimi-k2/src/runner/scheduler.rs index 08a9a195..41db27e3 100644 --- a/pegainfer-kimi-k2/src/runner/scheduler.rs +++ b/openinfer-kimi-k2/src/runner/scheduler.rs @@ -12,8 +12,8 @@ use crate::runner::worker::{KimiKvStepPages, KimiRowOptions}; use anyhow::{Context, Result}; use lifecycle::{preflight_prefill_candidate, send_scheduled, validate_kv_capacity}; use log::error; -use pegainfer_core::engine::{FinishReason, GenerateRequest, TokenEvent}; -use pegainfer_kv_cache::{BlockPool, RequestKv}; +use openinfer_core::engine::{FinishReason, GenerateRequest, TokenEvent}; +use openinfer_kv_cache::{BlockPool, RequestKv}; use tokio::sync::mpsc; const KIMI_RUNNER_MAX_BATCH: usize = 64; @@ -614,7 +614,7 @@ impl KimiK2Scheduler { mod tests { use std::sync::{Arc, Mutex}; - use pegainfer_core::sampler::SamplingParams; + use openinfer_core::sampler::SamplingParams; use crate::runner::worker::KimiOneTokenForwardReport; diff --git a/pegainfer-kimi-k2/src/runner/scheduler/dp.rs b/openinfer-kimi-k2/src/runner/scheduler/dp.rs similarity index 99% rename from pegainfer-kimi-k2/src/runner/scheduler/dp.rs rename to openinfer-kimi-k2/src/runner/scheduler/dp.rs index 75565221..3a8344af 100644 --- a/pegainfer-kimi-k2/src/runner/scheduler/dp.rs +++ b/openinfer-kimi-k2/src/runner/scheduler/dp.rs @@ -1,8 +1,8 @@ use anyhow::Result; use crossbeam_channel::{Receiver, Sender, bounded}; use log::error; -use pegainfer_core::engine::{FinishReason, GenerateRequest, TokenEvent}; -use pegainfer_kv_cache::{BlockPool, RequestKv}; +use openinfer_core::engine::{FinishReason, GenerateRequest, TokenEvent}; +use openinfer_kv_cache::{BlockPool, RequestKv}; use rand::rngs::StdRng; use tokio::sync::mpsc; @@ -1087,7 +1087,7 @@ fn rank_forward_loop( #[cfg(test)] mod tests { - use pegainfer_core::sampler::SamplingParams; + use openinfer_core::sampler::SamplingParams; use super::*; diff --git a/pegainfer-kimi-k2/src/runner/scheduler/lifecycle.rs b/openinfer-kimi-k2/src/runner/scheduler/lifecycle.rs similarity index 98% rename from pegainfer-kimi-k2/src/runner/scheduler/lifecycle.rs rename to openinfer-kimi-k2/src/runner/scheduler/lifecycle.rs index 328fba27..12ef7fed 100644 --- a/pegainfer-kimi-k2/src/runner/scheduler/lifecycle.rs +++ b/openinfer-kimi-k2/src/runner/scheduler/lifecycle.rs @@ -1,6 +1,6 @@ use std::time::{SystemTime, UNIX_EPOCH}; -use pegainfer_core::engine::{FinishReason, GenerateRequest, TokenEvent}; +use openinfer_core::engine::{FinishReason, GenerateRequest, TokenEvent}; use crate::runner::worker::KIMI_MAX_REQUEST_TOKENS; diff --git a/pegainfer-kimi-k2/src/runner/worker.rs b/openinfer-kimi-k2/src/runner/worker.rs similarity index 97% rename from pegainfer-kimi-k2/src/runner/worker.rs rename to openinfer-kimi-k2/src/runner/worker.rs index 2e77f219..65cea85f 100644 --- a/pegainfer-kimi-k2/src/runner/worker.rs +++ b/openinfer-kimi-k2/src/runner/worker.rs @@ -14,11 +14,11 @@ use cudarc::nccl::{ safe::{Comm, Id}, }; use log::debug; -use pegainfer_core::cuda_graph::CudaGraphState; -use pegainfer_core::engine::TokenLogprob; +use openinfer_core::cuda_graph::CudaGraphState; +use openinfer_core::engine::TokenLogprob; #[cfg(feature = "kernel-call-trace")] -use pegainfer_core::ops::call_trace; -use pegainfer_kernels::{ +use openinfer_core::ops::call_trace; +use openinfer_kernels::{ ops::{ KIMI_K2_LOCAL_EXPERTS, KIMI_K2_MLA_KV_A_OUT, KIMI_K2_MLA_KV_LORA_RANK, KIMI_K2_MLA_Q_HEAD_DIM, KIMI_K2_MLA_QKV_A_OUT, KIMI_K2_MLA_ROPE_DIM, @@ -153,7 +153,7 @@ pub(super) struct KimiRankWeightLoadReport { #[derive(Clone, Copy, Debug, Default)] pub(crate) struct KimiRowOptions { pub(crate) logprobs: usize, - pub(crate) sampling: pegainfer_core::sampler::SamplingParams, + pub(crate) sampling: openinfer_core::sampler::SamplingParams, } #[derive(Clone, Debug, PartialEq)] @@ -623,10 +623,10 @@ struct KimiCublasThreadGuard; impl Drop for KimiCublasThreadGuard { fn drop(&mut self) { unsafe { - pegainfer_kernels::ffi::kimi_mla_cublaslt_destroy_cuda(); - pegainfer_kernels::ffi::kimi_o_proj_cublaslt_destroy_cuda(); - pegainfer_kernels::ffi::kimi_shared_gate_up_cublaslt_destroy_cuda(); - pegainfer_kernels::ffi::cublas_destroy(); + openinfer_kernels::ffi::kimi_mla_cublaslt_destroy_cuda(); + openinfer_kernels::ffi::kimi_o_proj_cublaslt_destroy_cuda(); + openinfer_kernels::ffi::kimi_shared_gate_up_cublaslt_destroy_cuda(); + openinfer_kernels::ffi::cublas_destroy(); } } } @@ -643,8 +643,8 @@ fn bind_rank_thread( ctx.set_current()?; let decode_aux_ctx = ctx.auxiliary_device_context("decode aux")?; unsafe { - pegainfer_kernels::ffi::cublas_init(); - let status = pegainfer_kernels::ffi::kimi_shared_gate_up_cublaslt_init_cuda(); + openinfer_kernels::ffi::cublas_init(); + let status = openinfer_kernels::ffi::kimi_shared_gate_up_cublaslt_init_cuda(); if status != 0 { if status >= 100_000 { anyhow::bail!( @@ -657,7 +657,7 @@ fn bind_rank_thread( status ); } - let status = pegainfer_kernels::ffi::kimi_mla_cublaslt_init_cuda(); + let status = openinfer_kernels::ffi::kimi_mla_cublaslt_init_cuda(); if status != 0 { if status >= 100_000 { anyhow::bail!( @@ -668,7 +668,7 @@ fn bind_rank_thread( anyhow::bail!("Kimi MLA cuBLASLt init failed: cuda_status={}", status); } if local_dims.o_proj_in == KIMI_O_PROJ_CUBLASLT_INPUT { - let status = pegainfer_kernels::ffi::kimi_o_proj_cublaslt_init_cuda(); + let status = openinfer_kernels::ffi::kimi_o_proj_cublaslt_init_cuda(); if status != 0 { if status >= 100_000 { anyhow::bail!( diff --git a/pegainfer-kimi-k2/src/runner/worker/cache.rs b/openinfer-kimi-k2/src/runner/worker/cache.rs similarity index 100% rename from pegainfer-kimi-k2/src/runner/worker/cache.rs rename to openinfer-kimi-k2/src/runner/worker/cache.rs diff --git a/pegainfer-kimi-k2/src/runner/worker/forward.rs b/openinfer-kimi-k2/src/runner/worker/forward.rs similarity index 100% rename from pegainfer-kimi-k2/src/runner/worker/forward.rs rename to openinfer-kimi-k2/src/runner/worker/forward.rs diff --git a/pegainfer-kimi-k2/src/runner/worker/load.rs b/openinfer-kimi-k2/src/runner/worker/load.rs similarity index 100% rename from pegainfer-kimi-k2/src/runner/worker/load.rs rename to openinfer-kimi-k2/src/runner/worker/load.rs diff --git a/pegainfer-kimi-k2/src/runner/worker/runtime.rs b/openinfer-kimi-k2/src/runner/worker/runtime.rs similarity index 99% rename from pegainfer-kimi-k2/src/runner/worker/runtime.rs rename to openinfer-kimi-k2/src/runner/worker/runtime.rs index f06fc052..2ced8fae 100644 --- a/pegainfer-kimi-k2/src/runner/worker/runtime.rs +++ b/openinfer-kimi-k2/src/runner/worker/runtime.rs @@ -1,6 +1,6 @@ use super::*; -use pegainfer_core::engine::TokenLogprob; -use pegainfer_kernels::ffi; +use openinfer_core::engine::TokenLogprob; +use openinfer_kernels::ffi; pub(in crate::runner) fn all_reduce_hidden_via_f32_in_place( ctx: &DeviceContext, @@ -323,7 +323,7 @@ pub(super) fn launch_local_top1_batch( out.len(), active_rows ); - let partials = pegainfer_kernels::ops::argmax_batch_bf16_split_partials_len( + let partials = openinfer_kernels::ops::argmax_batch_bf16_split_partials_len( active_rows, logits.hidden_dim, ); diff --git a/pegainfer-kimi-k2/src/runner/worker/state.rs b/openinfer-kimi-k2/src/runner/worker/state.rs similarity index 98% rename from pegainfer-kimi-k2/src/runner/worker/state.rs rename to openinfer-kimi-k2/src/runner/worker/state.rs index 6681484f..c56686df 100644 --- a/pegainfer-kimi-k2/src/runner/worker/state.rs +++ b/openinfer-kimi-k2/src/runner/worker/state.rs @@ -349,11 +349,11 @@ impl KimiRankThreadState { // Non-greedy rows: one batched FlashInfer sampling pass over the // logits arena (its own sync, in addition to the argmax read below). // All-greedy batches skip this entirely — the greedy path is unchanged. - let sampling_rows: Vec = rows + let sampling_rows: Vec = rows .iter() .enumerate() .filter(|(_, r)| !r.sampling.is_greedy()) - .map(|(i, r)| pegainfer_kernels::ops::BatchSamplingRow { + .map(|(i, r)| openinfer_kernels::ops::BatchSamplingRow { row: i, temperature: r.sampling.temperature, top_k: r.sampling.top_k, @@ -369,7 +369,7 @@ impl KimiRankThreadState { cannot sample the global distribution (#237, #226)" ); let scratch = decode_arena.scratch.sampling.batch_sampling(&device_ctx)?; - pegainfer_kernels::ops::gpu_sample_batch_into( + openinfer_kernels::ops::gpu_sample_batch_into( &device_ctx, decode_arena.logits.as_ref(), &sampling_rows, @@ -626,16 +626,16 @@ impl KimiRankThreadState { "Kimi sampling requires an unsharded vocab (TP1); a vocab shard \ cannot sample the global distribution (#237, #226)" ); - let sampling_rows = [pegainfer_kernels::ops::BatchSamplingRow { + let sampling_rows = [openinfer_kernels::ops::BatchSamplingRow { row: 0, temperature: row.sampling.temperature, top_k: row.sampling.top_k, top_p: row.sampling.top_p, }]; let scratch = decode_arena.scratch.sampling.batch_sampling(&device_ctx)?; - let sampled = pegainfer_kernels::ops::gpu_sample_batch_into( + let sampled = openinfer_kernels::ops::gpu_sample_batch_into( &device_ctx, - pegainfer_kernels::tensor::HiddenStatesRef { + openinfer_kernels::tensor::HiddenStatesRef { data: &logits.data, hidden_dim: logits.len, seq_len: 1, @@ -704,7 +704,7 @@ impl KimiRankThreadState { let kv_len = cached_tokens + seq_len; let q_proj_out = local_heads * KIMI_K2_MLA_Q_HEAD_DIM; let kv_b_out = attention.kv_b_proj.rows; - pegainfer_kernels::typed_pipeline! { + openinfer_kernels::typed_pipeline! { ctx = ctx, eps = KIMI_K2_RMS_NORM_EPS, seq_len = seq_len, gemm = prefill; tensor qkv_a: KIMI_K2_MLA_QKV_A_OUT; tensor q_a: KIMI_K2_Q_LORA_RANK; diff --git a/pegainfer-kimi-k2/src/typed_scratch.rs b/openinfer-kimi-k2/src/typed_scratch.rs similarity index 96% rename from pegainfer-kimi-k2/src/typed_scratch.rs rename to openinfer-kimi-k2/src/typed_scratch.rs index bdb350eb..7dc0b382 100644 --- a/pegainfer-kimi-k2/src/typed_scratch.rs +++ b/openinfer-kimi-k2/src/typed_scratch.rs @@ -2,14 +2,14 @@ use anyhow::{Result, ensure}; use cudarc::driver::CudaSlice; -use pegainfer_kernels::gpu_buffers; -use pegainfer_kernels::tensor::{DeviceContext, GpuTensor, HiddenStates}; +use openinfer_kernels::gpu_buffers; +use openinfer_kernels::tensor::{DeviceContext, GpuTensor, HiddenStates}; use crate::config::{ KIMI_K2_EXPERT_INTERMEDIATE, KIMI_K2_HIDDEN, KIMI_K2_Q_LORA_RANK, KIMI_K2_ROUTED_EXPERTS, KIMI_K2_TOPK, KIMI_K2_VOCAB, KimiLocalDims, }; -use pegainfer_kernels::ops::{ +use openinfer_kernels::ops::{ KIMI_K2_EP_WORLD, KIMI_K2_MLA_KV_LORA_RANK, KIMI_K2_MLA_QKV_A_OUT, KIMI_K2_MLA_ROPE_DIM, KimiMarlinRouteWorkspace, KimiMarlinWna16Workspace, argmax_batch_bf16_split_partials_len, }; @@ -149,7 +149,7 @@ pub(crate) struct SamplingScratch { /// Buffers for non-greedy rows (f32 probs are batch x vocab, ~42 MB at /// batch 64) — allocated on the first sampling request so greedy-only /// serving pays nothing. - batch_sampling: Option, + batch_sampling: Option, batch_size: usize, } @@ -169,9 +169,9 @@ impl SamplingScratch { pub(crate) fn batch_sampling( &mut self, ctx: &DeviceContext, - ) -> Result<&mut pegainfer_kernels::ops::BatchSamplingScratch> { + ) -> Result<&mut openinfer_kernels::ops::BatchSamplingScratch> { if self.batch_sampling.is_none() { - self.batch_sampling = Some(pegainfer_kernels::ops::BatchSamplingScratch::new( + self.batch_sampling = Some(openinfer_kernels::ops::BatchSamplingScratch::new( ctx, self.batch_size, KIMI_K2_VOCAB, diff --git a/pegainfer-kimi-k2/src/weights.rs b/openinfer-kimi-k2/src/weights.rs similarity index 93% rename from pegainfer-kimi-k2/src/weights.rs rename to openinfer-kimi-k2/src/weights.rs index 68e94650..41152af9 100644 --- a/pegainfer-kimi-k2/src/weights.rs +++ b/openinfer-kimi-k2/src/weights.rs @@ -21,13 +21,13 @@ use cudarc::driver::{ use half::bf16; use log::debug; use memmap2::Mmap; -use pegainfer_kernels::ffi; -use pegainfer_kernels::ops::{ +use openinfer_kernels::ffi; +use openinfer_kernels::ops::{ KimiInt4ExpertRole, KimiInt4NibbleOrder, KimiInt4WeightManifest, KimiMarlinFusedW13Int4Weight, KimiMarlinInt4ExpertWeights, KimiMarlinInt4Weight, kimi_marlin_int4_fuse_w13, kimi_marlin_int4_reorder_scale, kimi_marlin_int4_reorder_weight, }; -use pegainfer_kernels::tensor::{DeviceContext, DeviceMatrix, DeviceVec, GpuWeight}; +use openinfer_kernels::tensor::{DeviceContext, DeviceMatrix, DeviceVec, GpuWeight}; use safetensors::{Dtype, SafeTensors}; use serde_json::Value; diff --git a/pegainfer-kimi-k2/src/weights/context.rs b/openinfer-kimi-k2/src/weights/context.rs similarity index 100% rename from pegainfer-kimi-k2/src/weights/context.rs rename to openinfer-kimi-k2/src/weights/context.rs diff --git a/pegainfer-kimi-k2/src/weights/load.rs b/openinfer-kimi-k2/src/weights/load.rs similarity index 100% rename from pegainfer-kimi-k2/src/weights/load.rs rename to openinfer-kimi-k2/src/weights/load.rs diff --git a/pegainfer-kimi-k2/src/weights/manifest.rs b/openinfer-kimi-k2/src/weights/manifest.rs similarity index 100% rename from pegainfer-kimi-k2/src/weights/manifest.rs rename to openinfer-kimi-k2/src/weights/manifest.rs diff --git a/pegainfer-kimi-k2/src/weights/package.rs b/openinfer-kimi-k2/src/weights/package.rs similarity index 100% rename from pegainfer-kimi-k2/src/weights/package.rs rename to openinfer-kimi-k2/src/weights/package.rs diff --git a/pegainfer-kimi-k2/src/weights/tests.rs b/openinfer-kimi-k2/src/weights/tests.rs similarity index 100% rename from pegainfer-kimi-k2/src/weights/tests.rs rename to openinfer-kimi-k2/src/weights/tests.rs diff --git a/pegainfer-kimi-k2/tests/vllm_golden_gate.rs b/openinfer-kimi-k2/tests/vllm_golden_gate.rs similarity index 96% rename from pegainfer-kimi-k2/tests/vllm_golden_gate.rs rename to openinfer-kimi-k2/tests/vllm_golden_gate.rs index 3510acae..ec187cd4 100644 --- a/pegainfer-kimi-k2/tests/vllm_golden_gate.rs +++ b/openinfer-kimi-k2/tests/vllm_golden_gate.rs @@ -13,13 +13,13 @@ //! //! The TP1/DP8 path emits exact per-token logprobs (host log-softmax of the //! full-vocab logits row, computed only when requested), so on top of token -//! comparison the gate bounds a two-sided |Δlogprob| distribution: pegainfer's +//! comparison the gate bounds a two-sided |Δlogprob| distribution: openinfer's //! own logprob of its pick against vLLM's logprob of the same token. Two //! passes through the *real serving path* //! (EngineHandle → DP coordinator → DeepEP → MLA kernels, TP1/DP8/EP8): //! //! * teacher-forced argmax sweep — for every position i, prefill -//! `prompt + tail[..i]` with max_tokens=1. pegainfer's pick must satisfy +//! `prompt + tail[..i]` with max_tokens=1. openinfer's pick must satisfy //! the flatness-scaled regret rule (see `REGRET_BASE`): how far it may //! sit below vLLM's own argmax *in vLLM's logprobs* grows with vLLM's //! own uncertainty at that position — near-exact agreement where vLLM is @@ -44,7 +44,7 @@ //! #293). Repeated prompts across passes also hit the prefix cache, so //! every bound in this gate exercises the cached-prefill path too. //! -//! Requires 8 GPUs and Kimi-K2.6 weights. `PEGAINFER_TEST_MODEL_PATH` must +//! Requires 8 GPUs and Kimi-K2.6 weights. `OPENINFER_TEST_MODEL_PATH` must //! point at the weights and the fixture must exist — both fail loudly when //! missing. No silent skip: a gate that can quietly report "ok 0.00s" guards //! nothing (the qwen35 gate's env-gated skip taught us that). Building the @@ -54,11 +54,11 @@ use std::path::Path; use std::time::{Duration, Instant}; -use pegainfer_core::engine::{ +use openinfer_core::engine::{ EngineHandle, EngineLoadOptions, EpBackend, TokenEvent, TokenLogprob, }; -use pegainfer_core::parallel::ParallelConfig; -use pegainfer_core::sampler::SamplingParams; +use openinfer_core::parallel::ParallelConfig; +use openinfer_core::sampler::SamplingParams; use safetensors::{Dtype, SafeTensors}; const FIXTURE: &str = concat!( @@ -66,7 +66,7 @@ const FIXTURE: &str = concat!( "/../test_data/kimi-k2.6-vllm-golden.safetensors" ); -/// Per-position regret rule: pegainfer's pick must satisfy +/// Per-position regret rule: openinfer's pick must satisfy /// /// regret ≤ REGRET_BASE + REGRET_FLATNESS_SLOPE × (−vllm_top1_lp) /// @@ -76,7 +76,7 @@ const FIXTURE: &str = concat!( /// agreement — while at a flat, multi-modal position (top-1 ≈ 11%) it /// reaches ≈ 1.07, because there is no single correct token for /// cross-engine noise to deviate from. The bound depends only on the -/// committed vLLM fixture, so pegainfer cannot influence its own +/// committed vLLM fixture, so openinfer cannot influence its own /// tolerance. /// /// Calibration (three 8×H200 runs, 2026-06-05/06): cross-engine INT4 @@ -92,7 +92,7 @@ const FIXTURE: &str = concat!( const REGRET_BASE: f32 = 0.30; const REGRET_FLATNESS_SLOPE: f32 = 0.35; -/// Aggregate guard: per pass, the fraction of positions where pegainfer's +/// Aggregate guard: per pass, the fraction of positions where openinfer's /// pick equals vLLM's argmax exactly. A systematic numerical bug shows up /// as *many* small in-bound flips long before any single pick violates the /// per-position rule — this floor catches that. Measured 97.7–98.4% across @@ -110,7 +110,7 @@ const EXACT_FLOOR: f64 = 0.95; const COVERAGE_FLOOR: f64 = 0.60; /// Two-sided |Δlogprob| bounds over *exact-match* positions: where both -/// engines pick the same token, Δ between pegainfer's exact host log-softmax +/// engines pick the same token, Δ between openinfer's exact host log-softmax /// and vLLM's stored logprob is pure numerical drift — the same shape as the /// Qwen golden gates, measured through a different engine. Flip positions are /// deliberately excluded: their Δ is structurally larger (the engines @@ -279,9 +279,9 @@ fn load_fixture() -> Fixture { } fn model_path() -> String { - let path = std::env::var("PEGAINFER_TEST_MODEL_PATH").unwrap_or_else(|_| { + let path = std::env::var("OPENINFER_TEST_MODEL_PATH").unwrap_or_else(|_| { panic!( - "kimi vllm_golden_gate: PEGAINFER_TEST_MODEL_PATH is not set. \ + "kimi vllm_golden_gate: OPENINFER_TEST_MODEL_PATH is not set. \ This gate needs 8 GPUs and Kimi-K2.6 weights; it fails rather \ than silently skipping." ) @@ -294,7 +294,7 @@ fn model_path() -> String { } fn start_engine(path: &str) -> EngineHandle { - pegainfer_kimi_k2::start_engine( + openinfer_kimi_k2::start_engine( Path::new(path), EngineLoadOptions { enable_cuda_graph: false, @@ -322,7 +322,7 @@ fn submit( ) -> PendingRequest { let (tx, rx) = tokio::sync::mpsc::unbounded_channel(); engine - .submit(pegainfer_core::engine::GenerateRequest { + .submit(openinfer_core::engine::GenerateRequest { request_id: Some(label.clone()), queued_at_unix_s: None, prompt_tokens: prompt.to_vec(), @@ -399,7 +399,7 @@ struct RegretStats { violations: Vec, } -/// Fold pegainfer's pick at one position into the stats, applying the +/// Fold openinfer's pick at one position into the stats, applying the /// flatness-scaled regret rule and accumulating the two-sided |Δlogprob|. fn check_pick(stats: &mut RegretStats, seq: &Seq, pos: usize, pick: u32, pick_lp: &TokenLogprob) { stats.positions += 1; @@ -413,7 +413,7 @@ fn check_pick(stats: &mut RegretStats, seq: &Seq, pos: usize, pick: u32, pick_lp .map_or(f32::NEG_INFINITY, |&(_, lp)| lp); if pick_lp.logprob < own_top - 1e-5 { stats.violations.push(format!( - "{} pos {pos}: pick {pick} logprob {:.4} sits below pegainfer's \ + "{} pos {pos}: pick {pick} logprob {:.4} sits below openinfer's \ own top-1 {:.4} — argmax and log-softmax disagree on the same logits", seq.name, pick_lp.logprob, own_top, )); @@ -433,7 +433,7 @@ fn check_pick(stats: &mut RegretStats, seq: &Seq, pos: usize, pick: u32, pick_lp } match seq.vllm_logprob(pos, pick) { None => stats.violations.push(format!( - "{} pos {pos}: pegainfer picked {pick}, absent from vLLM's top-{} \ + "{} pos {pos}: openinfer picked {pick}, absent from vLLM's top-{} \ (vLLM argmax {vllm_top1}) — confidently wrong on a token vLLM does not rank", seq.name, seq.topk_ids[pos].len(), @@ -445,7 +445,7 @@ fn check_pick(stats: &mut RegretStats, seq: &Seq, pos: usize, pick: u32, pick_lp stats.flips.push(regret); } else { stats.violations.push(format!( - "{} pos {pos}: pegainfer picked {pick}, which vLLM scores \ + "{} pos {pos}: openinfer picked {pick}, which vLLM scores \ {regret:.4} nat below its argmax {vllm_top1} (top-1 lp \ {vllm_top1_lp:.2}, bound {bound:.4})", seq.name, @@ -615,7 +615,7 @@ fn greedy_parity( // as benign tie-flip or violation. eprintln!( "vllm_golden_gate: greedy:{} diverged at pos {pos}/{} \ - (pegainfer {tok}, vLLM {})", + (openinfer {tok}, vLLM {})", seq.name, fixture.meta.decode_tokens, seq.tail_token_ids[pos], ); break; diff --git a/pegainfer-kv-cache/Cargo.toml b/openinfer-kv-cache/Cargo.toml similarity index 78% rename from pegainfer-kv-cache/Cargo.toml rename to openinfer-kv-cache/Cargo.toml index d19217cd..3c4ef7c2 100644 --- a/pegainfer-kv-cache/Cargo.toml +++ b/openinfer-kv-cache/Cargo.toml @@ -1,10 +1,10 @@ [package] -name = "pegainfer-kv-cache" +name = "openinfer-kv-cache" version = "0.1.0" edition = "2024" [dependencies] -pegainfer-kernels = { workspace = true } +openinfer-kernels = { workspace = true } kvbm-logical = { workspace = true } dynamo-kv-hashing = { workspace = true } anyhow = { workspace = true } diff --git a/pegainfer-kv-cache/src/buffer.rs b/openinfer-kv-cache/src/buffer.rs similarity index 100% rename from pegainfer-kv-cache/src/buffer.rs rename to openinfer-kv-cache/src/buffer.rs diff --git a/pegainfer-kv-cache/src/layout.rs b/openinfer-kv-cache/src/layout.rs similarity index 92% rename from pegainfer-kv-cache/src/layout.rs rename to openinfer-kv-cache/src/layout.rs index fad2ea54..ec64e50b 100644 --- a/pegainfer-kv-cache/src/layout.rs +++ b/openinfer-kv-cache/src/layout.rs @@ -31,8 +31,8 @@ impl KvLayout { } } - pub fn kernel_layout(&self) -> pegainfer_kernels::paged_kv::PagedKvLayout { - pegainfer_kernels::paged_kv::PagedKvLayout { + pub fn kernel_layout(&self) -> openinfer_kernels::paged_kv::PagedKvLayout { + openinfer_kernels::paged_kv::PagedKvLayout { page_size: self.page_size, num_layers: self.num_layers, num_kv_heads: self.num_kv_heads, diff --git a/pegainfer-kv-cache/src/lib.rs b/openinfer-kv-cache/src/lib.rs similarity index 100% rename from pegainfer-kv-cache/src/lib.rs rename to openinfer-kv-cache/src/lib.rs diff --git a/pegainfer-kv-cache/src/manager.rs b/openinfer-kv-cache/src/manager.rs similarity index 100% rename from pegainfer-kv-cache/src/manager.rs rename to openinfer-kv-cache/src/manager.rs diff --git a/pegainfer-kv-cache/src/pool.rs b/openinfer-kv-cache/src/pool.rs similarity index 100% rename from pegainfer-kv-cache/src/pool.rs rename to openinfer-kv-cache/src/pool.rs diff --git a/pegainfer-kv-cache/src/view.rs b/openinfer-kv-cache/src/view.rs similarity index 100% rename from pegainfer-kv-cache/src/view.rs rename to openinfer-kv-cache/src/view.rs diff --git a/pegainfer-kv-cache/tests/lifecycle.rs b/openinfer-kv-cache/tests/lifecycle.rs similarity index 99% rename from pegainfer-kv-cache/tests/lifecycle.rs rename to openinfer-kv-cache/tests/lifecycle.rs index 8c10d19b..feddc643 100644 --- a/pegainfer-kv-cache/tests/lifecycle.rs +++ b/openinfer-kv-cache/tests/lifecycle.rs @@ -1,4 +1,4 @@ -use pegainfer_kv_cache::KvCacheManager; +use openinfer_kv_cache::KvCacheManager; fn make_manager(num_blocks: usize) -> KvCacheManager { let ctx = cudarc::driver::CudaContext::new(0).expect("CUDA context"); diff --git a/pegainfer-kv-offload/Cargo.toml b/openinfer-kv-offload/Cargo.toml similarity index 92% rename from pegainfer-kv-offload/Cargo.toml rename to openinfer-kv-offload/Cargo.toml index 2d8b9e3a..0398cd7c 100644 --- a/pegainfer-kv-offload/Cargo.toml +++ b/openinfer-kv-offload/Cargo.toml @@ -1,11 +1,11 @@ [package] -name = "pegainfer-kv-offload" +name = "openinfer-kv-offload" version = "0.1.0" edition = "2024" [dependencies] # Embedded in-process: pegaflow-core is the host/SSD/RDMA KV offload tier. -# Pinned to the upstream master commit that landed the two changes pegainfer +# Pinned to the upstream master commit that landed the two changes openinfer # needs: block_stride_bytes (#331) and the in-process load API (#333). Bump the # rev when upstreaming further pegaflow changes; co-develop via a temporary path # dep + PR, then re-pin here. @@ -14,7 +14,7 @@ edition = "2024" # dense GPU<->CPU path). The workspace cudarc already provides cuda-12090 + # nvrtc, which is all pegaflow-core's code needs — it has no cfg(cuda-12) gates. pegaflow-core = { git = "https://github.com/novitalabs/pegaflow.git", rev = "07cac7e50e8ae7be15ad1b9311401039c9ee439b", default-features = false } -pegainfer-kv-cache = { workspace = true } +openinfer-kv-cache = { workspace = true } cudarc = { workspace = true } anyhow = { workspace = true } half = { workspace = true } diff --git a/pegainfer-kv-offload/src/engine.rs b/openinfer-kv-offload/src/engine.rs similarity index 97% rename from pegainfer-kv-offload/src/engine.rs rename to openinfer-kv-offload/src/engine.rs index 432b859f..1a288fb2 100644 --- a/pegainfer-kv-offload/src/engine.rs +++ b/openinfer-kv-offload/src/engine.rs @@ -1,8 +1,8 @@ //! [`OffloadEngine`]: the in-process connector that moves KV blocks between -//! pegainfer's GPU paged cache and pegaflow's host/SSD tiers. +//! openinfer's GPU paged cache and pegaflow's host/SSD tiers. //! //! It owns a [`PegaEngine`] plus a small tokio runtime to drive pegaflow's -//! async save/query, and translates pegainfer's page-first [`KvLayout`] into +//! async save/query, and translates openinfer's page-first [`KvLayout`] into //! pegaflow's per-layer strided registration. Block content hashes are opaque //! `Vec` here — the caller (scheduler) derives them from kvbm sequence //! hashes, so this layer never depends on the logical-cache hashing scheme. @@ -10,17 +10,17 @@ use std::sync::{Arc, Mutex}; use cudarc::driver::CudaStream; +use openinfer_kv_cache::KvBuffer; use pegaflow_core::{ EngineError, LayerSave, PegaEngine, PrefetchStatus, QueryLeaseId, StorageConfig, }; -use pegainfer_kv_cache::KvBuffer; use tokio::runtime::Runtime; use tokio::sync::oneshot; use tokio::task::JoinHandle; /// Single-GPU, single-rank topology. The dense Qwen3-4B path runs one offload /// engine per executor rank, each owning one GPU's KV buffer. -const NAMESPACE: &str = "pegainfer"; +const NAMESPACE: &str = "openinfer"; const TP_RANK: usize = 0; const PP_RANK: usize = 0; const TP_SIZE: usize = 1; @@ -164,7 +164,7 @@ impl Registration { } } -/// In-process bridge from pegainfer's GPU KV cache to pegaflow's offload tiers. +/// In-process bridge from openinfer's GPU KV cache to pegaflow's offload tiers. /// /// Dropping the engine drops its [`Runtime`], which abandons any in-flight /// fire-and-forget [`Self::save`] tasks. That is acceptable: the host tier is a @@ -190,7 +190,7 @@ impl OffloadEngine { /// /// `stream` must be the stream that owns `buffer` (used only to read its /// base device address). pegaflow attaches the device's primary CUDA - /// context for its own worker transfers — the same context pegainfer runs + /// context for its own worker transfers — the same context openinfer runs /// on — so the registered pointers are valid across both. pub fn new( config: OffloadConfig, @@ -261,7 +261,7 @@ impl OffloadEngine { /// the same (block_id, hash) pairing — only the device data differs. /// /// ORDERING CONTRACT: pegaflow's D2H runs on *its own* stream, with no - /// dependency on pegainfer's compute stream. The caller must therefore only + /// dependency on openinfer's compute stream. The caller must therefore only /// save blocks whose KV writes are already complete — i.e. call this after /// the producing forward step has synchronized (block-seal time, which is /// post-step-sync in the executor). Saving a block whose attention write is @@ -315,7 +315,7 @@ impl OffloadEngine { /// GPU block can be reused the moment this returns. Errors surface, unlike /// the fire-and-forget [`Self::save`]. The same compute-stream ORDERING /// CONTRACT as [`Self::save`] applies: blocking waits on pegaflow's D2H, not - /// on pegainfer's compute stream, so the writes must already be complete. + /// on openinfer's compute stream, so the writes must already be complete. pub fn save_blocking( &self, block_ids: &[i32], diff --git a/pegainfer-kv-offload/src/lib.rs b/openinfer-kv-offload/src/lib.rs similarity index 88% rename from pegainfer-kv-offload/src/lib.rs rename to openinfer-kv-offload/src/lib.rs index 3cf3a961..e2eb638c 100644 --- a/pegainfer-kv-offload/src/lib.rs +++ b/openinfer-kv-offload/src/lib.rs @@ -1,6 +1,6 @@ -//! In-process KV cache offload bridge between pegainfer and pegaflow. +//! In-process KV cache offload bridge between openinfer and pegaflow. //! -//! pegainfer owns the GPU paged-KV (`pegainfer-kv-cache::KvBuffer`, page-first +//! openinfer owns the GPU paged-KV (`openinfer-kv-cache::KvBuffer`, page-first //! layout) and the logical prefix cache (kvbm `BlockPool`). pegaflow owns the //! deeper tiers (host pinned memory, SSD, RDMA). [`OffloadEngine`] is the //! connector "brain" that moves blocks between them and decides when. diff --git a/pegainfer-kv-offload/tests/cpu_roundtrip.rs b/openinfer-kv-offload/tests/cpu_roundtrip.rs similarity index 98% rename from pegainfer-kv-offload/tests/cpu_roundtrip.rs rename to openinfer-kv-offload/tests/cpu_roundtrip.rs index f02f0c67..97eea105 100644 --- a/pegainfer-kv-offload/tests/cpu_roundtrip.rs +++ b/openinfer-kv-offload/tests/cpu_roundtrip.rs @@ -13,8 +13,8 @@ use cudarc::driver::{CudaContext, result}; use half::bf16; -use pegainfer_kv_cache::KvBuffer; -use pegainfer_kv_offload::{OffloadConfig, OffloadEngine}; +use openinfer_kv_cache::KvBuffer; +use openinfer_kv_offload::{OffloadConfig, OffloadEngine}; const NUM_LAYERS: usize = 4; const NUM_KV_HEADS: usize = 2; diff --git a/pegainfer-qwen3-4b/Cargo.toml b/openinfer-qwen3-4b/Cargo.toml similarity index 71% rename from pegainfer-qwen3-4b/Cargo.toml rename to openinfer-qwen3-4b/Cargo.toml index 5a017c4c..3bd03e70 100644 --- a/pegainfer-qwen3-4b/Cargo.toml +++ b/openinfer-qwen3-4b/Cargo.toml @@ -1,21 +1,21 @@ [package] -name = "pegainfer-qwen3-4b" +name = "openinfer-qwen3-4b" version = "0.1.0" edition = "2024" autobenches = false [dependencies] -pegainfer-bench = { workspace = true, optional = true } -pegainfer-core = { workspace = true } -pegainfer-cupti = { workspace = true, optional = true } -pegainfer-kernels = { workspace = true } +openinfer-bench = { workspace = true, optional = true } +openinfer-core = { workspace = true } +openinfer-cupti = { workspace = true, optional = true } +openinfer-kernels = { workspace = true } anyhow = { workspace = true } clap = { workspace = true, optional = true } comfy-table = { workspace = true, optional = true } crossbeam-channel = { workspace = true } cudarc = { workspace = true } -pegainfer-kv-cache = { workspace = true } -pegainfer-kv-offload = { workspace = true } +openinfer-kv-cache = { workspace = true } +openinfer-kv-offload = { workspace = true } fastrace = { workspace = true } half = { workspace = true } hex = { workspace = true, optional = true } @@ -29,20 +29,20 @@ tokio = { workspace = true, features = ["sync"] } toml = { workspace = true, optional = true } [features] -kernel-call-trace = ["pegainfer-core/kernel-call-trace"] +kernel-call-trace = ["openinfer-core/kernel-call-trace"] kernel-report = [ "dep:clap", "dep:comfy-table", "dep:hex", - "dep:pegainfer-bench", - "dep:pegainfer-cupti", + "dep:openinfer-bench", + "dep:openinfer-cupti", "dep:sha2", "dep:toml", "kernel-call-trace", ] [dev-dependencies] -pegainfer-vllm-support = { workspace = true } +openinfer-vllm-support = { workspace = true } tokio = { workspace = true, features = ["macros", "rt"] } vllm-text = { workspace = true } diff --git a/pegainfer-qwen3-4b/kernel_manifests/qwen3-4b.toml b/openinfer-qwen3-4b/kernel_manifests/qwen3-4b.toml similarity index 100% rename from pegainfer-qwen3-4b/kernel_manifests/qwen3-4b.toml rename to openinfer-qwen3-4b/kernel_manifests/qwen3-4b.toml diff --git a/pegainfer-qwen3-4b/src/batch_decode.rs b/openinfer-qwen3-4b/src/batch_decode.rs similarity index 95% rename from pegainfer-qwen3-4b/src/batch_decode.rs rename to openinfer-qwen3-4b/src/batch_decode.rs index 37b038bb..b504a11e 100644 --- a/pegainfer-qwen3-4b/src/batch_decode.rs +++ b/openinfer-qwen3-4b/src/batch_decode.rs @@ -11,10 +11,10 @@ use super::batch_decode_buffers::{ use super::batch_decode_dag::BatchDecodeDag; use super::weights::{PackedLoraProjection, Qwen3Model, TransformerBlock}; use crate::lora::LoraProjectionKind; -use pegainfer_core::kv_pool::KvLayout; -use pegainfer_core::ops; -use pegainfer_kernels::tensor::{KvDim, QDim}; -use pegainfer_kv_cache::KvView; +use openinfer_core::kv_pool::KvLayout; +use openinfer_core::ops; +use openinfer_kernels::tensor::{KvDim, QDim}; +use openinfer_kv_cache::KvView; #[cfg(feature = "kernel-call-trace")] macro_rules! dag_label { @@ -345,9 +345,9 @@ impl Qwen3Model { kind0: LoraProjectionKind, kind1: LoraProjectionKind, use_lora: bool, - input: &pegainfer_core::tensor::HiddenStates, - out0: &mut pegainfer_core::tensor::HiddenStates, - out1: &mut pegainfer_core::tensor::HiddenStates, + input: &openinfer_core::tensor::HiddenStates, + out0: &mut openinfer_core::tensor::HiddenStates, + out1: &mut openinfer_core::tensor::HiddenStates, token_slots: &CudaSlice, ) -> Result<()> { if !use_lora { @@ -367,10 +367,10 @@ impl Qwen3Model { kind1: LoraProjectionKind, kind2: LoraProjectionKind, use_lora: bool, - input: &pegainfer_core::tensor::HiddenStates, - out0: &mut pegainfer_core::tensor::HiddenStates, - out1: &mut pegainfer_core::tensor::HiddenStates, - out2: &mut pegainfer_core::tensor::HiddenStates, + input: &openinfer_core::tensor::HiddenStates, + out0: &mut openinfer_core::tensor::HiddenStates, + out1: &mut openinfer_core::tensor::HiddenStates, + out2: &mut openinfer_core::tensor::HiddenStates, token_slots: &CudaSlice, ) -> Result<()> { if !use_lora { @@ -387,7 +387,7 @@ impl Qwen3Model { &'a self, layer_idx: usize, kind: LoraProjectionKind, - out: &'a mut pegainfer_core::tensor::HiddenStates, + out: &'a mut openinfer_core::tensor::HiddenStates, ) -> Option> { let packed = self.packed_lora_projection(layer_idx, kind)?; Some(grouped_projection_from_packed(packed, out)) @@ -399,8 +399,8 @@ impl Qwen3Model { layer_idx: usize, kind: LoraProjectionKind, use_lora: bool, - input: &pegainfer_core::tensor::HiddenStates, - out: &mut pegainfer_core::tensor::HiddenStates, + input: &openinfer_core::tensor::HiddenStates, + out: &mut openinfer_core::tensor::HiddenStates, row_offset: usize, token_slots: &CudaSlice, ) -> Result<()> { @@ -430,7 +430,7 @@ impl Qwen3Model { fn grouped_projection_from_packed<'a>( packed: &'a PackedLoraProjection, - out: &'a mut pegainfer_core::tensor::HiddenStates, + out: &'a mut openinfer_core::tensor::HiddenStates, ) -> ops::LoraDecodeGroupedProjection<'a> { ops::LoraDecodeGroupedProjection { a_packed: &packed.a, @@ -448,10 +448,10 @@ mod tests { use super::*; use crate::batch_decode_buffers::BatchDecodeBuffers; use crate::weights::ModelRuntimeConfig; - use pegainfer_core::ops; - use pegainfer_core::sampler::SamplingParams; - use pegainfer_core::tensor::DeviceVec; - use pegainfer_kv_cache::{KvCacheManager, RequestKv}; + use openinfer_core::ops; + use openinfer_core::sampler::SamplingParams; + use openinfer_core::tensor::DeviceVec; + use openinfer_kv_cache::{KvCacheManager, RequestKv}; use rand::SeedableRng; use rand::rngs::StdRng; use std::path::Path; @@ -459,14 +459,14 @@ mod tests { const MODEL_PATH: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/../models/Qwen3-4B"); fn get_model_path_or_skip() -> Option { - match std::env::var("PEGAINFER_TEST_MODEL_PATH") { + match std::env::var("OPENINFER_TEST_MODEL_PATH") { Ok(path) => Some(path), Err(_) if Path::new(MODEL_PATH).join("config.json").exists() => { Some(MODEL_PATH.to_string()) } Err(_) => { eprintln!( - "skipping Qwen3 batch decode model test because {MODEL_PATH}/config.json is missing; set PEGAINFER_TEST_MODEL_PATH to run it" + "skipping Qwen3 batch decode model test because {MODEL_PATH}/config.json is missing; set OPENINFER_TEST_MODEL_PATH to run it" ); None } @@ -506,7 +506,7 @@ mod tests { let mut scratch_row_states = model .ctx .stream - .alloc_zeros(pegainfer_core::ops::flashinfer_topk_row_states_bytes()) + .alloc_zeros(openinfer_core::ops::flashinfer_topk_row_states_bytes()) .unwrap(); let mut scratch_valid = model.ctx.stream.alloc_zeros(1).unwrap(); let mut scratch_out = model.ctx.stream.alloc_zeros(1).unwrap(); @@ -546,7 +546,7 @@ mod tests { let mut row_states: cudarc::driver::CudaSlice = model .ctx .stream - .alloc_zeros(pegainfer_core::ops::flashinfer_topk_row_states_bytes()) + .alloc_zeros(openinfer_core::ops::flashinfer_topk_row_states_bytes()) .unwrap(); let mut valid: cudarc::driver::CudaSlice = model.ctx.stream.alloc_zeros(1).unwrap(); let mut out: cudarc::driver::CudaSlice = model.ctx.stream.alloc_zeros(1).unwrap(); diff --git a/pegainfer-qwen3-4b/src/batch_decode_buffers.rs b/openinfer-qwen3-4b/src/batch_decode_buffers.rs similarity index 98% rename from pegainfer-qwen3-4b/src/batch_decode_buffers.rs rename to openinfer-qwen3-4b/src/batch_decode_buffers.rs index 89451f1c..a38e1864 100644 --- a/pegainfer-qwen3-4b/src/batch_decode_buffers.rs +++ b/openinfer-qwen3-4b/src/batch_decode_buffers.rs @@ -4,9 +4,9 @@ use anyhow::Result; use cudarc::driver::CudaSlice; -use pegainfer_core::cuda_graph::CudaGraphState; -use pegainfer_core::tensor::{DeviceContext, HiddenStates}; -use pegainfer_kv_cache::KvView; +use openinfer_core::cuda_graph::CudaGraphState; +use openinfer_core::tensor::{DeviceContext, HiddenStates}; +use openinfer_kv_cache::KvView; /// Bucket sizes for CUDA Graph capture. Actual batch is padded to the nearest bucket. pub(crate) const BATCH_BUCKETS: &[usize] = &[1, 2, 4, 8, 16, 32, 64]; diff --git a/pegainfer-qwen3-4b/src/batch_decode_dag.rs b/openinfer-qwen3-4b/src/batch_decode_dag.rs similarity index 92% rename from pegainfer-qwen3-4b/src/batch_decode_dag.rs rename to openinfer-qwen3-4b/src/batch_decode_dag.rs index 573c14d5..a374f238 100644 --- a/pegainfer-qwen3-4b/src/batch_decode_dag.rs +++ b/openinfer-qwen3-4b/src/batch_decode_dag.rs @@ -7,16 +7,16 @@ use anyhow::Result; use cudarc::driver::CudaSlice; use half::bf16; -use pegainfer_core::kv_pool::KvLayout; +use openinfer_core::kv_pool::KvLayout; #[cfg(feature = "kernel-call-trace")] -use pegainfer_core::ops::call_spec::{ +use openinfer_core::ops::call_spec::{ self, PagedDecodeCallSpec, embedding_batch_call, fused_add_rms_norm_batch_call, gemm_call, gemm_rows_call, qk_norm_rope_batch_decode_call, rms_norm_batch_call, silu_mul_fused_batch_call, }; #[cfg(feature = "kernel-call-trace")] -use pegainfer_core::ops::call_trace; -use pegainfer_core::tensor::{DeviceMatrix, DeviceVec, HiddenStates}; -use pegainfer_kernels::tensor::{AxisTag, Hidden, InDim, Intermediate, QDim, Vocab}; +use openinfer_core::ops::call_trace; +use openinfer_core::tensor::{DeviceMatrix, DeviceVec, HiddenStates}; +use openinfer_kernels::tensor::{AxisTag, Hidden, InDim, Intermediate, QDim, Vocab}; use crate::batch_decode_buffers::{BatchDecodeBuffers, DecodeAttentionPath}; use crate::weights::Qwen3Model; @@ -65,7 +65,7 @@ impl<'a> BatchDecodeDag<'a> { self.model.embed_tokens.cols, out.seq_len, )); - pegainfer_kernels::ops::embedding_batch( + openinfer_kernels::ops::embedding_batch( &self.model.ctx, &self.model.embed_tokens, token_ids, @@ -87,7 +87,7 @@ impl<'a> BatchDecodeDag<'a> { x.seq_len, self.model.config.rms_norm_eps, )); - pegainfer_kernels::ops::rms_norm_batch_into( + openinfer_kernels::ops::rms_norm_batch_into( &self.model.ctx, x, weight, @@ -111,7 +111,7 @@ impl<'a> BatchDecodeDag<'a> { hidden.seq_len, self.model.config.rms_norm_eps, )); - pegainfer_kernels::ops::fused_add_rms_norm_round_batch_into( + openinfer_kernels::ops::fused_add_rms_norm_round_batch_into( &self.model.ctx, hidden, residual, @@ -139,7 +139,7 @@ impl<'a> BatchDecodeDag<'a> { row_offset, x.seq_len, )); - pegainfer_kernels::ops::gemm_rows_into(&self.model.ctx, weight, row_offset, rows, x, out); + openinfer_kernels::ops::gemm_rows_into(&self.model.ctx, weight, row_offset, rows, x, out); } pub(crate) fn gemm( @@ -156,7 +156,7 @@ impl<'a> BatchDecodeDag<'a> { weight.cols, x.seq_len, )); - pegainfer_kernels::ops::gemm_into(&self.model.ctx, weight, x, out); + openinfer_kernels::ops::gemm_into(&self.model.ctx, weight, x, out); } pub(crate) fn qk_norm_rope( @@ -180,7 +180,7 @@ impl<'a> BatchDecodeDag<'a> { self.model.config.head_dim, self.model.config.rms_norm_eps, )); - pegainfer_kernels::ops::qk_norm_rope_batch_decode_into( + openinfer_kernels::ops::qk_norm_rope_batch_decode_into( &self.model.ctx, q, k, @@ -225,7 +225,7 @@ impl<'a> BatchDecodeDag<'a> { match self.attention_path { DecodeAttentionPath::NonPartition => { - pegainfer_kernels::ops::paged_attention_batch_decode_into( + openinfer_kernels::ops::paged_attention_batch_decode_into( &self.model.ctx, &bufs.q, &bufs.k, @@ -246,7 +246,7 @@ impl<'a> BatchDecodeDag<'a> { ) } DecodeAttentionPath::SplitKv => { - pegainfer_kernels::ops::paged_attention_batch_decode_split_kv_into( + openinfer_kernels::ops::paged_attention_batch_decode_split_kv_into( &self.model.ctx, &bufs.q, &bufs.k, @@ -332,7 +332,7 @@ impl<'a> BatchDecodeDag<'a> { gate.hidden_dim, gate.seq_len, )); - pegainfer_kernels::ops::silu_mul_batch_into(&self.model.ctx, gate, up, out) + openinfer_kernels::ops::silu_mul_batch_into(&self.model.ctx, gate, up, out) } pub(crate) fn down_proj( @@ -356,7 +356,7 @@ impl<'a> BatchDecodeDag<'a> { } #[cfg(feature = "kernel-call-trace")] - fn record(call: pegainfer_kernels::tensor::KernelCall) { + fn record(call: openinfer_kernels::tensor::KernelCall) { if call_trace::is_enabled() { call_trace::record_call(call); } diff --git a/pegainfer-qwen3-4b/src/batch_decode_trace.rs b/openinfer-qwen3-4b/src/batch_decode_trace.rs similarity index 94% rename from pegainfer-qwen3-4b/src/batch_decode_trace.rs rename to openinfer-qwen3-4b/src/batch_decode_trace.rs index 411c312e..607ab4df 100644 --- a/pegainfer-qwen3-4b/src/batch_decode_trace.rs +++ b/openinfer-qwen3-4b/src/batch_decode_trace.rs @@ -1,9 +1,9 @@ #[cfg(feature = "kernel-call-trace")] use anyhow::Result; #[cfg(feature = "kernel-call-trace")] -use pegainfer_core::ops::call_trace; +use openinfer_core::ops::call_trace; #[cfg(feature = "kernel-call-trace")] -use pegainfer_kernels::tensor::KernelCall; +use openinfer_kernels::tensor::KernelCall; #[cfg(feature = "kernel-call-trace")] use crate::batch_decode_buffers::BatchDecodeBuffers; @@ -40,7 +40,7 @@ pub fn trace_decode_kernel_calls( }, )?; let budget = model.kv_budget(); - let kv_mgr = pegainfer_kv_cache::KvCacheManager::new( + let kv_mgr = openinfer_kv_cache::KvCacheManager::new( &model.device_ctx().stream, budget.num_layers, budget.num_kv_heads, @@ -48,7 +48,7 @@ pub fn trace_decode_kernel_calls( budget.block_size, budget.num_blocks, )?; - let layout = pegainfer_core::kv_pool::KvLayout::new( + let layout = openinfer_core::kv_pool::KvLayout::new( budget.num_layers, budget.num_kv_heads, budget.head_dim, diff --git a/pegainfer-qwen3-4b/src/bin/qwen3_decode_context.rs b/openinfer-qwen3-4b/src/bin/qwen3_decode_context.rs similarity index 98% rename from pegainfer-qwen3-4b/src/bin/qwen3_decode_context.rs rename to openinfer-qwen3-4b/src/bin/qwen3_decode_context.rs index 95dc80a7..ef998974 100644 --- a/pegainfer-qwen3-4b/src/bin/qwen3_decode_context.rs +++ b/openinfer-qwen3-4b/src/bin/qwen3_decode_context.rs @@ -2,8 +2,8 @@ use std::hint::black_box; use std::time::{Duration, Instant}; use anyhow::{Context, Result, anyhow, bail}; -use pegainfer_core::sampler::SamplingParams; -use pegainfer_qwen3_4b::runtime::{ +use openinfer_core::sampler::SamplingParams; +use openinfer_qwen3_4b::runtime::{ DecodePlan, DecodeStepItem, PrefillPlan, PrefillStepItem, Qwen3Executor, RequestId, }; use rand::rngs::StdRng; @@ -77,7 +77,7 @@ fn parse_usize(name: &str, raw: &str) -> Result { fn parse_args() -> Result { let mut mode = Mode::Measure; let mut model_path = - std::env::var("PEGAINFER_TEST_MODEL_PATH").unwrap_or_else(|_| MODEL_PATH.to_string()); + std::env::var("OPENINFER_TEST_MODEL_PATH").unwrap_or_else(|_| MODEL_PATH.to_string()); let mut contexts = DEFAULT_CONTEXTS.to_vec(); let mut iters = DEFAULT_MEASURE_ITERS; let mut profile_steps = DEFAULT_PROFILE_STEPS; diff --git a/pegainfer-qwen3-4b/src/bin/qwen3_kernel_report.rs b/openinfer-qwen3-4b/src/bin/qwen3_kernel_report.rs similarity index 99% rename from pegainfer-qwen3-4b/src/bin/qwen3_kernel_report.rs rename to openinfer-qwen3-4b/src/bin/qwen3_kernel_report.rs index a7986615..ebee5a0a 100644 --- a/pegainfer-qwen3-4b/src/bin/qwen3_kernel_report.rs +++ b/openinfer-qwen3-4b/src/bin/qwen3_kernel_report.rs @@ -8,9 +8,9 @@ use std::time::{SystemTime, UNIX_EPOCH}; use anyhow::{Context, Result, anyhow, bail}; use clap::{Args, Parser, Subcommand}; use cudarc::driver::sys; -use pegainfer_cupti::profile_range_with_prepare; -use pegainfer_kernels::tensor::DeviceContext; -use pegainfer_qwen3_4b::kernel_bench::{ +use openinfer_cupti::profile_range_with_prepare; +use openinfer_kernels::tensor::DeviceContext; +use openinfer_qwen3_4b::kernel_bench::{ AttentionDecodeCase, AttentionKernelShape, AttentionKernelSpec, AttentionKernelVariant, AttentionPrefillCase, DecodePath, DevicePeakBandwidth, HEAD_DIM, L2CacheClear, NUM_KV_HEADS, NUM_QO_HEADS, PAGE_SIZE, PrefillAttentionShape, PrefillAttentionSpec, PrefillAttentionVariant, @@ -1120,7 +1120,7 @@ fn query_external_provenance() -> ExternalProvenance { dirty: None, }, build: BuildProvenance { - target_sm_env: std::env::var("PEGAINFER_CUDA_SM").ok(), + target_sm_env: std::env::var("OPENINFER_CUDA_SM").ok(), flashinfer_commit: None, kernel_archive: None, kernel_archive_fnv1a64: None, diff --git a/pegainfer-qwen3-4b/src/bin/qwen3_model_report.rs b/openinfer-qwen3-4b/src/bin/qwen3_model_report.rs similarity index 98% rename from pegainfer-qwen3-4b/src/bin/qwen3_model_report.rs rename to openinfer-qwen3-4b/src/bin/qwen3_model_report.rs index 602196e8..25629914 100644 --- a/pegainfer-qwen3-4b/src/bin/qwen3_model_report.rs +++ b/openinfer-qwen3-4b/src/bin/qwen3_model_report.rs @@ -8,18 +8,18 @@ use clap::Parser; use comfy_table::{Cell, Color, ContentArrangement, Table, presets::UTF8_FULL}; use cudarc::driver::{CudaSlice, sys}; use half::bf16; -use pegainfer_bench::{ +use openinfer_bench::{ Accum, CallSiteRow, LatencyStats, RollupRow, accumulate, attr_usize, axis, call_site_row, input, output, rollup_row, zero_matrix, }; -use pegainfer_core::kv_pool::KvLayout; -use pegainfer_core::ops; -use pegainfer_kernels::tensor::{DeviceContext, DeviceVec, HiddenStates, KernelCall, TensorSpec}; -use pegainfer_qwen3_4b::batch_decode_trace::{ +use openinfer_core::kv_pool::KvLayout; +use openinfer_core::ops; +use openinfer_kernels::tensor::{DeviceContext, DeviceVec, HiddenStates, KernelCall, TensorSpec}; +use openinfer_qwen3_4b::batch_decode_trace::{ HEAD_DIM_VALUE, KV_DIM_VALUE, MODEL, NUM_KV_HEADS, NUM_LAYERS, NUM_Q_HEADS, PHASE_DECODE, RMS_NORM_EPS, normalize_call_site, trace_decode_kernel_calls, }; -use pegainfer_qwen3_4b::kernel_bench::{L2CacheClear, SplitKvConfig}; +use openinfer_qwen3_4b::kernel_bench::{L2CacheClear, SplitKvConfig}; use serde::Serialize; const DEFAULT_ITERS: u64 = 32; @@ -676,8 +676,8 @@ fn bench_key(call: &KernelCall) -> Result { #[derive(Serialize)] struct Key<'a> { op: &'a str, - inputs: &'a [pegainfer_kernels::tensor::TensorArg], - outputs: &'a [pegainfer_kernels::tensor::TensorArg], + inputs: &'a [openinfer_kernels::tensor::TensorArg], + outputs: &'a [openinfer_kernels::tensor::TensorArg], attrs: BTreeMap<&'a str, &'a str>, } @@ -807,7 +807,7 @@ fn print_text_report(report: &ModelReport, out: &Path, dot_out: &Path) { let first_input = call .inputs .first() - .map(pegainfer_core::tensor::TensorArg::compact) + .map(openinfer_core::tensor::TensorArg::compact) .map_or_else(|| "-".to_string(), |input| truncate(&input, 86)); preview.add_row(vec![ Cell::new(&call.label), diff --git a/pegainfer-qwen3-4b/src/config.rs b/openinfer-qwen3-4b/src/config.rs similarity index 100% rename from pegainfer-qwen3-4b/src/config.rs rename to openinfer-qwen3-4b/src/config.rs diff --git a/pegainfer-qwen3-4b/src/executor.rs b/openinfer-qwen3-4b/src/executor.rs similarity index 98% rename from pegainfer-qwen3-4b/src/executor.rs rename to openinfer-qwen3-4b/src/executor.rs index c31dfa62..0d2572e6 100644 --- a/pegainfer-qwen3-4b/src/executor.rs +++ b/openinfer-qwen3-4b/src/executor.rs @@ -8,15 +8,15 @@ use crate::batch_decode_buffers::{BATCH_BUCKETS, BatchDecodeBuffers}; use crate::config::{Config, TensorParallelConfig}; use crate::weights::{ModelRuntimeConfig, Qwen3Model}; use crate::{Qwen3LoraOptions, Qwen3OffloadOptions}; -use pegainfer_core::engine::{LoadLoraAdapterRequest, TokenLogprob, UnloadLoraAdapterRequest}; -use pegainfer_core::kv_pool::KvLayout; -use pegainfer_core::ops; -use pegainfer_core::sampler::SamplingParams; -use pegainfer_core::tensor::{DeviceContext, DeviceVec, HiddenStates}; -use pegainfer_kv_cache::{ +use openinfer_core::engine::{LoadLoraAdapterRequest, TokenLogprob, UnloadLoraAdapterRequest}; +use openinfer_core::kv_pool::KvLayout; +use openinfer_core::ops; +use openinfer_core::sampler::SamplingParams; +use openinfer_core::tensor::{DeviceContext, DeviceVec, HiddenStates}; +use openinfer_kv_cache::{ KvBlockGuard, KvBuffer, KvCacheManager, KvView, LoadReservation, PrefixProbe, }; -use pegainfer_kv_offload::{LoadHandle, OffloadConfig, OffloadEngine}; +use openinfer_kv_offload::{LoadHandle, OffloadConfig, OffloadEngine}; #[derive(Clone, Copy, Debug, Eq, PartialEq, Hash, Ord, PartialOrd)] pub struct RequestId(pub(crate) u64); @@ -192,7 +192,7 @@ fn build_batch_decode_request_results( ) -> Result> { let params: Vec<&SamplingParams> = requests.iter().map(|req| &req.params).collect(); let random_vals: Vec = requests.iter().map(|req| req.random_val).collect(); - let tokens = pegainfer_core::ops::select_batch_tokens_into( + let tokens = openinfer_core::ops::select_batch_tokens_into( lane.model.device_ctx(), &lane.bufs.logits, ¶ms, @@ -324,7 +324,7 @@ struct CublasThreadGuard; impl Drop for CublasThreadGuard { fn drop(&mut self) { unsafe { - pegainfer_core::ffi::cublas_destroy(); + openinfer_core::ffi::cublas_destroy(); } } } @@ -346,7 +346,7 @@ impl SamplingScratch { top1_values: ctx.stream.alloc_zeros(max_batch_bucket)?, row_states: ctx .stream - .alloc_zeros(pegainfer_core::ops::flashinfer_topk_row_states_bytes())?, + .alloc_zeros(openinfer_core::ops::flashinfer_topk_row_states_bytes())?, valid: ctx.stream.alloc_zeros(1)?, out: ctx.stream.alloc_zeros(max_batch_bucket)?, }) @@ -393,7 +393,7 @@ fn compute_logprobs_from_cpu( fn bind_model_thread(model: &Qwen3Model) -> Result<()> { unsafe { - let err = pegainfer_core::ffi::cuda_set_device(model.device_ctx().device_ordinal as i32); + let err = openinfer_core::ffi::cuda_set_device(model.device_ctx().device_ordinal as i32); if err != 0 { return Err(anyhow::anyhow!( "Failed to set CUDA device {} on worker thread: cudaError={}", @@ -408,7 +408,7 @@ fn bind_model_thread(model: &Qwen3Model) -> Result<()> { .bind_to_thread() .map_err(|e| anyhow::anyhow!("Failed to bind CUDA context to thread: {e}"))?; unsafe { - pegainfer_core::ffi::cublas_init(); + openinfer_core::ffi::cublas_init(); } Ok(()) } @@ -534,7 +534,7 @@ struct Qwen3ExecutorMetadata { pub struct Qwen3Executor { metadata: Qwen3ExecutorMetadata, kv_mgr: KvCacheManager, - request_kvs: HashMap, + request_kvs: HashMap, primary: RankWorker, workers: Vec, loaded_lora_adapters: HashSet, @@ -918,7 +918,7 @@ impl Qwen3Executor { fn settle_prefetch( &mut self, id: RequestId, - result: Result<(), pegainfer_kv_offload::EngineError>, + result: Result<(), openinfer_kv_offload::EngineError>, ) { if let Some(st) = self.prefetch.get_mut(&id) { st.handle = None; @@ -1683,7 +1683,7 @@ impl LocalQwen3Lane { params: &SamplingParams, random_val: f32, ) -> Result { - pegainfer_core::ops::gpu_sample_into( + openinfer_core::ops::gpu_sample_into( self.model.device_ctx(), logits, &mut self.sample_scratch.probs, @@ -1714,7 +1714,7 @@ impl LocalQwen3Lane { target_token: u32, top_k: usize, ) -> Option { - pegainfer_core::ops::extract_vec(self.model.device_ctx(), all_logits, prev_pos) + openinfer_core::ops::extract_vec(self.model.device_ctx(), all_logits, prev_pos) .ok() .and_then(|logits_vec| { let logits_f32 = logits_vec.to_host(self.model.device_ctx()).ok()?; diff --git a/pegainfer-qwen3-4b/src/kernel_bench.rs b/openinfer-qwen3-4b/src/kernel_bench.rs similarity index 99% rename from pegainfer-qwen3-4b/src/kernel_bench.rs rename to openinfer-qwen3-4b/src/kernel_bench.rs index 5205cd0c..77966073 100644 --- a/pegainfer-qwen3-4b/src/kernel_bench.rs +++ b/openinfer-qwen3-4b/src/kernel_bench.rs @@ -5,10 +5,10 @@ use std::time::Duration; use anyhow::{Result, anyhow, bail}; use cudarc::driver::{CudaEvent, CudaSlice, DevicePtr, DevicePtrMut, sys}; use half::bf16; -use pegainfer_kernels::ffi; -use pegainfer_kernels::ops::{PrefillPagedPlan, prefill_attention_paged_into}; -use pegainfer_kernels::paged_kv::PagedKvLayout; -use pegainfer_kernels::tensor::{DeviceContext, DeviceVec, HiddenStates}; +use openinfer_kernels::ffi; +use openinfer_kernels::ops::{PrefillPagedPlan, prefill_attention_paged_into}; +use openinfer_kernels::paged_kv::PagedKvLayout; +use openinfer_kernels::tensor::{DeviceContext, DeviceVec, HiddenStates}; use serde::{Deserialize, Serialize}; pub const NUM_LAYERS: usize = 1; diff --git a/pegainfer-qwen3-4b/src/kernel_plan.rs b/openinfer-qwen3-4b/src/kernel_plan.rs similarity index 100% rename from pegainfer-qwen3-4b/src/kernel_plan.rs rename to openinfer-qwen3-4b/src/kernel_plan.rs diff --git a/pegainfer-qwen3-4b/src/lib.rs b/openinfer-qwen3-4b/src/lib.rs similarity index 98% rename from pegainfer-qwen3-4b/src/lib.rs rename to openinfer-qwen3-4b/src/lib.rs index 83106f68..23c8a43c 100644 --- a/pegainfer-qwen3-4b/src/lib.rs +++ b/openinfer-qwen3-4b/src/lib.rs @@ -16,7 +16,7 @@ mod weights; use std::path::Path; use anyhow::Result; -use pegainfer_core::engine::{EngineHandle, EngineLoadOptions, ModelInfo}; +use openinfer_core::engine::{EngineHandle, EngineLoadOptions, ModelInfo}; pub use kernel_plan::kernel_plan; diff --git a/pegainfer-qwen3-4b/src/lora.rs b/openinfer-qwen3-4b/src/lora.rs similarity index 99% rename from pegainfer-qwen3-4b/src/lora.rs rename to openinfer-qwen3-4b/src/lora.rs index 2b834d7f..2ce66958 100644 --- a/pegainfer-qwen3-4b/src/lora.rs +++ b/openinfer-qwen3-4b/src/lora.rs @@ -5,8 +5,8 @@ use std::path::{Path, PathBuf}; use anyhow::{Context, Result, bail, ensure}; use cudarc::driver::CudaSlice; use half::{bf16, f16}; -use pegainfer_core::ops; -use pegainfer_core::tensor::{DeviceContext, DeviceMatrix, HiddenStates}; +use openinfer_core::ops; +use openinfer_core::tensor::{DeviceContext, DeviceMatrix, HiddenStates}; use safetensors::tensor::TensorView; use safetensors::{Dtype, SafeTensors}; use serde::Deserialize; @@ -788,7 +788,7 @@ mod tests { fn temp_adapter_dir(test_name: &str) -> PathBuf { let id = NEXT_TEST_DIR.fetch_add(1, Ordering::Relaxed); let path = std::env::temp_dir().join(format!( - "pegainfer-qwen3-lora-{test_name}-{}-{id}", + "openinfer-qwen3-lora-{test_name}-{}-{id}", std::process::id() )); let _ = fs::remove_dir_all(&path); diff --git a/pegainfer-qwen3-4b/src/prefill.rs b/openinfer-qwen3-4b/src/prefill.rs similarity index 97% rename from pegainfer-qwen3-4b/src/prefill.rs rename to openinfer-qwen3-4b/src/prefill.rs index 49813803..9ba97520 100644 --- a/pegainfer-qwen3-4b/src/prefill.rs +++ b/openinfer-qwen3-4b/src/prefill.rs @@ -5,11 +5,11 @@ use half::bf16; use super::config::PREFILL_ATTENTION_CTA_TILE_Q; use super::weights::{Qwen3Model, TransformerBlock}; use crate::lora::{DeviceLoraTokenGroup, build_lora_token_ranges, prepare_lora_token_groups}; -use pegainfer_core::kv_pool::KvLayout; -use pegainfer_core::ops; -use pegainfer_core::ops::PrefillPagedPlan; -use pegainfer_core::tensor::{DeviceContext, DeviceVec, HiddenStates}; -use pegainfer_kv_cache::KvView; +use openinfer_core::kv_pool::KvLayout; +use openinfer_core::ops; +use openinfer_core::ops::PrefillPagedPlan; +use openinfer_core::tensor::{DeviceContext, DeviceVec, HiddenStates}; +use openinfer_kv_cache::KvView; /// Pre-allocated scratch buffers for one prefill forward pass. /// Created once per prefill pass, eliminating @@ -82,7 +82,7 @@ impl Qwen3Model { layer: &TransformerBlock, hidden: &mut HiddenStates, kv_buffer: &cudarc::driver::CudaSlice, - layout: &pegainfer_core::kv_pool::KvLayout, + layout: &openinfer_core::kv_pool::KvLayout, plan: &PrefillPagedPlan, lora_groups: &[DeviceLoraTokenGroup<'_>], bufs: &mut PrefillBuffers, @@ -191,7 +191,7 @@ impl Qwen3Model { self.all_reduce_hidden(&mut bufs.o_buf)?; // 5+6. Residual add + MLP RMSNorm (fused): hidden += o_buf; normed = rms_norm(hidden) - pegainfer_kernels::ops::fused_add_rms_norm_round_batch_into( + openinfer_kernels::ops::fused_add_rms_norm_round_batch_into( &self.ctx, hidden, &bufs.o_buf, diff --git a/pegainfer-qwen3-4b/src/scheduler.rs b/openinfer-qwen3-4b/src/scheduler.rs similarity index 99% rename from pegainfer-qwen3-4b/src/scheduler.rs rename to openinfer-qwen3-4b/src/scheduler.rs index ca117abc..711fc6e3 100644 --- a/pegainfer-qwen3-4b/src/scheduler.rs +++ b/openinfer-qwen3-4b/src/scheduler.rs @@ -20,10 +20,10 @@ use tokio::sync::mpsc; use crate::executor::{ModelExecutor, Qwen3Executor, RequestId}; use crate::{Qwen3LoraOptions, Qwen3OffloadOptions}; -use pegainfer_core::engine::{ +use openinfer_core::engine::{ EngineCommand, EngineControlRequest, EngineHandle, GenerateRequest, TokenEvent, }; -use pegainfer_core::sampler::SamplingParams; +use openinfer_core::sampler::SamplingParams; use self::effects::apply_effects; use self::plan::{build_next_plan, execute_plan}; @@ -808,7 +808,7 @@ mod tests { use std::time::{Duration, Instant}; use anyhow::Result; - use pegainfer_core::engine::{ + use openinfer_core::engine::{ EngineControlError, LoadLoraAdapterRequest, UnloadLoraAdapterRequest, }; @@ -1715,21 +1715,21 @@ mod tests { request_id: RequestId(1), token: 201, logprob: None, - finish_reason: pegainfer_core::engine::FinishReason::Length, + finish_reason: openinfer_core::engine::FinishReason::Length, completion_tokens: 2, }, effects::DecodeEffect::EmitAndFinish { request_id: RequestId(10), token: 210, logprob: None, - finish_reason: pegainfer_core::engine::FinishReason::Length, + finish_reason: openinfer_core::engine::FinishReason::Length, completion_tokens: 2, }, effects::DecodeEffect::EmitAndFinish { request_id: RequestId(7), token: 207, logprob: None, - finish_reason: pegainfer_core::engine::FinishReason::Length, + finish_reason: openinfer_core::engine::FinishReason::Length, completion_tokens: 2, }, ], diff --git a/pegainfer-qwen3-4b/src/scheduler/effects.rs b/openinfer-qwen3-4b/src/scheduler/effects.rs similarity index 99% rename from pegainfer-qwen3-4b/src/scheduler/effects.rs rename to openinfer-qwen3-4b/src/scheduler/effects.rs index 456dc87e..9c1848e5 100644 --- a/pegainfer-qwen3-4b/src/scheduler/effects.rs +++ b/openinfer-qwen3-4b/src/scheduler/effects.rs @@ -1,7 +1,7 @@ use tokio::sync::mpsc; use crate::executor::RequestId; -use pegainfer_core::engine::{FinishReason, TokenLogprob}; +use openinfer_core::engine::{FinishReason, TokenLogprob}; use super::{ActiveRequestState, TokenEvent}; diff --git a/pegainfer-qwen3-4b/src/scheduler/plan.rs b/openinfer-qwen3-4b/src/scheduler/plan.rs similarity index 99% rename from pegainfer-qwen3-4b/src/scheduler/plan.rs rename to openinfer-qwen3-4b/src/scheduler/plan.rs index 78c380cb..2b4f12d1 100644 --- a/pegainfer-qwen3-4b/src/scheduler/plan.rs +++ b/openinfer-qwen3-4b/src/scheduler/plan.rs @@ -143,7 +143,7 @@ fn sort_decode_results(results: &mut [crate::executor::DecodeRequestResult]) { mod tests { use super::*; use crate::executor::RequestId; - use pegainfer_core::sampler::SamplingParams; + use openinfer_core::sampler::SamplingParams; fn pending() -> PendingRequest { let (token_tx, _rx) = tokio::sync::mpsc::unbounded_channel(); diff --git a/pegainfer-qwen3-4b/src/scheduler/resolve.rs b/openinfer-qwen3-4b/src/scheduler/resolve.rs similarity index 99% rename from pegainfer-qwen3-4b/src/scheduler/resolve.rs rename to openinfer-qwen3-4b/src/scheduler/resolve.rs index acfec650..9e9cf2b8 100644 --- a/pegainfer-qwen3-4b/src/scheduler/resolve.rs +++ b/openinfer-qwen3-4b/src/scheduler/resolve.rs @@ -1,5 +1,5 @@ use crate::executor::{DecodeRequestResult, ModelExecutor, PrefillRequestResult}; -use pegainfer_core::engine::FinishReason; +use openinfer_core::engine::FinishReason; use super::effects::{DecodeEffect, PendingEffect, PromptEchoEffect, StepEffects}; use super::plan::ExecutionArtifacts; diff --git a/pegainfer-qwen3-4b/src/unified_forward.rs b/openinfer-qwen3-4b/src/unified_forward.rs similarity index 98% rename from pegainfer-qwen3-4b/src/unified_forward.rs rename to openinfer-qwen3-4b/src/unified_forward.rs index 818a60ae..2fe4761b 100644 --- a/pegainfer-qwen3-4b/src/unified_forward.rs +++ b/openinfer-qwen3-4b/src/unified_forward.rs @@ -12,12 +12,12 @@ use super::config::PREFILL_ATTENTION_CTA_TILE_Q; use super::prefill::PrefillBuffers; use super::weights::{Qwen3Model, TransformerBlock}; use crate::lora::{DeviceLoraTokenGroup, build_lora_token_ranges, prepare_lora_token_groups}; -use pegainfer_core::ffi; -use pegainfer_core::kv_pool::KvLayout; -use pegainfer_core::ops; -use pegainfer_core::ops::PrefillPagedPlan; -use pegainfer_core::tensor::{DeviceContext, DeviceVec, HiddenStates}; -use pegainfer_kv_cache::KvView; +use openinfer_core::ffi; +use openinfer_core::kv_pool::KvLayout; +use openinfer_core::ops; +use openinfer_core::ops::PrefillPagedPlan; +use openinfer_core::tensor::{DeviceContext, DeviceVec, HiddenStates}; +use openinfer_kv_cache::KvView; /// Decode attention metadata (allocated per unified step, not CUDA-graph safe). #[allow(clippy::struct_field_names)] @@ -591,7 +591,7 @@ impl Qwen3Model { self.all_reduce_hidden(&mut bufs.o_buf)?; // ── 7+8. Residual add + MLP RMSNorm (fused) ───────────────── - pegainfer_kernels::ops::fused_add_rms_norm_round_batch_into( + openinfer_kernels::ops::fused_add_rms_norm_round_batch_into( &self.ctx, hidden, &bufs.o_buf, diff --git a/pegainfer-qwen3-4b/src/weights.rs b/openinfer-qwen3-4b/src/weights.rs similarity index 97% rename from pegainfer-qwen3-4b/src/weights.rs rename to openinfer-qwen3-4b/src/weights.rs index c4679f4a..c9818f67 100644 --- a/pegainfer-qwen3-4b/src/weights.rs +++ b/openinfer-qwen3-4b/src/weights.rs @@ -13,8 +13,8 @@ use crate::lora::{ LoraProjectionKind, apply_lora_projection_delta_indexed, apply_lora_projection_delta_range, }; use half::bf16; -use pegainfer_core::tensor::{DeviceContext, DeviceMatrix, DeviceVec, HiddenStates}; -use pegainfer_core::weight_loader::{ +use openinfer_core::tensor::{DeviceContext, DeviceMatrix, DeviceVec, HiddenStates}; +use openinfer_core::weight_loader::{ deserialize_shards, load_shard_info, load_tensor_1d, load_tensor_2d, load_tensor_2d_col_shard, load_tensor_2d_row_shard, mmap_shards, precompute_rope, }; @@ -152,7 +152,7 @@ impl PackedLoraProjection { .map_err(|e| anyhow::anyhow!("packed LoRA A copy failed: {e}"))?; let b_offset = slot * self.out_dim * self.max_rank; - pegainfer_core::ops::pack_lora_b_rows_into( + openinfer_core::ops::pack_lora_b_rows_into( ctx, &projection.b.data, &mut self.b, @@ -570,7 +570,7 @@ impl Qwen3Model { &self.config } - pub(crate) fn device_ctx(&self) -> &pegainfer_core::tensor::DeviceContext { + pub(crate) fn device_ctx(&self) -> &openinfer_core::tensor::DeviceContext { &self.ctx } @@ -802,13 +802,13 @@ impl Qwen3Model { pub(crate) fn all_reduce_hidden( &self, - hidden: &mut pegainfer_core::tensor::HiddenStates, + hidden: &mut openinfer_core::tensor::HiddenStates, ) -> Result<()> { #[cfg(feature = "kernel-call-trace")] - if pegainfer_core::ops::call_trace::is_enabled() { - let label = pegainfer_core::ops::call_trace::current_label("all_reduce_hidden"); - pegainfer_core::ops::call_trace::record_call( - pegainfer_core::ops::call_spec::all_reduce_hidden_call( + if openinfer_core::ops::call_trace::is_enabled() { + let label = openinfer_core::ops::call_trace::current_label("all_reduce_hidden"); + openinfer_core::ops::call_trace::record_call( + openinfer_core::ops::call_spec::all_reduce_hidden_call( label, hidden.hidden_dim, hidden.seq_len, @@ -820,7 +820,7 @@ impl Qwen3Model { pub(crate) fn all_reduce_hidden_untraced( &self, - hidden: &mut pegainfer_core::tensor::HiddenStates, + hidden: &mut openinfer_core::tensor::HiddenStates, ) -> Result<()> { if let Some(comm) = &self.tp_comm { comm.all_reduce_in_place(&mut hidden.data, &ReduceOp::Sum) @@ -833,7 +833,7 @@ impl Qwen3Model { pub(crate) fn kv_budget(&self) -> KvBudget { let page_size = 16; let num_kv_heads = self.local_num_key_value_heads(); - let layout = pegainfer_kv_cache::KvLayout::new( + let layout = openinfer_kv_cache::KvLayout::new( self.config.num_hidden_layers, num_kv_heads, self.config.head_dim, @@ -882,7 +882,7 @@ mod tests { fn temp_path(name: &str) -> std::path::PathBuf { std::env::temp_dir().join(format!( - "pegainfer-qwen3-lora-{name}-{}", + "openinfer-qwen3-lora-{name}-{}", std::process::id() )) } diff --git a/pegainfer-server/tests/common/mod.rs b/openinfer-qwen3-4b/tests/common/mod.rs similarity index 78% rename from pegainfer-server/tests/common/mod.rs rename to openinfer-qwen3-4b/tests/common/mod.rs index 01eab133..6614d672 100644 --- a/pegainfer-server/tests/common/mod.rs +++ b/openinfer-qwen3-4b/tests/common/mod.rs @@ -1,6 +1,6 @@ use vllm_text::tokenizer::DynTokenizer; pub(crate) fn load_tokenizer(model_path: &str) -> DynTokenizer { - pegainfer_vllm_support::load_tokenizer(model_path) + openinfer_vllm_support::load_tokenizer(model_path) .unwrap_or_else(|err| panic!("Failed to load tokenizer for {model_path}: {err}")) } diff --git a/pegainfer-qwen3-4b/tests/context_window.rs b/openinfer-qwen3-4b/tests/context_window.rs similarity index 93% rename from pegainfer-qwen3-4b/tests/context_window.rs rename to openinfer-qwen3-4b/tests/context_window.rs index a5e28a71..bc59f36a 100644 --- a/pegainfer-qwen3-4b/tests/context_window.rs +++ b/openinfer-qwen3-4b/tests/context_window.rs @@ -13,12 +13,12 @@ //! file keeps them serialized. //! //! Requires a CUDA GPU and Qwen3-4B weights; skips cleanly when the model is -//! absent (point `PEGAINFER_TEST_MODEL_PATH` at the weights to run it). +//! absent (point `OPENINFER_TEST_MODEL_PATH` at the weights to run it). use std::path::Path; -use pegainfer_core::engine::{EngineHandle, EngineLoadOptions, GenerateRequest, TokenEvent}; -use pegainfer_core::sampler::SamplingParams; +use openinfer_core::engine::{EngineHandle, EngineLoadOptions, GenerateRequest, TokenEvent}; +use openinfer_core::sampler::SamplingParams; use tokio::sync::mpsc; use vllm_text::tokenizer::DynTokenizer; @@ -27,14 +27,14 @@ mod common; const MODEL_PATH: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/../models/Qwen3-4B"); fn model_path_or_skip() -> Option { - match std::env::var("PEGAINFER_TEST_MODEL_PATH") { + match std::env::var("OPENINFER_TEST_MODEL_PATH") { Ok(path) => Some(path), Err(_) if Path::new(MODEL_PATH).join("config.json").exists() => { Some(MODEL_PATH.to_string()) } Err(_) => { eprintln!( - "skipping qwen3 context_window: {MODEL_PATH}/config.json is missing; set PEGAINFER_TEST_MODEL_PATH to run it" + "skipping qwen3 context_window: {MODEL_PATH}/config.json is missing; set OPENINFER_TEST_MODEL_PATH to run it" ); None } @@ -84,7 +84,7 @@ fn oversized_prompt_is_rejected_with_context_length_error() { return; }; - let handle = pegainfer_qwen3_4b::start_engine( + let handle = openinfer_qwen3_4b::start_engine( Path::new(&model_path), EngineLoadOptions { enable_cuda_graph: true, diff --git a/pegainfer-qwen3-4b/tests/context_window_in_window.rs b/openinfer-qwen3-4b/tests/context_window_in_window.rs similarity index 91% rename from pegainfer-qwen3-4b/tests/context_window_in_window.rs rename to openinfer-qwen3-4b/tests/context_window_in_window.rs index 6b8a5eb4..86d56d21 100644 --- a/pegainfer-qwen3-4b/tests/context_window_in_window.rs +++ b/openinfer-qwen3-4b/tests/context_window_in_window.rs @@ -18,25 +18,25 @@ //! binary, so two engines on one GPU would contend. One engine per file. //! //! Requires a CUDA GPU and Qwen3-4B weights; skips cleanly when the model is -//! absent (point `PEGAINFER_TEST_MODEL_PATH` at the weights to run it). +//! absent (point `OPENINFER_TEST_MODEL_PATH` at the weights to run it). use std::path::Path; -use pegainfer_core::engine::{EngineLoadOptions, GenerateRequest, TokenEvent}; -use pegainfer_core::sampler::SamplingParams; +use openinfer_core::engine::{EngineLoadOptions, GenerateRequest, TokenEvent}; +use openinfer_core::sampler::SamplingParams; use tokio::sync::mpsc; const MODEL_PATH: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/../models/Qwen3-4B"); fn model_path_or_skip() -> Option { - match std::env::var("PEGAINFER_TEST_MODEL_PATH") { + match std::env::var("OPENINFER_TEST_MODEL_PATH") { Ok(path) => Some(path), Err(_) if Path::new(MODEL_PATH).join("config.json").exists() => { Some(MODEL_PATH.to_string()) } Err(_) => { eprintln!( - "skipping qwen3 context_window_in_window: {MODEL_PATH}/config.json is missing; set PEGAINFER_TEST_MODEL_PATH to run it" + "skipping qwen3 context_window_in_window: {MODEL_PATH}/config.json is missing; set OPENINFER_TEST_MODEL_PATH to run it" ); None } @@ -49,7 +49,7 @@ fn in_window_prompt_past_old_rope_table_is_served() { return; }; - let handle = pegainfer_qwen3_4b::start_engine( + let handle = openinfer_qwen3_4b::start_engine( Path::new(&model_path), EngineLoadOptions { enable_cuda_graph: true, diff --git a/pegainfer-qwen3-4b/tests/hf_golden_gate.rs b/openinfer-qwen3-4b/tests/hf_golden_gate.rs similarity index 95% rename from pegainfer-qwen3-4b/tests/hf_golden_gate.rs rename to openinfer-qwen3-4b/tests/hf_golden_gate.rs index 28fe87d5..7c674836 100644 --- a/pegainfer-qwen3-4b/tests/hf_golden_gate.rs +++ b/openinfer-qwen3-4b/tests/hf_golden_gate.rs @@ -4,14 +4,14 @@ //! HuggingFace is the numerical golden truth. A bit-wise check (exact text or a //! logprob hash) is fragile: bf16 GEMM kernels differ per GPU, so the low bits — //! and any frozen snapshot — drift across hardware and false-positive on a -//! different card. Instead this gate asserts pegainfer lands within the *bf16 +//! different card. Instead this gate asserts openinfer lands within the *bf16 //! noise floor* of a stored HF reference, which every numerically-correct GPU //! satisfies; only a real regression escapes the tolerance. //! //! The reference (`test_data/qwen3-4b-hf-golden.safetensors`, produced once by //! `tools/accuracy/dump_qwen3_4b_hf_golden.py`) pins a set of fixed token //! sequences and HF's top-K next-token logprobs at each position. We replay the -//! *same fixed sequences* through pegainfer by teacher-forcing — prefill the +//! *same fixed sequences* through openinfer by teacher-forcing — prefill the //! prompt, then decode feeding the reference's own tail tokens — so every //! position is compared against the identical-input HF distribution. //! Teacher-forcing (vs free greedy) is what makes this stable: one argmax flip @@ -19,9 +19,9 @@ //! //! Assertions: //! * argmax — wherever HF has a clear winner (top-1 over top-2 margin exceeds a -//! few bf16 ULP), pegainfer must pick the same token. Below that margin it is +//! few bf16 ULP), openinfer must pick the same token. Below that margin it is //! a genuine tie with no correct answer, so it is not enforced. -//! * logprobs — on the head tokens, |pegainfer − HF| is bounded in the mean +//! * logprobs — on the head tokens, |openinfer − HF| is bounded in the mean //! (catches uniform drift) and the p99 (catches a noisier subset). Both are //! coverage-stable; the single worst delta is reported but not asserted, //! because it grows with sample count (irreducible bf16 tail) while mean and @@ -44,14 +44,14 @@ //! padding-slot count. //! //! Requires a CUDA GPU and Qwen3-4B weights; skips cleanly when the model is -//! absent (point `PEGAINFER_TEST_MODEL_PATH` at the weights to run it). +//! absent (point `OPENINFER_TEST_MODEL_PATH` at the weights to run it). use std::collections::HashMap; use std::path::Path; -use pegainfer_core::engine::TokenLogprob; -use pegainfer_core::sampler::SamplingParams; -use pegainfer_qwen3_4b::runtime::{ +use openinfer_core::engine::TokenLogprob; +use openinfer_core::sampler::SamplingParams; +use openinfer_qwen3_4b::runtime::{ DecodePlan, DecodeStepItem, PrefillPlan, PrefillStepItem, Qwen3Executor, RequestId, }; use safetensors::{Dtype, SafeTensors}; @@ -70,7 +70,7 @@ const LOGPROBS: usize = 64; const MAX_OUTPUT_TOKENS: usize = 64; /// Max acceptable *regret*: how far below HF's own argmax (in HF's logprobs) -/// pegainfer's chosen token may sit. ~3 bf16 ULP at typical logit magnitudes — +/// openinfer's chosen token may sit. ~3 bf16 ULP at typical logit magnitudes — /// genuine ties fall well under it; where HF has a clear winner the only token /// within this regret is HF's argmax itself, so this still enforces exact /// agreement there. @@ -106,14 +106,14 @@ const HEAD_K: usize = 8; const BUCKET_STRADDLES: [usize; 2] = [9, 5]; fn model_path_or_skip() -> Option { - match std::env::var("PEGAINFER_TEST_MODEL_PATH") { + match std::env::var("OPENINFER_TEST_MODEL_PATH") { Ok(path) => Some(path), Err(_) if Path::new(MODEL_PATH).join("config.json").exists() => { Some(MODEL_PATH.to_string()) } Err(_) => { eprintln!( - "skipping qwen3 hf_golden_gate: {MODEL_PATH}/config.json is missing; set PEGAINFER_TEST_MODEL_PATH to run it" + "skipping qwen3 hf_golden_gate: {MODEL_PATH}/config.json is missing; set OPENINFER_TEST_MODEL_PATH to run it" ); None } @@ -176,19 +176,19 @@ fn check_position( let hf_map: HashMap = hf.iter().copied().collect(); // pega's chosen token must be one HF also ranks near its own best. "Regret" - // is how far below HF's argmax (in HF's own logprobs) pegainfer's pick sits. + // is how far below HF's argmax (in HF's own logprobs) openinfer's pick sits. // A genuine bf16 tie differs by a ULP or two — small regret, fine. But a pick - // HF scores clearly worse, or one absent from HF's top-K entirely (pegainfer + // HF scores clearly worse, or one absent from HF's top-K entirely (openinfer // confidently wrong on a token HF does not even rank), is a real wrong-token // bug. This one rule subsumes "match HF where it is sure" *and* closes the // tie-band hole where a garbage argmax would otherwise escape every check. match hf_map.get(&pega_argmax) { None => stats.argmax_violations.push(format!( - "seq {seq} pos {pos}: pegainfer's argmax {pega_argmax} is absent from HF's top-{} — confidently wrong on a token HF does not rank", + "seq {seq} pos {pos}: openinfer's argmax {pega_argmax} is absent from HF's top-{} — confidently wrong on a token HF does not rank", hf.len() )), Some(&hlp) if hf_top - hlp > MARGIN_TOL => stats.argmax_violations.push(format!( - "seq {seq} pos {pos}: pegainfer chose {pega_argmax}, which HF scores {:.4} nat below its own argmax (> {MARGIN_TOL} tie tolerance)", + "seq {seq} pos {pos}: openinfer chose {pega_argmax}, which HF scores {:.4} nat below its own argmax (> {MARGIN_TOL} tie tolerance)", hf_top - hlp )), Some(_) => {} @@ -292,7 +292,7 @@ fn decode_item(id: RequestId, fed: u32) -> DecodeStepItem { /// the tightest comparison); `batched = true` advances them all as one batch. /// Restricting `seqs` lets a caller hit a specific CUDA-graph bucket (e.g. 5 /// seqs → bucket 8) so more than one real/pad ratio is exercised. The returned -/// vector is pegainfer's own top-1 logprob at each evaluated position — a +/// vector is openinfer's own top-1 logprob at each evaluated position — a /// fingerprint two identical runs must reproduce bit-for-bit (determinism). fn run(g: &Golden, ex: &mut Qwen3Executor, seqs: &[usize], batched: bool) -> (Stats, Vec) { let mut stats = Stats::default(); @@ -422,7 +422,7 @@ fn report_and_assert(label: &str, stats: &Stats) { assert!( stats.argmax_violations.is_empty(), - "[{label}] pegainfer picked a token HF does not rank near its best:\n {}", + "[{label}] openinfer picked a token HF does not rank near its best:\n {}", stats.argmax_violations.join("\n ") ); assert!( diff --git a/pegainfer-qwen3-4b/tests/kv_offload_cpu_hit.rs b/openinfer-qwen3-4b/tests/kv_offload_cpu_hit.rs similarity index 95% rename from pegainfer-qwen3-4b/tests/kv_offload_cpu_hit.rs rename to openinfer-qwen3-4b/tests/kv_offload_cpu_hit.rs index c2a47e22..3ee40a8e 100644 --- a/pegainfer-qwen3-4b/tests/kv_offload_cpu_hit.rs +++ b/openinfer-qwen3-4b/tests/kv_offload_cpu_hit.rs @@ -10,19 +10,19 @@ //! //! This is the one test that exercises save → host-tier persistence → query → //! async load → register → prefill-rematch through the executor, not a unit -//! harness. `tests/cpu_roundtrip.rs` (in `pegainfer-kv-offload`) covers the raw +//! harness. `tests/cpu_roundtrip.rs` (in `openinfer-kv-offload`) covers the raw //! byte path; this covers the live executor wiring. If the load landed in the //! wrong layer/segment/block the warm logits would be whole nats off. //! //! Requires a CUDA GPU and Qwen3-4B weights; skips cleanly when absent -//! (point `PEGAINFER_TEST_MODEL_PATH` at the weights to run it). +//! (point `OPENINFER_TEST_MODEL_PATH` at the weights to run it). use std::collections::HashMap; use std::path::Path; -use pegainfer_core::sampler::SamplingParams; -use pegainfer_qwen3_4b::runtime::{PrefillPlan, PrefillStepItem, Qwen3Executor, RequestId}; -use pegainfer_qwen3_4b::{Qwen3LoraOptions, Qwen3OffloadOptions}; +use openinfer_core::sampler::SamplingParams; +use openinfer_qwen3_4b::runtime::{PrefillPlan, PrefillStepItem, Qwen3Executor, RequestId}; +use openinfer_qwen3_4b::{Qwen3LoraOptions, Qwen3OffloadOptions}; const MODEL_PATH: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/../models/Qwen3-4B"); const BLOCK: usize = 16; @@ -41,7 +41,7 @@ const REGRET_TOL: f32 = 0.20; const MEAN_TOL: f32 = 0.06; fn model_path_or_skip() -> Option { - match std::env::var("PEGAINFER_TEST_MODEL_PATH") { + match std::env::var("OPENINFER_TEST_MODEL_PATH") { Ok(path) => Some(path), Err(_) if Path::new(MODEL_PATH).join("config.json").exists() => { Some(MODEL_PATH.to_string()) @@ -49,7 +49,7 @@ fn model_path_or_skip() -> Option { Err(_) => { eprintln!( "skipping qwen3 kv_offload_cpu_hit: {MODEL_PATH}/config.json is missing; \ - set PEGAINFER_TEST_MODEL_PATH to run it" + set OPENINFER_TEST_MODEL_PATH to run it" ); None } @@ -75,7 +75,7 @@ fn prefill_item(id: u64, prompt: &[u32]) -> PrefillStepItem { ) } -fn first_token_top(pr: &pegainfer_qwen3_4b::runtime::PrefillResult) -> Vec<(u32, f32)> { +fn first_token_top(pr: &openinfer_qwen3_4b::runtime::PrefillResult) -> Vec<(u32, f32)> { pr.requests[0] .first_token_logprob .as_ref() diff --git a/pegainfer-qwen3-4b/tests/lora_smoke.rs b/openinfer-qwen3-4b/tests/lora_smoke.rs similarity index 94% rename from pegainfer-qwen3-4b/tests/lora_smoke.rs rename to openinfer-qwen3-4b/tests/lora_smoke.rs index 3dce67db..46b4f2f6 100644 --- a/pegainfer-qwen3-4b/tests/lora_smoke.rs +++ b/openinfer-qwen3-4b/tests/lora_smoke.rs @@ -5,11 +5,11 @@ use std::path::{Path, PathBuf}; use std::time::{SystemTime, UNIX_EPOCH}; use half::bf16; -use pegainfer_core::engine::{ +use openinfer_core::engine::{ EngineHandle, EngineLoadOptions, FinishReason, GenerateRequest, LoadLoraAdapterRequest, TokenEvent, }; -use pegainfer_core::sampler::SamplingParams; +use openinfer_core::sampler::SamplingParams; use safetensors::Dtype; use safetensors::tensor::View; use serde::Deserialize; @@ -56,11 +56,11 @@ impl View for TestTensor { } fn get_model_path() -> String { - std::env::var("PEGAINFER_TEST_MODEL_PATH").unwrap_or_else(|_| MODEL_PATH.to_string()) + std::env::var("OPENINFER_TEST_MODEL_PATH").unwrap_or_else(|_| MODEL_PATH.to_string()) } fn get_device_ordinal() -> usize { - std::env::var("PEGAINFER_TEST_DEVICE_ORDINAL") + std::env::var("OPENINFER_TEST_DEVICE_ORDINAL") .ok() .and_then(|value| value.parse().ok()) .unwrap_or(0) @@ -80,7 +80,7 @@ fn temp_adapter_dir() -> PathBuf { .expect("system time before unix epoch") .as_nanos(); let path = std::env::temp_dir().join(format!( - "pegainfer-qwen3-lora-smoke-{}-{now}", + "openinfer-qwen3-lora-smoke-{}-{now}", std::process::id() )); let _ = fs::remove_dir_all(&path); @@ -218,7 +218,7 @@ fn qwen3_lora_loads_rank_and_generates(rank: usize, adapter_name: &str) { let adapter_path = temp_adapter_dir(); write_zero_lora_adapter(&adapter_path, &config, rank); - let handle = pegainfer_qwen3_4b::start_engine_with_lora_control( + let handle = openinfer_qwen3_4b::start_engine_with_lora_control( Path::new(&model_path), EngineLoadOptions { enable_cuda_graph: false, @@ -227,8 +227,8 @@ fn qwen3_lora_loads_rank_and_generates(rank: usize, adapter_name: &str) { seed: 42, ..EngineLoadOptions::default() }, - pegainfer_qwen3_4b::Qwen3LoraOptions::default(), - pegainfer_qwen3_4b::Qwen3OffloadOptions::disabled(), + openinfer_qwen3_4b::Qwen3LoraOptions::default(), + openinfer_qwen3_4b::Qwen3OffloadOptions::disabled(), false, ) .expect("start LoRA-capable Qwen3 engine"); diff --git a/pegainfer-qwen3-4b/tests/prefix_cache.rs b/openinfer-qwen3-4b/tests/prefix_cache.rs similarity index 97% rename from pegainfer-qwen3-4b/tests/prefix_cache.rs rename to openinfer-qwen3-4b/tests/prefix_cache.rs index 74eb1b3e..93b2bf5a 100644 --- a/pegainfer-qwen3-4b/tests/prefix_cache.rs +++ b/openinfer-qwen3-4b/tests/prefix_cache.rs @@ -20,14 +20,14 @@ //! and cannot hide inside these tolerances. //! //! Requires a CUDA GPU and Qwen3-4B weights; skips cleanly when absent -//! (point `PEGAINFER_TEST_MODEL_PATH` at the weights to run it). +//! (point `OPENINFER_TEST_MODEL_PATH` at the weights to run it). use std::collections::HashMap; use std::path::Path; -use pegainfer_core::engine::TokenLogprob; -use pegainfer_core::sampler::SamplingParams; -use pegainfer_qwen3_4b::runtime::{ +use openinfer_core::engine::TokenLogprob; +use openinfer_core::sampler::SamplingParams; +use openinfer_qwen3_4b::runtime::{ DecodePlan, DecodeStepItem, PrefillPlan, PrefillStepItem, Qwen3Executor, RequestId, UnifiedPlan, }; @@ -50,14 +50,14 @@ const REGRET_TOL: f32 = 0.20; const MEAN_TOL: f32 = 0.06; fn model_path_or_skip() -> Option { - match std::env::var("PEGAINFER_TEST_MODEL_PATH") { + match std::env::var("OPENINFER_TEST_MODEL_PATH") { Ok(path) => Some(path), Err(_) if Path::new(MODEL_PATH).join("config.json").exists() => { Some(MODEL_PATH.to_string()) } Err(_) => { eprintln!( - "skipping qwen3 prefix_cache: {MODEL_PATH}/config.json is missing; set PEGAINFER_TEST_MODEL_PATH to run it" + "skipping qwen3 prefix_cache: {MODEL_PATH}/config.json is missing; set OPENINFER_TEST_MODEL_PATH to run it" ); None } diff --git a/pegainfer-qwen3-4b/tests/scheduler_robustness.rs b/openinfer-qwen3-4b/tests/scheduler_robustness.rs similarity index 92% rename from pegainfer-qwen3-4b/tests/scheduler_robustness.rs rename to openinfer-qwen3-4b/tests/scheduler_robustness.rs index 51285160..b8e9be6c 100644 --- a/pegainfer-qwen3-4b/tests/scheduler_robustness.rs +++ b/openinfer-qwen3-4b/tests/scheduler_robustness.rs @@ -9,13 +9,13 @@ //! send-failure retirement path. //! //! Requires a CUDA GPU and Qwen3-4B weights; skips cleanly when the model is -//! absent (point `PEGAINFER_TEST_MODEL_PATH` at the weights to run it). +//! absent (point `OPENINFER_TEST_MODEL_PATH` at the weights to run it). use std::path::Path; use std::time::Duration; -use pegainfer_core::engine::{EngineHandle, EngineLoadOptions, GenerateRequest, TokenEvent}; -use pegainfer_core::sampler::SamplingParams; +use openinfer_core::engine::{EngineHandle, EngineLoadOptions, GenerateRequest, TokenEvent}; +use openinfer_core::sampler::SamplingParams; use tokio::sync::mpsc; use vllm_text::tokenizer::DynTokenizer; @@ -24,14 +24,14 @@ mod common; const MODEL_PATH: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/../models/Qwen3-4B"); fn model_path_or_skip() -> Option { - match std::env::var("PEGAINFER_TEST_MODEL_PATH") { + match std::env::var("OPENINFER_TEST_MODEL_PATH") { Ok(path) => Some(path), Err(_) if Path::new(MODEL_PATH).join("config.json").exists() => { Some(MODEL_PATH.to_string()) } Err(_) => { eprintln!( - "skipping qwen3 scheduler_robustness: {MODEL_PATH}/config.json is missing; set PEGAINFER_TEST_MODEL_PATH to run it" + "skipping qwen3 scheduler_robustness: {MODEL_PATH}/config.json is missing; set OPENINFER_TEST_MODEL_PATH to run it" ); None } @@ -84,7 +84,7 @@ fn scheduler_survives_consumer_drop() { return; }; - let handle = pegainfer_qwen3_4b::start_engine( + let handle = openinfer_qwen3_4b::start_engine( Path::new(&model_path), EngineLoadOptions { enable_cuda_graph: true, diff --git a/pegainfer-qwen35-4b/Cargo.toml b/openinfer-qwen35-4b/Cargo.toml similarity index 79% rename from pegainfer-qwen35-4b/Cargo.toml rename to openinfer-qwen35-4b/Cargo.toml index 17006ff8..a0e07741 100644 --- a/pegainfer-qwen35-4b/Cargo.toml +++ b/openinfer-qwen35-4b/Cargo.toml @@ -1,12 +1,12 @@ [package] -name = "pegainfer-qwen35-4b" +name = "openinfer-qwen35-4b" version = "0.1.0" edition = "2024" autobenches = false [dependencies] -pegainfer-core = { workspace = true } -pegainfer-kernels = { workspace = true } +openinfer-core = { workspace = true } +openinfer-kernels = { workspace = true } anyhow = { workspace = true } cudarc = { workspace = true } fastrace = { workspace = true } @@ -20,7 +20,7 @@ tokio = { workspace = true, features = ["sync"] } [dev-dependencies] criterion = { workspace = true } -pegainfer-vllm-support = { workspace = true } +openinfer-vllm-support = { workspace = true } sha2 = { workspace = true } vllm-text = { workspace = true } diff --git a/pegainfer-qwen35-4b/benches/ops/common/mod.rs b/openinfer-qwen35-4b/benches/ops/common/mod.rs similarity index 99% rename from pegainfer-qwen35-4b/benches/ops/common/mod.rs rename to openinfer-qwen35-4b/benches/ops/common/mod.rs index 19f3acd8..792ba974 100644 --- a/pegainfer-qwen35-4b/benches/ops/common/mod.rs +++ b/openinfer-qwen35-4b/benches/ops/common/mod.rs @@ -7,7 +7,7 @@ use anyhow::{Result, anyhow}; use criterion::{Bencher, BenchmarkGroup, measurement::WallTime}; use cudarc::driver::CudaSlice; use half::bf16; -use pegainfer_core::tensor::{DeviceContext, DeviceMatrix, DeviceVec, HiddenStates}; +use openinfer_core::tensor::{DeviceContext, DeviceMatrix, DeviceVec, HiddenStates}; pub(crate) const VECTOR_DIM: usize = 1024; pub(crate) const OUT_DIM: usize = 1024; diff --git a/pegainfer-qwen35-4b/benches/ops/mod.rs b/openinfer-qwen35-4b/benches/ops/mod.rs similarity index 100% rename from pegainfer-qwen35-4b/benches/ops/mod.rs rename to openinfer-qwen35-4b/benches/ops/mod.rs diff --git a/pegainfer-qwen35-4b/benches/ops/qwen35_norm_bench.rs b/openinfer-qwen35-4b/benches/ops/qwen35_norm_bench.rs similarity index 95% rename from pegainfer-qwen35-4b/benches/ops/qwen35_norm_bench.rs rename to openinfer-qwen35-4b/benches/ops/qwen35_norm_bench.rs index ded36a0a..ea277c73 100644 --- a/pegainfer-qwen35-4b/benches/ops/qwen35_norm_bench.rs +++ b/openinfer-qwen35-4b/benches/ops/qwen35_norm_bench.rs @@ -1,6 +1,6 @@ use criterion::{BenchmarkId, Criterion, Throughput}; -use pegainfer_core::tensor::{DeviceContext, DeviceVec, HiddenStates}; -use pegainfer_qwen35_4b::runtime_ops as ops; +use openinfer_core::tensor::{DeviceContext, DeviceVec, HiddenStates}; +use openinfer_qwen35_4b::runtime_ops as ops; use super::common::{ EPS, QWEN35_4B_HIDDEN, configure_group, device_vec, hidden_states, iter_sync, diff --git a/pegainfer-qwen35-4b/benches/ops/qwen35_state_bench.rs b/openinfer-qwen35-4b/benches/ops/qwen35_state_bench.rs similarity index 93% rename from pegainfer-qwen35-4b/benches/ops/qwen35_state_bench.rs rename to openinfer-qwen35-4b/benches/ops/qwen35_state_bench.rs index f7156273..cd3cecd0 100644 --- a/pegainfer-qwen35-4b/benches/ops/qwen35_state_bench.rs +++ b/openinfer-qwen35-4b/benches/ops/qwen35_state_bench.rs @@ -1,7 +1,7 @@ use criterion::{BenchmarkId, Criterion, Throughput}; -use pegainfer_core::tensor::DeviceContext; -use pegainfer_qwen35_4b::prefill_buffers::GdrChunkwiseScratch35; -use pegainfer_qwen35_4b::runtime_ops as ops; +use openinfer_core::tensor::DeviceContext; +use openinfer_qwen35_4b::prefill_buffers::GdrChunkwiseScratch35; +use openinfer_qwen35_4b::runtime_ops as ops; use super::common::{ QWEN35_4B_LINEAR_K_DIM, QWEN35_4B_LINEAR_K_HEADS, QWEN35_4B_LINEAR_V_DIM, @@ -40,7 +40,7 @@ pub(crate) fn bench_qwen35_state_ops(c: &mut Criterion) { QWEN35_4B_LINEAR_V_HEADS * QWEN35_4B_LINEAR_K_DIM * QWEN35_4B_LINEAR_V_DIM, ) .expect("failed to allocate recurrent state"); - let mut recurrent_out = pegainfer_core::tensor::HiddenStates::zeros( + let mut recurrent_out = openinfer_core::tensor::HiddenStates::zeros( &ctx, QWEN35_4B_LINEAR_V_HEADS * QWEN35_4B_LINEAR_V_DIM, seq_len, diff --git a/pegainfer-qwen35-4b/benches/qwen35_ops.rs b/openinfer-qwen35-4b/benches/qwen35_ops.rs similarity index 100% rename from pegainfer-qwen35-4b/benches/qwen35_ops.rs rename to openinfer-qwen35-4b/benches/qwen35_ops.rs diff --git a/pegainfer-qwen35-4b/src/batch_decode.rs b/openinfer-qwen35-4b/src/batch_decode.rs similarity index 99% rename from pegainfer-qwen35-4b/src/batch_decode.rs rename to openinfer-qwen35-4b/src/batch_decode.rs index 689362d9..31cddaea 100644 --- a/pegainfer-qwen35-4b/src/batch_decode.rs +++ b/openinfer-qwen35-4b/src/batch_decode.rs @@ -9,13 +9,13 @@ use super::decode_buffers::BatchDecodeBuffers35; use super::recurrent_state::RecurrentState; use super::weights::{FullAttentionLayer, LayerKind, LinearAttentionLayer, Qwen35Model}; use crate::ops; -use pegainfer_core::kv_pool::{KvLayout, KvState}; +use openinfer_core::kv_pool::{KvLayout, KvState}; impl Qwen35Model { pub(crate) fn select_tokens_batch_varied( &self, bufs: &mut BatchDecodeBuffers35, - params: &[&pegainfer_core::sampler::SamplingParams], + params: &[&openinfer_core::sampler::SamplingParams], rng: &mut rand::rngs::StdRng, ) -> Result> { let random_vals: Vec = params.iter().map(|_| rand::RngExt::random(rng)).collect(); diff --git a/pegainfer-qwen35-4b/src/batch_decode_graph.rs b/openinfer-qwen35-4b/src/batch_decode_graph.rs similarity index 97% rename from pegainfer-qwen35-4b/src/batch_decode_graph.rs rename to openinfer-qwen35-4b/src/batch_decode_graph.rs index 4ac9969b..31e4712c 100644 --- a/pegainfer-qwen35-4b/src/batch_decode_graph.rs +++ b/openinfer-qwen35-4b/src/batch_decode_graph.rs @@ -6,9 +6,9 @@ use anyhow::Result; -use pegainfer_core::cuda_graph::CudaGraphState; -use pegainfer_core::kv_pool::KvPool; -use pegainfer_core::tensor::DeviceContext; +use openinfer_core::cuda_graph::CudaGraphState; +use openinfer_core::kv_pool::KvPool; +use openinfer_core::tensor::DeviceContext; use super::config::Config35; use super::decode_buffers::BatchDecodeBuffers35; diff --git a/pegainfer-qwen35-4b/src/config.rs b/openinfer-qwen35-4b/src/config.rs similarity index 100% rename from pegainfer-qwen35-4b/src/config.rs rename to openinfer-qwen35-4b/src/config.rs diff --git a/pegainfer-qwen35-4b/src/decode_buffers.rs b/openinfer-qwen35-4b/src/decode_buffers.rs similarity index 98% rename from pegainfer-qwen35-4b/src/decode_buffers.rs rename to openinfer-qwen35-4b/src/decode_buffers.rs index 2aa6fd2c..b88338b7 100644 --- a/pegainfer-qwen35-4b/src/decode_buffers.rs +++ b/openinfer-qwen35-4b/src/decode_buffers.rs @@ -5,8 +5,8 @@ use anyhow::Result; use cudarc::driver::CudaSlice; use super::config::Config35; -use pegainfer_core::kv_pool::KvState; -use pegainfer_core::tensor::{DeviceContext, DeviceVec, HiddenStates}; +use openinfer_core::kv_pool::KvState; +use openinfer_core::tensor::{DeviceContext, DeviceVec, HiddenStates}; /// Pre-allocated GPU buffers for Qwen3.5 batch decode (N requests, 1 token each). pub(crate) struct BatchDecodeBuffers35 { diff --git a/pegainfer-qwen35-4b/src/executor.rs b/openinfer-qwen35-4b/src/executor.rs similarity index 98% rename from pegainfer-qwen35-4b/src/executor.rs rename to openinfer-qwen35-4b/src/executor.rs index 62a8f588..7b0cc3ae 100644 --- a/pegainfer-qwen35-4b/src/executor.rs +++ b/openinfer-qwen35-4b/src/executor.rs @@ -7,9 +7,9 @@ use std::collections::HashSet; use anyhow::Result; -use pegainfer_core::engine::TokenLogprob; -use pegainfer_core::kv_pool::KvState; -use pegainfer_core::tensor::DeviceVec; +use openinfer_core::engine::TokenLogprob; +use openinfer_core::kv_pool::KvState; +use openinfer_core::tensor::DeviceVec; use crate::batch_decode_graph::{BatchDecodeGraphState, MAX_BATCH}; use crate::recurrent_state::RecurrentState; diff --git a/openinfer-qwen35-4b/src/ffi.rs b/openinfer-qwen35-4b/src/ffi.rs new file mode 100644 index 00000000..a004025c --- /dev/null +++ b/openinfer-qwen35-4b/src/ffi.rs @@ -0,0 +1 @@ +pub(crate) use openinfer_core::ffi::*; diff --git a/pegainfer-qwen35-4b/src/kernel_plan.rs b/openinfer-qwen35-4b/src/kernel_plan.rs similarity index 99% rename from pegainfer-qwen35-4b/src/kernel_plan.rs rename to openinfer-qwen35-4b/src/kernel_plan.rs index b0818a97..4311dd89 100644 --- a/pegainfer-qwen35-4b/src/kernel_plan.rs +++ b/openinfer-qwen35-4b/src/kernel_plan.rs @@ -18,7 +18,7 @@ //! Use it like: //! //! ```ignore -//! use pegainfer_qwen35_4b::kernel_plan; +//! use openinfer_qwen35_4b::kernel_plan; //! for phase in kernel_plan().phases { //! for op in phase.ops { //! println!("[{}] {} -> {}", phase.name, op.id, op.backend); diff --git a/pegainfer-qwen35-4b/src/lib.rs b/openinfer-qwen35-4b/src/lib.rs similarity index 97% rename from pegainfer-qwen35-4b/src/lib.rs rename to openinfer-qwen35-4b/src/lib.rs index b919e82d..dc7a8e47 100644 --- a/pegainfer-qwen35-4b/src/lib.rs +++ b/openinfer-qwen35-4b/src/lib.rs @@ -18,7 +18,7 @@ mod weights; use std::path::Path; use anyhow::{Result, anyhow}; -use pegainfer_core::engine::{EngineHandle, EngineLoadOptions}; +use openinfer_core::engine::{EngineHandle, EngineLoadOptions}; pub use kernel_plan::kernel_plan; diff --git a/pegainfer-qwen35-4b/src/ops.rs b/openinfer-qwen35-4b/src/ops.rs similarity index 79% rename from pegainfer-qwen35-4b/src/ops.rs rename to openinfer-qwen35-4b/src/ops.rs index dcc74c3c..96b54152 100644 --- a/pegainfer-qwen35-4b/src/ops.rs +++ b/openinfer-qwen35-4b/src/ops.rs @@ -1,14 +1,14 @@ //! Qwen3.5 GPU operation wrappers. -pub(crate) use pegainfer_core::ops::PrefillPagedPlan; -pub(crate) use pegainfer_core::ops::{ +pub(crate) use openinfer_core::ops::PrefillPagedPlan; +pub(crate) use openinfer_core::ops::{ add_batch, add_batch_into, embedding_batch, extract_vec, extract_vec_into, flashinfer_topk_row_states_bytes, gemm, gemm_into, gpu_sample_into, linear, paged_attention_batch_decode_hd256_into, qk_norm_partial_rope_batched_decode_hd256_into, rms_norm_gated_batch_into, select_batch_tokens_into, silu_mul_batch, silu_mul_batch_into, write_vec_into, }; -pub use pegainfer_core::ops::{rms_norm_batch_offset_into, rms_norm_offset_into}; +pub use openinfer_core::ops::{rms_norm_batch_offset_into, rms_norm_offset_into}; pub use recurrent::gated_delta_rule_prefill_chunkwise_into; pub(crate) use recurrent::{ conv1d_decode_into, conv1d_prefill_batch_into, gated_delta_rule_decode_vec_into, diff --git a/pegainfer-qwen35-4b/src/prefill.rs b/openinfer-qwen35-4b/src/prefill.rs similarity index 99% rename from pegainfer-qwen35-4b/src/prefill.rs rename to openinfer-qwen35-4b/src/prefill.rs index d25e97b9..ee49058e 100644 --- a/pegainfer-qwen35-4b/src/prefill.rs +++ b/openinfer-qwen35-4b/src/prefill.rs @@ -16,8 +16,8 @@ use super::weights::{ use crate::ffi; use crate::ops; use crate::ops::PrefillPagedPlan; -use pegainfer_core::kv_pool::KvState; -use pegainfer_core::tensor::{DeviceVec, HiddenStates}; +use openinfer_core::kv_pool::KvState; +use openinfer_core::tensor::{DeviceVec, HiddenStates}; fn checked_prefill_end_pos( base_pos: usize, diff --git a/pegainfer-qwen35-4b/src/prefill_buffers.rs b/openinfer-qwen35-4b/src/prefill_buffers.rs similarity index 99% rename from pegainfer-qwen35-4b/src/prefill_buffers.rs rename to openinfer-qwen35-4b/src/prefill_buffers.rs index a57b7f4c..434dbbd5 100644 --- a/pegainfer-qwen35-4b/src/prefill_buffers.rs +++ b/openinfer-qwen35-4b/src/prefill_buffers.rs @@ -5,7 +5,7 @@ use cudarc::driver::CudaSlice; use half::bf16; use super::config::Config35; -use pegainfer_core::tensor::{DeviceContext, HiddenStates}; +use openinfer_core::tensor::{DeviceContext, HiddenStates}; /// Scratch buffers for a single Qwen3.5 linear-attention chunk-wise GDR prefill call. /// diff --git a/pegainfer-qwen35-4b/src/recurrent.rs b/openinfer-qwen35-4b/src/recurrent.rs similarity index 99% rename from pegainfer-qwen35-4b/src/recurrent.rs rename to openinfer-qwen35-4b/src/recurrent.rs index 08bb01c1..9bc207fa 100644 --- a/pegainfer-qwen35-4b/src/recurrent.rs +++ b/openinfer-qwen35-4b/src/recurrent.rs @@ -3,7 +3,7 @@ use cudarc::driver::{CudaSlice, DevicePtr, DevicePtrMut}; use crate::ffi; use crate::prefill_buffers::GdrChunkwiseScratch35; -use pegainfer_core::tensor::{DeviceContext, DeviceVec, HiddenStates}; +use openinfer_core::tensor::{DeviceContext, DeviceVec, HiddenStates}; #[allow(clippy::too_many_arguments)] pub(crate) fn gated_delta_rule_decode_vec_into( @@ -481,7 +481,7 @@ mod tests { use half::bf16; use super::conv1d_prefill_batch_into; - use pegainfer_core::tensor::{DeviceContext, DeviceVec, HiddenStates}; + use openinfer_core::tensor::{DeviceContext, DeviceVec, HiddenStates}; fn bf16_vec(data: &[f32]) -> Vec { data.iter().map(|&x| bf16::from_f32(x)).collect() diff --git a/pegainfer-qwen35-4b/src/recurrent_state.rs b/openinfer-qwen35-4b/src/recurrent_state.rs similarity index 97% rename from pegainfer-qwen35-4b/src/recurrent_state.rs rename to openinfer-qwen35-4b/src/recurrent_state.rs index 18a6bcc7..df32960f 100644 --- a/pegainfer-qwen35-4b/src/recurrent_state.rs +++ b/openinfer-qwen35-4b/src/recurrent_state.rs @@ -8,7 +8,7 @@ use anyhow::Result; use cudarc::driver::CudaSlice; use super::config::Config35; -use pegainfer_core::tensor::{DeviceContext, DeviceVec}; +use openinfer_core::tensor::{DeviceContext, DeviceVec}; /// Per-layer recurrent state for a single linear attention layer. pub(crate) struct LayerRecurrentState { diff --git a/pegainfer-qwen35-4b/src/scheduler.rs b/openinfer-qwen35-4b/src/scheduler.rs similarity index 99% rename from pegainfer-qwen35-4b/src/scheduler.rs rename to openinfer-qwen35-4b/src/scheduler.rs index 5d6efe52..ba2a38c1 100644 --- a/pegainfer-qwen35-4b/src/scheduler.rs +++ b/openinfer-qwen35-4b/src/scheduler.rs @@ -21,13 +21,13 @@ use tokio::sync::mpsc; use crate::batch_decode_graph::BatchDecodeGraphState; use crate::recurrent_state::RecurrentState; use crate::weights::Qwen35Model; -use pegainfer_core::engine::{ +use openinfer_core::engine::{ EngineHandle as SchedulerHandle, FinishReason, GenerateRequest as SchedulerRequest, TokenEvent, TokenLogprob, }; -use pegainfer_core::kv_pool::KvState; -use pegainfer_core::sampler::SamplingParams; -use pegainfer_core::tensor::DeviceVec; +use openinfer_core::kv_pool::KvState; +use openinfer_core::sampler::SamplingParams; +use openinfer_core::tensor::DeviceVec; use self::plan::{ ActiveKvBudget, ExecutionPlan, admit_pending_requests, compaction_after_retire, max_kv_tokens, diff --git a/pegainfer-qwen35-4b/src/scheduler/plan.rs b/openinfer-qwen35-4b/src/scheduler/plan.rs similarity index 100% rename from pegainfer-qwen35-4b/src/scheduler/plan.rs rename to openinfer-qwen35-4b/src/scheduler/plan.rs diff --git a/pegainfer-qwen35-4b/src/scheduler/tests.rs b/openinfer-qwen35-4b/src/scheduler/tests.rs similarity index 100% rename from pegainfer-qwen35-4b/src/scheduler/tests.rs rename to openinfer-qwen35-4b/src/scheduler/tests.rs diff --git a/pegainfer-qwen35-4b/src/unified_forward.rs b/openinfer-qwen35-4b/src/unified_forward.rs similarity index 97% rename from pegainfer-qwen35-4b/src/unified_forward.rs rename to openinfer-qwen35-4b/src/unified_forward.rs index 3bfa4715..31054511 100644 --- a/pegainfer-qwen35-4b/src/unified_forward.rs +++ b/openinfer-qwen35-4b/src/unified_forward.rs @@ -14,8 +14,8 @@ use super::batch_decode_graph::BatchDecodeGraphState; use super::recurrent_state::RecurrentState; use super::weights::Qwen35Model; use crate::ops; -use pegainfer_core::kv_pool::KvState; -use pegainfer_core::tensor::DeviceVec; +use openinfer_core::kv_pool::KvState; +use openinfer_core::tensor::DeviceVec; impl Qwen35Model { /// Prefill `n` prompts sequentially, updating each request's KV and recurrent state. @@ -102,19 +102,19 @@ mod tests { use std::path::Path; use super::*; - use pegainfer_core::kv_pool::KvState; + use openinfer_core::kv_pool::KvState; const MODEL_PATH: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/../models/Qwen3.5-4B"); fn get_model_path_or_skip() -> Option { - match std::env::var("PEGAINFER_TEST_MODEL_PATH") { + match std::env::var("OPENINFER_TEST_MODEL_PATH") { Ok(path) => Some(path), Err(_) if Path::new(MODEL_PATH).join("config.json").exists() => { Some(MODEL_PATH.to_string()) } Err(_) => { eprintln!( - "skipping Qwen3.5 unified forward model test because {MODEL_PATH}/config.json is missing; set PEGAINFER_TEST_MODEL_PATH to run it" + "skipping Qwen3.5 unified forward model test because {MODEL_PATH}/config.json is missing; set OPENINFER_TEST_MODEL_PATH to run it" ); None } @@ -123,7 +123,7 @@ mod tests { /// Sample a token from a DeviceVec logits using greedy (argmax). fn greedy_sample(model: &Qwen35Model, logits: &DeviceVec, _rng: &mut StdRng) -> u32 { - let params = pegainfer_core::sampler::SamplingParams::default(); + let params = openinfer_core::sampler::SamplingParams::default(); let mut probs: cudarc::driver::CudaSlice = model .ctx .stream diff --git a/pegainfer-qwen35-4b/src/weights.rs b/openinfer-qwen35-4b/src/weights.rs similarity index 97% rename from pegainfer-qwen35-4b/src/weights.rs rename to openinfer-qwen35-4b/src/weights.rs index 7301c433..2dda9785 100644 --- a/pegainfer-qwen35-4b/src/weights.rs +++ b/openinfer-qwen35-4b/src/weights.rs @@ -4,8 +4,8 @@ use log::{debug, info}; use std::time::Instant; use super::config::{Config35, LayerType}; -use pegainfer_core::tensor::{DeviceContext, DeviceMatrix, DeviceVec}; -use pegainfer_core::weight_loader::{ +use openinfer_core::tensor::{DeviceContext, DeviceMatrix, DeviceVec}; +use openinfer_core::weight_loader::{ deserialize_shards, load_shard_info_fixed, load_tensor_1d, load_tensor_1d_f32, load_tensor_2d, mmap_shards, precompute_rope, }; @@ -80,7 +80,7 @@ pub struct Qwen35Model { pub(super) cos_cache: DeviceVec, pub(super) sin_cache: DeviceVec, /// Shared paged KV pool for full-attention layers. - pub(super) kv_pool: pegainfer_core::kv_pool::KvPool, + pub(super) kv_pool: openinfer_core::kv_pool::KvPool, } impl Qwen35Model { @@ -315,7 +315,7 @@ impl Qwen35Model { // Paged KV pool for the 8 full-attention layers. let page_size = 16usize; let num_full_layers = config.num_full_attention_layers(); - let layout = pegainfer_core::kv_pool::KvLayout::new( + let layout = openinfer_core::kv_pool::KvLayout::new( num_full_layers, config.num_key_value_heads, config.head_dim, @@ -339,7 +339,7 @@ impl Qwen35Model { kv_budget as f64 / free_bytes as f64 * 100.0, free_bytes as f64 / 1024.0 / 1024.0 ); - let kv_pool = pegainfer_core::kv_pool::KvPool::new( + let kv_pool = openinfer_core::kv_pool::KvPool::new( &ctx, num_full_layers, config.num_key_value_heads, @@ -378,11 +378,11 @@ impl Qwen35Model { &self.ctx } - pub(crate) fn alloc_kv(&self) -> pegainfer_core::kv_pool::KvState { + pub(crate) fn alloc_kv(&self) -> openinfer_core::kv_pool::KvState { self.kv_pool.alloc() } - pub(crate) fn kv_pool(&self) -> &pegainfer_core::kv_pool::KvPool { + pub(crate) fn kv_pool(&self) -> &openinfer_core::kv_pool::KvPool { &self.kv_pool } /// Create a CUDA Graph batch decode state with a custom slot capacity. diff --git a/pegainfer-qwen3-4b/tests/common/mod.rs b/openinfer-qwen35-4b/tests/common/mod.rs similarity index 78% rename from pegainfer-qwen3-4b/tests/common/mod.rs rename to openinfer-qwen35-4b/tests/common/mod.rs index 01eab133..6614d672 100644 --- a/pegainfer-qwen3-4b/tests/common/mod.rs +++ b/openinfer-qwen35-4b/tests/common/mod.rs @@ -1,6 +1,6 @@ use vllm_text::tokenizer::DynTokenizer; pub(crate) fn load_tokenizer(model_path: &str) -> DynTokenizer { - pegainfer_vllm_support::load_tokenizer(model_path) + openinfer_vllm_support::load_tokenizer(model_path) .unwrap_or_else(|err| panic!("Failed to load tokenizer for {model_path}: {err}")) } diff --git a/pegainfer-qwen35-4b/tests/e2e_scheduler.rs b/openinfer-qwen35-4b/tests/e2e_scheduler.rs similarity index 96% rename from pegainfer-qwen35-4b/tests/e2e_scheduler.rs rename to openinfer-qwen35-4b/tests/e2e_scheduler.rs index 29ffe9d5..a963f104 100644 --- a/pegainfer-qwen35-4b/tests/e2e_scheduler.rs +++ b/openinfer-qwen35-4b/tests/e2e_scheduler.rs @@ -7,9 +7,9 @@ use std::time::Instant; use log::info; use tokio::sync::mpsc; -use pegainfer_core::engine::FinishReason; -use pegainfer_core::engine::{EngineHandle, GenerateRequest, TokenEvent}; -use pegainfer_core::sampler::SamplingParams; +use openinfer_core::engine::FinishReason; +use openinfer_core::engine::{EngineHandle, GenerateRequest, TokenEvent}; +use openinfer_core::sampler::SamplingParams; use vllm_text::tokenizer::DynTokenizer; mod common; @@ -70,7 +70,7 @@ const CASES: &[TestCase] = &[ ]; fn get_model_path() -> String { - std::env::var("PEGAINFER_TEST_MODEL_PATH").unwrap_or_else(|_| DEFAULT_MODEL_PATH.to_string()) + std::env::var("OPENINFER_TEST_MODEL_PATH").unwrap_or_else(|_| DEFAULT_MODEL_PATH.to_string()) } struct TestCase { @@ -127,11 +127,11 @@ fn test_e2e_qwen35_scheduler() { info!("Loading Qwen3.5 model for scheduler test..."); let start = Instant::now(); let model = - pegainfer_qwen35_4b::runtime::Qwen35Model::from_safetensors_with_options(&model_path, true) + openinfer_qwen35_4b::runtime::Qwen35Model::from_safetensors_with_options(&model_path, true) .expect("Failed to load model"); let tokenizer = common::load_tokenizer(&model_path); // Use reduced batch capacity (8) to fit on 16GB GPUs alongside the model. - let handle = pegainfer_qwen35_4b::runtime::start_with_capacity(model, 42, 8) + let handle = openinfer_qwen35_4b::runtime::start_with_capacity(model, 42, 8) .expect("Failed to start Qwen3.5 scheduler"); info!("scheduler loaded in {:.2?}", start.elapsed()); diff --git a/pegainfer-qwen35-4b/tests/hf_golden_gate.rs b/openinfer-qwen35-4b/tests/hf_golden_gate.rs similarity index 96% rename from pegainfer-qwen35-4b/tests/hf_golden_gate.rs rename to openinfer-qwen35-4b/tests/hf_golden_gate.rs index b794364b..0dea0836 100644 --- a/pegainfer-qwen35-4b/tests/hf_golden_gate.rs +++ b/openinfer-qwen35-4b/tests/hf_golden_gate.rs @@ -3,9 +3,9 @@ //! This is the Qwen3.5 instance of the reusable logits-golden method in //! `docs/subsystems/correctness/logits-golden-gate.md`: store HF bf16 top-K //! logprobs for fixed teacher-forced token sequences, replay those sequences -//! through pegainfer, and compare bounded logprob drift instead of exact text. +//! through openinfer, and compare bounded logprob drift instead of exact text. //! The Qwen3.5 fixture is produced through HF's incremental `past_key_values` -//! path so the oracle matches pegainfer's prefill + decode shape. +//! path so the oracle matches openinfer's prefill + decode shape. //! //! Qwen3.5 currently has no eager batched decode path; decode goes through the //! CUDA-graph bucketed path. This gate therefore covers sequential bs=1, @@ -15,8 +15,8 @@ use std::collections::HashMap; use std::path::{Path, PathBuf}; -use pegainfer_core::engine::TokenLogprob; -use pegainfer_qwen35_4b::runtime::{ +use openinfer_core::engine::TokenLogprob; +use openinfer_qwen35_4b::runtime::{ DecodePlan, DecodeStepItem, PrefillPlan, PrefillStepItem, Qwen35Executor, RequestId, }; use safetensors::{Dtype, SafeTensors}; @@ -31,7 +31,7 @@ const LONG_GOLDEN: &str = concat!( env!("CARGO_MANIFEST_DIR"), "/../test_data/qwen35-4b-hf-long-golden.safetensors" ); -const GOLDEN_ENV: &str = "PEGAINFER_QWEN35_HF_GOLDEN"; +const GOLDEN_ENV: &str = "OPENINFER_QWEN35_HF_GOLDEN"; const LOGPROBS: usize = 64; const MAX_EXECUTOR_BATCH: usize = 8; @@ -49,14 +49,14 @@ const SLOT_COMPACTION_BATCH: usize = 5; const SLOT_COMPACTION_DROP_INDEX: usize = 1; fn model_path_or_skip() -> Option { - match std::env::var("PEGAINFER_TEST_MODEL_PATH") { + match std::env::var("OPENINFER_TEST_MODEL_PATH") { Ok(path) => Some(path), Err(_) if Path::new(MODEL_PATH).join("config.json").exists() => { Some(MODEL_PATH.to_string()) } Err(_) => { eprintln!( - "skipping qwen35 hf_golden_gate: {MODEL_PATH}/config.json is missing; set PEGAINFER_TEST_MODEL_PATH to run it" + "skipping qwen35 hf_golden_gate: {MODEL_PATH}/config.json is missing; set OPENINFER_TEST_MODEL_PATH to run it" ); None } @@ -99,7 +99,7 @@ fn safetensors_metadata(bytes: &[u8]) -> HashMap { } fn model_revision(model_path: &str) -> Option { - if let Ok(value) = std::env::var("PEGAINFER_TEST_MODEL_REVISION") { + if let Ok(value) = std::env::var("OPENINFER_TEST_MODEL_REVISION") { return Some(value); } let path = Path::new(model_path); @@ -195,7 +195,7 @@ fn check_fixture_metadata(model_path: &str, metadata: &HashMap) }; assert_eq!( actual_revision, expected_revision, - "qwen35 hf_golden_gate model revision mismatch; set PEGAINFER_TEST_MODEL_REVISION or use the fixture's model snapshot" + "qwen35 hf_golden_gate model revision mismatch; set OPENINFER_TEST_MODEL_REVISION or use the fixture's model snapshot" ); if let Some(expected_tokenizer_revision) = metadata.get("tokenizer_revision") { @@ -259,11 +259,11 @@ fn check_position( match hf_map.get(&pega_argmax) { None => stats.argmax_violations.push(format!( - "seq {seq} pos {pos}: pegainfer argmax {pega_argmax} absent from HF top-{}", + "seq {seq} pos {pos}: openinfer argmax {pega_argmax} absent from HF top-{}", hf.len() )), Some(&hlp) if hf_top - hlp > MARGIN_TOL => stats.argmax_violations.push(format!( - "seq {seq} pos {pos}: pegainfer chose {pega_argmax}, HF scores it {:.4} nat below its argmax", + "seq {seq} pos {pos}: openinfer chose {pega_argmax}, HF scores it {:.4} nat below its argmax", hf_top - hlp )), Some(_) => {} @@ -580,7 +580,7 @@ fn report_and_assert(label: &str, stats: &Stats) { assert!( stats.argmax_violations.is_empty(), - "[{label}] pegainfer picked a token HF does not rank near its best:\n {}", + "[{label}] openinfer picked a token HF does not rank near its best:\n {}", stats.argmax_violations.join("\n ") ); assert!( diff --git a/pegainfer-server/Cargo.toml b/openinfer-server/Cargo.toml similarity index 56% rename from pegainfer-server/Cargo.toml rename to openinfer-server/Cargo.toml index 15f16f2a..5d95008f 100644 --- a/pegainfer-server/Cargo.toml +++ b/openinfer-server/Cargo.toml @@ -1,16 +1,16 @@ [package] autobenches = false -name = "pegainfer-server" -default-run = "pegainfer" +name = "openinfer-server" +default-run = "openinfer" version = "0.1.0" edition = "2024" [lib] -name = "pegainfer" +name = "openinfer" path = "src/lib.rs" [[bin]] -name = "pegainfer" +name = "openinfer" path = "src/main.rs" [[bin]] @@ -18,14 +18,14 @@ name = "bench_serving" path = "src/bin/bench_serving.rs" [dependencies] -pegainfer-core = { workspace = true } -pegainfer-deepseek-v2-lite = { workspace = true, optional = true } -pegainfer-deepseek-v4 = { workspace = true, optional = true } -pegainfer-kimi-k2 = { workspace = true, optional = true } -pegainfer-qwen3-4b = { workspace = true } -pegainfer-qwen35-4b = { workspace = true } -pegainfer-vllm-frontend = { workspace = true } -pegainfer-vllm-support = { workspace = true } +openinfer-core = { workspace = true } +openinfer-deepseek-v2-lite = { workspace = true, optional = true } +openinfer-deepseek-v4 = { workspace = true, optional = true } +openinfer-kimi-k2 = { workspace = true, optional = true } +openinfer-qwen3-4b = { workspace = true } +openinfer-qwen35-4b = { workspace = true } +openinfer-vllm-frontend = { workspace = true } +openinfer-vllm-support = { workspace = true } anyhow = { workspace = true } clap = { workspace = true } colored = { workspace = true } @@ -46,13 +46,13 @@ tikv-jemallocator = "0.6" [features] default = [] -deepseek-v4 = ["dep:pegainfer-deepseek-v4", "pegainfer-deepseek-v4/deepseek-v4"] -deepseek-v2-lite = ["dep:pegainfer-deepseek-v2-lite", "pegainfer-deepseek-v2-lite/deepseek-v2-lite"] -kimi-k2 = ["dep:pegainfer-kimi-k2", "pegainfer-kimi-k2/kimi-k2"] -pplx-ep = ["deepseek-v4", "pegainfer-deepseek-v4/pplx-ep"] +deepseek-v4 = ["dep:openinfer-deepseek-v4", "openinfer-deepseek-v4/deepseek-v4"] +deepseek-v2-lite = ["dep:openinfer-deepseek-v2-lite", "openinfer-deepseek-v2-lite/deepseek-v2-lite"] +kimi-k2 = ["dep:openinfer-kimi-k2", "openinfer-kimi-k2/kimi-k2"] +pplx-ep = ["deepseek-v4", "openinfer-deepseek-v4/pplx-ep"] deepseek-v4-cutedsl-diagnostic = [ "deepseek-v4", - "pegainfer-deepseek-v4/deepseek-v4-cutedsl-diagnostic", + "openinfer-deepseek-v4/deepseek-v4-cutedsl-diagnostic", ] [dev-dependencies] diff --git a/pegainfer-server/benches/nccl_bench.rs b/openinfer-server/benches/nccl_bench.rs similarity index 97% rename from pegainfer-server/benches/nccl_bench.rs rename to openinfer-server/benches/nccl_bench.rs index 214626ef..c450211f 100644 --- a/pegainfer-server/benches/nccl_bench.rs +++ b/openinfer-server/benches/nccl_bench.rs @@ -1,4 +1,4 @@ -//! NCCL all-reduce microbenchmark for the pegainfer TP=2 bring-up path. +//! NCCL all-reduce microbenchmark for the openinfer TP=2 bring-up path. //! //! Reference data below was measured on a dual-GPU PCIe 4.0 setup //! (`4090-3`, 2x RTX 4090, NCCL 2.28.3, BF16, in-place all-reduce): @@ -24,7 +24,7 @@ use cudarc::nccl::{ safe::{Comm, group_end, group_start}, }; use half::bf16; -use pegainfer::tensor::DeviceContext; +use openinfer::tensor::DeviceContext; const PAYLOAD_BYTES: &[usize] = &[ 4 * 1024, diff --git a/pegainfer-server/benches/ops/common/mod.rs b/openinfer-server/benches/ops/common/mod.rs similarity index 98% rename from pegainfer-server/benches/ops/common/mod.rs rename to openinfer-server/benches/ops/common/mod.rs index 9d77eb19..84cb5b25 100644 --- a/pegainfer-server/benches/ops/common/mod.rs +++ b/openinfer-server/benches/ops/common/mod.rs @@ -7,7 +7,7 @@ use anyhow::{Result, anyhow}; use criterion::{Bencher, BenchmarkGroup, measurement::WallTime}; use cudarc::driver::CudaSlice; use half::bf16; -use pegainfer::tensor::{DeviceContext, DeviceMatrix, DeviceVec, HiddenStates}; +use openinfer::tensor::{DeviceContext, DeviceMatrix, DeviceVec, HiddenStates}; pub(crate) const VECTOR_DIM: usize = 1024; pub(crate) const OUT_DIM: usize = 1024; diff --git a/pegainfer-server/benches/ops/mod.rs b/openinfer-server/benches/ops/mod.rs similarity index 100% rename from pegainfer-server/benches/ops/mod.rs rename to openinfer-server/benches/ops/mod.rs diff --git a/pegainfer-server/benches/ops/ops_batched_bench.rs b/openinfer-server/benches/ops/ops_batched_bench.rs similarity index 97% rename from pegainfer-server/benches/ops/ops_batched_bench.rs rename to openinfer-server/benches/ops/ops_batched_bench.rs index 59cfe7fd..f4b9aafb 100644 --- a/pegainfer-server/benches/ops/ops_batched_bench.rs +++ b/openinfer-server/benches/ops/ops_batched_bench.rs @@ -1,8 +1,8 @@ use std::hint::black_box; use criterion::{BenchmarkId, Criterion, Throughput}; -use pegainfer::ops; -use pegainfer::tensor::DeviceContext; +use openinfer::ops; +use openinfer::tensor::DeviceContext; use super::common::{ BATCH_SEQ_LEN, OUT_DIM, VECTOR_DIM, configure_group, device_matrix, hidden_states, iter_sync, diff --git a/pegainfer-server/benches/ops/ops_elementwise_bench.rs b/openinfer-server/benches/ops/ops_elementwise_bench.rs similarity index 97% rename from pegainfer-server/benches/ops/ops_elementwise_bench.rs rename to openinfer-server/benches/ops/ops_elementwise_bench.rs index aad5ae9e..2cbf483f 100644 --- a/pegainfer-server/benches/ops/ops_elementwise_bench.rs +++ b/openinfer-server/benches/ops/ops_elementwise_bench.rs @@ -1,6 +1,6 @@ use criterion::{BenchmarkId, Criterion, Throughput}; -use pegainfer::ops; -use pegainfer::tensor::{DeviceContext, DeviceVec}; +use openinfer::ops; +use openinfer::tensor::{DeviceContext, DeviceVec}; use super::common::{ EPS, OUT_DIM, VECTOR_DIM, configure_group, device_matrix, device_vec, iter_sync, diff --git a/pegainfer-server/benches/ops/ops_embedding_sampling_bench.rs b/openinfer-server/benches/ops/ops_embedding_sampling_bench.rs similarity index 97% rename from pegainfer-server/benches/ops/ops_embedding_sampling_bench.rs rename to openinfer-server/benches/ops/ops_embedding_sampling_bench.rs index abb2fce0..7fefafb0 100644 --- a/pegainfer-server/benches/ops/ops_embedding_sampling_bench.rs +++ b/openinfer-server/benches/ops/ops_embedding_sampling_bench.rs @@ -3,9 +3,9 @@ use std::hint::black_box; use cudarc::driver::CudaSlice; use half::bf16; -use pegainfer::ops; -use pegainfer::sampler::SamplingParams; -use pegainfer::tensor::{DeviceContext, DeviceVec, HiddenStates}; +use openinfer::ops; +use openinfer::sampler::SamplingParams; +use openinfer::tensor::{DeviceContext, DeviceVec, HiddenStates}; use super::common::{ BATCH_SEQ_LEN, VECTOR_DIM, VOCAB_SIZE, configure_group, decode_token_id, embedding_matrix, @@ -112,7 +112,7 @@ pub(crate) fn bench_embedding_sampling_ops(c: &mut Criterion) { .expect("failed to allocate top1 value scratch"); let mut row_states: CudaSlice = ctx .stream - .alloc_zeros(pegainfer::ops::flashinfer_topk_row_states_bytes()) + .alloc_zeros(openinfer::ops::flashinfer_topk_row_states_bytes()) .expect("failed to allocate row state scratch"); let mut valid: CudaSlice = ctx .stream @@ -182,7 +182,7 @@ pub(crate) fn bench_embedding_sampling_ops(c: &mut Criterion) { .expect("failed to allocate top1 value scratch"); let mut row_states: CudaSlice = ctx .stream - .alloc_zeros(pegainfer::ops::flashinfer_topk_row_states_bytes()) + .alloc_zeros(openinfer::ops::flashinfer_topk_row_states_bytes()) .expect("failed to allocate row state scratch"); let mut valid: CudaSlice = ctx .stream diff --git a/pegainfer-server/benches/ops_bench.rs b/openinfer-server/benches/ops_bench.rs similarity index 100% rename from pegainfer-server/benches/ops_bench.rs rename to openinfer-server/benches/ops_bench.rs diff --git a/pegainfer-server/src/bin/bench_serving.rs b/openinfer-server/src/bin/bench_serving.rs similarity index 98% rename from pegainfer-server/src/bin/bench_serving.rs rename to openinfer-server/src/bin/bench_serving.rs index f924c273..96aa1c65 100644 --- a/pegainfer-server/src/bin/bench_serving.rs +++ b/openinfer-server/src/bin/bench_serving.rs @@ -24,15 +24,15 @@ use comfy_table::{Cell, CellAlignment, Table}; use cudarc::driver::Profiler; use cudarc::runtime::result::device as cuda_device; use log::{debug, info}; -use pegainfer::logging; -use pegainfer::sampler::SamplingParams; -use pegainfer::scheduler::{SchedulerHandle, SchedulerRequest, TokenEvent}; -use pegainfer::server_engine::{ModelType, detect_model_type}; -use pegainfer_core::{ +use openinfer::logging; +use openinfer::sampler::SamplingParams; +use openinfer::scheduler::{SchedulerHandle, SchedulerRequest, TokenEvent}; +use openinfer::server_engine::{ModelType, detect_model_type}; +use openinfer_core::{ engine::{EngineLoadOptions, EpBackend}, parallel::ParallelConfig, }; -use pegainfer_vllm_support::load_tokenizer as load_vllm_tokenizer; +use openinfer_vllm_support::load_tokenizer as load_vllm_tokenizer; use rand::RngExt; use rand::SeedableRng; use rand::rngs::StdRng; @@ -141,7 +141,7 @@ enum Command { #[derive(Parser, Debug)] #[command( name = "bench_serving", - about = "pegainfer in-process inference benchmark", + about = "openinfer in-process inference benchmark", after_help = TOP_LEVEL_EXAMPLES )] struct Cli { @@ -1125,7 +1125,7 @@ impl BenchModel for SchedulerBenchModel { #[cfg(feature = "deepseek-v2-lite")] struct DeepSeekV2LiteBenchModel { - generator: pegainfer_deepseek_v2_lite::DeepSeekV2LiteEp2Generator, + generator: openinfer_deepseek_v2_lite::DeepSeekV2LiteEp2Generator, } #[cfg(feature = "deepseek-v2-lite")] @@ -1261,7 +1261,7 @@ fn timings_from_dsv2_lite_attribution( #[cfg(feature = "deepseek-v2-lite")] fn timings_from_dsv2_lite_batched_generation( - result: pegainfer_deepseek_v2_lite::BatchedGenerationResult, + result: openinfer_deepseek_v2_lite::BatchedGenerationResult, expected_generated_tokens: usize, ) -> Vec { let batch_size = result.tokens.len(); @@ -2167,7 +2167,7 @@ fn main() -> Result<()> { match model_type { #[cfg(feature = "deepseek-v2-lite")] ModelType::DeepSeekV2Lite => { - let generator = pegainfer_deepseek_v2_lite::DeepSeekV2LiteEp2Generator::load( + let generator = openinfer_deepseek_v2_lite::DeepSeekV2LiteEp2Generator::load( Path::new(&cli.model_path), EngineLoadOptions { enable_cuda_graph: false, @@ -2185,7 +2185,7 @@ fn main() -> Result<()> { } #[cfg(feature = "deepseek-v4")] ModelType::DeepSeekV4 => { - let handle = pegainfer_deepseek_v4::start_engine( + let handle = openinfer_deepseek_v4::start_engine( Path::new(&cli.model_path), EngineLoadOptions { enable_cuda_graph: false, @@ -2204,7 +2204,7 @@ fn main() -> Result<()> { #[cfg(feature = "kimi-k2")] ModelType::KimiK2 => { let parallel = kimi_parallel_config(cli.tp_size, cli.dp_size)?; - let handle = pegainfer_kimi_k2::start_engine( + let handle = openinfer_kimi_k2::start_engine( Path::new(&cli.model_path), EngineLoadOptions { enable_cuda_graph: cli.cuda_graph, @@ -2228,7 +2228,7 @@ fn main() -> Result<()> { ) } ModelType::Qwen3 => { - let handle = pegainfer_qwen3_4b::start_engine( + let handle = openinfer_qwen3_4b::start_engine( Path::new(&cli.model_path), EngineLoadOptions { enable_cuda_graph: cli.cuda_graph, @@ -2252,7 +2252,7 @@ fn main() -> Result<()> { ) } ModelType::Qwen35 => { - let handle = pegainfer_qwen35_4b::start_engine_with_capacity( + let handle = openinfer_qwen35_4b::start_engine_with_capacity( Path::new(&cli.model_path), EngineLoadOptions { enable_cuda_graph: cli.cuda_graph, @@ -2342,12 +2342,12 @@ mod tests { #[test] fn dsv2_lite_batched_timings_use_shared_decode_time_for_rate() { let timings = timings_from_dsv2_lite_batched_generation( - pegainfer_deepseek_v2_lite::BatchedGenerationResult { + openinfer_deepseek_v2_lite::BatchedGenerationResult { tokens: vec![vec![11, 304, 608], vec![11, 304, 608]], prefill_next_token_us: vec![20_000, 21_000], per_token_decode_us: vec![19_000, 18_000], total_generation_us: 80_000, - stats: pegainfer_deepseek_v2_lite::GenerationStats::default(), + stats: openinfer_deepseek_v2_lite::GenerationStats::default(), }, 3, ); diff --git a/pegainfer-server/src/lib.rs b/openinfer-server/src/lib.rs similarity index 80% rename from pegainfer-server/src/lib.rs rename to openinfer-server/src/lib.rs index add4cbc3..5f773c69 100644 --- a/pegainfer-server/src/lib.rs +++ b/openinfer-server/src/lib.rs @@ -1,4 +1,4 @@ -pub use pegainfer_core::logging; +pub use openinfer_core::logging; pub mod ops; pub mod sampler; pub mod scheduler; diff --git a/pegainfer-server/src/main.rs b/openinfer-server/src/main.rs similarity index 93% rename from pegainfer-server/src/main.rs rename to openinfer-server/src/main.rs index bb017acb..25ecdfb3 100644 --- a/pegainfer-server/src/main.rs +++ b/openinfer-server/src/main.rs @@ -4,13 +4,13 @@ use std::time::Instant; use anyhow::{Context, bail}; use clap::{Parser, ValueEnum}; use log::info; -use pegainfer::logging; -use pegainfer::server_engine::{ModelType, detect_model_type}; -use pegainfer::vllm_frontend::LoraModule; -use pegainfer_core::engine::{EngineLoadOptions, EpBackend}; +use openinfer::logging; +use openinfer::server_engine::{ModelType, detect_model_type}; +use openinfer::vllm_frontend::LoraModule; +use openinfer_core::engine::{EngineLoadOptions, EpBackend}; #[cfg(feature = "kimi-k2")] -use pegainfer_core::parallel::ParallelConfig; -use pegainfer_qwen3_4b::{Qwen3LoraOptions, Qwen3OffloadOptions}; +use openinfer_core::parallel::ParallelConfig; +use openinfer_qwen3_4b::{Qwen3LoraOptions, Qwen3OffloadOptions}; #[cfg(not(target_env = "msvc"))] #[global_allocator] @@ -19,7 +19,7 @@ static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; const DEFAULT_MODEL_PATH: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/../models/Qwen3-4B"); #[derive(Parser)] -#[command(name = "pegainfer", about = "Qwen3/3.5 GPU inference server")] +#[command(name = "openinfer", about = "Qwen3/3.5 GPU inference server")] struct Args { /// Model directory containing config, tokenizer, and safetensor shards #[arg(long, default_value = DEFAULT_MODEL_PATH)] @@ -140,7 +140,7 @@ async fn main() -> anyhow::Result<()> { ModelType::Qwen3 | ModelType::Qwen35 => args.cuda_graph, }; - info!("=== pegainfer - {} (GPU) ===", model_type); + info!("=== openinfer - {} (GPU) ===", model_type); info!("Loading engine..."); let start = Instant::now(); info!( @@ -158,7 +158,7 @@ async fn main() -> anyhow::Result<()> { let handle = match model_type { #[cfg(feature = "deepseek-v4")] ModelType::DeepSeekV4 => { - let handle = pegainfer_deepseek_v4::start_engine( + let handle = openinfer_deepseek_v4::start_engine( &args.model_path, EngineLoadOptions { enable_cuda_graph: false, @@ -175,7 +175,7 @@ async fn main() -> anyhow::Result<()> { } #[cfg(feature = "deepseek-v2-lite")] ModelType::DeepSeekV2Lite => { - let handle = pegainfer_deepseek_v2_lite::start_engine( + let handle = openinfer_deepseek_v2_lite::start_engine( &args.model_path, EngineLoadOptions { enable_cuda_graph: false, @@ -196,7 +196,7 @@ async fn main() -> anyhow::Result<()> { "EP options: dp_size={}, ep_backend={:?}", args.dp_size, args.ep_backend ); - let handle = pegainfer_kimi_k2::start_engine( + let handle = openinfer_kimi_k2::start_engine( &args.model_path, EngineLoadOptions { enable_cuda_graph: args.cuda_graph, @@ -244,7 +244,7 @@ async fn main() -> anyhow::Result<()> { "Starting Qwen3 engine with LoRA control; CUDA Graph is disabled; max_loras={}, max_lora_rank={}", lora_options.max_loras, lora_options.max_lora_rank ); - pegainfer_qwen3_4b::start_engine_with_lora_control( + openinfer_qwen3_4b::start_engine_with_lora_control( &args.model_path, options, lora_options, @@ -252,7 +252,7 @@ async fn main() -> anyhow::Result<()> { args.no_prefix_cache, ) } else { - pegainfer_qwen3_4b::start_engine_with_offload( + openinfer_qwen3_4b::start_engine_with_offload( &args.model_path, options, offload, @@ -264,7 +264,7 @@ async fn main() -> anyhow::Result<()> { handle } ModelType::Qwen35 => { - let handle = pegainfer_qwen35_4b::start_engine( + let handle = openinfer_qwen35_4b::start_engine( &args.model_path, EngineLoadOptions { enable_cuda_graph: args.cuda_graph, @@ -285,24 +285,24 @@ async fn main() -> anyhow::Result<()> { if args.enable_lora { let max_model_len = - pegainfer::vllm_frontend::load_max_model_len(&args.model_path).unwrap_or(4096); - pegainfer::vllm_frontend::serve_model_with_lora_routes( + openinfer::vllm_frontend::load_max_model_len(&args.model_path).unwrap_or(4096); + openinfer::vllm_frontend::serve_model_with_lora_routes( handle, args.model_path.to_string_lossy().into_owned(), args.served_model_name.into_iter().collect(), args.lora_modules, args.port, max_model_len, - pegainfer::vllm_frontend::shutdown_token_from_ctrl_c(), + openinfer::vllm_frontend::shutdown_token_from_ctrl_c(), ) .await } else { - pegainfer::vllm_frontend::serve( + openinfer::vllm_frontend::serve( handle, &args.model_path, args.served_model_name.as_deref(), args.port, - pegainfer::vllm_frontend::shutdown_token_from_ctrl_c(), + openinfer::vllm_frontend::shutdown_token_from_ctrl_c(), ) .await } diff --git a/pegainfer-server/src/ops.rs b/openinfer-server/src/ops.rs similarity index 95% rename from pegainfer-server/src/ops.rs rename to openinfer-server/src/ops.rs index 102fb5c8..6d7ffe92 100644 --- a/pegainfer-server/src/ops.rs +++ b/openinfer-server/src/ops.rs @@ -3,7 +3,7 @@ #[cfg(test)] mod tests; -pub use pegainfer_core::ops::{ +pub use openinfer_core::ops::{ add_batch, add_batch_into, argmax, embedding_batch, embedding_decode_into, extract_vec, extract_vec_into, flashinfer_topk_row_states_bytes, fused_add_rms_norm_batch_into, fused_add_rms_norm_into, gemm, gemm_into, gemm_rows_into, gemv, gpu_sample, gpu_sample_into, diff --git a/pegainfer-server/src/ops/tests.rs b/openinfer-server/src/ops/tests.rs similarity index 100% rename from pegainfer-server/src/ops/tests.rs rename to openinfer-server/src/ops/tests.rs diff --git a/openinfer-server/src/sampler.rs b/openinfer-server/src/sampler.rs new file mode 100644 index 00000000..f5c7ec2d --- /dev/null +++ b/openinfer-server/src/sampler.rs @@ -0,0 +1 @@ +pub use openinfer_core::sampler::*; diff --git a/pegainfer-server/src/scheduler.rs b/openinfer-server/src/scheduler.rs similarity index 72% rename from pegainfer-server/src/scheduler.rs rename to openinfer-server/src/scheduler.rs index 7ce6c4f6..22700dae 100644 --- a/pegainfer-server/src/scheduler.rs +++ b/openinfer-server/src/scheduler.rs @@ -1,3 +1,3 @@ -pub use pegainfer_core::engine::{ +pub use openinfer_core::engine::{ EngineHandle as SchedulerHandle, GenerateRequest as SchedulerRequest, TokenEvent, }; diff --git a/pegainfer-server/src/server_engine.rs b/openinfer-server/src/server_engine.rs similarity index 89% rename from pegainfer-server/src/server_engine.rs rename to openinfer-server/src/server_engine.rs index d36e3fb2..fcddc08e 100644 --- a/pegainfer-server/src/server_engine.rs +++ b/openinfer-server/src/server_engine.rs @@ -2,7 +2,7 @@ use std::{fmt, path::Path}; use anyhow::{Context, Result}; -pub use pegainfer_core::engine::{FinishReason, TokenLogprob}; +pub use openinfer_core::engine::{FinishReason, TokenLogprob}; // ── Model type detection ──────────────────────────────────────────────── @@ -48,13 +48,13 @@ pub fn detect_model_type(model_path: impl AsRef) -> Result { { #[cfg(feature = "deepseek-v2-lite")] { - pegainfer_deepseek_v2_lite::probe_config_json(&json)?; + openinfer_deepseek_v2_lite::probe_config_json(&json)?; return Ok(ModelType::DeepSeekV2Lite); } #[cfg(not(feature = "deepseek-v2-lite"))] { anyhow::bail!( - "DeepSeek-V2-Lite support is feature-gated; rebuild pegainfer-server with --features deepseek-v2-lite" + "DeepSeek-V2-Lite support is feature-gated; rebuild openinfer-server with --features deepseek-v2-lite" ); } } @@ -68,7 +68,7 @@ pub fn detect_model_type(model_path: impl AsRef) -> Result { return Ok(ModelType::DeepSeekV4); #[cfg(not(feature = "deepseek-v4"))] anyhow::bail!( - "DeepSeek V4 support is feature-gated; rebuild pegainfer-server with --features deepseek-v4" + "DeepSeek V4 support is feature-gated; rebuild openinfer-server with --features deepseek-v4" ); } @@ -84,12 +84,12 @@ pub fn detect_model_type(model_path: impl AsRef) -> Result { { #[cfg(feature = "kimi-k2")] { - pegainfer_kimi_k2::probe_config_json(&json)?; + openinfer_kimi_k2::probe_config_json(&json)?; return Ok(ModelType::KimiK2); } #[cfg(not(feature = "kimi-k2"))] anyhow::bail!( - "Kimi-K2 support is feature-gated; rebuild pegainfer-server with --features kimi-k2" + "Kimi-K2 support is feature-gated; rebuild openinfer-server with --features kimi-k2" ); } diff --git a/openinfer-server/src/tensor.rs b/openinfer-server/src/tensor.rs new file mode 100644 index 00000000..b1eb1edd --- /dev/null +++ b/openinfer-server/src/tensor.rs @@ -0,0 +1 @@ +pub use openinfer_core::tensor::*; diff --git a/pegainfer-server/src/trace_reporter.rs b/openinfer-server/src/trace_reporter.rs similarity index 100% rename from pegainfer-server/src/trace_reporter.rs rename to openinfer-server/src/trace_reporter.rs diff --git a/openinfer-server/src/vllm_frontend.rs b/openinfer-server/src/vllm_frontend.rs new file mode 100644 index 00000000..4ae4a42e --- /dev/null +++ b/openinfer-server/src/vllm_frontend.rs @@ -0,0 +1 @@ +pub use openinfer_vllm_frontend::*; diff --git a/openinfer-server/src/weight_loader.rs b/openinfer-server/src/weight_loader.rs new file mode 100644 index 00000000..fb410bff --- /dev/null +++ b/openinfer-server/src/weight_loader.rs @@ -0,0 +1 @@ +pub use openinfer_core::weight_loader::*; diff --git a/pegainfer-qwen35-4b/tests/common/mod.rs b/openinfer-server/tests/common/mod.rs similarity index 78% rename from pegainfer-qwen35-4b/tests/common/mod.rs rename to openinfer-server/tests/common/mod.rs index 01eab133..6614d672 100644 --- a/pegainfer-qwen35-4b/tests/common/mod.rs +++ b/openinfer-server/tests/common/mod.rs @@ -1,6 +1,6 @@ use vllm_text::tokenizer::DynTokenizer; pub(crate) fn load_tokenizer(model_path: &str) -> DynTokenizer { - pegainfer_vllm_support::load_tokenizer(model_path) + openinfer_vllm_support::load_tokenizer(model_path) .unwrap_or_else(|err| panic!("Failed to load tokenizer for {model_path}: {err}")) } diff --git a/pegainfer-sim/Cargo.toml b/openinfer-sim/Cargo.toml similarity index 73% rename from pegainfer-sim/Cargo.toml rename to openinfer-sim/Cargo.toml index f0e95f9d..c93746ba 100644 --- a/pegainfer-sim/Cargo.toml +++ b/openinfer-sim/Cargo.toml @@ -1,17 +1,17 @@ [package] -name = "pegainfer-sim" +name = "openinfer-sim" version = "0.1.0" edition = "2024" [[bin]] -name = "pegainfer-sim" +name = "openinfer-sim" path = "src/main.rs" [dependencies] anyhow = { workspace = true } clap = { workspace = true } -pegainfer-engine = { workspace = true } -pegainfer-vllm-frontend = { workspace = true } +openinfer-engine = { workspace = true } +openinfer-vllm-frontend = { workspace = true } tokio = { workspace = true, features = ["full"] } [dev-dependencies] diff --git a/pegainfer-sim/src/lib.rs b/openinfer-sim/src/lib.rs similarity index 98% rename from pegainfer-sim/src/lib.rs rename to openinfer-sim/src/lib.rs index f947089d..a7787b40 100644 --- a/pegainfer-sim/src/lib.rs +++ b/openinfer-sim/src/lib.rs @@ -1,7 +1,7 @@ use std::time::{Duration, SystemTime, UNIX_EPOCH}; use anyhow::{Result, ensure}; -use pegainfer_engine::engine::{ +use openinfer_engine::engine::{ EngineHandle, FinishReason, GenerateRequest, TokenEvent, TokenLogprob, }; use tokio::sync::mpsc; @@ -154,7 +154,7 @@ fn now_secs_f64() -> f64 { #[cfg(test)] mod tests { - use pegainfer_engine::sampler::SamplingParams; + use openinfer_engine::sampler::SamplingParams; use super::*; diff --git a/pegainfer-sim/src/main.rs b/openinfer-sim/src/main.rs similarity index 88% rename from pegainfer-sim/src/main.rs rename to openinfer-sim/src/main.rs index 694586c9..1a351050 100644 --- a/pegainfer-sim/src/main.rs +++ b/openinfer-sim/src/main.rs @@ -1,12 +1,12 @@ use anyhow::Result; use clap::Parser; -use pegainfer_sim::{SimulatedEngineConfig, start_engine}; +use openinfer_sim::{SimulatedEngineConfig, start_engine}; const DEFAULT_MODEL_ID: &str = "Qwen/Qwen3-0.6B"; #[derive(Parser, Debug)] #[command( - name = "pegainfer-sim", + name = "openinfer-sim", about = "CPU-only simulated inference server for OpenAI/vLLM serving benchmarks" )] struct Args { @@ -50,13 +50,13 @@ async fn main() -> Result<()> { )?; let handle = start_engine(config); - pegainfer_vllm_frontend::serve_model( + openinfer_vllm_frontend::serve_model( handle, args.model_id, Vec::new(), args.port, args.max_model_len, - pegainfer_vllm_frontend::shutdown_token_from_ctrl_c(), + openinfer_vllm_frontend::shutdown_token_from_ctrl_c(), ) .await } diff --git a/pegainfer-sim/tests/frontend_e2e.rs b/openinfer-sim/tests/frontend_e2e.rs similarity index 97% rename from pegainfer-sim/tests/frontend_e2e.rs rename to openinfer-sim/tests/frontend_e2e.rs index 4bb99a45..6d2b5a1c 100644 --- a/pegainfer-sim/tests/frontend_e2e.rs +++ b/openinfer-sim/tests/frontend_e2e.rs @@ -5,14 +5,14 @@ use std::sync::atomic::{AtomicU64, Ordering}; use std::time::{Duration, SystemTime, UNIX_EPOCH}; use anyhow::{Context, Result, anyhow, bail}; -use pegainfer_sim::{SimulatedEngineConfig, start_engine}; +use openinfer_sim::{SimulatedEngineConfig, start_engine}; use reqwest::Client; use serde_json::{Value, json}; use tokio::task::JoinHandle; use tokio_util::sync::CancellationToken; const HTTP_TIMEOUT: Duration = Duration::from_secs(10); -const MODEL_NAME: &str = "pegainfer-sim-e2e"; +const MODEL_NAME: &str = "openinfer-sim-e2e"; const SERVER_START_ATTEMPTS: usize = 5; static TEMP_MODEL_DIR_COUNTER: AtomicU64 = AtomicU64::new(0); @@ -79,7 +79,7 @@ impl SimServer { let model_path = model_dir.path.to_string_lossy().into_owned(); let mut task = tokio::spawn(async move { if enable_lora_routes { - pegainfer_vllm_frontend::serve_model_with_lora_routes( + openinfer_vllm_frontend::serve_model_with_lora_routes( engine, model_path, vec![MODEL_NAME.to_string()], @@ -90,7 +90,7 @@ impl SimServer { ) .await } else { - pegainfer_vllm_frontend::serve_model( + openinfer_vllm_frontend::serve_model( engine, model_path, vec![MODEL_NAME.to_string()], @@ -154,7 +154,7 @@ impl TempModelDir { .as_nanos(); let sequence = TEMP_MODEL_DIR_COUNTER.fetch_add(1, Ordering::Relaxed); let path = std::env::temp_dir().join(format!( - "pegainfer-sim-e2e-{}-{now}-{sequence}", + "openinfer-sim-e2e-{}-{now}-{sequence}", std::process::id() )); fs::create_dir(&path) @@ -425,6 +425,6 @@ const TINY_TOKENIZER_CONFIG_JSON: &str = r#"{ }"#; const TINY_CONFIG_JSON: &str = r#"{ - "model_type": "pegainfer_sim", + "model_type": "openinfer_sim", "max_position_embeddings": 128 }"#; diff --git a/pegainfer-vllm-frontend/Cargo.toml b/openinfer-vllm-frontend/Cargo.toml similarity index 88% rename from pegainfer-vllm-frontend/Cargo.toml rename to openinfer-vllm-frontend/Cargo.toml index 2718b0dd..a056b74a 100644 --- a/pegainfer-vllm-frontend/Cargo.toml +++ b/openinfer-vllm-frontend/Cargo.toml @@ -1,10 +1,10 @@ [package] -name = "pegainfer-vllm-frontend" +name = "openinfer-vllm-frontend" version = "0.1.0" edition = "2024" [dependencies] -pegainfer-engine = { workspace = true } +openinfer-engine = { workspace = true } anyhow = { workspace = true } axum = { workspace = true } log = { workspace = true } diff --git a/pegainfer-vllm-frontend/src/lib.rs b/openinfer-vllm-frontend/src/lib.rs similarity index 99% rename from pegainfer-vllm-frontend/src/lib.rs rename to openinfer-vllm-frontend/src/lib.rs index 92253da9..307a0581 100644 --- a/pegainfer-vllm-frontend/src/lib.rs +++ b/openinfer-vllm-frontend/src/lib.rs @@ -40,16 +40,16 @@ use zeromq::prelude::{Socket, SocketRecv, SocketSend}; use zeromq::util::PeerIdentity; use zeromq::{DealerSocket, PushSocket, SocketOptions, ZmqMessage}; -use pegainfer_engine::engine::{ +use openinfer_engine::engine::{ EngineControlError, EngineHandle, FinishReason, GenerateRequest, LoadLoraAdapterRequest, TokenEvent, TokenLogprob, UnloadLoraAdapterRequest, }; -use pegainfer_engine::sampler::SamplingParams; +use openinfer_engine::sampler::SamplingParams; const ENGINE_INDEX: u32 = 0; const LORA_ROUTE_BODY_LIMIT: usize = 128 * 1024 * 1024; const COMPLETION_ROUTE_BODY_LIMIT: usize = 2 * 1024 * 1024; -const LORA_ADAPTER_XARG: &str = "pegainfer_lora_adapter"; +const LORA_ADAPTER_XARG: &str = "openinfer_lora_adapter"; #[derive(Clone)] struct LoraRouteState { @@ -312,7 +312,7 @@ impl LocalEngineBridge { num_gpu_blocks: 0, dp_stats_address: None, dtype: ModelDtype::BFloat16, - vllm_version: "pegainfer-local-bridge".to_string(), + vllm_version: "openinfer-local-bridge".to_string(), }; input .send(ZmqMessage::from(encode_msgpack(&ready)?)) @@ -1074,7 +1074,7 @@ fn to_wire_position_logprobs( ) -> Option { let lp = logprob?; let mut entries = Vec::with_capacity(1 + lp.top_logprobs.len()); - // pegainfer-core does not currently expose the sampled token's vocab rank. + // openinfer-core does not currently expose the sampled token's vocab rank. // rank: 1 is correct for greedy sampling, where the sampled token is top-1, // and is a lossy placeholder for non-greedy sampling. // See discussion on PR #96. @@ -1163,7 +1163,7 @@ fn now_secs_f64() -> f64 { fn local_ipc_namespace() -> Result { let base_dir = - std::env::var_os("PEGAINFER_IPC_DIR").map_or_else(|| PathBuf::from("/tmp"), PathBuf::from); + std::env::var_os("OPENINFER_IPC_DIR").map_or_else(|| PathBuf::from("/tmp"), PathBuf::from); let uuid = uuid::Uuid::new_v4().to_string(); let path = base_dir.join(format!("pgi-{}-{}", std::process::id(), &uuid[..8])); std::fs::create_dir_all(&path) diff --git a/pegainfer-vllm-support/Cargo.toml b/openinfer-vllm-support/Cargo.toml similarity index 88% rename from pegainfer-vllm-support/Cargo.toml rename to openinfer-vllm-support/Cargo.toml index fedfc840..8d45d5b8 100644 --- a/pegainfer-vllm-support/Cargo.toml +++ b/openinfer-vllm-support/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "pegainfer-vllm-support" +name = "openinfer-vllm-support" version = "0.1.0" edition = "2024" diff --git a/pegainfer-vllm-support/src/lib.rs b/openinfer-vllm-support/src/lib.rs similarity index 97% rename from pegainfer-vllm-support/src/lib.rs rename to openinfer-vllm-support/src/lib.rs index 50b3e3ef..7ec4b3ac 100644 --- a/pegainfer-vllm-support/src/lib.rs +++ b/openinfer-vllm-support/src/lib.rs @@ -11,7 +11,7 @@ static TOKENIZER_RESOLVER_RUNTIME: OnceCell> = OnceCell::new(); pub fn load_tokenizer(model_id: &str) -> Result { if tokio::runtime::Handle::try_current().is_ok() { return Err(Error::Tokenizer( - "pegainfer_vllm_support::load_tokenizer is synchronous and cannot be called from \ + "openinfer_vllm_support::load_tokenizer is synchronous and cannot be called from \ inside an active Tokio runtime; use load_tokenizer_async instead" .to_string(), )); diff --git a/pegainfer-comm/crates/pegainfer-comm-cudart-sys/Cargo.toml b/pegainfer-comm/crates/pegainfer-comm-cudart-sys/Cargo.toml deleted file mode 100644 index 0582410d..00000000 --- a/pegainfer-comm/crates/pegainfer-comm-cudart-sys/Cargo.toml +++ /dev/null @@ -1,15 +0,0 @@ -[package] -edition = "2024" -links = "cudart" -name = "pegainfer-comm-cudart-sys" -publish = false - -[features] -# Internal sys-crate feature. See `pegainfer-comm-cuda-sys/Cargo.toml` for the rationale. -default = [] -system-bindings = [] - -[build-dependencies] -build-utils = { path = "../pegainfer-comm-build-utils", package = "pegainfer-comm-build-utils" } - -bindgen = { workspace = true } diff --git a/pegainfer-comm/crates/pegainfer-comm-gdrapi-sys/Cargo.toml b/pegainfer-comm/crates/pegainfer-comm-gdrapi-sys/Cargo.toml deleted file mode 100644 index cae1ec03..00000000 --- a/pegainfer-comm/crates/pegainfer-comm-gdrapi-sys/Cargo.toml +++ /dev/null @@ -1,15 +0,0 @@ -[package] -edition = "2024" -links = "gdrapi" -name = "pegainfer-comm-gdrapi-sys" -publish = false - -[features] -# Internal sys-crate feature. See `pegainfer-comm-cuda-sys/Cargo.toml` for the rationale. -default = [] -system-bindings = [] - -[build-dependencies] -build-utils = { path = "../pegainfer-comm-build-utils", package = "pegainfer-comm-build-utils" } - -bindgen = { workspace = true } diff --git a/pegainfer-core/src/engine.rs b/pegainfer-core/src/engine.rs deleted file mode 100644 index 90696ddd..00000000 --- a/pegainfer-core/src/engine.rs +++ /dev/null @@ -1 +0,0 @@ -pub use pegainfer_engine::engine::*; diff --git a/pegainfer-core/src/ffi.rs b/pegainfer-core/src/ffi.rs deleted file mode 100644 index f27b8f87..00000000 --- a/pegainfer-core/src/ffi.rs +++ /dev/null @@ -1 +0,0 @@ -pub use pegainfer_kernels::ffi::*; diff --git a/pegainfer-core/src/parallel.rs b/pegainfer-core/src/parallel.rs deleted file mode 100644 index 48f5b5a5..00000000 --- a/pegainfer-core/src/parallel.rs +++ /dev/null @@ -1 +0,0 @@ -pub use pegainfer_engine::parallel::*; diff --git a/pegainfer-core/src/sampler.rs b/pegainfer-core/src/sampler.rs deleted file mode 100644 index ba135aa7..00000000 --- a/pegainfer-core/src/sampler.rs +++ /dev/null @@ -1 +0,0 @@ -pub use pegainfer_engine::sampler::*; diff --git a/pegainfer-core/src/tensor.rs b/pegainfer-core/src/tensor.rs deleted file mode 100644 index 005d5f4d..00000000 --- a/pegainfer-core/src/tensor.rs +++ /dev/null @@ -1 +0,0 @@ -pub use pegainfer_kernels::tensor::*; diff --git a/pegainfer-qwen35-4b/src/ffi.rs b/pegainfer-qwen35-4b/src/ffi.rs deleted file mode 100644 index a455b65b..00000000 --- a/pegainfer-qwen35-4b/src/ffi.rs +++ /dev/null @@ -1 +0,0 @@ -pub(crate) use pegainfer_core::ffi::*; diff --git a/pegainfer-server/src/sampler.rs b/pegainfer-server/src/sampler.rs deleted file mode 100644 index 40258a5a..00000000 --- a/pegainfer-server/src/sampler.rs +++ /dev/null @@ -1 +0,0 @@ -pub use pegainfer_core::sampler::*; diff --git a/pegainfer-server/src/tensor.rs b/pegainfer-server/src/tensor.rs deleted file mode 100644 index fdeb269f..00000000 --- a/pegainfer-server/src/tensor.rs +++ /dev/null @@ -1 +0,0 @@ -pub use pegainfer_core::tensor::*; diff --git a/pegainfer-server/src/vllm_frontend.rs b/pegainfer-server/src/vllm_frontend.rs deleted file mode 100644 index be7607f0..00000000 --- a/pegainfer-server/src/vllm_frontend.rs +++ /dev/null @@ -1 +0,0 @@ -pub use pegainfer_vllm_frontend::*; diff --git a/pegainfer-server/src/weight_loader.rs b/pegainfer-server/src/weight_loader.rs deleted file mode 100644 index 172dc061..00000000 --- a/pegainfer-server/src/weight_loader.rs +++ /dev/null @@ -1 +0,0 @@ -pub use pegainfer_core::weight_loader::*; diff --git a/scripts/bench_http_serving.py b/scripts/bench_http_serving.py index d46b3e04..1b3b27d0 100755 --- a/scripts/bench_http_serving.py +++ b/scripts/bench_http_serving.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -"""OpenAI-compatible HTTP serving benchmark for pegainfer. +"""OpenAI-compatible HTTP serving benchmark for openinfer. The harness intentionally talks to /v1/completions over HTTP instead of using the in-process bench_serving binary. It records streaming TTFT/ITL/TPOT, @@ -113,7 +113,7 @@ def summarize_trace_ms(measured: list[RequestResult]) -> dict[str, Any]: if isinstance(result.server_trace.get("decode_batch_size_max"), int) ] return { - "source": "server log lines matching `pegainfer_http_trace`; frontend_to_queue includes HTTP ingress, tokenization, and vLLM submit before engine queue", + "source": "server log lines matching `openinfer_http_trace`; frontend_to_queue includes HTTP ingress, tokenization, and vLLM submit before engine queue", "traced_requests": len(traced), "missing_traces": [result.request_id for result in measured if result.server_trace is None], "phases_ms": phase_summary, @@ -364,7 +364,7 @@ def failed_result( ) -TRACE_RE = re.compile(r"pegainfer_http_trace\s+(\{.*\})") +TRACE_RE = re.compile(r"openinfer_http_trace\s+(\{.*\})") STREAM_ERROR_RE = re.compile(r'request failed .*self\.request_id="([^"]+)"') @@ -475,7 +475,7 @@ def run_batch(args: argparse.Namespace, measured: bool) -> tuple[list[RequestRes pool.submit( request_once, idx, - f"pegainfer-bench-{label}-{offset + idx}", + f"openinfer-bench-{label}-{offset + idx}", url, args.model, prompt_words, @@ -596,7 +596,7 @@ def main() -> None: parser.add_argument( "--server-log", type=Path, - help="Optional pegainfer server log containing pegainfer_http_trace lines for TTFT phase attribution.", + help="Optional openinfer server log containing openinfer_http_trace lines for TTFT phase attribution.", ) parser.add_argument("--out", type=Path) args = parser.parse_args() diff --git a/scripts/e2e_eos_stop.py b/scripts/e2e_eos_stop.py index c6cfa313..08b27b33 100644 --- a/scripts/e2e_eos_stop.py +++ b/scripts/e2e_eos_stop.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -"""E2E check that a serving pegainfer engine stops at EOS (issue #238). +"""E2E check that a serving openinfer engine stops at EOS (issue #238). Sends the same prompt twice through `/v1/completions` on an already-running server: diff --git a/scripts/eval_gsm8k_thinking.py b/scripts/eval_gsm8k_thinking.py index b5872206..03a0c698 100644 --- a/scripts/eval_gsm8k_thinking.py +++ b/scripts/eval_gsm8k_thinking.py @@ -1,5 +1,5 @@ """ -GSM8K evaluation for thinking models (Qwen3.5) via pegainfer /v1/completions API. +GSM8K evaluation for thinking models (Qwen3.5) via openinfer /v1/completions API. lm-eval's local-completions backend applies stop sequences during generation, which causes "Question:" inside blocks to prematurely truncate output. diff --git a/scripts/generate_test_data.py b/scripts/generate_test_data.py index 4158cc18..968d078b 100644 --- a/scripts/generate_test_data.py +++ b/scripts/generate_test_data.py @@ -3,7 +3,7 @@ This script loads a model via HF Transformers and generates greedy (do_sample=False) outputs for a set of test prompts. The results are saved as a single JSON file -that drives pegainfer's e2e tests. +that drives openinfer's e2e tests. Usage: python scripts/generate_test_data.py --model models/Qwen3-4B --name Qwen3-4B diff --git a/scripts/run_snapshot_benchmark.sh b/scripts/run_snapshot_benchmark.sh index 07b4994a..384087de 100755 --- a/scripts/run_snapshot_benchmark.sh +++ b/scripts/run_snapshot_benchmark.sh @@ -61,7 +61,7 @@ if ! command -v uv >/dev/null 2>&1; then fi export PATH="$HOME/.local/bin:$HOME/.cargo/bin:$PATH" -if [[ -f .gitmodules ]] && [[ ! -f pegainfer-kernels/third_party/flashinfer/include/flashinfer/norm.cuh ]]; then +if [[ -f .gitmodules ]] && [[ ! -f openinfer-kernels/third_party/flashinfer/include/flashinfer/norm.cuh ]]; then git submodule update --init --recursive fi @@ -80,10 +80,10 @@ fi export CUDA_HOME export LD_LIBRARY_PATH="$CUDA_HOME/lib64:${LD_LIBRARY_PATH:-}" -export PEGAINFER_TRITON_PYTHON="$PWD/.venv/bin/python" -if [[ -z "${PEGAINFER_CUDA_SM:-}" ]]; then - PEGAINFER_CUDA_SM="$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -n1 | tr -d '.')" - export PEGAINFER_CUDA_SM +export OPENINFER_TRITON_PYTHON="$PWD/.venv/bin/python" +if [[ -z "${OPENINFER_CUDA_SM:-}" ]]; then + OPENINFER_CUDA_SM="$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader | head -n1 | tr -d '.')" + export OPENINFER_CUDA_SM fi cargo build --release --bin bench_serving diff --git a/tests/test_bench_http_serving.py b/tests/test_bench_http_serving.py index c8ef46be..2cd5747f 100644 --- a/tests/test_bench_http_serving.py +++ b/tests/test_bench_http_serving.py @@ -140,7 +140,7 @@ def test_server_trace_log_is_attached_by_vllm_completion_id_prefix(self) -> None text_prefix="text", ) line = ( - 'INFO pegainfer_http_trace {"request_id":"cmpl-bench-1-generated",' + 'INFO openinfer_http_trace {"request_id":"cmpl-bench-1-generated",' '"queued_at_unix_s":100.01,"scheduled_at_unix_s":100.03,' '"first_token_emit_unix_s":100.20,"prefill_ms":170.0,' '"first_decode_ms":28.0}\\n' @@ -222,7 +222,7 @@ def test_server_trace_zero_completion_tokens_marks_request_failed(self) -> None: text_prefix="text", ) line = ( - 'INFO pegainfer_http_trace {"request_id":"cmpl-bench-0-generated",' + 'INFO openinfer_http_trace {"request_id":"cmpl-bench-0-generated",' '"completion_tokens":0}\\n' ) with tempfile.TemporaryDirectory() as tmp: diff --git a/tools/accuracy/compare_dsv2_lite_ep2_outputs.py b/tools/accuracy/compare_dsv2_lite_ep2_outputs.py index b457f754..54591a16 100755 --- a/tools/accuracy/compare_dsv2_lite_ep2_outputs.py +++ b/tools/accuracy/compare_dsv2_lite_ep2_outputs.py @@ -148,7 +148,7 @@ def classify(pairs: dict[str, dict[str, Any]]) -> str: return "all_token_text_exact" if not host_nccl_exact: return "nccl_transport_regression" - return "pegainfer_baseline_accuracy_gap" + return "openinfer_baseline_accuracy_gap" def short(text: str, width: int = 72) -> str: @@ -253,8 +253,8 @@ def context_warnings(hf: Output, host: Output, nccl: Output) -> list[str]: def main() -> int: parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--hf", required=True, help="HF JSON output") - parser.add_argument("--host-staged", required=True, help="host-staged pegainfer JSON output") - parser.add_argument("--nccl", required=True, help="NCCL pegainfer JSON output") + parser.add_argument("--host-staged", required=True, help="host-staged openinfer JSON output") + parser.add_argument("--nccl", required=True, help="NCCL openinfer JSON output") parser.add_argument("--out", help="Optional path for structured comparison JSON") parser.add_argument( "--require-all-exact", diff --git a/tools/accuracy/dump_kimi_k2_vllm_golden.py b/tools/accuracy/dump_kimi_k2_vllm_golden.py index ef97afa3..950421c8 100644 --- a/tools/accuracy/dump_kimi_k2_vllm_golden.py +++ b/tools/accuracy/dump_kimi_k2_vllm_golden.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 """Generate the vLLM golden fixture for the Kimi-K2 accuracy gate. -The gate (`pegainfer-kimi-k2/tests/vllm_golden_gate.rs`) compares pegainfer's +The gate (`openinfer-kimi-k2/tests/vllm_golden_gate.rs`) compares openinfer's greedy decisions against vLLM *without* running vLLM at test time and *without* binding to one engine's exact bit pattern. So we precompute, once, on the serving hardware: @@ -17,9 +17,9 @@ different numerical regime, and needs a fragile trust_remote_code + stubbed vision tower load. -The Rust gate replays the same sequences through pegainfer two ways: +The Rust gate replays the same sequences through openinfer two ways: * teacher-forced argmax sweep — prefill `prompt + tail[..i]`, max_tokens=1, - per position i: pegainfer's pick must sit within a logprob tie tolerance + per position i: openinfer's pick must sit within a logprob tie tolerance of vLLM's own argmax (in vLLM's logprobs — the "regret" check); * free-greedy decode parity — generate D tokens and compare against the tail, classifying any first divergence as benign tie vs real bug using @@ -28,7 +28,7 @@ Output is safetensors, not JSON: machine-only numeric data, nobody reads it, and the binary layout is ~3.5x smaller (same convention as the Qwen goldens). -Run on a host with 8 GPUs and the vLLM venv (the gate's pegainfer run needs +Run on a host with 8 GPUs and the vLLM venv (the gate's openinfer run needs the same GPUs, so generation and gating are sequential on one box): .venv/bin/python tools/accuracy/dump_kimi_k2_vllm_golden.py \ @@ -94,7 +94,7 @@ ("list", "Top five most spoken languages in the world:\n1."), ( "json", - '{"name": "pegainfer", "language": "Rust", "purpose":', + '{"name": "openinfer", "language": "Rust", "purpose":', ), ( "translation", diff --git a/tools/accuracy/dump_qwen35_4b_hf_golden.py b/tools/accuracy/dump_qwen35_4b_hf_golden.py index eabb2414..62a945bb 100644 --- a/tools/accuracy/dump_qwen35_4b_hf_golden.py +++ b/tools/accuracy/dump_qwen35_4b_hf_golden.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 """Generate the HuggingFace bf16 logprob golden for the Qwen3.5-4B gate. -The Rust gate replays these fixed token sequences through pegainfer with +The Rust gate replays these fixed token sequences through openinfer with teacher-forced decode and compares top-K logprobs against this stored HF oracle. For Qwen3.5 the HF oracle follows the same incremental shape: prefill the prompt with `use_cache=True`, then feed one fixed decode token at a time through diff --git a/tools/accuracy/dump_qwen3_4b_hf_golden.py b/tools/accuracy/dump_qwen3_4b_hf_golden.py index f098de7e..2f3146fa 100644 --- a/tools/accuracy/dump_qwen3_4b_hf_golden.py +++ b/tools/accuracy/dump_qwen3_4b_hf_golden.py @@ -1,19 +1,19 @@ #!/usr/bin/env python3 """Generate the HuggingFace bf16 logprob golden for the Qwen3-4B logits gate. -The gate (`pegainfer-qwen3-4b/tests/hf_golden_gate.rs`) compares pegainfer's +The gate (`openinfer-qwen3-4b/tests/hf_golden_gate.rs`) compares openinfer's logprobs against HF *without* running HF at test time and *without* binding to one GPU's exact bit pattern. So we precompute, once, on the GPU: * a seed-pinned set of fixed token sequences (`prompt + teacher-forced tail`), * HF's top-K next-token logprobs at every evaluated position. -The Rust gate replays the *same fixed sequences* through pegainfer (prefill + +The Rust gate replays the *same fixed sequences* through openinfer (prefill + teacher-forced decode) and asserts its logprobs land within a bf16 tolerance of this golden — argmax must match HF wherever HF has a clear (> a few ULP) winner, logprobs within the bf16 noise floor. -bf16 (not fp32) on purpose: it is the same precision regime as pegainfer, so the +bf16 (not fp32) on purpose: it is the same precision regime as openinfer, so the comparison is apples-to-apples, and it runs on the GPU — `device_map=auto` scales the same script to the large models. fp32 only mattered for the one-time tie *adjudication* (compare_qwen3_4b_hf_logprobs.py --dtype float32); the gate's diff --git a/tools/lora/qwen3_lora_live_parity.py b/tools/lora/qwen3_lora_live_parity.py index 233cdcf8..1a31a4b1 100644 --- a/tools/lora/qwen3_lora_live_parity.py +++ b/tools/lora/qwen3_lora_live_parity.py @@ -3,7 +3,7 @@ The script creates a deterministic PEFT-style adapter, obtains the greedy reference text from transformers+peft, loads the same adapter through -PegaInfer's live /v1/load_lora_adapter route, and compares /v1/completions. +OpenInfer's live /v1/load_lora_adapter route, and compares /v1/completions. """ from __future__ import annotations @@ -167,41 +167,41 @@ def encode_generated_text(model_path: Path, text: str) -> list[int]: def first_token_mismatch( hf_token_ids: list[int], - pegainfer_token_ids: list[int], + openinfer_token_ids: list[int], model_path: Path, ) -> dict | None: - if hf_token_ids == pegainfer_token_ids: + if hf_token_ids == openinfer_token_ids: return None from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) - for index, (hf_token_id, pegainfer_token_id) in enumerate( - zip(hf_token_ids, pegainfer_token_ids), + for index, (hf_token_id, openinfer_token_id) in enumerate( + zip(hf_token_ids, openinfer_token_ids), start=1, ): - if hf_token_id != pegainfer_token_id: + if hf_token_id != openinfer_token_id: return { "index_1based": index, "hf_token_id": hf_token_id, - "pegainfer_token_id": pegainfer_token_id, + "openinfer_token_id": openinfer_token_id, "hf_piece": tokenizer.decode([hf_token_id]), - "pegainfer_piece": tokenizer.decode([pegainfer_token_id]), + "openinfer_piece": tokenizer.decode([openinfer_token_id]), } return { - "index_1based": min(len(hf_token_ids), len(pegainfer_token_ids)) + 1, - "hf_token_id": hf_token_ids[len(pegainfer_token_ids)] - if len(hf_token_ids) > len(pegainfer_token_ids) + "index_1based": min(len(hf_token_ids), len(openinfer_token_ids)) + 1, + "hf_token_id": hf_token_ids[len(openinfer_token_ids)] + if len(hf_token_ids) > len(openinfer_token_ids) else None, - "pegainfer_token_id": pegainfer_token_ids[len(hf_token_ids)] - if len(pegainfer_token_ids) > len(hf_token_ids) + "openinfer_token_id": openinfer_token_ids[len(hf_token_ids)] + if len(openinfer_token_ids) > len(hf_token_ids) else None, - "hf_piece": tokenizer.decode([hf_token_ids[len(pegainfer_token_ids)]]) - if len(hf_token_ids) > len(pegainfer_token_ids) + "hf_piece": tokenizer.decode([hf_token_ids[len(openinfer_token_ids)]]) + if len(hf_token_ids) > len(openinfer_token_ids) else None, - "pegainfer_piece": tokenizer.decode([pegainfer_token_ids[len(hf_token_ids)]]) - if len(pegainfer_token_ids) > len(hf_token_ids) + "openinfer_piece": tokenizer.decode([openinfer_token_ids[len(hf_token_ids)]]) + if len(openinfer_token_ids) > len(hf_token_ids) else None, } @@ -243,7 +243,7 @@ def wait_for_health(server_url: str, timeout_s: float, process: subprocess.Popen def start_server(args: argparse.Namespace, repo_root: Path) -> subprocess.Popen: env = os.environ.copy() - env.setdefault("PEGAINFER_CUDA_SM", "80") + env.setdefault("OPENINFER_CUDA_SM", "80") compat = "/usr/local/cuda-12.9/compat" if Path(compat).exists(): old = env.get("LD_LIBRARY_PATH") @@ -253,7 +253,7 @@ def start_server(args: argparse.Namespace, repo_root: Path) -> subprocess.Popen: "run", "--release", "-p", - "pegainfer-server", + "openinfer-server", "--", "--model-path", args.model_path, @@ -264,7 +264,7 @@ def start_server(args: argparse.Namespace, repo_root: Path) -> subprocess.Popen: str(args.port), ] log = tempfile.NamedTemporaryFile( - prefix="pegainfer-qwen3-lora-server-", + prefix="openinfer-qwen3-lora-server-", suffix=".log", mode="w+", delete=False, @@ -278,7 +278,7 @@ def start_server(args: argparse.Namespace, repo_root: Path) -> subprocess.Popen: text=True, start_new_session=True, ) - process.pegainfer_log_path = log.name # type: ignore[attr-defined] + process.openinfer_log_path = log.name # type: ignore[attr-defined] print(f"server_log={log.name}", file=sys.stderr) log.close() return process @@ -298,7 +298,7 @@ def stop_server(process: subprocess.Popen | None) -> None: def tail_server_output(process: subprocess.Popen | None) -> str: if process is None: return "" - log_path = getattr(process, "pegainfer_log_path", None) + log_path = getattr(process, "openinfer_log_path", None) if not log_path: return "" with contextlib.suppress(Exception): @@ -306,7 +306,7 @@ def tail_server_output(process: subprocess.Popen | None) -> str: return "" -def pegainfer_completion( +def openinfer_completion( server_url: str, model_name: str, prompt: str, @@ -335,7 +335,7 @@ def main() -> int: adapter_path.mkdir(parents=True, exist_ok=True) cleanup = contextlib.nullcontext(adapter_path) else: - cleanup = tempfile.TemporaryDirectory(prefix="pegainfer-qwen3-lora-parity-") + cleanup = tempfile.TemporaryDirectory(prefix="openinfer-qwen3-lora-parity-") process = None with cleanup as adapter_dir: @@ -359,7 +359,7 @@ def main() -> int: f"{server_url}/v1/load_lora_adapter", {"lora_name": args.lora_name, "lora_path": str(adapter_path)}, ) - completion = pegainfer_completion( + completion = openinfer_completion( server_url, model_name=args.lora_name, prompt=args.prompt, @@ -374,9 +374,9 @@ def main() -> int: choices = completion.get("choices", []) if not choices: raise RuntimeError(f"completion response has no choices: {completion}") - pegainfer_text = choices[0].get("text", "") - pegainfer_token_ids = encode_generated_text(model_path, pegainfer_text) - mismatch = first_token_mismatch(hf["token_ids"], pegainfer_token_ids, model_path) + openinfer_text = choices[0].get("text", "") + openinfer_token_ids = encode_generated_text(model_path, openinfer_text) + mismatch = first_token_mismatch(hf["token_ids"], openinfer_token_ids, model_path) summary = { "adapter_path": str(adapter_path), "hf_text": hf["text"], @@ -384,14 +384,14 @@ def main() -> int: "hf_logit_max_abs_diff_vs_base": hf["logit_max_abs_diff_vs_base"], "peft_autocast_adapter_dtype": peft_autocast_adapter_dtype, "load_response": load_response, - "pegainfer_text": pegainfer_text, - "pegainfer_token_ids": pegainfer_token_ids, + "openinfer_text": openinfer_text, + "openinfer_token_ids": openinfer_token_ids, "first_token_mismatch": mismatch, - "match": pegainfer_text == hf["text"], + "match": openinfer_text == hf["text"], } print(json.dumps(summary, indent=2, ensure_ascii=False)) - if pegainfer_text != hf["text"]: + if openinfer_text != hf["text"]: print(tail_server_output(process), file=sys.stderr) return 1 if hf["logit_max_abs_diff_vs_base"] == 0.0: diff --git a/tools/lora/qwen3_lora_live_stress.py b/tools/lora/qwen3_lora_live_stress.py index b1f0e0da..5c5123b9 100644 --- a/tools/lora/qwen3_lora_live_stress.py +++ b/tools/lora/qwen3_lora_live_stress.py @@ -155,7 +155,7 @@ def wait_for_health(server_url: str, timeout_s: float, process: subprocess.Popen def start_server(args: argparse.Namespace, repo_root: Path) -> subprocess.Popen: assert_port_available(args.port) env = os.environ.copy() - env.setdefault("PEGAINFER_CUDA_SM", "80") + env.setdefault("OPENINFER_CUDA_SM", "80") compat = "/usr/local/cuda-12.9/compat" if Path(compat).exists(): old = env.get("LD_LIBRARY_PATH") @@ -165,7 +165,7 @@ def start_server(args: argparse.Namespace, repo_root: Path) -> subprocess.Popen: "run", "--release", "-p", - "pegainfer-server", + "openinfer-server", "--", "--model-path", args.model_path, @@ -180,7 +180,7 @@ def start_server(args: argparse.Namespace, repo_root: Path) -> subprocess.Popen: str(args.port), ] log = tempfile.NamedTemporaryFile( - prefix="pegainfer-qwen3-lora-stress-server-", + prefix="openinfer-qwen3-lora-stress-server-", suffix=".log", mode="w+", delete=False, @@ -194,7 +194,7 @@ def start_server(args: argparse.Namespace, repo_root: Path) -> subprocess.Popen: text=True, start_new_session=True, ) - process.pegainfer_log_path = log.name # type: ignore[attr-defined] + process.openinfer_log_path = log.name # type: ignore[attr-defined] print(f"server_log={log.name}", file=sys.stderr) log.close() return process @@ -223,7 +223,7 @@ def stop_server(process: subprocess.Popen | None) -> None: def tail_server_output(process: subprocess.Popen | None) -> str: if process is None: return "" - log_path = getattr(process, "pegainfer_log_path", None) + log_path = getattr(process, "openinfer_log_path", None) if not log_path: return "" with contextlib.suppress(Exception): @@ -289,7 +289,7 @@ def main() -> int: server_url = args.server_url or f"http://127.0.0.1:{args.port}" process = None - with tempfile.TemporaryDirectory(prefix="pegainfer-qwen3-lora-stress-") as tmp: + with tempfile.TemporaryDirectory(prefix="openinfer-qwen3-lora-stress-") as tmp: root = Path(tmp) adapters = { "stress-a": root / "stress-a",