From 388c3e24eccaeb9a6ff9c8cdc32d2fcaa1163b68 Mon Sep 17 00:00:00 2001 From: yuanzhg078 <939526371@qq.com> Date: Thu, 2 Apr 2026 12:42:37 +0800 Subject: [PATCH] [doc] Update quickstart and calculator config --- docker/Dockerfile.vllm_npu | 3 -- docs/source/_static/kv_cache_calculator.html | 38 ++++++++++--------- .../source/getting-started/quickstart_vllm.md | 5 +-- .../getting-started/quickstart_vllm_ascend.md | 4 +- 4 files changed, 25 insertions(+), 25 deletions(-) diff --git a/docker/Dockerfile.vllm_npu b/docker/Dockerfile.vllm_npu index 9498b2ec5..496072b1f 100644 --- a/docker/Dockerfile.vllm_npu +++ b/docker/Dockerfile.vllm_npu @@ -5,9 +5,6 @@ FROM ${BASE_IMAGE} ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" -# Apply the UCM monkey patch for vllm & vllm_ascend -ENV ENABLE_UCM_PATCH=1 - WORKDIR /workspace # Install unified-cache-management diff --git a/docs/source/_static/kv_cache_calculator.html b/docs/source/_static/kv_cache_calculator.html index 2011ce61e..1d1fe3d2f 100644 --- a/docs/source/_static/kv_cache_calculator.html +++ b/docs/source/_static/kv_cache_calculator.html @@ -1052,19 +1052,28 @@

"num_key_value_heads": 8 }, // GLM Models (Zhipu AI) - "zai-org/GLM-4.6": { + "zai-org/GLM-4.5":{ + "hidden_size": 6144, + "num_attention_heads": 64, + "num_hidden_layers": 78, + "num_key_value_heads": 64, + "kv_lora_rank": 512, + "qk_rope_head_dim": 64 + }, + "zai-org/GLM-4.7":{ "hidden_size": 5120, "num_attention_heads": 96, - "num_hidden_layers": 62, - "num_key_value_heads": 4 + "num_hidden_layers": 92, + "num_key_value_heads": 8, + "head_dim": 128 }, - "zai-org/GLM-4.7": { + "zai-org/GLM-4.6":{ "hidden_size": 5120, "num_attention_heads": 96, - "num_hidden_layers": 62, - "num_key_value_heads": 4 + "num_hidden_layers": 92, + "num_key_value_heads": 8, + "head_dim": 128 }, - // Kimi Models (Moonshot AI) "moonshotai/Kimi-K2-Instruct-0905": { "hidden_size": 7168, @@ -1075,18 +1084,13 @@

"qk_rope_head_dim": 64 }, // MiniMax Models - "MiniMaxAI/MiniMax-M2": { - "hidden_size": 5632, - "num_attention_heads": 44, + "MiniMaxAI/MiniMax-M2.5": { + "hidden_size": 3072, + "num_attention_heads": 48, "num_hidden_layers": 62, - "num_key_value_heads": 4 + "num_key_value_heads": 8, + "head_dim": 128 }, - "MiniMaxAI/MiniMax-M2.1": { - "hidden_size": 5632, - "num_attention_heads": 44, - "num_hidden_layers": 62, - "num_key_value_heads": 4 - } }; } diff --git a/docs/source/getting-started/quickstart_vllm.md b/docs/source/getting-started/quickstart_vllm.md index 2dbf6f245..8ded760bb 100644 --- a/docs/source/getting-started/quickstart_vllm.md +++ b/docs/source/getting-started/quickstart_vllm.md @@ -49,10 +49,9 @@ docker build -t ucm-vllm-sparse:latest -f ./docker/Dockerfile.vllm_gpu_v0110 ./ 1. Prepare vLLM Environment For the sake of environment isolation and simplicity, we recommend preparing the vLLM environment by pulling the official, pre-built vLLM Docker image. - > Note: v0.11.0 is newly supported (replace the tag with v0.11.0 if needed). ```bash - docker pull vllm/vllm-openai:v0.11.0 + docker pull vllm/vllm-openai: ``` Use the following command to run your own container: ```bash @@ -65,7 +64,7 @@ docker build -t ucm-vllm-sparse:latest -f ./docker/Dockerfile.vllm_gpu_v0110 ./ -v :/home/storage \ --entrypoint /bin/bash \ --name \ - -it vllm/vllm-openai:v0.9.2 + -it vllm/vllm-openai: ``` Refer to [Set up using docker](https://docs.vllm.ai/en/latest/getting_started/installation/gpu.html#set-up-using-docker) for more information to run your own vLLM container. diff --git a/docs/source/getting-started/quickstart_vllm_ascend.md b/docs/source/getting-started/quickstart_vllm_ascend.md index 50daf53db..0fa34688e 100644 --- a/docs/source/getting-started/quickstart_vllm_ascend.md +++ b/docs/source/getting-started/quickstart_vllm_ascend.md @@ -25,7 +25,7 @@ cd .. >**Note:** For the Atlas A3 series, the `PLATFORM` variable should be set to `ascend-a3`. -2、Apply vLLM and vLLM-Ascend Integration Patches (Required) +2、Apply vLLM and vLLM-Ascend Integration Patches (Not required for versions >= v0.17.0rc1) To enable Unified Cache Management (UCM) integration, you need to apply patches to both vLLM and vLLM-Ascend source trees. #### Option A: Monkey Patch (Recommended) @@ -38,7 +38,7 @@ export ENABLE_UCM_PATCH=1 ``` >**Note:** Enabling ENABLE_UCM_PATCH is required to use the Prefix Caching feature with UCM. -2. Enable Sparse Attention (Optional): +2. Enable Sparse Attention (supported on v0.11.0): ```bash export ENABLE_SPARSE=1 ```