From 388c3e24eccaeb9a6ff9c8cdc32d2fcaa1163b68 Mon Sep 17 00:00:00 2001
From: yuanzhg078 <939526371@qq.com>
Date: Thu, 2 Apr 2026 12:42:37 +0800
Subject: [PATCH] [doc] Update quickstart and calculator config
---
docker/Dockerfile.vllm_npu | 3 --
docs/source/_static/kv_cache_calculator.html | 38 ++++++++++---------
.../source/getting-started/quickstart_vllm.md | 5 +--
.../getting-started/quickstart_vllm_ascend.md | 4 +-
4 files changed, 25 insertions(+), 25 deletions(-)
diff --git a/docker/Dockerfile.vllm_npu b/docker/Dockerfile.vllm_npu
index 9498b2ec5..496072b1f 100644
--- a/docker/Dockerfile.vllm_npu
+++ b/docker/Dockerfile.vllm_npu
@@ -5,9 +5,6 @@ FROM ${BASE_IMAGE}
ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
-# Apply the UCM monkey patch for vllm & vllm_ascend
-ENV ENABLE_UCM_PATCH=1
-
WORKDIR /workspace
# Install unified-cache-management
diff --git a/docs/source/_static/kv_cache_calculator.html b/docs/source/_static/kv_cache_calculator.html
index 2011ce61e..1d1fe3d2f 100644
--- a/docs/source/_static/kv_cache_calculator.html
+++ b/docs/source/_static/kv_cache_calculator.html
@@ -1052,19 +1052,28 @@
"num_key_value_heads": 8
},
// GLM Models (Zhipu AI)
- "zai-org/GLM-4.6": {
+ "zai-org/GLM-4.5":{
+ "hidden_size": 6144,
+ "num_attention_heads": 64,
+ "num_hidden_layers": 78,
+ "num_key_value_heads": 64,
+ "kv_lora_rank": 512,
+ "qk_rope_head_dim": 64
+ },
+ "zai-org/GLM-4.7":{
"hidden_size": 5120,
"num_attention_heads": 96,
- "num_hidden_layers": 62,
- "num_key_value_heads": 4
+ "num_hidden_layers": 92,
+ "num_key_value_heads": 8,
+ "head_dim": 128
},
- "zai-org/GLM-4.7": {
+ "zai-org/GLM-4.6":{
"hidden_size": 5120,
"num_attention_heads": 96,
- "num_hidden_layers": 62,
- "num_key_value_heads": 4
+ "num_hidden_layers": 92,
+ "num_key_value_heads": 8,
+ "head_dim": 128
},
-
// Kimi Models (Moonshot AI)
"moonshotai/Kimi-K2-Instruct-0905": {
"hidden_size": 7168,
@@ -1075,18 +1084,13 @@
"qk_rope_head_dim": 64
},
// MiniMax Models
- "MiniMaxAI/MiniMax-M2": {
- "hidden_size": 5632,
- "num_attention_heads": 44,
+ "MiniMaxAI/MiniMax-M2.5": {
+ "hidden_size": 3072,
+ "num_attention_heads": 48,
"num_hidden_layers": 62,
- "num_key_value_heads": 4
+ "num_key_value_heads": 8,
+ "head_dim": 128
},
- "MiniMaxAI/MiniMax-M2.1": {
- "hidden_size": 5632,
- "num_attention_heads": 44,
- "num_hidden_layers": 62,
- "num_key_value_heads": 4
- }
};
}
diff --git a/docs/source/getting-started/quickstart_vllm.md b/docs/source/getting-started/quickstart_vllm.md
index 2dbf6f245..8ded760bb 100644
--- a/docs/source/getting-started/quickstart_vllm.md
+++ b/docs/source/getting-started/quickstart_vllm.md
@@ -49,10 +49,9 @@ docker build -t ucm-vllm-sparse:latest -f ./docker/Dockerfile.vllm_gpu_v0110 ./
1. Prepare vLLM Environment
For the sake of environment isolation and simplicity, we recommend preparing the vLLM environment by pulling the official, pre-built vLLM Docker image.
- > Note: v0.11.0 is newly supported (replace the tag with v0.11.0 if needed).
```bash
- docker pull vllm/vllm-openai:v0.11.0
+ docker pull vllm/vllm-openai:
```
Use the following command to run your own container:
```bash
@@ -65,7 +64,7 @@ docker build -t ucm-vllm-sparse:latest -f ./docker/Dockerfile.vllm_gpu_v0110 ./
-v :/home/storage \
--entrypoint /bin/bash \
--name \
- -it vllm/vllm-openai:v0.9.2
+ -it vllm/vllm-openai:
```
Refer to [Set up using docker](https://docs.vllm.ai/en/latest/getting_started/installation/gpu.html#set-up-using-docker) for more information to run your own vLLM container.
diff --git a/docs/source/getting-started/quickstart_vllm_ascend.md b/docs/source/getting-started/quickstart_vllm_ascend.md
index 50daf53db..0fa34688e 100644
--- a/docs/source/getting-started/quickstart_vllm_ascend.md
+++ b/docs/source/getting-started/quickstart_vllm_ascend.md
@@ -25,7 +25,7 @@ cd ..
>**Note:** For the Atlas A3 series, the `PLATFORM` variable should be set to `ascend-a3`.
-2、Apply vLLM and vLLM-Ascend Integration Patches (Required)
+2、Apply vLLM and vLLM-Ascend Integration Patches (Not required for versions >= v0.17.0rc1)
To enable Unified Cache Management (UCM) integration, you need to apply patches to both vLLM and vLLM-Ascend source trees.
#### Option A: Monkey Patch (Recommended)
@@ -38,7 +38,7 @@ export ENABLE_UCM_PATCH=1
```
>**Note:** Enabling ENABLE_UCM_PATCH is required to use the Prefix Caching feature with UCM.
-2. Enable Sparse Attention (Optional):
+2. Enable Sparse Attention (supported on v0.11.0):
```bash
export ENABLE_SPARSE=1
```