ModelEngine-Group · mag1c-h · Apr 2, 2026 · Apr 2, 2026 · Apr 2, 2026
@@ -5,9 +5,6 @@ FROM ${BASE_IMAGE}
 
 ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
 
-# Apply the UCM monkey patch for vllm & vllm_ascend
-ENV ENABLE_UCM_PATCH=1
-
 WORKDIR /workspace
 
 # Install unified-cache-management

@@ -1052,19 +1052,28 @@ <h2 class="section-title">
                     "num_key_value_heads": 8
                 },
                 // GLM Models (Zhipu AI)
-                "zai-org/GLM-4.6": {
+                "zai-org/GLM-4.5":{
+                    "hidden_size": 6144,
+                    "num_attention_heads": 64,
+                    "num_hidden_layers": 78,
+                    "num_key_value_heads": 64,
+                    "kv_lora_rank": 512,
+                    "qk_rope_head_dim": 64
+                },
+                "zai-org/GLM-4.7":{
                     "hidden_size": 5120,
                     "num_attention_heads": 96,
-                    "num_hidden_layers": 62,
-                    "num_key_value_heads": 4
+                    "num_hidden_layers": 92,
+                    "num_key_value_heads": 8,
+                    "head_dim": 128
                 },
-                "zai-org/GLM-4.7": {
+                "zai-org/GLM-4.6":{
                     "hidden_size": 5120,
                     "num_attention_heads": 96,
-                    "num_hidden_layers": 62,
-                    "num_key_value_heads": 4
+                    "num_hidden_layers": 92,
+                    "num_key_value_heads": 8,
+                    "head_dim": 128
                 },
-
                 // Kimi Models (Moonshot AI)
                 "moonshotai/Kimi-K2-Instruct-0905": {
                     "hidden_size": 7168,
@@ -1075,18 +1084,13 @@ <h2 class="section-title">
                     "qk_rope_head_dim": 64
                 },
                 // MiniMax Models
-                "MiniMaxAI/MiniMax-M2": {
-                    "hidden_size": 5632,
-                    "num_attention_heads": 44,
+                "MiniMaxAI/MiniMax-M2.5": {
+                    "hidden_size": 3072,
+                    "num_attention_heads": 48,
                     "num_hidden_layers": 62,
-                    "num_key_value_heads": 4
+                    "num_key_value_heads": 8,
+                    "head_dim": 128
                 },
-                "MiniMaxAI/MiniMax-M2.1": {
-                    "hidden_size": 5632,
-                    "num_attention_heads": 44,
-                    "num_hidden_layers": 62,
-                    "num_key_value_heads": 4
-                }
             };
         }
 

@@ -49,10 +49,9 @@ docker build -t ucm-vllm-sparse:latest -f ./docker/Dockerfile.vllm_gpu_v0110 ./
 1. Prepare vLLM Environment
 
     For the sake of environment isolation and simplicity, we recommend preparing the vLLM environment by pulling the official, pre-built vLLM Docker image.
-    > Note: v0.11.0 is newly supported (replace the tag with v0.11.0 if needed).
 
     ```bash
-    docker pull vllm/vllm-openai:v0.11.0
+    docker pull vllm/vllm-openai:<vllm_version>
     ```
     Use the following command to run your own container:
     ```bash
@@ -65,7 +64,7 @@ docker build -t ucm-vllm-sparse:latest -f ./docker/Dockerfile.vllm_gpu_v0110 ./
         -v <path_to_your_storage>:/home/storage \
         --entrypoint /bin/bash \
         --name <name_of_your_container> \
-        -it vllm/vllm-openai:v0.9.2
+        -it vllm/vllm-openai:<vllm_version>
     ```
     Refer to [Set up using docker](https://docs.vllm.ai/en/latest/getting_started/installation/gpu.html#set-up-using-docker) for more information to run your own vLLM container.
 

@@ -25,7 +25,7 @@ cd ..
 
 >**Note:** For the Atlas A3 series, the `PLATFORM` variable should be set to `ascend-a3`.
 
-2、Apply vLLM and vLLM-Ascend Integration Patches (Required)
+2、Apply vLLM and vLLM-Ascend Integration Patches (Not required for versions >= v0.17.0rc1)
 To enable Unified Cache Management (UCM) integration, you need to apply patches to both vLLM and vLLM-Ascend source trees.
 
 #### Option A: Monkey Patch (Recommended)
@@ -38,7 +38,7 @@ export ENABLE_UCM_PATCH=1
 ```
 >**Note:** Enabling ENABLE_UCM_PATCH is required to use the Prefix Caching feature with UCM.
 
-2. Enable Sparse Attention (Optional):
+2. Enable Sparse Attention (supported on v0.11.0):
 ```bash
 export ENABLE_SPARSE=1
 ```