InternLM · CUHKSZzxy · Oct 30, 2025 · Oct 30, 2025
diff --git a/docker/install.sh b/docker/install.sh
@@ -22,9 +22,9 @@ popd >/dev/null
 if [[ "${CUDA_VERSION_SHORT}" = "cu118" ]]; then
     apt-get install -y --no-install-recommends cuda-minimal-build-11-8
 elif [[ "${CUDA_VERSION_SHORT}" = "cu124" ]]; then
-    apt-get install -y --no-install-recommends cuda-minimal-build-12-4
+    apt-get install -y --no-install-recommends cuda-minimal-build-12-4 cuda-cuobjdump-12-4 cuda-nvdisasm-12-4
 elif [[ "${CUDA_VERSION_SHORT}" = "cu128" ]]; then
-    apt-get install -y --no-install-recommends cuda-minimal-build-12-8
+    apt-get install -y --no-install-recommends cuda-minimal-build-12-8 cuda-cuobjdump-12-8 cuda-nvdisasm-12-8
 fi
 
 apt-get clean -y

diff --git a/docker/prepare_wheel.sh b/docker/prepare_wheel.sh
@@ -22,13 +22,14 @@ fi
 if [[ "${CUDA_VERSION_SHORT}" != "cu118" ]]; then
 
     if [[ "${CUDA_VERSION_SHORT}" = "cu124" ]]; then
-        DEEP_GEMM_VERSION=03d0be3
         FLASH_MLA_VERSION=9edee0c
     else
-        DEEP_GEMM_VERSION=79f48ee
         FLASH_MLA_VERSION=c759027
     fi
 
+    # The current EP implementation uses dlblas, which is incompatible with the latest deep_gemm APIs.
+    # To ensure compatibility, we pin deep_gemm to an older version.
+    DEEP_GEMM_VERSION=03d0be3
     DEEP_EP_VERSION=26cf250
     pip install nvidia-nvshmem-cu12
 

diff --git a/lmdeploy/pytorch/backends/cuda/token_dispatcher.py b/lmdeploy/pytorch/backends/cuda/token_dispatcher.py
@@ -1,6 +1,13 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 try:
     from deep_ep import Buffer
+
+    from lmdeploy.pytorch.envs import env_to_int
+
+    # default value refers to DeepEP code
+    # https://github.com/deepseek-ai/DeepEP/blob/bfded34800dfec415b71503f8205181de90b2480/deep_ep/buffer.py#L30
+    deep_ep_buffer_num_sms = env_to_int(env_var='DEEPEP_BUFFER_NUM_SMS', default=20)
+    Buffer.set_num_sms(deep_ep_buffer_num_sms)
     use_deepep = True
 except ImportError:
     use_deepep = False

diff --git a/lmdeploy/pytorch/envs.py b/lmdeploy/pytorch/envs.py
@@ -126,6 +126,7 @@ def _patched_get_env(
     # we don't need to read this, it would be passed to ray workers
     # If Ray is launched from outside, it may fail to access the environment variables.
     os.getenv('DEEPEP_MAX_BATCH_SIZE', None)
+    os.getenv('DEEPEP_BUFFER_NUM_SMS', None)
 
     # deepgemm
     os.getenv('DG_JIT_DEBUG', '0')