From cbe8ddba6a49ec9d0ac4dd0006ba076277e4c5c6 Mon Sep 17 00:00:00 2001
From: zxy <zhou0493@e.ntu.edu.sg>
Date: Thu, 30 Oct 2025 10:44:52 +0800
Subject: [PATCH 1/2] fix for multi-node ep

---
 docker/prepare_wheel.sh                            | 5 +++--
 lmdeploy/pytorch/backends/cuda/token_dispatcher.py | 7 +++++++
 lmdeploy/pytorch/envs.py                           | 1 +
 3 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/docker/prepare_wheel.sh b/docker/prepare_wheel.sh
index 4250c8820a..364a8c6dd0 100755
--- a/docker/prepare_wheel.sh
+++ b/docker/prepare_wheel.sh
@@ -22,13 +22,14 @@ fi
 if [[ "${CUDA_VERSION_SHORT}" != "cu118" ]]; then
 
     if [[ "${CUDA_VERSION_SHORT}" = "cu124" ]]; then
-        DEEP_GEMM_VERSION=03d0be3
         FLASH_MLA_VERSION=9edee0c
     else
-        DEEP_GEMM_VERSION=79f48ee
         FLASH_MLA_VERSION=c759027
     fi
 
+    # The current EP implementation uses dlblas, which is incompatible with the latest deep_gemm APIs.
+    # To ensure compatibility, we pin deep_gemm to an older version.
+    DEEP_GEMM_VERSION=03d0be3
     DEEP_EP_VERSION=26cf250
     pip install nvidia-nvshmem-cu12
 
diff --git a/lmdeploy/pytorch/backends/cuda/token_dispatcher.py b/lmdeploy/pytorch/backends/cuda/token_dispatcher.py
index bdf8cfab9f..77c88ac464 100644
--- a/lmdeploy/pytorch/backends/cuda/token_dispatcher.py
+++ b/lmdeploy/pytorch/backends/cuda/token_dispatcher.py
@@ -1,6 +1,13 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 try:
     from deep_ep import Buffer
+
+    from lmdeploy.pytorch.envs import env_to_int
+
+    # default value refers to DeepEP code
+    # https://github.com/deepseek-ai/DeepEP/blob/bfded34800dfec415b71503f8205181de90b2480/deep_ep/buffer.py#L30
+    deep_ep_buffer_num_sms = env_to_int(env_var='DEEPEP_BUFFER_NUM_SMS', default=20)
+    Buffer.set_num_sms(deep_ep_buffer_num_sms)
     use_deepep = True
 except ImportError:
     use_deepep = False
diff --git a/lmdeploy/pytorch/envs.py b/lmdeploy/pytorch/envs.py
index d5d08362d9..1ac1741dd0 100644
--- a/lmdeploy/pytorch/envs.py
+++ b/lmdeploy/pytorch/envs.py
@@ -126,6 +126,7 @@ def _patched_get_env(
     # we don't need to read this, it would be passed to ray workers
     # If Ray is launched from outside, it may fail to access the environment variables.
     os.getenv('DEEPEP_MAX_BATCH_SIZE', None)
+    os.getenv('DEEPEP_BUFFER_NUM_SMS', None)
 
     # deepgemm
     os.getenv('DG_JIT_DEBUG', '0')

From 1de8e6763eb529cf711e2ff0ca14f93e67b47279 Mon Sep 17 00:00:00 2001
From: zxy <zhou0493@e.ntu.edu.sg>
Date: Thu, 30 Oct 2025 11:02:04 +0800
Subject: [PATCH 2/2] add deep_gemm jit dependencies

---
 docker/install.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docker/install.sh b/docker/install.sh
index 941ee0c144..ed281c6ff2 100755
--- a/docker/install.sh
+++ b/docker/install.sh
@@ -22,9 +22,9 @@ popd >/dev/null
 if [[ "${CUDA_VERSION_SHORT}" = "cu118" ]]; then
     apt-get install -y --no-install-recommends cuda-minimal-build-11-8
 elif [[ "${CUDA_VERSION_SHORT}" = "cu124" ]]; then
-    apt-get install -y --no-install-recommends cuda-minimal-build-12-4
+    apt-get install -y --no-install-recommends cuda-minimal-build-12-4 cuda-cuobjdump-12-4 cuda-nvdisasm-12-4
 elif [[ "${CUDA_VERSION_SHORT}" = "cu128" ]]; then
-    apt-get install -y --no-install-recommends cuda-minimal-build-12-8
+    apt-get install -y --no-install-recommends cuda-minimal-build-12-8 cuda-cuobjdump-12-8 cuda-nvdisasm-12-8
 fi
 
 apt-get clean -y