From cbe8ddba6a49ec9d0ac4dd0006ba076277e4c5c6 Mon Sep 17 00:00:00 2001 From: zxy Date: Thu, 30 Oct 2025 10:44:52 +0800 Subject: [PATCH 1/2] fix for multi-node ep --- docker/prepare_wheel.sh | 5 +++-- lmdeploy/pytorch/backends/cuda/token_dispatcher.py | 7 +++++++ lmdeploy/pytorch/envs.py | 1 + 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/docker/prepare_wheel.sh b/docker/prepare_wheel.sh index 4250c8820a..364a8c6dd0 100755 --- a/docker/prepare_wheel.sh +++ b/docker/prepare_wheel.sh @@ -22,13 +22,14 @@ fi if [[ "${CUDA_VERSION_SHORT}" != "cu118" ]]; then if [[ "${CUDA_VERSION_SHORT}" = "cu124" ]]; then - DEEP_GEMM_VERSION=03d0be3 FLASH_MLA_VERSION=9edee0c else - DEEP_GEMM_VERSION=79f48ee FLASH_MLA_VERSION=c759027 fi + # The current EP implementation uses dlblas, which is incompatible with the latest deep_gemm APIs. + # To ensure compatibility, we pin deep_gemm to an older version. + DEEP_GEMM_VERSION=03d0be3 DEEP_EP_VERSION=26cf250 pip install nvidia-nvshmem-cu12 diff --git a/lmdeploy/pytorch/backends/cuda/token_dispatcher.py b/lmdeploy/pytorch/backends/cuda/token_dispatcher.py index bdf8cfab9f..77c88ac464 100644 --- a/lmdeploy/pytorch/backends/cuda/token_dispatcher.py +++ b/lmdeploy/pytorch/backends/cuda/token_dispatcher.py @@ -1,6 +1,13 @@ # Copyright (c) OpenMMLab. All rights reserved. try: from deep_ep import Buffer + + from lmdeploy.pytorch.envs import env_to_int + + # default value refers to DeepEP code + # https://github.com/deepseek-ai/DeepEP/blob/bfded34800dfec415b71503f8205181de90b2480/deep_ep/buffer.py#L30 + deep_ep_buffer_num_sms = env_to_int(env_var='DEEPEP_BUFFER_NUM_SMS', default=20) + Buffer.set_num_sms(deep_ep_buffer_num_sms) use_deepep = True except ImportError: use_deepep = False diff --git a/lmdeploy/pytorch/envs.py b/lmdeploy/pytorch/envs.py index d5d08362d9..1ac1741dd0 100644 --- a/lmdeploy/pytorch/envs.py +++ b/lmdeploy/pytorch/envs.py @@ -126,6 +126,7 @@ def _patched_get_env( # we don't need to read this, it would be passed to ray workers # If Ray is launched from outside, it may fail to access the environment variables. os.getenv('DEEPEP_MAX_BATCH_SIZE', None) + os.getenv('DEEPEP_BUFFER_NUM_SMS', None) # deepgemm os.getenv('DG_JIT_DEBUG', '0') From 1de8e6763eb529cf711e2ff0ca14f93e67b47279 Mon Sep 17 00:00:00 2001 From: zxy Date: Thu, 30 Oct 2025 11:02:04 +0800 Subject: [PATCH 2/2] add deep_gemm jit dependencies --- docker/install.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/install.sh b/docker/install.sh index 941ee0c144..ed281c6ff2 100755 --- a/docker/install.sh +++ b/docker/install.sh @@ -22,9 +22,9 @@ popd >/dev/null if [[ "${CUDA_VERSION_SHORT}" = "cu118" ]]; then apt-get install -y --no-install-recommends cuda-minimal-build-11-8 elif [[ "${CUDA_VERSION_SHORT}" = "cu124" ]]; then - apt-get install -y --no-install-recommends cuda-minimal-build-12-4 + apt-get install -y --no-install-recommends cuda-minimal-build-12-4 cuda-cuobjdump-12-4 cuda-nvdisasm-12-4 elif [[ "${CUDA_VERSION_SHORT}" = "cu128" ]]; then - apt-get install -y --no-install-recommends cuda-minimal-build-12-8 + apt-get install -y --no-install-recommends cuda-minimal-build-12-8 cuda-cuobjdump-12-8 cuda-nvdisasm-12-8 fi apt-get clean -y