Skip to content

Commit a663f6a

Browse files
authored
[cpu][perf] Fix low CPU utilization with VLLM_CPU_OMP_THREADS_BIND on AArch64 (vllm-project#27415)
Signed-off-by: Fadi Arafeh <[email protected]>
1 parent a4fc218 commit a663f6a

File tree

3 files changed

+83
-6
lines changed

3 files changed

+83
-6
lines changed

cmake/cpu_extension.cmake

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -212,11 +212,24 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON
212212
# Build ACL with scons
213213
include(ProcessorCount)
214214
ProcessorCount(_NPROC)
215+
set(_scons_cmd
216+
scons -j${_NPROC}
217+
Werror=0 debug=0 neon=1 examples=0 embed_kernels=0 os=linux
218+
arch=armv8.2-a build=native benchmark_examples=0 fixed_format_kernels=1
219+
multi_isa=1 openmp=1 cppthreads=0
220+
)
221+
222+
# locate PyTorch's libgomp (e.g. site-packages/torch.libs/libgomp-947d5fa1.so.1.0.0)
223+
# and create a local shim dir with it
224+
include("${CMAKE_CURRENT_LIST_DIR}/utils.cmake")
225+
vllm_prepare_torch_gomp_shim(VLLM_TORCH_GOMP_SHIM_DIR)
226+
227+
if(NOT VLLM_TORCH_GOMP_SHIM_DIR STREQUAL "")
228+
list(APPEND _scons_cmd extra_link_flags=-L${VLLM_TORCH_GOMP_SHIM_DIR})
229+
endif()
230+
215231
execute_process(
216-
COMMAND scons -j${_NPROC}
217-
Werror=0 debug=0 neon=1 examples=0 embed_kernels=0 os=linux
218-
arch=armv8.2-a build=native benchmark_examples=0 fixed_format_kernels=1
219-
multi_isa=1 openmp=1 cppthreads=0
232+
COMMAND ${_scons_cmd}
220233
WORKING_DIRECTORY "$ENV{ACL_ROOT_DIR}"
221234
RESULT_VARIABLE _acl_rc
222235
)

cmake/utils.cmake

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,44 @@ function (get_torch_gpu_compiler_flags OUT_GPU_FLAGS GPU_LANG)
129129
set(${OUT_GPU_FLAGS} ${GPU_FLAGS} PARENT_SCOPE)
130130
endfunction()
131131

132+
# Find libgomp that gets shipped with PyTorch wheel and create a shim dir with:
133+
# libgomp.so -> libgomp-<hash>.so...
134+
# libgomp.so.1 -> libgomp-<hash>.so...
135+
# OUTPUT: TORCH_GOMP_SHIM_DIR ("" if not found)
136+
function(vllm_prepare_torch_gomp_shim TORCH_GOMP_SHIM_DIR)
137+
set(${TORCH_GOMP_SHIM_DIR} "" PARENT_SCOPE)
138+
139+
# Use run_python to locate vendored libgomp; never throw on failure.
140+
run_python(_VLLM_TORCH_GOMP_PATH
141+
"
142+
import os, glob
143+
try:
144+
import torch
145+
torch_pkg = os.path.dirname(torch.__file__)
146+
site_root = os.path.dirname(torch_pkg)
147+
torch_libs = os.path.join(site_root, 'torch.libs')
148+
print(glob.glob(os.path.join(torch_libs, 'libgomp-*.so*'))[0])
149+
except:
150+
print('')
151+
"
152+
"failed to probe torch.libs for libgomp")
153+
154+
if(_VLLM_TORCH_GOMP_PATH STREQUAL "" OR NOT EXISTS "${_VLLM_TORCH_GOMP_PATH}")
155+
return()
156+
endif()
157+
158+
# Create shim under the build tree
159+
set(_shim "${CMAKE_BINARY_DIR}/gomp_shim")
160+
file(MAKE_DIRECTORY "${_shim}")
161+
162+
execute_process(COMMAND ${CMAKE_COMMAND} -E rm -f "${_shim}/libgomp.so")
163+
execute_process(COMMAND ${CMAKE_COMMAND} -E rm -f "${_shim}/libgomp.so.1")
164+
execute_process(COMMAND ${CMAKE_COMMAND} -E create_symlink "${_VLLM_TORCH_GOMP_PATH}" "${_shim}/libgomp.so")
165+
execute_process(COMMAND ${CMAKE_COMMAND} -E create_symlink "${_VLLM_TORCH_GOMP_PATH}" "${_shim}/libgomp.so.1")
166+
167+
set(${TORCH_GOMP_SHIM_DIR} "${_shim}" PARENT_SCOPE)
168+
endfunction()
169+
132170
# Macro for converting a `gencode` version number to a cmake version number.
133171
macro(string_to_ver OUT_VER IN_STR)
134172
string(REGEX REPLACE "\([0-9]+\)\([0-9]\)" "\\1.\\2" ${OUT_VER} ${IN_STR})

vllm/platforms/cpu.py

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# SPDX-License-Identifier: Apache-2.0
22
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
33

4+
import glob
45
import json
56
import os
67
import platform
@@ -301,8 +302,8 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
301302
os.environ["VLLM_DISABLE_SHARED_EXPERTS_STREAM"] = "0"
302303

303304
# Intel OpenMP setting
304-
ld_prealod_str = os.getenv("LD_PRELOAD", "")
305-
if "libiomp5.so" in ld_prealod_str:
305+
ld_preload_str = os.getenv("LD_PRELOAD", "")
306+
if "libiomp5.so" in ld_preload_str:
306307
# The time(milliseconds) that a thread should wait after
307308
# completing the execution of a parallel region, before sleeping.
308309
os.environ["KMP_BLOCKTIME"] = "1"
@@ -313,6 +314,31 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
313314
os.environ["KMP_PLAIN_BARRIER_PATTERN"] = "dist,dist"
314315
os.environ["KMP_REDUCTION_BARRIER_PATTERN"] = "dist,dist"
315316

317+
if (
318+
platform.system() == "Linux"
319+
and Platform.get_cpu_architecture() == CpuArchEnum.ARM
320+
and not ("libomp" in ld_preload_str or "libgomp" in ld_preload_str)
321+
):
322+
# We need to LD_PRELOAD PyTorch's libgomp, otherwise only
323+
# one core will be properly utilized when we thread-bind
324+
# See: https://github.com/vllm-project/vllm/issues/27369
325+
# TODO: Remove once:
326+
# https://github.com/pytorch/pytorch/issues/166087 is fixed
327+
328+
# We need to find the location of PyTorch's libgomp
329+
torch_pkg = os.path.dirname(torch.__file__)
330+
site_root = os.path.dirname(torch_pkg)
331+
torch_libs = os.path.join(site_root, "torch.libs")
332+
pytorch_libgomp_so_candidates = glob.glob(
333+
os.path.join(torch_libs, "libgomp-*.so*")
334+
)
335+
if pytorch_libgomp_so_candidates:
336+
pytorch_libgomp_so = pytorch_libgomp_so_candidates[0]
337+
if ld_preload_str:
338+
ld_preload_str += ":"
339+
ld_preload_str += pytorch_libgomp_so
340+
os.environ["LD_PRELOAD"] = ld_preload_str
341+
316342
# To hint IPEX uses shared memory based AllReduce
317343
os.environ["LOCAL_WORLD_SIZE"] = str(
318344
vllm_config.parallel_config.tensor_parallel_size

0 commit comments

Comments
 (0)