11# SPDX-License-Identifier: Apache-2.0
22# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
33
4+ import glob
45import json
56import os
67import platform
@@ -301,8 +302,8 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
301302 os .environ ["VLLM_DISABLE_SHARED_EXPERTS_STREAM" ] = "0"
302303
303304 # Intel OpenMP setting
304- ld_prealod_str = os .getenv ("LD_PRELOAD" , "" )
305- if "libiomp5.so" in ld_prealod_str :
305+ ld_preload_str = os .getenv ("LD_PRELOAD" , "" )
306+ if "libiomp5.so" in ld_preload_str :
306307 # The time(milliseconds) that a thread should wait after
307308 # completing the execution of a parallel region, before sleeping.
308309 os .environ ["KMP_BLOCKTIME" ] = "1"
@@ -313,6 +314,31 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
313314 os .environ ["KMP_PLAIN_BARRIER_PATTERN" ] = "dist,dist"
314315 os .environ ["KMP_REDUCTION_BARRIER_PATTERN" ] = "dist,dist"
315316
317+ if (
318+ platform .system () == "Linux"
319+ and Platform .get_cpu_architecture () == CpuArchEnum .ARM
320+ and not ("libomp" in ld_preload_str or "libgomp" in ld_preload_str )
321+ ):
322+ # We need to LD_PRELOAD PyTorch's libgomp, otherwise only
323+ # one core will be properly utilized when we thread-bind
324+ # See: https://github.com/vllm-project/vllm/issues/27369
325+ # TODO: Remove once:
326+ # https://github.com/pytorch/pytorch/issues/166087 is fixed
327+
328+ # We need to find the location of PyTorch's libgomp
329+ torch_pkg = os .path .dirname (torch .__file__ )
330+ site_root = os .path .dirname (torch_pkg )
331+ torch_libs = os .path .join (site_root , "torch.libs" )
332+ pytorch_libgomp_so_candidates = glob .glob (
333+ os .path .join (torch_libs , "libgomp-*.so*" )
334+ )
335+ if pytorch_libgomp_so_candidates :
336+ pytorch_libgomp_so = pytorch_libgomp_so_candidates [0 ]
337+ if ld_preload_str :
338+ ld_preload_str += ":"
339+ ld_preload_str += pytorch_libgomp_so
340+ os .environ ["LD_PRELOAD" ] = ld_preload_str
341+
316342 # To hint IPEX uses shared memory based AllReduce
317343 os .environ ["LOCAL_WORLD_SIZE" ] = str (
318344 vllm_config .parallel_config .tensor_parallel_size
0 commit comments