From 64e03e909fd292a7679379258a23fd23ab8525ba Mon Sep 17 00:00:00 2001 From: Antoni Baum Date: Thu, 13 Jun 2024 16:06:49 -0700 Subject: [PATCH] Add `cuda_device_count_stateless` (#5473) --- .buildkite/test-pipeline.yaml | 1 + tests/conftest.py | 17 ++------- tests/distributed/test_utils.py | 31 ++++++++++++++++ vllm/config.py | 6 ++-- .../device_communicators/custom_all_reduce.py | 3 +- .../custom_all_reduce_utils.py | 3 +- vllm/executor/multiproc_gpu_executor.py | 6 ++-- vllm/utils.py | 35 +++++++++++++++++++ 8 files changed, 79 insertions(+), 23 deletions(-) create mode 100644 tests/distributed/test_utils.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 6b12d19ba611..6a2932db9f2d 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -48,6 +48,7 @@ steps: - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py - pytest -v -s spec_decode/e2e/test_integration_dist.py - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py + - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py - label: Distributed Tests (Multiple Groups) #mirror_hardwares: [amd] diff --git a/tests/conftest.py b/tests/conftest.py index f1c3f0e67931..2c72094f5005 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2,8 +2,6 @@ import gc import logging import os -import subprocess -import sys from typing import Any, Dict, List, Optional, Tuple, TypeVar import pytest @@ -24,7 +22,7 @@ from vllm.multimodal import MultiModalData from vllm.multimodal.image import ImageFeatureData, ImagePixelData from vllm.sequence import SampleLogprobs -from vllm.utils import is_cpu +from vllm.utils import cuda_device_count_stateless, is_cpu logger = init_logger(__name__) @@ -769,18 +767,7 @@ def num_gpus_available(): """Get number of GPUs without initializing the CUDA context in current process.""" - try: - out = subprocess.run([ - sys.executable, "-c", - "import torch; print(torch.cuda.device_count())" - ], - capture_output=True, - check=True, - text=True) - except subprocess.CalledProcessError as e: - logger.warning("Failed to get number of GPUs.", exc_info=e) - return 0 - return int(out.stdout.strip()) + return cuda_device_count_stateless() @pytest.fixture(scope="session") diff --git a/tests/distributed/test_utils.py b/tests/distributed/test_utils.py new file mode 100644 index 000000000000..b7ec59c7a2cc --- /dev/null +++ b/tests/distributed/test_utils.py @@ -0,0 +1,31 @@ +import os + +import ray + +from vllm.utils import cuda_device_count_stateless + + +@ray.remote +class _CUDADeviceCountStatelessTestActor(): + + def get_count(self): + return cuda_device_count_stateless() + + def set_cuda_visible_devices(self, cuda_visible_devices: str): + os.environ["CUDA_VISIBLE_DEVICES"] = cuda_visible_devices + + def get_cuda_visible_devices(self): + return os.environ["CUDA_VISIBLE_DEVICES"] + + +def test_cuda_device_count_stateless(): + """Test that cuda_device_count_stateless changes return value if + CUDA_VISIBLE_DEVICES is changed.""" + + actor = _CUDADeviceCountStatelessTestActor.options(num_gpus=2).remote() + assert ray.get(actor.get_cuda_visible_devices.remote()) == "0,1" + assert ray.get(actor.get_count.remote()) == 2 + ray.get(actor.set_cuda_visible_devices.remote("0")) + assert ray.get(actor.get_count.remote()) == 1 + ray.get(actor.set_cuda_visible_devices.remote("")) + assert ray.get(actor.get_count.remote()) == 0 diff --git a/vllm/config.py b/vllm/config.py index 2d72deb7f722..403959cb79d2 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -11,7 +11,8 @@ from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS from vllm.model_executor.models import ModelRegistry from vllm.transformers_utils.config import get_config, get_hf_text_config -from vllm.utils import get_cpu_memory, is_cpu, is_hip, is_neuron, is_tpu +from vllm.utils import (cuda_device_count_stateless, get_cpu_memory, is_cpu, + is_hip, is_neuron, is_tpu) if TYPE_CHECKING: from ray.util.placement_group import PlacementGroup @@ -637,12 +638,11 @@ def __init__( if self.distributed_executor_backend is None and self.world_size > 1: # We use multiprocessing by default if world_size fits on the # current node and we aren't in a ray placement group. - from torch.cuda import device_count from vllm.executor import ray_utils backend = "mp" ray_found = ray_utils.ray is not None - if device_count() < self.world_size: + if cuda_device_count_stateless() < self.world_size: if not ray_found: raise ValueError("Unable to load Ray which is " "required for multi-node inference") diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py index 9a2b47594916..b0cb21a02278 100644 --- a/vllm/distributed/device_communicators/custom_all_reduce.py +++ b/vllm/distributed/device_communicators/custom_all_reduce.py @@ -11,6 +11,7 @@ gpu_p2p_access_check) from vllm.distributed.parallel_state import is_in_the_same_node from vllm.logger import init_logger +from vllm.utils import cuda_device_count_stateless try: import pynvml @@ -144,7 +145,7 @@ def __init__(self, if cuda_visible_devices: device_ids = list(map(int, cuda_visible_devices.split(","))) else: - device_ids = list(range(torch.cuda.device_count())) + device_ids = list(range(cuda_device_count_stateless())) physical_device_id = device_ids[device.index] tensor = torch.tensor([physical_device_id], diff --git a/vllm/distributed/device_communicators/custom_all_reduce_utils.py b/vllm/distributed/device_communicators/custom_all_reduce_utils.py index 1fd0058f617f..c9573edb08f3 100644 --- a/vllm/distributed/device_communicators/custom_all_reduce_utils.py +++ b/vllm/distributed/device_communicators/custom_all_reduce_utils.py @@ -12,6 +12,7 @@ import vllm.envs as envs from vllm.logger import init_logger +from vllm.utils import cuda_device_count_stateless logger = init_logger(__name__) @@ -152,7 +153,7 @@ def gpu_p2p_access_check(i: int, j: int) -> bool: is_distributed = dist.is_initialized() - num_dev = torch.cuda.device_count() + num_dev = cuda_device_count_stateless() cuda_visible_devices = envs.CUDA_VISIBLE_DEVICES if cuda_visible_devices is None: cuda_visible_devices = ",".join(str(i) for i in range(num_dev)) diff --git a/vllm/executor/multiproc_gpu_executor.py b/vllm/executor/multiproc_gpu_executor.py index 99c9e52034cc..8385e56f88b3 100644 --- a/vllm/executor/multiproc_gpu_executor.py +++ b/vllm/executor/multiproc_gpu_executor.py @@ -9,7 +9,8 @@ ResultHandler, WorkerMonitor) from vllm.logger import init_logger from vllm.sequence import ExecuteModelRequest, SamplerOutput -from vllm.utils import (get_distributed_init_method, get_ip, get_open_port, +from vllm.utils import (cuda_device_count_stateless, + get_distributed_init_method, get_ip, get_open_port, get_vllm_instance_id, make_async) logger = init_logger(__name__) @@ -33,8 +34,7 @@ def _init_executor(self) -> None: # Disable torch async compiling which won't work with daemonic processes os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1" - from torch.cuda import device_count - assert world_size <= device_count(), ( + assert world_size <= cuda_device_count_stateless(), ( "please set tensor_parallel_size to less than max local gpu count") distributed_init_method = get_distributed_init_method( diff --git a/vllm/utils.py b/vllm/utils.py index 8f33f11f5ac1..ef0602987a9e 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -693,3 +693,38 @@ def inner(*args, **kwargs): return inner # type: ignore return wrapper + + +@lru_cache(maxsize=8) +def _cuda_device_count_stateless( + cuda_visible_devices: Optional[str] = None) -> int: + # Note: cuda_visible_devices is not used, but we keep it as an argument for + # LRU Cache purposes. + + # Code below is based on + # https://github.com/pytorch/pytorch/blob/ + # c1cd946818442aca8c7f812b16d187ce1586c3bc/ + # torch/cuda/__init__.py#L831C1-L831C17 + import torch.cuda + import torch.version + + if not torch.cuda._is_compiled(): + return 0 + # bypass _device_count_nvml() if rocm (not supported) + nvml_count = -1 if torch.version.hip else torch.cuda._device_count_nvml() + r = torch._C._cuda_getDeviceCount() if nvml_count < 0 else nvml_count + return r + + +def cuda_device_count_stateless() -> int: + """Get number of CUDA devices, caching based on the value of + CUDA_VISIBLE_DEVICES at the time of call. + + This should be used instead of torch.cuda.device_count() + unless CUDA_VISIBLE_DEVICES has already been set to the desired + value.""" + + # This can be removed and simply replaced with torch.cuda.get_device_count + # after https://github.com/pytorch/pytorch/pull/122815 is released. + + return _cuda_device_count_stateless(envs.CUDA_VISIBLE_DEVICES)