diff --git a/.gitignore b/.gitignore index 09ba832..07fd616 100644 --- a/.gitignore +++ b/.gitignore @@ -61,6 +61,7 @@ pyrightconfig.json *.swp *~ .DS_Store +.qoder/ # Ignore uv lockfile uv.lock diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index afd67b6..83339f4 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -14,7 +14,7 @@ repos: - fastapi - httpx - numpy - - tinker + - tinker>=0.18.2 - typer - uvicorn - pytest diff --git a/src/tuft/backends/sampling_backend.py b/src/tuft/backends/sampling_backend.py index 950f75f..e22753a 100644 --- a/src/tuft/backends/sampling_backend.py +++ b/src/tuft/backends/sampling_backend.py @@ -250,17 +250,16 @@ def _create_colocated_engine(self, config: ModelConfig): def _create_standalone_engine(self, config: ModelConfig): import ray - from ray.util.placement_group import placement_group - from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy from trinity.common.config import InferenceModelConfig from trinity.common.models.vllm_model import vLLMRolloutModel - # create a placement group for this model - pg = placement_group( - [{"CPU": 1, "GPU": 1} for _ in range(config.tensor_parallel_size)], - strategy="PACK", - ) - ray.get(pg.ready(), timeout=120) + # Assign tensor_parallel_size GPUs to the actor itself + # so that Ray populates CUDA_VISIBLE_DEVICES correctly. vLLM then + # creates its own placement group inside the EngineCore process where + # the GPUs are visible. + num_gpus = config.tensor_parallel_size + bundle_indices = ",".join(str(i) for i in range(config.tensor_parallel_size)) + if not self._worker_venv_path or not self._worker_venv_path.strip(): _runtime_env = {} else: @@ -277,11 +276,7 @@ def _create_standalone_engine(self, config: ModelConfig): ray.remote(vLLMRolloutModel) .options( name="sampling_model_" + self.base_model, - num_gpus=0 if config.tensor_parallel_size > 1 else 1, - scheduling_strategy=PlacementGroupSchedulingStrategy( - placement_group=pg, - placement_group_capture_child_tasks=True, - ), + num_gpus=num_gpus, runtime_env=_runtime_env, ) .remote( @@ -307,6 +302,7 @@ def _create_standalone_engine(self, config: ModelConfig): "max_loras": config.max_loras, }, gpu_memory_utilization=config.sampling_memory_fraction, + bundle_indices=bundle_indices, ) ) )