diff --git a/.gitignore b/.gitignore
index 09ba832..07fd616 100644
--- a/.gitignore
+++ b/.gitignore
@@ -61,6 +61,7 @@ pyrightconfig.json
 *.swp
 *~
 .DS_Store
+.qoder/
 
 # Ignore uv lockfile
 uv.lock
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index afd67b6..83339f4 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -14,7 +14,7 @@ repos:
           - fastapi
           - httpx
           - numpy
-          - tinker
+          - tinker>=0.18.2
           - typer
           - uvicorn
           - pytest
diff --git a/src/tuft/backends/sampling_backend.py b/src/tuft/backends/sampling_backend.py
index 950f75f..e22753a 100644
--- a/src/tuft/backends/sampling_backend.py
+++ b/src/tuft/backends/sampling_backend.py
@@ -250,17 +250,16 @@ def _create_colocated_engine(self, config: ModelConfig):
 
     def _create_standalone_engine(self, config: ModelConfig):
         import ray
-        from ray.util.placement_group import placement_group
-        from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
         from trinity.common.config import InferenceModelConfig
         from trinity.common.models.vllm_model import vLLMRolloutModel
 
-        # create a placement group for this model
-        pg = placement_group(
-            [{"CPU": 1, "GPU": 1} for _ in range(config.tensor_parallel_size)],
-            strategy="PACK",
-        )
-        ray.get(pg.ready(), timeout=120)
+        # Assign tensor_parallel_size GPUs to the actor itself
+        # so that Ray populates CUDA_VISIBLE_DEVICES correctly.  vLLM then
+        # creates its own placement group inside the EngineCore process where
+        # the GPUs are visible.
+        num_gpus = config.tensor_parallel_size
+        bundle_indices = ",".join(str(i) for i in range(config.tensor_parallel_size))
+
         if not self._worker_venv_path or not self._worker_venv_path.strip():
             _runtime_env = {}
         else:
@@ -277,11 +276,7 @@ def _create_standalone_engine(self, config: ModelConfig):
             ray.remote(vLLMRolloutModel)
             .options(
                 name="sampling_model_" + self.base_model,
-                num_gpus=0 if config.tensor_parallel_size > 1 else 1,
-                scheduling_strategy=PlacementGroupSchedulingStrategy(
-                    placement_group=pg,
-                    placement_group_capture_child_tasks=True,
-                ),
+                num_gpus=num_gpus,
                 runtime_env=_runtime_env,
             )
             .remote(
@@ -307,6 +302,7 @@ def _create_standalone_engine(self, config: ModelConfig):
                         "max_loras": config.max_loras,
                     },
                     gpu_memory_utilization=config.sampling_memory_fraction,
+                    bundle_indices=bundle_indices,
                 )
             )
         )