diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py index 1a7d11d0d4bc..fe9b2fac1117 100644 --- a/vllm/model_executor/layers/rejection_sampler.py +++ b/vllm/model_executor/layers/rejection_sampler.py @@ -240,6 +240,7 @@ def _smallest_positive_value(self) -> float: """ return torch.finfo(self.probs_dtype).tiny + # torch.multinomial forces a GPU<->CPU sync. # Therefore, we use an optimized implementation instead that skips the sync. # Note that we always sample with replacement.