diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 4e98cd2d73d..b18e6c156c4 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -573,4 +573,4 @@ def dispatch_bgmv_low_level( names_and_values_to_update[k] = hint_on_error(v) names_and_values.update(names_and_values_to_update) -del names_and_values_to_update, names_and_values, v, k, fn_type \ No newline at end of file +del names_and_values_to_update, names_and_values, v, k, fn_type diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py index 07f1264b1db..48128155470 100644 --- a/vllm/attention/selector.py +++ b/vllm/attention/selector.py @@ -62,8 +62,8 @@ def get_attn_backend( ROCmFlashAttentionBackend) return ROCmFlashAttentionBackend elif backend == _Backend.TORCH_SDPA: - assert is_cpu(), RuntimeError( - "Torch SDPA backend is only used for the CPU device.") + # assert is_cpu(), RuntimeError( + # "Torch SDPA backend is only used for the CPU device.") logger.info("Using Torch SDPA backend.") from vllm.attention.backends.torch_sdpa import TorchSDPABackend return TorchSDPABackend diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py index 56e6901f760..2d9ad9a3719 100644 --- a/vllm/spec_decode/spec_decode_worker.py +++ b/vllm/spec_decode/spec_decode_worker.py @@ -67,7 +67,7 @@ def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker": typical_acceptance_sampler_posterior_threshold=speculative_config. typical_acceptance_sampler_posterior_threshold, typical_acceptance_sampler_posterior_alpha=speculative_config. - typical_acceptance_sampler_posterior_alpha + typical_acceptance_sampler_posterior_alpha, cpu_draft_worker=speculative_config.cpu_draft_worker) return spec_decode_worker @@ -132,8 +132,8 @@ def create_worker( draft_tp = draft_parallel_config.tensor_parallel_size target_tp = scorer_worker.parallel_config.tensor_parallel_size - if draft_tp == 1: - draft_worker_kwargs["model_runner_cls"] = TP1DraftModelRunner + # if draft_tp == 1: + # draft_worker_kwargs["model_runner_cls"] = TP1DraftModelRunner if cpu_draft_worker: cpu_draft_worker_kwargs = copy.deepcopy(draft_worker_kwargs) from vllm.executor.cpu_executor import (