From 550da5361d6a67cb96e24aa2a2c9dc05499428ad Mon Sep 17 00:00:00 2001 From: Varun Sundar Rabindranath Date: Tue, 17 Dec 2024 15:22:38 +0000 Subject: [PATCH] fix profile Signed-off-by: Varun Sundar Rabindranath --- vllm/v1/worker/gpu_model_runner.py | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index dadd696de48a1..7f7cd9be2b1c4 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -630,7 +630,6 @@ def profile_run(self) -> None: for _ in range(self.num_attn_layers) ] -<<<<<<< HEAD # Profile with multimodal encoder & encoder cache. # TODO (ywang96): generalize this beyond image modality since # mm_input_mapper only supports image inputs. @@ -681,16 +680,7 @@ def profile_run(self) -> None: # Cache the dummy encoder outputs. self.encoder_cache["tmp"] = dict(enumerate(dummy_encoder_outputs)) - # Trigger compilation for general shape. - hidden_states = self._dummy_run(self.model, self.max_num_tokens, - dummy_kv_caches) - logits = self.model.compute_logits(hidden_states, None) - logits = logits[:self.max_num_tokens] - # TODO(woosuk): Consider the memory usage of the sampler. - torch.cuda.synchronize() - del hidden_states, logits - self.encoder_cache.clear() -======= + # TODO (varun): Reconcile text-only with multi-modal # compute num tokens per request. For profile, have maximum num_reqs and # that collectively have maximum num_tokens. num_reqs = self.scheduler_config.max_num_seqs @@ -715,9 +705,8 @@ def profile_run(self) -> None: logits = self.model.compute_logits(hidden_states, None) # TODO(woosuk): Consider the memory usage of the sampler. torch.cuda.synchronize() - del hidden_states, logits ->>>>>>> 90bb4d2c3 (Add lora support) + self.encoder_cache.clear() gc.collect() def capture_model(self) -> None: