fix profile

Signed-off-by: Varun Sundar Rabindranath <[email protected]>
vllm-project · Dec 17, 2024 · 550da53 · 550da53
1 parent 569fb69
commit 550da53
Showing 1 changed file with 2 additions and 13 deletions.
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -630,7 +630,6 @@ def profile_run(self) -> None:
             for _ in range(self.num_attn_layers)
         ]
 
-<<<<<<< HEAD
         # Profile with multimodal encoder & encoder cache.
         # TODO (ywang96): generalize this beyond image modality since
         # mm_input_mapper only supports image inputs.
@@ -681,16 +680,7 @@ def profile_run(self) -> None:
             # Cache the dummy encoder outputs.
             self.encoder_cache["tmp"] = dict(enumerate(dummy_encoder_outputs))
 
-        # Trigger compilation for general shape.
-        hidden_states = self._dummy_run(self.model, self.max_num_tokens,
-                                        dummy_kv_caches)
-        logits = self.model.compute_logits(hidden_states, None)
-        logits = logits[:self.max_num_tokens]
-        # TODO(woosuk): Consider the memory usage of the sampler.
-        torch.cuda.synchronize()
-        del hidden_states, logits
-        self.encoder_cache.clear()
-=======
+        # TODO (varun): Reconcile text-only with multi-modal
         # compute num tokens per request. For profile, have maximum num_reqs and
         # that collectively have maximum num_tokens.
         num_reqs = self.scheduler_config.max_num_seqs
@@ -715,9 +705,8 @@ def profile_run(self) -> None:
             logits = self.model.compute_logits(hidden_states, None)
             # TODO(woosuk): Consider the memory usage of the sampler.
             torch.cuda.synchronize()
-
             del hidden_states, logits
->>>>>>> 90bb4d2c3 (Add lora support)
+            self.encoder_cache.clear()
         gc.collect()
 
     def capture_model(self) -> None: