Skip to content

Commit

Permalink
fix profile
Browse files Browse the repository at this point in the history
Signed-off-by: Varun Sundar Rabindranath <[email protected]>
  • Loading branch information
Varun Sundar Rabindranath committed Dec 17, 2024
1 parent 569fb69 commit 550da53
Showing 1 changed file with 2 additions and 13 deletions.
15 changes: 2 additions & 13 deletions vllm/v1/worker/gpu_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -630,7 +630,6 @@ def profile_run(self) -> None:
for _ in range(self.num_attn_layers)
]

<<<<<<< HEAD
# Profile with multimodal encoder & encoder cache.
# TODO (ywang96): generalize this beyond image modality since
# mm_input_mapper only supports image inputs.
Expand Down Expand Up @@ -681,16 +680,7 @@ def profile_run(self) -> None:
# Cache the dummy encoder outputs.
self.encoder_cache["tmp"] = dict(enumerate(dummy_encoder_outputs))

# Trigger compilation for general shape.
hidden_states = self._dummy_run(self.model, self.max_num_tokens,
dummy_kv_caches)
logits = self.model.compute_logits(hidden_states, None)
logits = logits[:self.max_num_tokens]
# TODO(woosuk): Consider the memory usage of the sampler.
torch.cuda.synchronize()
del hidden_states, logits
self.encoder_cache.clear()
=======
# TODO (varun): Reconcile text-only with multi-modal
# compute num tokens per request. For profile, have maximum num_reqs and
# that collectively have maximum num_tokens.
num_reqs = self.scheduler_config.max_num_seqs
Expand All @@ -715,9 +705,8 @@ def profile_run(self) -> None:
logits = self.model.compute_logits(hidden_states, None)
# TODO(woosuk): Consider the memory usage of the sampler.
torch.cuda.synchronize()

del hidden_states, logits
>>>>>>> 90bb4d2c3 (Add lora support)
self.encoder_cache.clear()
gc.collect()

def capture_model(self) -> None:
Expand Down

0 comments on commit 550da53

Please sign in to comment.