Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 11 additions & 10 deletions fastdeploy/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -1391,19 +1391,20 @@ def __init__(
self.cache_config: CacheConfig = cache_config # type: ignore
self.plas_attention_config: Optional[PlasAttentionConfig] = plas_attention_config
self.structured_outputs_config: StructuredOutputsConfig = structured_outputs_config
# Initialize cuda graph capture list
if self.graph_opt_config.cudagraph_capture_sizes is None:
self.graph_opt_config._set_cudagraph_sizes(max_num_seqs=self.scheduler_config.max_num_seqs)

if self.graph_opt_config.cudagraph_only_prefill:
self.graph_opt_config.init_with_cudagrpah_size(max_capture_size=512)
elif self.speculative_config is not None and self.speculative_config.method == "mtp":
# Initialize cuda graph capture list
max_shape = self.scheduler_config.max_num_seqs
if self.speculative_config is not None and self.speculative_config.method == "mtp":
max_shape = self.scheduler_config.max_num_seqs * (self.speculative_config.num_speculative_tokens + 1)
if max_shape % 2 == 1:
max_shape = max_shape + 1
self.graph_opt_config.init_with_cudagrpah_size(max_capture_size=min(512, max_shape))
assert max_shape % 2 == 0, "CUDAGraph only supports capturing even token nums in MTP scenarios."
if self.graph_opt_config.cudagraph_only_prefill:
max_shape = 512
else:
self.graph_opt_config.init_with_cudagrpah_size(max_capture_size=self.scheduler_config.max_num_seqs)
max_shape = min(512, max_shape)

if self.graph_opt_config.cudagraph_capture_sizes is None:
self.graph_opt_config._set_cudagraph_sizes(max_num_seqs=max_shape)
self.graph_opt_config.init_with_cudagrpah_size(max_capture_size=max_shape)

self.tokenizer = tokenizer
self.ips = ips
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ def __call__(self, **kwargs) -> List[paddle.Tensor] | paddle.Tensor:
for n in range(entry.num_finished_warmup, self.warm_up_size):
entry.num_finished_warmup += 1
entry.runnable(**kwargs)
logger.debug(
logger.info(
f"[CUDA GRAPH][ID:{id(self)}] Warm up for real shape {padding_real_shape}, "
f"finished ({n + 1}/{entry.num_finished_warmup}) times"
)
Expand Down Expand Up @@ -207,7 +207,7 @@ def __call__(self, **kwargs) -> List[paddle.Tensor] | paddle.Tensor:

# For CUDAGraph debug
# self._save_cudagrpah_dot_files(entry)
logger.debug(f"[CUDA GRAPH][ID:{id(self)}] CUDAGraph captured for real shape {padding_real_shape}")
logger.info(f"[CUDA GRAPH][ID:{id(self)}] CUDAGraph captured for real shape {padding_real_shape}")

# Replay
entry.cuda_graph.replay()
Expand All @@ -224,7 +224,7 @@ def _create_entry_dict(self):
for shape in self.cudagraph_capture_sizes:
self.concrete_size_entries[shape] = ConcreteSizeEntry(real_shape=shape)

logger.debug(
logger.info(
f"[CUDA GRAPH][ID:{id(self)}] CUDAGraph capture list {self.cudagraph_capture_sizes}, "
"Created all real shape entry."
)
Expand Down Expand Up @@ -254,3 +254,9 @@ def _save_cudagrpah_dot_files(self, entry):
f"{log_dir}/GraphDotFiles/backend{id(self)}_shape{entry.real_shape}",
1 << 0,
)

def check_capture_successful(self):
"""Check whether the shapes are captured or not"""
for shape, entry in self.concrete_size_entries.items():
if not entry.captured:
raise ValueError(f"[CUDA GRAPH][ID:{id(self)}] Shape {shape} capture failed.")
Loading