Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 16 additions & 15 deletions fastdeploy/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -583,6 +583,7 @@ def __init__(
only to the layer where CUDA graph functionality is required.
"""
self.cudagraph_splitting_ops: list[str] = []
self.cudagraph_only_prefill: bool = False
""" Whether to use a full cuda graph for the entire forward pass rather than
splitting certain operations such as attention into subgraphs.
Thus this flag cannot be used together with splitting_ops."""
Expand All @@ -601,13 +602,13 @@ def __init__(

self.check_legality_parameters()

def init_with_cudagrpah_size(self, max_num_seqs: int = 0) -> None:
def init_with_cudagrpah_size(self, max_capture_size: int = 0) -> None:
"""
Initialize cuda graph capture sizes and
pre-compute the mapping from batch size to padded graph size
"""
# Regular capture sizes
self.cudagraph_capture_sizes = [size for size in self.cudagraph_capture_sizes if size <= max_num_seqs]
self.cudagraph_capture_sizes = [size for size in self.cudagraph_capture_sizes if size <= max_capture_size]
dedup_sizes = list(set(self.cudagraph_capture_sizes))
if len(dedup_sizes) < len(self.cudagraph_capture_sizes):
logger.info(
Expand All @@ -631,7 +632,7 @@ def init_with_cudagrpah_size(self, max_num_seqs: int = 0) -> None:
self.real_shape_to_captured_size[bs] = end
self.real_shape_to_captured_size[self.max_capture_size] = self.max_capture_size

def _set_cudagraph_sizes(self, max_num_seqs: int = 0):
def _set_cudagraph_sizes(self, max_capture_size: int = 0):
"""
Calculate a series of candidate capture sizes,
and then extract a portion of them as the capture list for the CUDA graph based on user input.
Expand All @@ -643,7 +644,7 @@ def _set_cudagraph_sizes(self, max_num_seqs: int = 0):
# Shape [256, 288, ... 992, 1024]
draft_capture_sizes += [32 * i for i in range(17, 33)]

draft_capture_sizes.append(max_num_seqs)
draft_capture_sizes.append(max_capture_size)
self.cudagraph_capture_sizes = sorted(draft_capture_sizes)

def to_json_string(self):
Expand Down Expand Up @@ -1148,20 +1149,20 @@ def __init__(
self.cache_config: CacheConfig = cache_config # type: ignore
self.moba_attention_config: Optional[MobaAttentionConfig] = moba_attention_config
# Initialize cuda graph capture list
if self.graph_opt_config.cudagraph_capture_sizes is None:
self.graph_opt_config._set_cudagraph_sizes(max_num_seqs=self.parallel_config.max_num_seqs)

max_capture_shape = self.parallel_config.max_num_seqs
if self.speculative_config is not None and self.speculative_config.method == "mtp":
max_shape = self.parallel_config.max_num_seqs * (self.speculative_config.num_speculative_tokens + 1)
if max_shape % 2 == 1:
max_shape = max_shape + 1
self.graph_opt_config.init_with_cudagrpah_size(max_num_seqs=min(512, max_shape))
max_capture_shape = self.parallel_config.max_num_seqs * (
self.speculative_config.num_speculative_tokens + 1
)
assert max_capture_shape % 2 == 0, "CUDAGraph only supports capturing even token nums in MTP scenarios."
if self.graph_opt_config.cudagraph_only_prefill:
max_capture_shape = 512
else:
self.graph_opt_config.init_with_cudagrpah_size(max_num_seqs=self.parallel_config.max_num_seqs)
max_capture_shape = min(512, max_capture_shape)

# TODO(wangmingkai02): change graph_opt_level=2 when using static mode with cinn
if self.graph_opt_config.graph_opt_level == 2:
self.graph_opt_config.graph_opt_level = 1
if self.graph_opt_config.cudagraph_capture_sizes is None:
self.graph_opt_config._set_cudagraph_sizes(max_capture_size=max_capture_shape)
self.graph_opt_config.init_with_cudagrpah_size(max_capture_size=max_capture_shape)

self.tokenizer = tokenizer
self.max_num_batched_tokens = max_num_batched_tokens
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ def __call__(self, **kwargs):
for n in range(entry.num_finished_warmup, self.warm_up_size):
entry.num_finished_warmup += 1
entry.runnable(**kwargs)
logger.debug(
logger.info(
f"[CUDA GRAPH] [ID:{id(self)}] Warm up for real shape {padding_real_shape}, "
f"finished ({n + 1}/{entry.num_finished_warmup}) times"
)
Expand Down Expand Up @@ -199,7 +199,7 @@ def __call__(self, **kwargs):

# For CUDAGraph debug
# self._save_cudagrpah_dot_files(entry)
logger.debug(f"[CUDA GRAPH] [ID:{id(self)}] CUDAGraph captured for real shape {padding_real_shape}")
logger.info(f"[CUDA GRAPH] [ID:{id(self)}] CUDAGraph captured for real shape {padding_real_shape}")

# Replay
entry.cuda_graph.replay()
Expand Down Expand Up @@ -243,3 +243,9 @@ def _save_cudagrpah_dot_files(self, entry):
f"./{log_dir}/GraphDotFiles/backend{id(self)}_shape{entry.real_shape}_time{time.perf_counter()}",
1 << 0,
)

def check_capture_successful(self):
"""Check whether the shapes are captured or not"""
for shape, entry in self.concrete_size_entries.items():
if not entry.captured:
raise ValueError(f"[CUDA GRAPH][ID:{id(self)}] Shape {shape} capture failed.")
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@

from paddle.jit import sot
from paddle.jit.dy2static.utils import Backend as ToStaticBackend
from paddleformers.utils.log import logger
from typing_extensions import ParamSpec

from fastdeploy.config import FDConfig
Expand All @@ -35,6 +34,9 @@
from fastdeploy.model_executor.graph_optimization.utils import (
in_sot_warmup_mode as in_warmup_mode,
)
from fastdeploy.utils import get_logger

logger = get_logger("cudagrpah_piecewise_backend", "cudagraph_piecewise_backend.log")

P = ParamSpec("P")
T = TypeVar("T")
Expand Down Expand Up @@ -116,6 +118,9 @@ def __init__(self, runnable: Callable, fd_config: FDConfig):
self.fd_config = fd_config

self.max_captre_size = fd_config.graph_opt_config.cudagraph_capture_sizes[0]
self._debug_count_cudagraph_replay = 0
self._debug_count_total_step = 0

if self.fd_config.graph_opt_config.graph_opt_level > 0:
# 1. Prepare cuda grpah input buffers (contain output of subgraphs)

Expand All @@ -130,6 +135,7 @@ def __init__(self, runnable: Callable, fd_config: FDConfig):
).__get__(self.runnable.__self__)

def __call__(self, **kwargs):
self._debug_count_total_step += 1
if not self.fd_config.graph_opt_config.use_cudagraph:
return self.runnable(**kwargs)
if self.cudagraph_piecewise_backend is None:
Expand All @@ -143,6 +149,10 @@ def __call__(self, **kwargs):
if (not kwargs["forward_meta"].step_use_cudagraph) or (real_shape > self.max_captre_size):
return self.runnable(**kwargs)
else:
self._debug_count_cudagraph_replay += 1
logger.debug(
f"[CUDA GRAPH][ID:{id(self.cudagraph_piecewise_backend)}] Total step count: {self._debug_count_total_step}, CUDAGraph replay count: {self._debug_count_cudagraph_replay}"
)
return self.cudagraph_piecewise_backend.__call__(**kwargs)

def clear_cudagraph_piecewise_backend(self):
Expand Down
4 changes: 2 additions & 2 deletions tests/graph_optimization/test_cuda_graph_dynamic_subgraph.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,8 +156,8 @@ def test_cuda_graph_subgraph(self):
parallel_config.max_num_seqs = 8
cache_config = CacheConfig({})
# Initialize cuda graph capture list
graph_opt_config._set_cudagraph_sizes(max_num_seqs=parallel_config.max_num_seqs)
graph_opt_config.init_with_cudagrpah_size(max_num_seqs=parallel_config.max_num_seqs)
graph_opt_config._set_cudagraph_sizes(max_capture_size=parallel_config.max_num_seqs)
graph_opt_config.init_with_cudagrpah_size(max_capture_size=parallel_config.max_num_seqs)
fd_config = FDConfig(
graph_opt_config=graph_opt_config,
parallel_config=parallel_config,
Expand Down
4 changes: 2 additions & 2 deletions tests/graph_optimization/test_cuda_graph_spec_decode.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,8 +103,8 @@ def test_cuda_graph_spec_decode(self):
parallel_config.max_num_seqs = 1
cache_config = CacheConfig({})
# Initialize cuda graph capture list
graph_opt_config._set_cudagraph_sizes(max_num_seqs=parallel_config.max_num_seqs)
graph_opt_config.init_with_cudagrpah_size(max_num_seqs=parallel_config.max_num_seqs)
graph_opt_config._set_cudagraph_sizes(max_capture_size=parallel_config.max_num_seqs)
graph_opt_config.init_with_cudagrpah_size(max_capture_size=parallel_config.max_num_seqs)
fd_config = FDConfig(
graph_opt_config=graph_opt_config,
parallel_config=parallel_config,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,8 +89,9 @@ def test(self):
# Set FastDeploy config
graph_opt_config = GraphOptimizationConfig({"use_cudagraph": True, "graph_opt_level": 1})
parallel_config = ParallelConfig({"max_num_seqs": 1})
graph_opt_config._set_cudagraph_sizes(max_num_seqs=parallel_config.max_num_seqs)
graph_opt_config.init_with_cudagrpah_size(max_num_seqs=parallel_config.max_num_seqs)
graph_opt_config._set_cudagraph_sizes(max_capture_size=parallel_config.max_num_seqs)
graph_opt_config.init_with_cudagrpah_size(max_capture_size=parallel_config.max_num_seqs)

cache_config = CacheConfig({})

fd_config = FDConfig(
Expand Down
Loading