3434from fastdeploy .model_executor .graph_optimization .utils import (
3535 in_sot_warmup_mode as in_warmup_mode ,
3636)
37+ from fastdeploy .utils import get_logger
38+
39+ logger = get_logger ("cudagrpah_piecewise_backend" , "cudagraph_piecewise_backend.log" )
40+
3741
3842P = ParamSpec ("P" )
3943T = TypeVar ("T" )
@@ -105,6 +109,9 @@ def __init__(self, runnable: Callable, fd_config: FDConfig):
105109 self .dy_runnable = self .runnable
106110 self .fd_config = fd_config
107111 self .max_captre_size = fd_config .graph_opt_config .cudagraph_capture_sizes [0 ]
112+ self ._debug_count_cudagraph_replay = 0
113+ self ._debug_count_total_step = 0
114+
108115 if self .fd_config .graph_opt_config .graph_opt_level > 0 :
109116 # 1. Prepare cuda graph input buffers (contain output of subgraphs)
110117
@@ -123,6 +130,7 @@ def __init__(self, runnable: Callable, fd_config: FDConfig):
123130 )
124131
125132 def __call__ (self , ** kwargs ):
133+ self ._debug_count_total_step += 1
126134 if not self .fd_config .graph_opt_config .use_cudagraph :
127135 return self .runnable (** kwargs )
128136 if self .cudagraph_piecewise_backend is None :
@@ -136,6 +144,10 @@ def __call__(self, **kwargs):
136144 if (not kwargs ["forward_meta" ].step_use_cudagraph ) or (real_shape > self .cudagraph_switch_threshold ):
137145 return self .dy_runnable (** kwargs )
138146 else :
147+ self ._debug_count_cudagraph_replay += 1
148+ logger .debug (
149+ f"[CUDA GRAPH][ID:{ id (self .cudagraph_piecewise_backend )} ] Total step count: { self ._debug_count_total_step } , CUDAGraph replay count: { self ._debug_count_cudagraph_replay } "
150+ )
139151 return self .cudagraph_piecewise_backend .__call__ (** kwargs )
140152
141153 def clear_cudagraph_piecewise_backend (self ):
0 commit comments