fix bug and refine code

gongshaotian · gongshaotian · commit ebd08421bbe9 · 2025-10-28T14:31:08.000+08:00
diff --git a/fastdeploy/config.py b/fastdeploy/config.py
@@ -1391,19 +1391,20 @@ def __init__(
         self.cache_config: CacheConfig = cache_config  # type: ignore
         self.plas_attention_config: Optional[PlasAttentionConfig] = plas_attention_config
         self.structured_outputs_config: StructuredOutputsConfig = structured_outputs_config
-        # Initialize cuda graph capture list
-        if self.graph_opt_config.cudagraph_capture_sizes is None:
-            self.graph_opt_config._set_cudagraph_sizes(max_num_seqs=self.scheduler_config.max_num_seqs)
 
-        if self.graph_opt_config.cudagraph_only_prefill:
-            self.graph_opt_config.init_with_cudagrpah_size(max_capture_size=512)
-        elif self.speculative_config is not None and self.speculative_config.method == "mtp":
+        # Initialize cuda graph capture list
+        max_shape = self.scheduler_config.max_num_seqs
+        if self.speculative_config is not None and self.speculative_config.method == "mtp":
             max_shape = self.scheduler_config.max_num_seqs * (self.speculative_config.num_speculative_tokens + 1)
-            if max_shape % 2 == 1:
-                max_shape = max_shape + 1
-            self.graph_opt_config.init_with_cudagrpah_size(max_capture_size=min(512, max_shape))
+            assert max_shape % 2 == 0, "CUDAGraph only supports capturing even token nums in MTP scenarios."
+        if self.graph_opt_config.cudagraph_only_prefill:
+            max_shape = 512
         else:
-            self.graph_opt_config.init_with_cudagrpah_size(max_capture_size=self.scheduler_config.max_num_seqs)
+            max_shape = min(512, max_shape)
+
+        if self.graph_opt_config.cudagraph_capture_sizes is None:
+            self.graph_opt_config._set_cudagraph_sizes(max_num_seqs=max_shape)
+        self.graph_opt_config.init_with_cudagrpah_size(max_capture_size=max_shape)
 
         self.tokenizer = tokenizer
         self.ips = ips
diff --git a/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py b/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py
@@ -171,7 +171,7 @@ def __call__(self, **kwargs) -> List[paddle.Tensor] | paddle.Tensor:
             for n in range(entry.num_finished_warmup, self.warm_up_size):
                 entry.num_finished_warmup += 1
                 entry.runnable(**kwargs)
-                logger.debug(
+                logger.info(
                     f"[CUDA GRAPH][ID:{id(self)}] Warm up for real shape {padding_real_shape}, "
                     f"finished ({n + 1}/{entry.num_finished_warmup}) times"
                 )
@@ -207,7 +207,7 @@ def __call__(self, **kwargs) -> List[paddle.Tensor] | paddle.Tensor:
 
             # For CUDAGraph debug
             # self._save_cudagrpah_dot_files(entry)
-            logger.debug(f"[CUDA GRAPH][ID:{id(self)}] CUDAGraph captured for real shape {padding_real_shape}")
+            logger.info(f"[CUDA GRAPH][ID:{id(self)}] CUDAGraph captured for real shape {padding_real_shape}")
 
         # Replay
         entry.cuda_graph.replay()
@@ -224,7 +224,7 @@ def _create_entry_dict(self):
         for shape in self.cudagraph_capture_sizes:
             self.concrete_size_entries[shape] = ConcreteSizeEntry(real_shape=shape)
 
-        logger.debug(
+        logger.info(
             f"[CUDA GRAPH][ID:{id(self)}] CUDAGraph capture list {self.cudagraph_capture_sizes}, "
             "Created all real shape entry."
         )
@@ -254,3 +254,9 @@ def _save_cudagrpah_dot_files(self, entry):
                 f"{log_dir}/GraphDotFiles/backend{id(self)}_shape{entry.real_shape}",
                 1 << 0,
             )
+
+    def check_capture_successful(self):
+        """Check whether the shapes are captured or not"""
+        for shape, entry in self.concrete_size_entries.items():
+            if not entry.captured:
+                raise ValueError(f"[CUDA GRAPH][ID:{id(self)}] Shape {shape} capture failed.")