Skip to content

Commit 15a0df8

Browse files
committed
Merge branch 'develop' of https://github.com/PaddlePaddle/FastDeploy into suport_spoling
2 parents a6a9483 + fc5cd1a commit 15a0df8

File tree

5 files changed

+15
-7
lines changed

5 files changed

+15
-7
lines changed

fastdeploy/engine/engine.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -503,7 +503,6 @@ def _start_worker_service(self):
503503
f" --tensor_parallel_size {self.cfg.parallel_config.tensor_parallel_size}"
504504
f" --engine_worker_queue_port {ports}"
505505
f" --pod_ip {self.cfg.master_ip}"
506-
f" --total_block_num {self.cfg.cache_config.total_block_num}"
507506
f" --block_size {self.cfg.cache_config.block_size}"
508507
f" --enc_dec_block_num {self.cfg.cache_config.enc_dec_block_num}"
509508
f" --eos_tokens_lens {self.engine.data_processor.eos_token_id_len}"
@@ -538,7 +537,7 @@ def _start_worker_service(self):
538537
if self.cfg.structured_outputs_config.logits_processors is not None:
539538
arguments += f" --logits-processors {' '.join(self.cfg.structured_outputs_config.logits_processors)}"
540539

541-
worker_append_flag = {
540+
worker_store_true_flag = {
542541
"enable_expert_parallel": self.cfg.parallel_config.enable_expert_parallel,
543542
"enable_prefix_caching": self.cfg.cache_config.enable_prefix_caching,
544543
"enable_chunked_prefill": self.cfg.cache_config.enable_chunked_prefill,
@@ -549,9 +548,17 @@ def _start_worker_service(self):
549548
"enable_logprob": self.cfg.model_config.enable_logprob,
550549
"lm_head_fp32": self.cfg.model_config.lm_head_fp32,
551550
}
552-
for worker_flag, value in worker_append_flag.items():
551+
for worker_flag, value in worker_store_true_flag.items():
553552
if value:
554553
arguments = arguments + f" --{worker_flag}"
554+
555+
worker_default_none_flag = {
556+
"num_gpu_blocks_override": self.cfg.cache_config.num_gpu_blocks_override,
557+
}
558+
for worker_flag, value in worker_default_none_flag.items():
559+
if value:
560+
arguments = arguments + f" --{worker_flag} {value}"
561+
555562
if self.cfg.nnode > 1:
556563
pd_cmd = pd_cmd + f" --ips {ips} --nnodes {len(self.cfg.ips)}"
557564
pd_cmd = pd_cmd + arguments + f" 2>{log_dir}/launch_worker.log"

fastdeploy/worker/worker_process.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -480,7 +480,7 @@ def parse_args():
480480
help="model dir",
481481
)
482482
parser.add_argument("-mbs", "--max_num_seqs", type=int, default=34, help="max batch size")
483-
parser.add_argument("--total_block_num", type=int, default=2000)
483+
parser.add_argument("--num_gpu_blocks_override", type=int, default=None)
484484
parser.add_argument("--block_size", type=int, default=64)
485485
parser.add_argument("--pod_ip", type=str, default="127.0.0.1")
486486
parser.add_argument("--engine_worker_queue_port", type=str, default="9923")
@@ -715,6 +715,7 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig:
715715
parallel_config = ParallelConfig(vars(args))
716716
cache_config = CacheConfig(vars(args))
717717
scheduler_config = SchedulerConfig(vars(args))
718+
718719
parallel_config.tensor_parallel_rank = local_rank % parallel_config.tensor_parallel_size
719720
parallel_config.data_parallel_rank = local_rank // parallel_config.tensor_parallel_size
720721
# config for EP

tests/graph_optimization/test_cuda_graph_dynamic_subgraph.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,7 @@ def test_cuda_graph_subgraph(self):
161161
model_config = Mock()
162162
model_config.max_model_len = 512
163163
# Initialize cuda graph capture list
164-
graph_opt_config._set_cudagraph_sizes(max_num_seqs=scheduler_config.max_num_seqs)
164+
graph_opt_config._set_cudagraph_sizes(max_capture_size=scheduler_config.max_num_seqs)
165165
graph_opt_config.init_with_cudagrpah_size(max_capture_size=scheduler_config.max_num_seqs)
166166
fd_config = FDConfig(
167167
graph_opt_config=graph_opt_config,

tests/graph_optimization/test_cuda_graph_spec_decode.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ def test_cuda_graph_spec_decode(self):
108108
model_config = Mock()
109109
model_config.max_model_len = 512
110110
# Initialize cuda graph capture list
111-
graph_opt_config._set_cudagraph_sizes(max_num_seqs=scheduler_config.max_num_seqs)
111+
graph_opt_config._set_cudagraph_sizes(max_capture_size=scheduler_config.max_num_seqs)
112112
graph_opt_config.init_with_cudagrpah_size(max_capture_size=scheduler_config.max_num_seqs)
113113
fd_config = FDConfig(
114114
graph_opt_config=graph_opt_config,

tests/graph_optimization/test_static_graph_cuda_graph_split.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ def test(self):
9191
# Set FastDeploy config
9292
graph_opt_config = GraphOptimizationConfig({"use_cudagraph": True, "graph_opt_level": 1})
9393
scheduler_config = SchedulerConfig({"max_num_seqs": 1})
94-
graph_opt_config._set_cudagraph_sizes(max_num_seqs=scheduler_config.max_num_seqs)
94+
graph_opt_config._set_cudagraph_sizes(max_capture_size=scheduler_config.max_num_seqs)
9595
graph_opt_config.init_with_cudagrpah_size(max_capture_size=scheduler_config.max_num_seqs)
9696
cache_config = CacheConfig({})
9797
parallel_config = ParallelConfig(args={})

0 commit comments

Comments
 (0)