From 7716866f6df8ba4dbe5e55ceb18ff24607fe5d13 Mon Sep 17 00:00:00 2001 From: lizexu <2694294196@qq.com> Date: Mon, 22 Sep 2025 20:33:37 +0800 Subject: [PATCH 01/20] support qwen3-embedding --- .../model_executor/layers/embeddings.py | 178 +++++++++++++++++- fastdeploy/model_executor/layers/lm_head.py | 9 + fastdeploy/model_executor/layers/utils.py | 19 ++ tests/pooling/test_embedding.py | 157 +++++++++------ 4 files changed, 300 insertions(+), 63 deletions(-) diff --git a/fastdeploy/model_executor/layers/embeddings.py b/fastdeploy/model_executor/layers/embeddings.py index 43fbd76a848..7b97c53e5b1 100644 --- a/fastdeploy/model_executor/layers/embeddings.py +++ b/fastdeploy/model_executor/layers/embeddings.py @@ -14,6 +14,7 @@ # limitations under the License. """ +from dataclasses import dataclass from typing import Dict import numpy as np @@ -22,9 +23,73 @@ from paddle.distributed import fleet from fastdeploy.config import FDConfig -from fastdeploy.model_executor.utils import set_weight_attrs +from fastdeploy.model_executor.utils import set_weight_attrs, slice_fn -from .utils import get_tensor +from .utils import ( + DEFAULT_VOCAB_PADDING_SIZE, + get_tensor, + pad_vocab_size, + vocab_range_from_global_vocab_size, +) + + +@dataclass +class VocabParallelEmbeddingShardIndices: + """Indices for a shard of a vocab parallel embedding.""" + + padded_org_vocab_start_index: int + padded_org_vocab_end_index: int + padded_added_vocab_start_index: int + padded_added_vocab_end_index: int + + org_vocab_start_index: int + org_vocab_end_index: int + added_vocab_start_index: int + added_vocab_end_index: int + + @property + def num_org_elements(self) -> int: + return self.org_vocab_end_index - self.org_vocab_start_index + + @property + def num_added_elements(self) -> int: + return self.added_vocab_end_index - self.added_vocab_start_index + + @property + def num_org_elements_padded(self) -> int: + return self.padded_org_vocab_end_index - self.padded_org_vocab_start_index + + @property + def num_added_elements_padded(self) -> int: + return self.padded_added_vocab_end_index - self.padded_added_vocab_start_index + + @property + def num_org_vocab_padding(self) -> int: + return self.num_org_elements_padded - self.num_org_elements + + @property + def num_added_vocab_padding(self) -> int: + return self.num_added_elements_padded - self.num_added_elements + + @property + def num_elements_padded(self) -> int: + return self.num_org_elements_padded + self.num_added_elements_padded + + def __post_init__(self): + # sanity checks + assert self.padded_org_vocab_start_index <= self.padded_org_vocab_end_index + assert self.padded_added_vocab_start_index <= self.padded_added_vocab_end_index + + assert self.org_vocab_start_index <= self.org_vocab_end_index + assert self.added_vocab_start_index <= self.added_vocab_end_index + + assert self.org_vocab_start_index <= self.padded_org_vocab_start_index + assert self.added_vocab_start_index <= self.padded_added_vocab_start_index + assert self.org_vocab_end_index <= self.padded_org_vocab_end_index + assert self.added_vocab_end_index <= self.padded_added_vocab_end_index + + assert self.num_org_elements <= self.num_org_elements_padded + assert self.num_added_elements <= self.num_added_elements_padded class VocabParallelEmbedding(nn.Layer): @@ -39,6 +104,7 @@ def __init__( embedding_dim: int = 768, params_dtype: str = "bfloat16", prefix="", + padding_size: int = DEFAULT_VOCAB_PADDING_SIZE, ) -> None: """ Initialize the VocabParallelEmbedding layer for the model. @@ -65,10 +131,32 @@ def __init__( self.max_position_embeddings: int = fd_config.model_config.max_position_embeddings self.tie_word_embeddings: bool = fd_config.model_config.tie_word_embeddings self.params_dtype: str = params_dtype + self.padding_size = padding_size + + self.org_vocab_size = num_embeddings + self.num_embeddings = num_embeddings + num_added_embeddings = num_embeddings - self.org_vocab_size + + self.org_vocab_size_padded = pad_vocab_size(self.org_vocab_size, self.padding_size) + self.num_embeddings_padded = pad_vocab_size( + self.org_vocab_size_padded + num_added_embeddings, self.padding_size + ) + assert self.org_vocab_size_padded <= self.num_embeddings_padded + self.shard_indices = self._get_indices( + self.num_embeddings_padded, + self.org_vocab_size_padded, + self.num_embeddings, + self.org_vocab_size, + self.tensor_parallel_rank, + self.world_size, + ) + + if num_embeddings % self.world_size != 0: + self.num_embeddings_padded = pad_vocab_size(num_embeddings, self.padding_size) if not self.column_cut: self.embeddings = fleet.meta_parallel.VocabParallelEmbedding( - num_embeddings, + self.num_embeddings_padded, embedding_dim, mp_group=self.tp_group, weight_attr=paddle.ParamAttr( @@ -76,7 +164,7 @@ def __init__( ), ) if self.world_size > 1: - set_weight_attrs(self.embeddings.weight, {"output_dim": False}) + set_weight_attrs(self.embeddings.weight, {"output_dim": False, "weight_loader": self.weight_loader}) else: # column cut embedding self.embeddings = nn.Embedding( @@ -106,6 +194,88 @@ def load_state_dict(self, state_dict: Dict[str, paddle.Tensor | np.ndarray]): self.embeddings.weight.set_value(weight_tensor) + @classmethod + def _get_indices( + cls, + vocab_size_paded: int, + org_vocab_size_padded: int, + vocab_size: int, + org_vocab_size: int, + tp_rank: int, + tp_size: int, + ) -> VocabParallelEmbeddingShardIndices: + """Get start and end indices for vocab parallel embedding, following the + layout outlined in the class docstring, based on the given tp_rank and + tp_size.""" + + num_added_embeddings_padded = vocab_size_paded - org_vocab_size_padded + padded_org_vocab_start_index, padded_org_vocab_end_index = vocab_range_from_global_vocab_size( + org_vocab_size_padded, tp_rank, tp_size + ) + + padded_added_vocab_start_index, padded_added_vocab_end_index = vocab_range_from_global_vocab_size( + num_added_embeddings_padded, tp_rank, tp_size, offset=org_vocab_size + ) + # remove padding + org_vocab_start_index = min(padded_org_vocab_start_index, org_vocab_size) + org_vocab_end_index = min(padded_org_vocab_end_index, org_vocab_size) + added_vocab_start_index = min(padded_added_vocab_start_index, vocab_size) + added_vocab_end_index = min(padded_added_vocab_end_index, vocab_size) + return VocabParallelEmbeddingShardIndices( + padded_org_vocab_start_index, + padded_org_vocab_end_index, + padded_added_vocab_start_index, + padded_added_vocab_end_index, + org_vocab_start_index, + org_vocab_end_index, + added_vocab_start_index, + added_vocab_end_index, + ) + + def weight_loader(self, param, loaded_weight, shard_id=None): + output_dim = getattr(param, "output_dim", None) + packed_dim = getattr(param, "packed_dim", None) + + loaded_weight = get_tensor(loaded_weight) + if param.dtype != loaded_weight.dtype: + if loaded_weight.dtype == paddle.int8 and param.dtype == paddle.float8_e4m3fn: + loaded_weight = loaded_weight.cast(param.dtype) + else: + loaded_weight = loaded_weight.cast(param.dtype) + + if output_dim is None: + assert ( + param.shape == loaded_weight.shape + ), f"Shape mismatch: param {param.shape} vs loaded_weight {loaded_weight.shape}" + param.set_value(loaded_weight) + return + + start_idx = self.shard_indices.org_vocab_start_index + end_idx = self.shard_indices.org_vocab_end_index + shard_size = self.shard_indices.org_vocab_end_index - start_idx + + # If param packed on the same dim we are sharding on, then + # need to adjust offsets of loaded weight by pack_factor. + if packed_dim is not None and packed_dim == output_dim: + packed_factor = getattr(param, "packed_factor", getattr(param, "pack_factor", 1)) + assert loaded_weight.shape[output_dim] == (self.org_vocab_size // packed_factor) + start_idx = start_idx // packed_factor + shard_size = shard_size // packed_factor + else: + assert loaded_weight.shape[output_dim] == self.org_vocab_size, ( + f"Loaded weight dim {output_dim} size {loaded_weight.shape[output_dim]} " + f"!= org_vocab_size {self.org_vocab_size}" + ) + + shard_weight = slice_fn(loaded_weight, output_dim, start_idx, end_idx) + + if output_dim == 0: + param[: shard_weight.shape[0]].copy_(shard_weight, False) + param[shard_weight.shape[0] :].fill_(0) + else: + param[:, : shard_weight.shape[1]].copy_(shard_weight, False) + param[:, shard_weight.shape[1] :].fill_(0) + def forward(self, ids_remove_padding=None) -> paddle.Tensor: """ Defines the forward computation of the layer. diff --git a/fastdeploy/model_executor/layers/lm_head.py b/fastdeploy/model_executor/layers/lm_head.py index 57131b00a27..ff1bdaa9217 100644 --- a/fastdeploy/model_executor/layers/lm_head.py +++ b/fastdeploy/model_executor/layers/lm_head.py @@ -22,6 +22,10 @@ from paddle.distributed import fleet from fastdeploy.config import FDConfig +from fastdeploy.model_executor.layers.utils import ( + DEFAULT_VOCAB_PADDING_SIZE, + pad_vocab_size, +) from fastdeploy.model_executor.utils import ( default_weight_loader, set_weight_attrs, @@ -44,6 +48,7 @@ def __init__( prefix: str = "", with_bias: bool = False, dtype: str = None, + padding_size: int = DEFAULT_VOCAB_PADDING_SIZE, ) -> None: """ Parallelized LMhead. @@ -68,6 +73,10 @@ def __init__( self.column_cut = True self.nranks = fd_config.parallel_config.tensor_parallel_size self.fd_config = fd_config + self.padding_size = padding_size + + if num_embeddings % self.nranks != 0: + num_embeddings = pad_vocab_size(num_embeddings, self.padding_size) ColumnParallelLinear = fleet.meta_parallel.ColumnParallelLinear RowParallelLinear = fleet.meta_parallel.RowParallelLinear diff --git a/fastdeploy/model_executor/layers/utils.py b/fastdeploy/model_executor/layers/utils.py index 85de8ec4c14..27bc770e88d 100644 --- a/fastdeploy/model_executor/layers/utils.py +++ b/fastdeploy/model_executor/layers/utils.py @@ -45,6 +45,14 @@ c8_state_dict = paddle.load(cache_params, return_numpy=True) +DEFAULT_VOCAB_PADDING_SIZE = 64 + + +def pad_vocab_size(vocab_size: int, pad_to: int = DEFAULT_VOCAB_PADDING_SIZE) -> int: + """Pad the vocab size to the given value.""" + return ((vocab_size + pad_to - 1) // pad_to) * pad_to + + def per_block_cast_to_fp8(x: Tensor, block_size: list = [128, 128]) -> Tuple[Tensor, Tensor]: """ Only used in deep_gemm block wise quant weight. @@ -372,3 +380,14 @@ def create_empty_tensor(shape: Tuple[int, ...], dtype: Union[paddle.dtype, str]) paddle.Tensor: An empty tensor with the specified shape and data type. """ return paddle.empty(list(shape), dtype=dtype) + + +def vocab_range_from_per_partition_vocab_size(per_partition_vocab_size: int, rank: int, offset: int = 0): + index_f = rank * per_partition_vocab_size + index_l = index_f + per_partition_vocab_size + return index_f + offset, index_l + offset + + +def vocab_range_from_global_vocab_size(global_vocab_size: int, rank: int, world_size: int, offset: int = 0): + per_partition_vocab_size = divide(global_vocab_size, world_size) + return vocab_range_from_per_partition_vocab_size(per_partition_vocab_size, rank, offset=offset) diff --git a/tests/pooling/test_embedding.py b/tests/pooling/test_embedding.py index d609726e235..a548494dcad 100644 --- a/tests/pooling/test_embedding.py +++ b/tests/pooling/test_embedding.py @@ -27,7 +27,9 @@ ModelConfig, ParallelConfig, ) +from fastdeploy.model_executor.models.adapters import as_embedding_model from fastdeploy.model_executor.models.model_base import ModelRegistry +from fastdeploy.scheduler import SchedulerConfig current_dir = os.path.dirname(os.path.abspath(__file__)) project_root = os.path.abspath(os.path.join(current_dir, "..")) @@ -36,58 +38,103 @@ from tests.model_loader.utils import get_torch_model_path +test_model_configs = { + "Qwen3-0.6B": { + "tensor_parallel_size": 2, + "max_model_len": 8192, + "baseline_suffix": "standard", + }, + "Qwen3-Embedding-0.6B": { + "tensor_parallel_size": 2, + "max_model_len": 8192, + "baseline_suffix": "embedding", + }, +} + class TestModelLoader: @pytest.fixture(scope="session", autouse=True) def setup_paddle(self): if not paddle.is_compiled_with_cuda(): - print("CUDA not available, using CPU") - paddle.set_device("cpu") - else: - print("Using CUDA device") - paddle.set_device("gpu") + raise AssertionError("CUDA not available") + paddle.set_device("gpu") yield - @pytest.fixture(scope="session") - def model_path(self): + @pytest.fixture(scope="session", params=list(test_model_configs.keys())) + def model_info(self, request): + model_name = request.param try: - torch_model_path = get_torch_model_path("Qwen3-0.6B") - if os.path.exists(torch_model_path): - return torch_model_path + torch_model_path = get_torch_model_path(model_name) + if not os.path.exists(torch_model_path): + raise AssertionError(f"Model path does not exist: {torch_model_path}") + return {"name": model_name, "path": torch_model_path, "config": test_model_configs[model_name]} except Exception as e: - print(f"Could not get torch model path: {e}") + raise AssertionError(f"Could not get torch model path for {model_name}: {e}") @pytest.fixture - def model_config(self, model_path): + def model_config(self, model_info): + if model_info is None: + raise AssertionError("model_info is None") + model_args = { - "model": model_path, + "model": model_info["path"], "dtype": "bfloat16", - "max_model_len": 8192, - "tensor_parallel_size": 1, + "max_model_len": model_info["config"]["max_model_len"], + "tensor_parallel_size": model_info["config"]["tensor_parallel_size"], "runner": "auto", "convert": "auto", } try: - return ModelConfig(model_args) + config = ModelConfig(model_args) + return config + except Exception as e: + raise AssertionError(f"Could not create ModelConfig: {e}") + + @pytest.fixture + def scheduler_config(self): + scheduler_args = { + "name": "local", + "max_num_seqs": 256, + "max_num_batched_tokens": 8192, + "splitwise_role": "mixed", + "max_size": -1, + "ttl": 900, + "max_model_len": 8192, + "enable_chunked_prefill": False, + "max_num_partial_prefills": 1, + "max_long_partial_prefills": 1, + "long_prefill_token_threshold": 0, + } + + try: + config = SchedulerConfig(scheduler_args) + return config except Exception as e: - print(f"Could not create ModelConfig: {e}") + raise AssertionError(f"Could not create SchedulerConfig: {e}") @pytest.fixture - def fd_config(self, model_config): + def fd_config(self, model_info, model_config, scheduler_config): + if model_config is None: + raise AssertionError("ModelConfig is None") + if scheduler_config is None: + raise AssertionError("SchedulerConfig is None") + try: + tensor_parallel_size = model_info["config"]["tensor_parallel_size"] + cache_args = { "block_size": 64, "gpu_memory_utilization": 0.9, "cache_dtype": "bfloat16", "model_cfg": model_config, - "tensor_parallel_size": 1, + "tensor_parallel_size": tensor_parallel_size, } cache_config = CacheConfig(cache_args) parallel_args = { - "tensor_parallel_size": 1, + "tensor_parallel_size": tensor_parallel_size, "data_parallel_size": 1, } parallel_config = ParallelConfig(parallel_args) @@ -95,88 +142,80 @@ def fd_config(self, model_config): load_args = {} load_config = LoadConfig(load_args) - graph_opt_args = { - "enable_cudagraph": False, - "cudagraph_capture_sizes": None, - } + graph_opt_args = {} graph_opt_config = GraphOptimizationConfig(graph_opt_args) - return FDConfig( + fd_config = FDConfig( model_config=model_config, cache_config=cache_config, parallel_config=parallel_config, + scheduler_config=scheduler_config, load_config=load_config, graph_opt_config=graph_opt_config, test_mode=True, ) + return fd_config + except Exception as e: - print(f"Could not create FDConfig: {e}") + raise AssertionError(f"Could not create FDConfig: {e}") @pytest.fixture - def model_json_config(self, model_path): - config_path = os.path.join(model_path, "config.json") - if os.path.exists(config_path): - with open(config_path, "r", encoding="utf-8") as f: - return json.load(f) - return None + def model_json_config(self, model_info): + if model_info is None: + raise AssertionError("model_info is None") - def test_embedding_with_none_convert_type(self, fd_config, model_json_config): - if model_json_config is None: - pytest.skip("Model config not available") + config_path = os.path.join(model_info["path"], "config.json") + if not os.path.exists(config_path): + raise AssertionError(f"Config file does not exist: {config_path}") - if fd_config is None: - pytest.skip("FDConfig not available") + with open(config_path, "r", encoding="utf-8") as f: + return json.load(f) - print("=" * 60) - print("Testing initialize_model with convert_type='none'") - print("=" * 60) + def test_embedding_with_none_convert_type(self, model_info, fd_config, model_json_config): + if any(x is None for x in [model_info, fd_config, model_json_config]): + raise AssertionError("Required configs not available") architectures = model_json_config.get("architectures", []) if not architectures: - pytest.skip("No architectures found in model config") + raise AssertionError("No architectures found in model config") fd_config.model_config.convert_type = "none" try: - model_cls = ModelRegistry.get_class(architectures) + model_cls = ModelRegistry.get_class(architectures[0]) if hasattr(model_cls, "__name__"): assert ( "ForEmbedding" not in model_cls.__name__ ), f"Standard model should not have 'ForEmbedding' in name, but got: {model_cls.__name__}" - print(f"Confirmed standard model type (no ForEmbedding): {model_cls.__name__}") standard_methods = set(dir(model_cls)) assert "_init_pooler" not in standard_methods, "Standard model should not have _init_pooler method" except Exception as e: - print(f"Error in none: {e}") + raise AssertionError(f"Error in none convert type test: {e}") - def test_embedding_with_embed_convert_type(self, fd_config, model_json_config): - if model_json_config is None: - pytest.skip("Model config not available") - - if fd_config is None: - pytest.skip("FDConfig not available") - - print("=" * 60) - print("Testing embedding with convert_type='embed'") - print("=" * 60) + def test_embedding_with_embed_convert_type(self, model_info, fd_config, model_json_config): + if any(x is None for x in [model_info, fd_config, model_json_config]): + raise AssertionError("Required configs not available") architectures = model_json_config.get("architectures", []) if not architectures: - pytest.skip("No architectures found in model config") + raise AssertionError("No architectures found in model config") fd_config.model_config.convert_type = "embed" try: - model_cls = ModelRegistry.get_class(architectures) + model_cls = ModelRegistry.get_class(architectures[0]) + model_cls = as_embedding_model(model_cls) + if hasattr(model_cls, "__name__"): - assert "ForEmbedding" in model_cls.__name__, "Embedding model should have 'ForEmbedding' in name" - print(f"Confirmed embedding model type: {model_cls.__name__}") + assert ( + "ForEmbedding" in model_cls.__name__ + ), f"Embedding model should have 'ForEmbedding' in name, but got: {model_cls.__name__}" embedding_methods = set(dir(model_cls)) assert "_init_pooler" in embedding_methods, "Embedding model should have _init_pooler method" except Exception as e: - print(f"Error in convert embed: {e}") + raise AssertionError(f"Error in embed convert type test: {e}") From 85d14ba33319cb9a85ead2b20fcd8c11fca7b21f Mon Sep 17 00:00:00 2001 From: lizexu <2694294196@qq.com> Date: Fri, 24 Oct 2025 21:53:45 +0800 Subject: [PATCH 02/20] support qwen3-embedding-0.6b --- custom_ops/gpu_ops/cpp_extensions.cc | 1442 ++++++++++------- custom_ops/gpu_ops/update_inputs_v1.cu | 277 ++-- fastdeploy/engine/common_engine.py | 36 +- fastdeploy/model_executor/layers/pooler.py | 24 +- .../model_executor/pre_and_post_process.py | 131 +- fastdeploy/output/stream_transfer_data.py | 2 +- fastdeploy/worker/gpu_model_runner.py | 440 +++-- tests/pooling/test_Qwen3-Embedding_serving.py | 308 ++++ 8 files changed, 1720 insertions(+), 940 deletions(-) create mode 100644 tests/pooling/test_Qwen3-Embedding_serving.py diff --git a/custom_ops/gpu_ops/cpp_extensions.cc b/custom_ops/gpu_ops/cpp_extensions.cc index 45c1882057e..32f13aff46b 100644 --- a/custom_ops/gpu_ops/cpp_extensions.cc +++ b/custom_ops/gpu_ops/cpp_extensions.cc @@ -18,14 +18,14 @@ namespace py = pybind11; // 自定义异常类,用于处理CUDA错误 class CudaError : public std::exception { -public: + public: explicit CudaError(cudaError_t error) : error_(error) {} const char *what() const noexcept override { return cudaGetErrorString(error_); } -private: + private: cudaError_t error_; }; @@ -50,12 +50,16 @@ void cuda_host_free(uintptr_t ptr) { } std::vector AppendAttention( - const paddle::Tensor &qkv, const paddle::Tensor &key_cache, - const paddle::Tensor &value_cache, const paddle::Tensor &seq_lens_encoder, + const paddle::Tensor &qkv, + const paddle::Tensor &key_cache, + const paddle::Tensor &value_cache, + const paddle::Tensor &seq_lens_encoder, const paddle::Tensor &seq_lens_decoder, const paddle::Tensor &seq_lens_this_time, - const paddle::Tensor &batch_id_per_token, const paddle::Tensor &cu_seqlens_q, - const paddle::Tensor &block_tables, const paddle::Tensor &encoder_batch_ids, + const paddle::Tensor &batch_id_per_token, + const paddle::Tensor &cu_seqlens_q, + const paddle::Tensor &block_tables, + const paddle::Tensor &encoder_batch_ids, const paddle::Tensor &encoder_tile_ids_per_batch, const paddle::Tensor &encoder_num_blocks, const paddle::Tensor &kv_batch_ids, @@ -79,27 +83,38 @@ std::vector AppendAttention( const paddle::optional &out_linear_smooths, const paddle::optional &mask_offset, const paddle::optional &kv_signal_data, - const paddle::optional& q_norm_weight, - const paddle::optional& k_norm_weight, - const paddle::optional& sinks, + const paddle::optional &q_norm_weight, + const paddle::optional &k_norm_weight, + const paddle::optional &sinks, const float rms_norm_eps, - const std::string &compute_dtype, const std::string &cache_quant_type_str, - const bool use_neox_rotary_style, const bool rope_3d, - const int max_input_length, const float quant_max_bound, - const float quant_min_bound, const float out_linear_in_scale, - const int encoder_block_shape_q, const int decoder_block_shape_q, - const int max_partition_size, const int encoder_max_partition_size, - const int speculate_max_draft_token_num, const bool causal, + const std::string &compute_dtype, + const std::string &cache_quant_type_str, + const bool use_neox_rotary_style, + const bool rope_3d, + const int max_input_length, + const float quant_max_bound, + const float quant_min_bound, + const float out_linear_in_scale, + const int encoder_block_shape_q, + const int decoder_block_shape_q, + const int max_partition_size, + const int encoder_max_partition_size, + const int speculate_max_draft_token_num, + const bool causal, const bool speculate_decoder, const int sliding_window); std::vector AppendAttentionWithOutput( - const paddle::Tensor &qkv, const paddle::Tensor &key_cache, - const paddle::Tensor &value_cache, const paddle::Tensor &seq_lens_encoder, + const paddle::Tensor &qkv, + const paddle::Tensor &key_cache, + const paddle::Tensor &value_cache, + const paddle::Tensor &seq_lens_encoder, const paddle::Tensor &seq_lens_decoder, const paddle::Tensor &seq_lens_this_time, - const paddle::Tensor &batch_id_per_token, const paddle::Tensor &cu_seqlens_q, - const paddle::Tensor &block_tables, const paddle::Tensor &encoder_batch_ids, + const paddle::Tensor &batch_id_per_token, + const paddle::Tensor &cu_seqlens_q, + const paddle::Tensor &block_tables, + const paddle::Tensor &encoder_batch_ids, const paddle::Tensor &encoder_tile_ids_per_batch, const paddle::Tensor &encoder_num_blocks, const paddle::Tensor &kv_batch_ids, @@ -124,31 +139,44 @@ std::vector AppendAttentionWithOutput( const paddle::optional &out_linear_smooths, const paddle::optional &mask_offset, const paddle::optional &kv_signal_data, - const paddle::optional& q_norm_weight, - const paddle::optional& k_norm_weight, - const paddle::optional& sinks, + const paddle::optional &q_norm_weight, + const paddle::optional &k_norm_weight, + const paddle::optional &sinks, const float rms_norm_eps, - const std::string &compute_dtype, const std::string &cache_quant_type_str, - const bool use_neox_rotary_style, const bool rope_3d, - const int max_input_length, const float quant_max_bound, - const float quant_min_bound, const float out_linear_in_scale, - const int encoder_block_shape_q, const int decoder_block_shape_q, - const int max_partition_size, const int encoder_max_partition_size, - const int speculate_max_draft_token_num, const bool causal, + const std::string &compute_dtype, + const std::string &cache_quant_type_str, + const bool use_neox_rotary_style, + const bool rope_3d, + const int max_input_length, + const float quant_max_bound, + const float quant_min_bound, + const float out_linear_in_scale, + const int encoder_block_shape_q, + const int decoder_block_shape_q, + const int max_partition_size, + const int encoder_max_partition_size, + const int speculate_max_draft_token_num, + const bool causal, const bool speculate_decoder, const int sliding_window); std::vector GQARopeWriteCacheKernel( - const paddle::Tensor &qkv, const paddle::Tensor &key_cache, - const paddle::Tensor &value_cache, const paddle::Tensor &cu_seqlens_q, - const paddle::Tensor &cu_seqlens_k, const paddle::Tensor &rotary_embs, + const paddle::Tensor &qkv, + const paddle::Tensor &key_cache, + const paddle::Tensor &value_cache, + const paddle::Tensor &cu_seqlens_q, + const paddle::Tensor &cu_seqlens_k, + const paddle::Tensor &rotary_embs, const paddle::Tensor &seq_lens_this_time, const paddle::Tensor &seq_lens_encoder, const paddle::Tensor &seq_lens_decoder, const paddle::Tensor &batch_id_per_token, - const paddle::Tensor &block_tables, const paddle::Tensor &kv_batch_ids, - const paddle::Tensor &kv_tile_ids, const paddle::Tensor &kv_num_blocks, - const paddle::Tensor &cache_batch_ids, const paddle::Tensor &cache_tile_ids, + const paddle::Tensor &block_tables, + const paddle::Tensor &kv_batch_ids, + const paddle::Tensor &kv_tile_ids, + const paddle::Tensor &kv_num_blocks, + const paddle::Tensor &cache_batch_ids, + const paddle::Tensor &cache_tile_ids, const paddle::Tensor &cache_num_blocks, const paddle::optional &cache_k_quant_scales, const paddle::optional &cache_v_quant_scales, @@ -157,150 +185,177 @@ std::vector GQARopeWriteCacheKernel( const paddle::optional &cache_k_zp, const paddle::optional &cache_v_zp, const paddle::optional &kv_signal_data, - const int kv_token_num, const int max_seq_len, + const int kv_token_num, + const int max_seq_len, const std::string &cache_quant_type, const bool rope_3d); -std::vector -PreCacheLenConcat(const paddle::Tensor &seq_lens_decoder, - const paddle::Tensor &seq_lens_this_time, - const int max_dec_len, const int block_size); +std::vector PreCacheLenConcat( + const paddle::Tensor &seq_lens_decoder, + const paddle::Tensor &seq_lens_this_time, + const int max_dec_len, + const int block_size); paddle::Tensor FusedExpertMoeFunc( - const paddle::Tensor &input, const paddle::Tensor &gate_weight, - const paddle::Tensor &up_gate_proj_weight, const paddle::Tensor &down_proj_weight, + const paddle::Tensor &input, + const paddle::Tensor &gate_weight, + const paddle::Tensor &up_gate_proj_weight, + const paddle::Tensor &down_proj_weight, const paddle::optional &up_gate_proj_bias, const paddle::optional &up_gate_proj_scale, const paddle::optional &down_proj_bias, const paddle::optional &down_proj_scale, - const std::string &quant_method, const int moe_topk, - const bool norm_topk_prob, const bool group_moe); + const std::string &quant_method, + const int moe_topk, + const bool norm_topk_prob, + const bool group_moe); std::vector MacheteMMKernel( - paddle::Tensor const& A, paddle::Tensor const& B, - paddle::optional const& maybe_group_scales, - paddle::optional const& maybe_group_zeros, - paddle::optional const& maybe_channel_scales, - paddle::optional const& maybe_token_scales, - std::string const& b_type_str, - std::string const& maybe_out_type_str, - int64_t const& maybe_group_size, - std::string const& maybe_schedule); + paddle::Tensor const &A, + paddle::Tensor const &B, + paddle::optional const &maybe_group_scales, + paddle::optional const &maybe_group_zeros, + paddle::optional const &maybe_channel_scales, + paddle::optional const &maybe_token_scales, + std::string const &b_type_str, + std::string const &maybe_out_type_str, + int64_t const &maybe_group_size, + std::string const &maybe_schedule); std::vector MachetePrepackBKernel( - paddle::Tensor const& B, std::string const& a_type_str, std::string const& b_type_str, - std::string const& maybe_group_scales_type_str); + paddle::Tensor const &B, + std::string const &a_type_str, + std::string const &b_type_str, + std::string const &maybe_group_scales_type_str); std::vector MacheteSupportedSchedules( - std::string const& a_type_str, std::string const& b_type_str); + std::string const &a_type_str, std::string const &b_type_str); std::vector MoeExpertDispatch( - const paddle::Tensor &input, const paddle::Tensor &gating_output, + const paddle::Tensor &input, + const paddle::Tensor &gating_output, const paddle::optional &gating_correction_bias, - const paddle::optional &w4a8_in_scale, const int moe_topk, - const bool group_moe, const std::string &moe_quant_type, const bool topk_only_mode); - -std::vector -MoETopKSelectKernel(const paddle::Tensor &gating_logits, - const paddle::optional &bias, - const int moe_topk, const bool apply_norm_weight, - const bool enable_softmax_top_k_fused); - -std::vector -MoERedundantTopKSelectKernel(const paddle::Tensor &gating_logits, - const paddle::Tensor &expert_id_to_ep_rank_array, - const paddle::Tensor &expert_in_rank_num_list, - paddle::Tensor &tokens_per_expert_stats_list, - const paddle::optional &bias, - const int moe_topk, const bool apply_norm_weight, - const bool enable_softmax_top_k_fused, - const int redundant_ep_rank_num_plus_one); - -std::vector -EPMoeExpertDispatch(const paddle::Tensor &input, const paddle::Tensor &topk_ids, - const paddle::Tensor &topk_weights, - const paddle::optional &up_gate_proj_in_scale, - const std::vector &token_nums_per_expert, - const int token_nums_this_rank, - const std::string &moe_quant_type); + const paddle::optional &w4a8_in_scale, + const int moe_topk, + const bool group_moe, + const std::string &moe_quant_type, + const bool topk_only_mode); + +std::vector MoETopKSelectKernel( + const paddle::Tensor &gating_logits, + const paddle::optional &bias, + const int moe_topk, + const bool apply_norm_weight, + const bool enable_softmax_top_k_fused); + +std::vector MoERedundantTopKSelectKernel( + const paddle::Tensor &gating_logits, + const paddle::Tensor &expert_id_to_ep_rank_array, + const paddle::Tensor &expert_in_rank_num_list, + paddle::Tensor &tokens_per_expert_stats_list, + const paddle::optional &bias, + const int moe_topk, + const bool apply_norm_weight, + const bool enable_softmax_top_k_fused, + const int redundant_ep_rank_num_plus_one); + +std::vector EPMoeExpertDispatch( + const paddle::Tensor &input, + const paddle::Tensor &topk_ids, + const paddle::Tensor &topk_weights, + const paddle::optional &up_gate_proj_in_scale, + const std::vector &token_nums_per_expert, + const int token_nums_this_rank, + const std::string &moe_quant_type); std::vector EPMoeExpertDispatchFP8( - const paddle::Tensor &input, const paddle::Tensor &scale, - const paddle::Tensor &topk_ids, const paddle::Tensor &topk_weights, + const paddle::Tensor &input, + const paddle::Tensor &scale, + const paddle::Tensor &topk_ids, + const paddle::Tensor &topk_weights, const paddle::Tensor &token_nums_per_expert, const paddle::Tensor &token_nums_per_expert_padded, - const bool use_in_ep, const int token_nums_this_rank_padded); + const bool use_in_ep, + const int token_nums_this_rank_padded); std::vector PerTokenQuant(paddle::Tensor &input, const int block_size); std::vector PerTokenQuantPadding(paddle::Tensor &input, const int block_size); -std::vector -MaskedPerTokenQuant(paddle::Tensor &input, paddle::Tensor &recv_expert_count, - const int block_size); +std::vector MaskedPerTokenQuant( + paddle::Tensor &input, + paddle::Tensor &recv_expert_count, + const int block_size); std::vector EPMoeExpertCombine( - const paddle::Tensor &ffn_out, const paddle::Tensor &expert_scales_float, + const paddle::Tensor &ffn_out, + const paddle::Tensor &expert_scales_float, const paddle::Tensor &permute_indices_per_token, const paddle::Tensor &top_k_indices, const paddle::optional &down_proj_bias, - const bool norm_topk_prob, const float routed_scaling_factor); + const bool norm_topk_prob, + const float routed_scaling_factor); std::vector> GetExpertTokenNum(const paddle::Tensor &topk_ids, const int num_experts); paddle::Tensor MoeExpertFFNFunc( - const paddle::Tensor& permute_input, - const paddle::Tensor& tokens_expert_prefix_sum, - const paddle::Tensor& up_gate_proj_weight, - const paddle::Tensor& down_proj_weight, - const paddle::optional& up_gate_proj_bias, - const paddle::optional& up_gate_proj_scale, - const paddle::optional& down_proj_scale, - const paddle::optional& down_proj_in_scale, - const paddle::optional& expert_idx_per_token, - const std::string& quant_method, + const paddle::Tensor &permute_input, + const paddle::Tensor &tokens_expert_prefix_sum, + const paddle::Tensor &up_gate_proj_weight, + const paddle::Tensor &down_proj_weight, + const paddle::optional &up_gate_proj_bias, + const paddle::optional &up_gate_proj_scale, + const paddle::optional &down_proj_scale, + const paddle::optional &down_proj_in_scale, + const paddle::optional &expert_idx_per_token, + const std::string &quant_method, const bool used_in_ep_low_latency, const int estimate_total_token_nums, const int hadamard_block_size, - const std::string& activation); + const std::string &activation); paddle::Tensor MoeExpertFFNWint2Func( - const paddle::Tensor& permute_input, - const paddle::Tensor& tokens_expert_prefix_sum, - const paddle::Tensor& up_gate_proj_weight, - const paddle::Tensor& down_proj_weight, - const paddle::optional& up_gate_proj_bias, - const paddle::optional& up_gate_proj_scale, - const paddle::optional& down_proj_scale, - const paddle::optional& up_gate_proj_local_scale, - const paddle::optional& up_gate_proj_code_scale, - const paddle::optional& up_gate_proj_code_zp, - const paddle::optional& down_proj_local_scale, - const paddle::optional& down_proj_code_scale, - const paddle::optional& down_proj_code_zp, + const paddle::Tensor &permute_input, + const paddle::Tensor &tokens_expert_prefix_sum, + const paddle::Tensor &up_gate_proj_weight, + const paddle::Tensor &down_proj_weight, + const paddle::optional &up_gate_proj_bias, + const paddle::optional &up_gate_proj_scale, + const paddle::optional &down_proj_scale, + const paddle::optional &up_gate_proj_local_scale, + const paddle::optional &up_gate_proj_code_scale, + const paddle::optional &up_gate_proj_code_zp, + const paddle::optional &down_proj_local_scale, + const paddle::optional &down_proj_code_scale, + const paddle::optional &down_proj_code_zp, const bool used_in_ep_low_latency); paddle::Tensor MoeExpertReduceFunc( - const paddle::Tensor &ffn_out, const paddle::Tensor &top_k_weight, + const paddle::Tensor &ffn_out, + const paddle::Tensor &top_k_weight, const paddle::Tensor &permute_indices_per_token, const paddle::Tensor &top_k_indices, const paddle::optional &down_proj_bias, - const bool norm_topk_prob, const float routed_scaling_factor); + const bool norm_topk_prob, + const float routed_scaling_factor); void InitKVSignalPerQuery(const paddle::Tensor &seq_lens_encoder_tensor, const paddle::Tensor &seq_lens_this_time_tensor, const paddle::Tensor &seq_lens_decoder_tensor, - const int rank, const int num_layers); + const int rank, + const int num_layers); -void GetOutputKVSignal(const paddle::Tensor &x, int64_t rank_id, +void GetOutputKVSignal(const paddle::Tensor &x, + int64_t rank_id, bool wait_flag); paddle::Tensor DequantInt8Func(const paddle::Tensor &input, const paddle::Tensor &out_scale, std::string dtype); -paddle::Tensor OpenShmAndGetMetaSignalFunc(const int rank, const int device_id, +paddle::Tensor OpenShmAndGetMetaSignalFunc(const int rank, + const int device_id, const bool keep_pd_step_flag); paddle::Tensor InitSignalLayerwiseFunc(const paddle::Tensor &kv_signal_metadata, @@ -310,18 +365,18 @@ void GetBlockShapeAndSplitKVBlock( const paddle::Tensor &seq_lens_encoder, const paddle::Tensor &seq_lens_decoder, const paddle::Tensor &seq_lens_this_time, - paddle::Tensor &decoder_batch_ids, // Inplace - paddle::Tensor &decoder_tile_ids_per_batch, // Inplace - paddle::Tensor &decoder_num_blocks_cpu, // Inplace, Pinned Memory - paddle::Tensor &decoder_num_blocks_device, // Inplace - paddle::Tensor &decoder_chunk_size_device, // Inplace - paddle::Tensor &max_len_tensor_cpu, // Inplace, Pinned Memory - paddle::Tensor &encoder_batch_ids, // Inplace - paddle::Tensor &encoder_tile_ids_per_batch, // Inplace - paddle::Tensor &encoder_num_blocks_x_cpu, // Inplace, Pinned Memory - paddle::Tensor &kv_batch_ids, // Inplace - paddle::Tensor &kv_tile_ids_per_batch, // Inplace - paddle::Tensor &kv_num_blocks_x_cpu, // Inplace, Pinned Memory + paddle::Tensor &decoder_batch_ids, // Inplace + paddle::Tensor &decoder_tile_ids_per_batch, // Inplace + paddle::Tensor &decoder_num_blocks_cpu, // Inplace, Pinned Memory + paddle::Tensor &decoder_num_blocks_device, // Inplace + paddle::Tensor &decoder_chunk_size_device, // Inplace + paddle::Tensor &max_len_tensor_cpu, // Inplace, Pinned Memory + paddle::Tensor &encoder_batch_ids, // Inplace + paddle::Tensor &encoder_tile_ids_per_batch, // Inplace + paddle::Tensor &encoder_num_blocks_x_cpu, // Inplace, Pinned Memory + paddle::Tensor &kv_batch_ids, // Inplace + paddle::Tensor &kv_tile_ids_per_batch, // Inplace + paddle::Tensor &kv_num_blocks_x_cpu, // Inplace, Pinned Memory const int encoder_block_shape_q, const int decoder_block_shape_q, const int group_size, @@ -342,8 +397,8 @@ void SetValueByFlagsAndIdx(const paddle::Tensor &pre_ids_all, const paddle::Tensor &stop_flags); paddle::Tensor RebuildPaddingFunc( - const paddle::Tensor &tmp_out, // [token_num, dim_embed] - const paddle::Tensor &cum_offsets, // [bsz, 1] + const paddle::Tensor &tmp_out, // [token_num, dim_embed] + const paddle::Tensor &cum_offsets, // [bsz, 1] const paddle::Tensor &seq_len_this_time, const paddle::Tensor &seq_lens_decoder, const paddle::Tensor &seq_lens_encoder, @@ -363,9 +418,8 @@ void GetStopFlagsMulti(const paddle::Tensor &topk_ids, const paddle::Tensor &stop_seqs_len, const bool beam_search); - void UpdateInputes(const paddle::Tensor &stop_flags, - const paddle::Tensor ¬_need_stop, // only on cpu + const paddle::Tensor ¬_need_stop, // only on cpu const paddle::Tensor &seq_lens_this_time, const paddle::Tensor &seq_lens_encoder, const paddle::Tensor &seq_lens_decoder, @@ -375,42 +429,46 @@ void UpdateInputes(const paddle::Tensor &stop_flags, const paddle::Tensor &is_block_step); void UpdateInputesV1(const paddle::Tensor &stop_flags, - const paddle::Tensor ¬_need_stop, // only on cpu - const paddle::Tensor &seq_lens_this_time, - const paddle::Tensor &seq_lens_encoder, - const paddle::Tensor &seq_lens_decoder, - const paddle::Tensor &step_seq_lens_decoder, - const paddle::Tensor &prompt_lens, - const paddle::Tensor &topk_ids, - const paddle::Tensor &input_ids, - const paddle::Tensor &block_tables, - const paddle::Tensor &stop_nums, - const paddle::Tensor &next_tokens, - const paddle::Tensor &is_block_step, - const int block_size); + const paddle::Tensor ¬_need_stop, // only on cpu + const paddle::Tensor &seq_lens_this_time, + const paddle::Tensor &seq_lens_encoder, + const paddle::Tensor &seq_lens_decoder, + const paddle::Tensor &step_seq_lens_decoder, + const paddle::Tensor &prompt_lens, + const paddle::Tensor &topk_ids, + const paddle::Tensor &input_ids, + const paddle::Tensor &block_tables, + const paddle::Tensor &stop_nums, + const paddle::Tensor &next_tokens, + const paddle::Tensor &is_block_step, + const int block_size, + const bool is_pooling_task); + +void RecoverDecodeTask( + const paddle::Tensor &stop_flags, + const paddle::Tensor &seq_lens_this_time, + const paddle::Tensor &seq_lens_encoder, + const paddle::Tensor &seq_lens_decoder, + const paddle::Tensor &step_seq_lens_decoder, + const paddle::Tensor &block_tables, + const paddle::Tensor &is_block_step, + const paddle::optional &draft_tokens, + const paddle::optional &step_draft_tokens, + const paddle::optional &step_seq_lens_this_time, + const int block_size, + const int max_draft_tokens); -void RecoverDecodeTask(const paddle::Tensor &stop_flags, - const paddle::Tensor &seq_lens_this_time, - const paddle::Tensor &seq_lens_encoder, - const paddle::Tensor &seq_lens_decoder, - const paddle::Tensor &step_seq_lens_decoder, - const paddle::Tensor &block_tables, - const paddle::Tensor &is_block_step, - const paddle::optional &draft_tokens, - const paddle::optional &step_draft_tokens, - const paddle::optional &step_seq_lens_this_time, - const int block_size, - const int max_draft_tokens); - -paddle::Tensor -GroupSwigluWithMasked(const paddle::Tensor &fc1_out_tensor, - const paddle::Tensor &token_nums_per_expert); +paddle::Tensor GroupSwigluWithMasked( + const paddle::Tensor &fc1_out_tensor, + const paddle::Tensor &token_nums_per_expert); std::vector ExtractTextTokenOutput( - const paddle::Tensor &max_seq_len, const paddle::Tensor &max_seq_len_index, + const paddle::Tensor &max_seq_len, + const paddle::Tensor &max_seq_len_index, const paddle::Tensor &mm_token_num_len, const paddle::Tensor &seq_lens_this_time, - const paddle::Tensor &cu_seqlens_q, const paddle::Tensor &hidden_states); + const paddle::Tensor &cu_seqlens_q, + const paddle::Tensor &hidden_states); std::vector MoEDeepGEMMPermute(const paddle::Tensor &x, const paddle::Tensor &topk_idx, @@ -419,97 +477,99 @@ std::vector MoEDeepGEMMPermute(const paddle::Tensor &x, std::vector MoEDeepGEMMDePermute( const paddle::Tensor - &ffn_out, // [num_experts, max_tokens_per_expert, hidden] - const paddle::Tensor &permute_indices_per_token, // [token_num, topk}] - const paddle::Tensor &topk_idx, const paddle::Tensor &topk_weights); + &ffn_out, // [num_experts, max_tokens_per_expert, hidden] + const paddle::Tensor &permute_indices_per_token, // [token_num, topk}] + const paddle::Tensor &topk_idx, + const paddle::Tensor &topk_weights); void TextImageIndexOut(const paddle::Tensor &token_type_ids, - paddle::Tensor &text_input, - paddle::Tensor &image_input); + paddle::Tensor &text_input, + paddle::Tensor &image_input); -void TextImageGatherScatter(paddle::Tensor &input, paddle::Tensor &text_input, +void TextImageGatherScatter(paddle::Tensor &input, + paddle::Tensor &text_input, paddle::Tensor &image_input, paddle::Tensor &token_type_ids, paddle::Tensor &text_index, - paddle::Tensor &image_index, const bool is_scatter); + paddle::Tensor &image_index, + const bool is_scatter); paddle::Tensor count_tokens_per_expert_func(const paddle::Tensor &topk_ids, int64_t num_experts); void GetPositionIdsAndMaskEncoderBatch( - const paddle::Tensor& seq_lens_encoder, - const paddle::Tensor& seq_lens_decoder, - const paddle::Tensor& seq_lens_this_time, - const paddle::Tensor& position_ids, - const paddle::Tensor& mask_encoder_batch); + const paddle::Tensor &seq_lens_encoder, + const paddle::Tensor &seq_lens_decoder, + const paddle::Tensor &seq_lens_this_time, + const paddle::Tensor &position_ids, + const paddle::Tensor &mask_encoder_batch); std::vector DecodeMLAWriteCacheKernel( - const paddle::Tensor& kv_nope, - const paddle::Tensor& kv_pe, - const paddle::Tensor& kv_cache, - const paddle::Tensor& seq_lens, - const paddle::Tensor& seq_lens_encoder, - const paddle::Tensor& batch_id_per_token, - const paddle::Tensor& cu_seqlens_q, - const paddle::Tensor& block_tables, - const std::string& cache_quant_type_str, + const paddle::Tensor &kv_nope, + const paddle::Tensor &kv_pe, + const paddle::Tensor &kv_cache, + const paddle::Tensor &seq_lens, + const paddle::Tensor &seq_lens_encoder, + const paddle::Tensor &batch_id_per_token, + const paddle::Tensor &cu_seqlens_q, + const paddle::Tensor &block_tables, + const std::string &cache_quant_type_str, const int max_seq_len, const bool speculate_decoder); - std::vector PrefillMLAWriteCacheKernel( - const paddle::Tensor& kv_nope, - const paddle::Tensor& kv_pe, - const paddle::Tensor& kv_cache, - const paddle::Tensor& seq_lens, - const paddle::Tensor& seq_lens_decoder, - const paddle::Tensor& batch_id_per_token, - const paddle::Tensor& cu_seqlens_q, - const paddle::Tensor& block_tables, - const std::string& cache_quant_type_str, +std::vector PrefillMLAWriteCacheKernel( + const paddle::Tensor &kv_nope, + const paddle::Tensor &kv_pe, + const paddle::Tensor &kv_cache, + const paddle::Tensor &seq_lens, + const paddle::Tensor &seq_lens_decoder, + const paddle::Tensor &batch_id_per_token, + const paddle::Tensor &cu_seqlens_q, + const paddle::Tensor &block_tables, + const std::string &cache_quant_type_str, const int max_seq_len); - void FusedRotaryPositionEncoding( - paddle::Tensor& query, // [num_tokens, num_heads, head_size] or + paddle::Tensor &query, // [num_tokens, num_heads, head_size] or // [num_tokens, num_heads * head_size] - paddle::Tensor& key, + paddle::Tensor &key, // [num_tokens, num_kv_heads, head_size] or [num_tokens, num_kv_heads * // head_size] - const paddle::Tensor& position_ids, // [num_tokens] - const paddle::Tensor& cos_sin_cache, // [max_position, rot_dim] + const paddle::Tensor &position_ids, // [num_tokens] + const paddle::Tensor &cos_sin_cache, // [max_position, rot_dim] int head_size, bool is_neox); std::vector MultiHeadLatentAttention( - const paddle::Tensor& query, - const paddle::Tensor& key_cache, - const paddle::Tensor& value_cache, - const paddle::Tensor& seq_lens_decoder, - const paddle::Tensor& seq_lens_this_time, - const paddle::Tensor& cu_seqlens_q, - const paddle::Tensor& batch_id_per_token, - const paddle::Tensor& block_tables, - const paddle::Tensor& kv_batch_ids, - const paddle::Tensor& kv_tile_ids_per_batch, - const paddle::Tensor& kv_num_blocks, - const paddle::Tensor& decoder_batch_ids, - const paddle::Tensor& decoder_tile_ids_per_batch, - const paddle::Tensor& decoder_num_blocks_device, - const paddle::Tensor& decoder_chunk_size_device, - const paddle::Tensor& max_dec_len_this_time, - const paddle::Tensor& max_len_kv, - const paddle::optional& attn_mask, - const paddle::optional& query_bias, - const paddle::optional& query_out_scales, - const paddle::optional& cache_k_quant_scales, - const paddle::optional& cache_v_quant_scales, - const paddle::optional& cache_k_dequant_scales, - const paddle::optional& cache_v_dequant_scales, - const paddle::optional& cache_k_zp, - const paddle::optional& cache_v_zp, - const paddle::optional& out_linear_shifts, - const paddle::optional& out_linear_smooths, - const std::string& compute_dtype, - const std::string& cache_quant_type_str, + const paddle::Tensor &query, + const paddle::Tensor &key_cache, + const paddle::Tensor &value_cache, + const paddle::Tensor &seq_lens_decoder, + const paddle::Tensor &seq_lens_this_time, + const paddle::Tensor &cu_seqlens_q, + const paddle::Tensor &batch_id_per_token, + const paddle::Tensor &block_tables, + const paddle::Tensor &kv_batch_ids, + const paddle::Tensor &kv_tile_ids_per_batch, + const paddle::Tensor &kv_num_blocks, + const paddle::Tensor &decoder_batch_ids, + const paddle::Tensor &decoder_tile_ids_per_batch, + const paddle::Tensor &decoder_num_blocks_device, + const paddle::Tensor &decoder_chunk_size_device, + const paddle::Tensor &max_dec_len_this_time, + const paddle::Tensor &max_len_kv, + const paddle::optional &attn_mask, + const paddle::optional &query_bias, + const paddle::optional &query_out_scales, + const paddle::optional &cache_k_quant_scales, + const paddle::optional &cache_v_quant_scales, + const paddle::optional &cache_k_dequant_scales, + const paddle::optional &cache_v_dequant_scales, + const paddle::optional &cache_k_zp, + const paddle::optional &cache_v_zp, + const paddle::optional &out_linear_shifts, + const paddle::optional &out_linear_smooths, + const std::string &compute_dtype, + const std::string &cache_quant_type_str, const int nope_size, const int max_input_length, const float softmax_scale, @@ -520,29 +580,30 @@ std::vector MultiHeadLatentAttention( const bool causal, const bool speculate_decoder); - -std::vector tritonmoe_preprocess_kernel(const paddle::Tensor& topk_ids, int64_t num_experts, int64_t GEMM_BLOCK_SIZE_M); - +std::vector tritonmoe_preprocess_kernel( + const paddle::Tensor &topk_ids, + int64_t num_experts, + int64_t GEMM_BLOCK_SIZE_M); std::vector MoeWna16MarlinGemmApi( - const paddle::Tensor& a, - const paddle::optional& c_or_none, - const paddle::Tensor& b_q_weight, - const paddle::Tensor& b_scales, - const paddle::optional& global_scale_or_none, - const paddle::optional& b_zeros_or_none, - const paddle::optional& g_idx_or_none, - const paddle::optional& perm_or_none, - const paddle::Tensor& workspace, - const paddle::Tensor& sorted_token_ids, - const paddle::Tensor& expert_ids, - const paddle::Tensor& num_tokens_post_padded, - const paddle::Tensor& topk_weights, + const paddle::Tensor &a, + const paddle::optional &c_or_none, + const paddle::Tensor &b_q_weight, + const paddle::Tensor &b_scales, + const paddle::optional &global_scale_or_none, + const paddle::optional &b_zeros_or_none, + const paddle::optional &g_idx_or_none, + const paddle::optional &perm_or_none, + const paddle::Tensor &workspace, + const paddle::Tensor &sorted_token_ids, + const paddle::Tensor &expert_ids, + const paddle::Tensor &num_tokens_post_padded, + const paddle::Tensor &topk_weights, int64_t moe_block_size, int64_t top_k, bool mul_topk_weights, bool is_ep, - const std::string& b_q_type_str, + const std::string &b_q_type_str, int64_t size_m, int64_t size_n, int64_t size_k, @@ -550,84 +611,93 @@ std::vector MoeWna16MarlinGemmApi( bool use_atomic_add, bool use_fp32_reduce, bool is_zp_float); -void CutlassScaledMm(paddle::Tensor &c, paddle::Tensor const &a, - paddle::Tensor const &b, paddle::Tensor const &a_scales, +void CutlassScaledMm(paddle::Tensor &c, + paddle::Tensor const &a, + paddle::Tensor const &b, + paddle::Tensor const &a_scales, paddle::Tensor const &b_scales, paddle::optional const &bias); -void CutlassScaledMmAzp(paddle::Tensor& c, paddle::Tensor const& a, - paddle::Tensor const& b, - paddle::Tensor const& a_scales, - paddle::Tensor const& b_scales, - paddle::Tensor const& azp_adj, - paddle::optional const& azp, - paddle::optional const& bias); - -void StaticScaledFp8Quant(paddle::Tensor &out, paddle::Tensor const &input, +void CutlassScaledMmAzp(paddle::Tensor &c, + paddle::Tensor const &a, + paddle::Tensor const &b, + paddle::Tensor const &a_scales, + paddle::Tensor const &b_scales, + paddle::Tensor const &azp_adj, + paddle::optional const &azp, + paddle::optional const &bias); + +void StaticScaledFp8Quant(paddle::Tensor &out, + paddle::Tensor const &input, paddle::Tensor const &scale); -void DynamicScaledFp8Quant(paddle::Tensor &out, paddle::Tensor const &input, +void DynamicScaledFp8Quant(paddle::Tensor &out, + paddle::Tensor const &input, paddle::Tensor &scale); void DynamicPerTokenScaledFp8Quant(paddle::Tensor &out, paddle::Tensor const &input, - paddle::Tensor &scales, float scale_ub); + paddle::Tensor &scales, + float scale_ub); -std::vector NoauxTc( - paddle::Tensor& scores, - paddle::Tensor& scores_with_bias, - int n_group, - int topk_group, - int topk, - bool renormalize, - float routed_scaling_factor); +std::vector NoauxTc(paddle::Tensor &scores, + paddle::Tensor &scores_with_bias, + int n_group, + int topk_group, + int topk, + bool renormalize, + float routed_scaling_factor); #ifdef ENABLE_FP8 paddle::Tensor cutlass_fp8_fp8_half_gemm_func( - const paddle::Tensor& x, - const paddle::Tensor& y, - const paddle::optional& bias, + const paddle::Tensor &x, + const paddle::Tensor &y, + const paddle::optional &bias, bool trans_x, bool trans_y, float scale, // only support per-tensor quantization std::string output_dtype, std::string activation_type); -paddle::Tensor MoeFusedHadamardQuantFp8Func( - const paddle::Tensor &input, - const paddle::Tensor &scale, - const paddle::Tensor &topk_ids, - const int top_k, - const int intermediate_size, - const bool tiled); - -paddle::Tensor FusedHadamardQuantFp8Func( - const paddle::Tensor &input, - const float scale); +paddle::Tensor MoeFusedHadamardQuantFp8Func(const paddle::Tensor &input, + const paddle::Tensor &scale, + const paddle::Tensor &topk_ids, + const int top_k, + const int intermediate_size, + const bool tiled); + +paddle::Tensor FusedHadamardQuantFp8Func(const paddle::Tensor &input, + const float scale); #endif -int64_t init_custom_all_reduce(const std::vector& fake_ipc_ptrs, - paddle::Tensor& rank_data, int64_t rank, bool full_nvlink); +int64_t init_custom_all_reduce(const std::vector &fake_ipc_ptrs, + paddle::Tensor &rank_data, + int64_t rank, + bool full_nvlink); -void all_reduce(paddle::Tensor& inp, paddle::Tensor& out, int64_t _fa, - int64_t reg_buffer, int64_t reg_buffer_sz_bytes); +void all_reduce(paddle::Tensor &inp, + paddle::Tensor &out, + int64_t _fa, + int64_t reg_buffer, + int64_t reg_buffer_sz_bytes); void dispose(int64_t _fa); int64_t meta_size(); -void register_buffer(int64_t _fa, const std::vector& fake_ipc_ptrs); +void register_buffer(int64_t _fa, const std::vector &fake_ipc_ptrs); -std::tuple, std::vector> get_graph_buffer_ipc_meta(int64_t _fa); +std::tuple, std::vector> +get_graph_buffer_ipc_meta(int64_t _fa); void register_graph_buffers(int64_t _fa, - const std::vector>& handles, - const std::vector>& offsets); + const std::vector> &handles, + const std::vector> &offsets); std::tuple allocate_shared_buffer_and_handle( int64_t size); -int64_t open_mem_handle(paddle::Tensor& mem_handle); +int64_t open_mem_handle(paddle::Tensor &mem_handle); void free_shared_buffer(int64_t buffer); @@ -635,39 +705,38 @@ void clear_ipc_handles(int64_t _fa); // speculative decoding Kernel std::vector SpeculateGetPaddingOffset( - const paddle::Tensor& input_ids, - const paddle::Tensor& draft_tokens, - const paddle::Tensor& cum_offsets, - const paddle::Tensor& token_num, - const paddle::Tensor& seq_len, - const paddle::Tensor& seq_lens_encoder); + const paddle::Tensor &input_ids, + const paddle::Tensor &draft_tokens, + const paddle::Tensor &cum_offsets, + const paddle::Tensor &token_num, + const paddle::Tensor &seq_len, + const paddle::Tensor &seq_lens_encoder); std::vector SpeculateGetSeqLensOutput( - const paddle::Tensor& seq_lens_this_time, - const paddle::Tensor& seq_lens_encoder, - const paddle::Tensor& seq_lens_decoder); + const paddle::Tensor &seq_lens_this_time, + const paddle::Tensor &seq_lens_encoder, + const paddle::Tensor &seq_lens_decoder); std::vector SpeculateGetOutputPaddingOffset( - const paddle::Tensor& output_cum_offsets_tmp, - const paddle::Tensor& out_token_num, - const paddle::Tensor& seq_lens_output, + const paddle::Tensor &output_cum_offsets_tmp, + const paddle::Tensor &out_token_num, + const paddle::Tensor &seq_lens_output, const int max_seq_len); - void SpecTokenPenaltyMultiScores(const paddle::Tensor &pre_ids, - const paddle::Tensor &logits, - const paddle::Tensor &penalty_scores, - const paddle::Tensor &frequency_scores, - const paddle::Tensor &presence_scores, - const paddle::Tensor &temperatures, - const paddle::Tensor &bad_tokens, - const paddle::Tensor &cur_len, - const paddle::Tensor &min_len, - const paddle::Tensor &eos_token_id, - const paddle::Tensor &seq_lens_this_time, - const paddle::Tensor &output_padding_offset, - const paddle::Tensor &output_cum_offsets, - const int max_seq_len); + const paddle::Tensor &logits, + const paddle::Tensor &penalty_scores, + const paddle::Tensor &frequency_scores, + const paddle::Tensor &presence_scores, + const paddle::Tensor &temperatures, + const paddle::Tensor &bad_tokens, + const paddle::Tensor &cur_len, + const paddle::Tensor &min_len, + const paddle::Tensor &eos_token_id, + const paddle::Tensor &seq_lens_this_time, + const paddle::Tensor &output_padding_offset, + const paddle::Tensor &output_cum_offsets, + const int max_seq_len); void SpecGetStopFlagsMultiSeqs(const paddle::Tensor &accept_tokens, const paddle::Tensor &accept_num, @@ -679,32 +748,40 @@ void SpecGetStopFlagsMultiSeqs(const paddle::Tensor &accept_tokens, const paddle::Tensor &stop_seqs_len, const paddle::Tensor &end_ids); - -void SpeculateVerify( - const paddle::Tensor &accept_tokens, const paddle::Tensor &accept_num, - const paddle::Tensor &step_idx, const paddle::Tensor &stop_flags, - const paddle::Tensor &seq_lens_encoder, - const paddle::Tensor &seq_lens_decoder, const paddle::Tensor &draft_tokens, - const paddle::Tensor &seq_lens_this_time, - const paddle::Tensor &verify_tokens, const paddle::Tensor &verify_scores, - const paddle::Tensor &max_dec_len, const paddle::Tensor &end_tokens, - const paddle::Tensor &is_block_step, - const paddle::Tensor &output_cum_offsets, - const paddle::Tensor &actual_candidate_len, - const paddle::Tensor &actual_draft_token_nums, const paddle::Tensor &topp, - int max_seq_len, int verify_window, bool enable_topp, bool benchmark_mode, bool accept_all_drafts); +void SpeculateVerify(const paddle::Tensor &accept_tokens, + const paddle::Tensor &accept_num, + const paddle::Tensor &step_idx, + const paddle::Tensor &stop_flags, + const paddle::Tensor &seq_lens_encoder, + const paddle::Tensor &seq_lens_decoder, + const paddle::Tensor &draft_tokens, + const paddle::Tensor &seq_lens_this_time, + const paddle::Tensor &verify_tokens, + const paddle::Tensor &verify_scores, + const paddle::Tensor &max_dec_len, + const paddle::Tensor &end_tokens, + const paddle::Tensor &is_block_step, + const paddle::Tensor &output_cum_offsets, + const paddle::Tensor &actual_candidate_len, + const paddle::Tensor &actual_draft_token_nums, + const paddle::Tensor &topp, + int max_seq_len, + int verify_window, + bool enable_topp, + bool benchmark_mode, + bool accept_all_drafts); void SpeculateUpdate(const paddle::Tensor &seq_lens_encoder, - const paddle::Tensor &seq_lens_decoder, - const paddle::Tensor ¬_need_stop, - const paddle::Tensor &draft_tokens, - const paddle::Tensor &actual_draft_token_nums, - const paddle::Tensor &accept_tokens, - const paddle::Tensor &accept_num, - const paddle::Tensor &stop_flags, - const paddle::Tensor &seq_lens_this_time, - const paddle::Tensor &is_block_step, - const paddle::Tensor &stop_nums); + const paddle::Tensor &seq_lens_decoder, + const paddle::Tensor ¬_need_stop, + const paddle::Tensor &draft_tokens, + const paddle::Tensor &actual_draft_token_nums, + const paddle::Tensor &accept_tokens, + const paddle::Tensor &accept_num, + const paddle::Tensor &stop_flags, + const paddle::Tensor &seq_lens_this_time, + const paddle::Tensor &is_block_step, + const paddle::Tensor &stop_nums); void SpeculateSetValueByFlagsAndIdx(const paddle::Tensor &pre_ids_all, const paddle::Tensor &accept_tokens, @@ -715,18 +792,17 @@ void SpeculateSetValueByFlagsAndIdx(const paddle::Tensor &pre_ids_all, const paddle::Tensor &seq_lens_decoder, const paddle::Tensor &step_idx); -void SpeculateSaveWithOutputMsgStatic(const paddle::Tensor& accept_tokens, - const paddle::Tensor& accept_num, - const paddle::Tensor& not_need_stop, - const paddle::Tensor& seq_lens_decoder, - const paddle::Tensor& prompt_lens, +void SpeculateSaveWithOutputMsgStatic(const paddle::Tensor &accept_tokens, + const paddle::Tensor &accept_num, + const paddle::Tensor ¬_need_stop, + const paddle::Tensor &seq_lens_decoder, + const paddle::Tensor &prompt_lens, int64_t rank_id, bool save_each_rank, bool skip_prefill); - -void SpeculateClearAcceptNums(const paddle::Tensor& accept_num, - const paddle::Tensor& seq_lens_decoder); +void SpeculateClearAcceptNums(const paddle::Tensor &accept_num, + const paddle::Tensor &seq_lens_decoder); void SpeculateScheduleCache(const paddle::Tensor &draft_tokens, const paddle::Tensor &block_tables, @@ -747,100 +823,94 @@ void SpeculateScheduleCache(const paddle::Tensor &draft_tokens, const int max_draft_tokens); void NgramMatch(const paddle::Tensor &input_ids, - const paddle::Tensor &input_ids_len, - const paddle::Tensor &pre_ids, - const paddle::Tensor &step_idx, - const paddle::Tensor &draft_token_num, - const paddle::Tensor &draft_tokens, - const paddle::Tensor &seq_lens_this_time, - const paddle::Tensor &seq_lens_encoder, - const paddle::Tensor &seq_lens_decoder, - const paddle::Tensor &max_dec_len, - const int max_ngram_size, - const int max_draft_tokens); - + const paddle::Tensor &input_ids_len, + const paddle::Tensor &pre_ids, + const paddle::Tensor &step_idx, + const paddle::Tensor &draft_token_num, + const paddle::Tensor &draft_tokens, + const paddle::Tensor &seq_lens_this_time, + const paddle::Tensor &seq_lens_encoder, + const paddle::Tensor &seq_lens_decoder, + const paddle::Tensor &max_dec_len, + const int max_ngram_size, + const int max_draft_tokens); void HybridMtpNgram(const paddle::Tensor &input_ids, - const paddle::Tensor &input_ids_len, - const paddle::Tensor &pre_ids, - const paddle::Tensor &step_idx, - const paddle::Tensor &draft_token_num, - const paddle::Tensor &draft_tokens, - const paddle::Tensor &seq_lens_this_time, - const paddle::Tensor &seq_lens_decoder, - const paddle::Tensor &max_dec_len, - const int max_ngram_size, - const int min_ngram_size, - const int max_draft_tokens); - + const paddle::Tensor &input_ids_len, + const paddle::Tensor &pre_ids, + const paddle::Tensor &step_idx, + const paddle::Tensor &draft_token_num, + const paddle::Tensor &draft_tokens, + const paddle::Tensor &seq_lens_this_time, + const paddle::Tensor &seq_lens_decoder, + const paddle::Tensor &max_dec_len, + const int max_ngram_size, + const int min_ngram_size, + const int max_draft_tokens); // MTP -void DraftModelPostprocess(const paddle::Tensor& base_model_draft_tokens, - const paddle::Tensor& base_model_seq_lens_this_time, - const paddle::Tensor& base_model_seq_lens_encoder, - const paddle::Tensor& base_model_stop_flags); - - -void DraftModelPreprocess(const paddle::Tensor& draft_tokens, - const paddle::Tensor& input_ids, - const paddle::Tensor& stop_flags, - const paddle::Tensor& seq_lens_this_time, - const paddle::Tensor& seq_lens_encoder, - const paddle::Tensor& seq_lens_decoder, - const paddle::Tensor& step_idx, - const paddle::Tensor& not_need_stop, - const paddle::Tensor& is_block_step, - const paddle::Tensor& batch_drop, - const paddle::Tensor& pre_ids, - const paddle::Tensor& accept_tokens, - const paddle::Tensor& accept_num, - const paddle::Tensor& base_model_seq_lens_this_time, - const paddle::Tensor& base_model_seq_lens_encoder, - const paddle::Tensor& base_model_seq_lens_decoder, - const paddle::Tensor& base_model_step_idx, - const paddle::Tensor& base_model_stop_flags, - const paddle::Tensor& base_model_is_block_step, - const paddle::Tensor& base_model_draft_tokens, +void DraftModelPostprocess(const paddle::Tensor &base_model_draft_tokens, + const paddle::Tensor &base_model_seq_lens_this_time, + const paddle::Tensor &base_model_seq_lens_encoder, + const paddle::Tensor &base_model_stop_flags); + +void DraftModelPreprocess(const paddle::Tensor &draft_tokens, + const paddle::Tensor &input_ids, + const paddle::Tensor &stop_flags, + const paddle::Tensor &seq_lens_this_time, + const paddle::Tensor &seq_lens_encoder, + const paddle::Tensor &seq_lens_decoder, + const paddle::Tensor &step_idx, + const paddle::Tensor ¬_need_stop, + const paddle::Tensor &is_block_step, + const paddle::Tensor &batch_drop, + const paddle::Tensor &pre_ids, + const paddle::Tensor &accept_tokens, + const paddle::Tensor &accept_num, + const paddle::Tensor &base_model_seq_lens_this_time, + const paddle::Tensor &base_model_seq_lens_encoder, + const paddle::Tensor &base_model_seq_lens_decoder, + const paddle::Tensor &base_model_step_idx, + const paddle::Tensor &base_model_stop_flags, + const paddle::Tensor &base_model_is_block_step, + const paddle::Tensor &base_model_draft_tokens, const int max_draft_token, const bool truncate_first_token, const bool splitwise_prefill, const bool kvcache_scheduler_v1); - -void DraftModelUpdate(const paddle::Tensor& inter_next_tokens, - const paddle::Tensor& draft_tokens, - const paddle::Tensor& pre_ids, - const paddle::Tensor& seq_lens_this_time, - const paddle::Tensor& seq_lens_encoder, - const paddle::Tensor& seq_lens_decoder, - const paddle::Tensor& step_idx, - const paddle::Tensor& output_cum_offsets, - const paddle::Tensor& stop_flags, - const paddle::Tensor& not_need_stop, - const paddle::Tensor& max_dec_len, - const paddle::Tensor& end_ids, - const paddle::Tensor& base_model_draft_tokens, +void DraftModelUpdate(const paddle::Tensor &inter_next_tokens, + const paddle::Tensor &draft_tokens, + const paddle::Tensor &pre_ids, + const paddle::Tensor &seq_lens_this_time, + const paddle::Tensor &seq_lens_encoder, + const paddle::Tensor &seq_lens_decoder, + const paddle::Tensor &step_idx, + const paddle::Tensor &output_cum_offsets, + const paddle::Tensor &stop_flags, + const paddle::Tensor ¬_need_stop, + const paddle::Tensor &max_dec_len, + const paddle::Tensor &end_ids, + const paddle::Tensor &base_model_draft_tokens, const int max_seq_len, const int substep); - - std::vector EagleGetHiddenStates( - const paddle::Tensor& input, - const paddle::Tensor& seq_lens_this_time, - const paddle::Tensor& seq_lens_encoder, - const paddle::Tensor& seq_lens_decoder, - const paddle::Tensor& stop_flags, - const paddle::Tensor& accept_nums, - const paddle::Tensor& base_model_seq_lens_this_time, - const paddle::Tensor& base_model_seq_lens_encoder, - const int actual_draft_token_num); + const paddle::Tensor &input, + const paddle::Tensor &seq_lens_this_time, + const paddle::Tensor &seq_lens_encoder, + const paddle::Tensor &seq_lens_decoder, + const paddle::Tensor &stop_flags, + const paddle::Tensor &accept_nums, + const paddle::Tensor &base_model_seq_lens_this_time, + const paddle::Tensor &base_model_seq_lens_encoder, + const int actual_draft_token_num); std::vector EagleGetSelfHiddenStates( - const paddle::Tensor& input, - const paddle::Tensor& last_seq_lens_this_time, - const paddle::Tensor& seq_lens_this_time, - const paddle::Tensor& step_idx); + const paddle::Tensor &input, + const paddle::Tensor &last_seq_lens_this_time, + const paddle::Tensor &seq_lens_this_time, + const paddle::Tensor &step_idx); void MTPStepPaddle( const paddle::Tensor &base_model_stop_flags, @@ -885,38 +955,38 @@ void SpeculateStepPaddle( const int encoder_decoder_block_num, const int max_draft_tokens); -void MergePrefillDecodeOutput( - const paddle::Tensor &encoder_res, - const paddle::Tensor &decoder_res, - const paddle::Tensor &seq_lens_encoder, - const paddle::Tensor &seq_lens_decoder, - const paddle::Tensor &seq_lens_this_time, - const paddle::Tensor &cu_seq_q, - const int head_num, - const int head_dim, - const int max_token); +void MergePrefillDecodeOutput(const paddle::Tensor &encoder_res, + const paddle::Tensor &decoder_res, + const paddle::Tensor &seq_lens_encoder, + const paddle::Tensor &seq_lens_decoder, + const paddle::Tensor &seq_lens_this_time, + const paddle::Tensor &cu_seq_q, + const int head_num, + const int head_dim, + const int max_token); -std::vector TopPSamplingReject(const paddle::Tensor &probs, - const paddle::Tensor &top_p, - const paddle::optional &top_k, - int64_t seed); +std::vector TopPSamplingReject( + const paddle::Tensor &probs, + const paddle::Tensor &top_p, + const paddle::optional &top_k, + int64_t seed); std::vector TopKRenorm(const paddle::Tensor &probs, const paddle::Tensor &top_k); std::vector MinPSamplingFromProbs(const paddle::Tensor &probs, - const paddle::Tensor &min_p); + const paddle::Tensor &min_p); -void SaveOutMmsgStatic(const paddle::Tensor& x, - const paddle::Tensor& not_need_stop, +void SaveOutMmsgStatic(const paddle::Tensor &x, + const paddle::Tensor ¬_need_stop, int64_t rank_id, bool save_each_rank); void LimitThinkingContentLengthV1(const paddle::Tensor &next_tokens, - const paddle::Tensor &max_think_lens, - const paddle::Tensor &step_idx, - const paddle::Tensor &limit_think_status, - const int64_t think_end_id); + const paddle::Tensor &max_think_lens, + const paddle::Tensor &step_idx, + const paddle::Tensor &limit_think_status, + const int64_t think_end_id); void LimitThinkingContentLengthV2(const paddle::Tensor &next_tokens, const paddle::Tensor &max_think_lens, @@ -926,21 +996,21 @@ void LimitThinkingContentLengthV2(const paddle::Tensor &next_tokens, const int64_t line_break_id); void SpeculateLimitThinkingContentLengthV1( - const paddle::Tensor& next_tokens, - const paddle::Tensor& max_think_lens, - const paddle::Tensor& step_idx, - const paddle::Tensor& limit_think_status, - const paddle::Tensor& accept_num, - const paddle::Tensor& seq_lens_decoder, + const paddle::Tensor &next_tokens, + const paddle::Tensor &max_think_lens, + const paddle::Tensor &step_idx, + const paddle::Tensor &limit_think_status, + const paddle::Tensor &accept_num, + const paddle::Tensor &seq_lens_decoder, const int64_t think_end_id); void SpeculateLimitThinkingContentLengthV2( - const paddle::Tensor& next_tokens, - const paddle::Tensor& max_think_lens, - const paddle::Tensor& step_idx, - const paddle::Tensor& limit_think_status, - const paddle::Tensor& accept_num, - const paddle::Tensor& seq_lens_decoder, + const paddle::Tensor &next_tokens, + const paddle::Tensor &max_think_lens, + const paddle::Tensor &step_idx, + const paddle::Tensor &limit_think_status, + const paddle::Tensor &accept_num, + const paddle::Tensor &seq_lens_decoder, const int64_t think_end_id, const int64_t line_break_id); @@ -971,19 +1041,25 @@ void SpeculateGetTargetLogits(const paddle::Tensor &target_logits, const paddle::Tensor &accept_num); PYBIND11_MODULE(fastdeploy_ops, m) { - - m.def("get_expert_token_num", &GetExpertTokenNum, py::arg("topk_ids"), - py::arg("num_experts"), "get expert token num"); + m.def("get_expert_token_num", + &GetExpertTokenNum, + py::arg("topk_ids"), + py::arg("num_experts"), + "get expert token num"); /** * moe/fused_moe/moe_redundant_topk_select.cu * moe_redundant_topk_select */ - m.def("moe_redundant_topk_select", &MoERedundantTopKSelectKernel, - py::arg("gating_logits"), py::arg("expert_id_to_ep_rank_array"), + m.def("moe_redundant_topk_select", + &MoERedundantTopKSelectKernel, + py::arg("gating_logits"), + py::arg("expert_id_to_ep_rank_array"), py::arg("expert_in_rank_num_list"), - py::arg("tokens_per_expert_stats_list"), py::arg("bias"), - py::arg("moe_topk"), py::arg("apply_norm_weight"), + py::arg("tokens_per_expert_stats_list"), + py::arg("bias"), + py::arg("moe_topk"), + py::arg("apply_norm_weight"), py::arg("enable_softmax_top_k_fused"), py::arg("redundant_ep_rank_num_plus_one"), "moe export RedundantTopKSelect function"); @@ -992,49 +1068,62 @@ PYBIND11_MODULE(fastdeploy_ops, m) { * open_shm_and_get_meta_signal.cc * InitKVSignalPerQuery */ - m.def("init_kv_signal_per_query", &InitKVSignalPerQuery, + m.def("init_kv_signal_per_query", + &InitKVSignalPerQuery, py::arg("seq_lens_encoder_tensor"), py::arg("seq_lens_this_time_tensor"), - py::arg("seq_lens_decoder_tensor"), py::arg("rank"), - py::arg("num_layers"), "init_kv_signal_per_query function"); + py::arg("seq_lens_decoder_tensor"), + py::arg("rank"), + py::arg("num_layers"), + "init_kv_signal_per_query function"); /** * GetOutputKVSignal */ - m.def("get_output_kv_signal", &GetOutputKVSignal, py::arg("x"), - py::arg("rank_id"), py::arg("wait_flag"), + m.def("get_output_kv_signal", + &GetOutputKVSignal, + py::arg("x"), + py::arg("rank_id"), + py::arg("wait_flag"), "get_output_kv_signal function"); m.def("moe_deepgemm_permute", &MoEDeepGEMMPermute, "MoEDeepGEMMPermute"); - m.def("moe_deepgemm_depermute", &MoEDeepGEMMDePermute, - "MoEDeepGEMMDePermute"); + m.def( + "moe_deepgemm_depermute", &MoEDeepGEMMDePermute, "MoEDeepGEMMDePermute"); /** * alloc_cache_pinned.cc * cuda_host_alloc * cuda_host_free */ - m.def("cuda_host_alloc", &cuda_host_alloc, "Allocate pinned memory", - py::arg("size"), py::arg("flags") = cudaHostAllocDefault); - m.def("cuda_host_free", &cuda_host_free, "Free pinned memory", - py::arg("ptr")); + m.def("cuda_host_alloc", + &cuda_host_alloc, + "Allocate pinned memory", + py::arg("size"), + py::arg("flags") = cudaHostAllocDefault); + m.def( + "cuda_host_free", &cuda_host_free, "Free pinned memory", py::arg("ptr")); py::register_exception(m, "CudaError"); /** * append_attention.cu * append_attention */ m.def("append_attention", &AppendAttention, "append attention function"); - m.def("append_attention_with_output", &AppendAttentionWithOutput, "append attention with output function"); + m.def("append_attention_with_output", + &AppendAttentionWithOutput, + "append attention with output function"); /** * gqa_rope_write_cache.cu * gqa_rope_write_cache */ - m.def("gqa_rope_write_cache", &GQARopeWriteCacheKernel, + m.def("gqa_rope_write_cache", + &GQARopeWriteCacheKernel, "gqa rope write cache function"); /** * pre_cache_len_concat.cu * pre_cache_len_concat */ - m.def("pre_cache_len_concat", &PreCacheLenConcat, + m.def("pre_cache_len_concat", + &PreCacheLenConcat, "pre_cache len concat function"); /** * moe/fused_moe/fused_moe.cu @@ -1052,66 +1141,108 @@ PYBIND11_MODULE(fastdeploy_ops, m) { * moe/fused_moe/moe_dispatch.cu * moe_expert_dispatch */ - m.def("moe_expert_dispatch", &MoeExpertDispatch, py::arg("input"), - py::arg("gating_output"), py::arg("gating_correction_bias"), - py::arg("w4a8_in_scale"), py::arg("moe_topk"), py::arg("group_moe"), - py::arg("moe_quant_type"), py::arg("topk_only_mode"), "moe export dispatch function"); + m.def("moe_expert_dispatch", + &MoeExpertDispatch, + py::arg("input"), + py::arg("gating_output"), + py::arg("gating_correction_bias"), + py::arg("w4a8_in_scale"), + py::arg("moe_topk"), + py::arg("group_moe"), + py::arg("moe_quant_type"), + py::arg("topk_only_mode"), + "moe export dispatch function"); /** * moe/fused_moe/ep_moe_prefill_func.cu * ep_moe_dispatch */ - m.def("ep_moe_expert_dispatch", &EPMoeExpertDispatch, py::arg("input"), - py::arg("topk_ids"), py::arg("topk_weights"), py::arg("up_gate_proj_in_scale"), - py::arg("token_nums_per_expert"), py::arg("token_nums_this_rank"), - py::arg("moe_quant_type"), "ep moe export dispatch function"); + m.def("ep_moe_expert_dispatch", + &EPMoeExpertDispatch, + py::arg("input"), + py::arg("topk_ids"), + py::arg("topk_weights"), + py::arg("up_gate_proj_in_scale"), + py::arg("token_nums_per_expert"), + py::arg("token_nums_this_rank"), + py::arg("moe_quant_type"), + "ep moe export dispatch function"); m.def("ep_moe_expert_dispatch_fp8", &EPMoeExpertDispatchFP8); - m.def("ep_moe_expert_combine", &EPMoeExpertCombine, py::arg("ffn_out"), - py::arg("expert_scales_float"), py::arg("permute_indices_per_token"), - py::arg("top_k_indices"), py::arg("down_proj_bias"), - py::arg("norm_topk_prob"), py::arg("routed_scaling_factor"), + m.def("ep_moe_expert_combine", + &EPMoeExpertCombine, + py::arg("ffn_out"), + py::arg("expert_scales_float"), + py::arg("permute_indices_per_token"), + py::arg("top_k_indices"), + py::arg("down_proj_bias"), + py::arg("norm_topk_prob"), + py::arg("routed_scaling_factor"), "ep moe export combine function"); - m.def("per_token_quant", &PerTokenQuant, py::arg("input"), - py::arg("block_size"), "per token per block quant"); + m.def("per_token_quant", + &PerTokenQuant, + py::arg("input"), + py::arg("block_size"), + "per token per block quant"); - m.def("per_token_quant_padding", &PerTokenQuantPadding, py::arg("input"), + m.def("per_token_quant_padding", + &PerTokenQuantPadding, + py::arg("input"), py::arg("block_size"), "per token per block quant and padding transpose scale"); - m.def("masked_per_token_quant", &MaskedPerTokenQuant, py::arg("input"), - py::arg("recv_expert_count"), py::arg("block_size"), + m.def("masked_per_token_quant", + &MaskedPerTokenQuant, + py::arg("input"), + py::arg("recv_expert_count"), + py::arg("block_size"), "per token per block quant"); #ifdef ENABLE_MACHETE /*machete/machete_mm.cu * machete_mm */ - m.def("machete_mm", &MacheteMMKernel, py::arg("A"), py::arg("B"), py::arg("maybe_group_scale"), - py::arg("maybe_group_zeros"), py::arg("maybe_channel_scales"), py::arg("maybe_token_scales"), - py::arg("b_type_str"), py::arg("maybe_out_type_str"), py::arg("maybe_group_size"), + m.def("machete_mm", + &MacheteMMKernel, + py::arg("A"), + py::arg("B"), + py::arg("maybe_group_scale"), + py::arg("maybe_group_zeros"), + py::arg("maybe_channel_scales"), + py::arg("maybe_token_scales"), + py::arg("b_type_str"), + py::arg("maybe_out_type_str"), + py::arg("maybe_group_size"), py::arg("maybe_schedule"), "machete mm function"); /*machete/machete_prepack_B.cu * machete_prepack_B */ - m.def("machete_prepack_B", &MachetePrepackBKernel, "machete prepacked B function"); + m.def("machete_prepack_B", + &MachetePrepackBKernel, + "machete prepacked B function"); /*machete/machete_supported_schedules.cu * machete_supported_schedules */ - m.def("machete_supported_schedules", &MacheteSupportedSchedules, "machete supported schedules function"); + m.def("machete_supported_schedules", + &MacheteSupportedSchedules, + "machete supported schedules function"); #endif /** * moe/fused_moe/moe_topk_select.cu * moe_topk_select */ - m.def("moe_topk_select", &MoETopKSelectKernel, py::arg("gating_logits"), - py::arg("bias"), py::arg("moe_topk"), py::arg("apply_norm_weight"), + m.def("moe_topk_select", + &MoETopKSelectKernel, + py::arg("gating_logits"), + py::arg("bias"), + py::arg("moe_topk"), + py::arg("apply_norm_weight"), py::arg("enable_softmax_top_k_fused"), "moe export TopKSelect function"); @@ -1125,16 +1256,23 @@ PYBIND11_MODULE(fastdeploy_ops, m) { * moe/fused_moe/moe_expert_ffn_wint2.cu * moe_expert_ffn_wint2 */ - m.def("moe_expert_ffn_wint2", &MoeExpertFFNWint2Func, "moe export ffn wint2 function"); + m.def("moe_expert_ffn_wint2", + &MoeExpertFFNWint2Func, + "moe export ffn wint2 function"); /** * moe/fused_moe/moe_expert_reduce.cu * moe_expert_reduce */ - m.def("moe_expert_reduce", &MoeExpertReduceFunc, py::arg("ffn_out"), - py::arg("top_k_weight"), py::arg("permute_indices_per_token"), - py::arg("top_k_indices"), py::arg("down_proj_bias"), - py::arg("norm_topk_prob"), py::arg("routed_scaling_factor"), + m.def("moe_expert_reduce", + &MoeExpertReduceFunc, + py::arg("ffn_out"), + py::arg("top_k_weight"), + py::arg("permute_indices_per_token"), + py::arg("top_k_indices"), + py::arg("down_proj_bias"), + py::arg("norm_topk_prob"), + py::arg("routed_scaling_factor"), "moe export reduce function"); /** @@ -1147,14 +1285,16 @@ PYBIND11_MODULE(fastdeploy_ops, m) { * init_signal_layerwise.cc * init_signal_layerwise */ - m.def("init_signal_layerwise", &InitSignalLayerwiseFunc, + m.def("init_signal_layerwise", + &InitSignalLayerwiseFunc, "init_signal_layerwise function"); /** * open_shm_and_get_meta_signal.cc * open_shm_and_get_meta_signal */ - m.def("open_shm_and_get_meta_signal", &OpenShmAndGetMetaSignalFunc, + m.def("open_shm_and_get_meta_signal", + &OpenShmAndGetMetaSignalFunc, "open_shm_and_get_meta_signal function"); /** @@ -1162,7 +1302,8 @@ PYBIND11_MODULE(fastdeploy_ops, m) { * get_block_shape_and_split_kv_block */ m.def("get_block_shape_and_split_kv_block", - &GetBlockShapeAndSplitKVBlock, "get_block_shape_and_split_kv_block function"); + &GetBlockShapeAndSplitKVBlock, + "get_block_shape_and_split_kv_block function"); /** * get_padding_offset.cu @@ -1174,7 +1315,8 @@ PYBIND11_MODULE(fastdeploy_ops, m) { * get_padding_offset.cu * get_padding_offset */ - m.def("set_value_by_flags_and_idx", &SetValueByFlagsAndIdx, + m.def("set_value_by_flags_and_idx", + &SetValueByFlagsAndIdx, "SetValueByFlagsAndIdx"); /** @@ -1187,50 +1329,77 @@ PYBIND11_MODULE(fastdeploy_ops, m) { * stop_generation_multi_ends.cu * set_stop_value_multi_ends */ - m.def("set_stop_value_multi_ends", &GetStopFlagsMulti, + m.def("set_stop_value_multi_ends", + &GetStopFlagsMulti, "update_inputs function"); - /** * update_inputs.cu * update_inputs */ m.def("update_inputs", &UpdateInputes, "update_inputs function"); - /** + /** * update_inputs_v1.cu * update_inputs_v1 */ - m.def("update_inputs_v1", &UpdateInputesV1, "update inputs for scheduler v1 function"); + m.def("update_inputs_v1", + &UpdateInputesV1, + "update inputs for scheduler v1 function"); - /** + /** * recover_decode_task.cu * recover_decode_task */ - m.def("recover_decode_task", &RecoverDecodeTask, "recover decode task for scheduler v1 function"); + m.def("recover_decode_task", + &RecoverDecodeTask, + "recover decode task for scheduler v1 function"); - m.def("group_swiglu_with_masked", &GroupSwigluWithMasked, + m.def("group_swiglu_with_masked", + &GroupSwigluWithMasked, "group_swiglu_with_masked function"); - m.def("text_image_index_out", &TextImageIndexOut, + m.def("text_image_index_out", + &TextImageIndexOut, "text_image_index_out function"); - m.def("text_image_gather_scatter", &TextImageGatherScatter, + m.def("text_image_gather_scatter", + &TextImageGatherScatter, "text_image_gather_scatter function"); m.def("count_tokens_per_expert_func", &count_tokens_per_expert_func); m.def("tritonmoe_preprocess_func", &tritonmoe_preprocess_kernel); - m.def("MoeWna16MarlinGemmApi", &MoeWna16MarlinGemmApi, - py::arg("a"), py::arg("c_or_none"), py::arg("b_q_weight"), - py::arg("b_scales"), py::arg("global_scale_or_none"), py::arg("b_zeros_or_none"), - py::arg("g_idx_or_none"), py::arg("perm_or_none"), py::arg("workspace"), py::arg("sorted_token_ids"), - py::arg("expert_ids"), py::arg("num_tokens_post_padded"), py::arg("topk_weights"), py::arg("moe_block_size"), - py::arg("top_k"), py::arg("mul_topk_weights"), py::arg("is_ep"), py::arg("b_q_type_str"), - py::arg("size_m"), py::arg("size_n"), py::arg("size_k"), py::arg("is_k_full"), py::arg("use_atomic_add"), - py::arg("use_fp32_reduce"), py::arg("is_zp_float")); - - m.def("get_position_ids_and_mask_encoder_batch", &GetPositionIdsAndMaskEncoderBatch, + m.def("MoeWna16MarlinGemmApi", + &MoeWna16MarlinGemmApi, + py::arg("a"), + py::arg("c_or_none"), + py::arg("b_q_weight"), + py::arg("b_scales"), + py::arg("global_scale_or_none"), + py::arg("b_zeros_or_none"), + py::arg("g_idx_or_none"), + py::arg("perm_or_none"), + py::arg("workspace"), + py::arg("sorted_token_ids"), + py::arg("expert_ids"), + py::arg("num_tokens_post_padded"), + py::arg("topk_weights"), + py::arg("moe_block_size"), + py::arg("top_k"), + py::arg("mul_topk_weights"), + py::arg("is_ep"), + py::arg("b_q_type_str"), + py::arg("size_m"), + py::arg("size_n"), + py::arg("size_k"), + py::arg("is_k_full"), + py::arg("use_atomic_add"), + py::arg("use_fp32_reduce"), + py::arg("is_zp_float")); + + m.def("get_position_ids_and_mask_encoder_batch", + &GetPositionIdsAndMaskEncoderBatch, "get_position_ids_and_mask_encoder_batch function"); /** @@ -1239,7 +1408,9 @@ PYBIND11_MODULE(fastdeploy_ops, m) { * cutlass_scaled_mm_azp */ m.def("cutlass_scaled_mm", &CutlassScaledMm, "cutlass_scaled_mm function"); - m.def("cutlass_scaled_mm_azp", &CutlassScaledMmAzp, "cutlass_scaled_mm_azp function"); + m.def("cutlass_scaled_mm_azp", + &CutlassScaledMmAzp, + "cutlass_scaled_mm_azp function"); /** * quantization/common.cu @@ -1247,39 +1418,76 @@ PYBIND11_MODULE(fastdeploy_ops, m) { * dynamic_scaled_fp8_quant * dynamic_per_token_scaled_fp8_quant */ - m.def("static_scaled_fp8_quant", &StaticScaledFp8Quant, "static_scaled_fp8_quant function", - py::arg("out"), py::arg("input"), py::arg("scale")); - - m.def("dynamic_scaled_fp8_quant", &DynamicScaledFp8Quant, + m.def("static_scaled_fp8_quant", + &StaticScaledFp8Quant, + "static_scaled_fp8_quant function", + py::arg("out"), + py::arg("input"), + py::arg("scale")); + + m.def("dynamic_scaled_fp8_quant", + &DynamicScaledFp8Quant, "dynamic_scaled_fp8_quant function", - py::arg("out"), py::arg("input"), py::arg("scale")); + py::arg("out"), + py::arg("input"), + py::arg("scale")); - m.def("dynamic_per_token_scaled_fp8_quant", &DynamicPerTokenScaledFp8Quant, + m.def("dynamic_per_token_scaled_fp8_quant", + &DynamicPerTokenScaledFp8Quant, "dynamic_per_token_scaled_fp8_quant function", - py::arg("out"), py::arg("input"), py::arg("scales"), py::arg("scale_ub")); - m.def("decode_mla_write_cache", &DecodeMLAWriteCacheKernel, "decode_mla_write_cache function"); + py::arg("out"), + py::arg("input"), + py::arg("scales"), + py::arg("scale_ub")); + m.def("decode_mla_write_cache", + &DecodeMLAWriteCacheKernel, + "decode_mla_write_cache function"); - m.def("prefill_mla_write_cache", &PrefillMLAWriteCacheKernel, "prefill_mla_write_cache function"); + m.def("prefill_mla_write_cache", + &PrefillMLAWriteCacheKernel, + "prefill_mla_write_cache function"); - m.def("fused_rotary_position_encoding", &FusedRotaryPositionEncoding, "fused_rotary_position_encoding function"); + m.def("fused_rotary_position_encoding", + &FusedRotaryPositionEncoding, + "fused_rotary_position_encoding function"); - m.def("multi_head_latent_attention", &MultiHeadLatentAttention, "multi_head_latent_attention function"); + m.def("multi_head_latent_attention", + &MultiHeadLatentAttention, + "multi_head_latent_attention function"); - m.def("noaux_tc",&NoauxTc, "noaux_tc for Deepseekv3 MoE compute"); + m.def("noaux_tc", &NoauxTc, "noaux_tc for Deepseekv3 MoE compute"); #ifdef ENABLE_FP8 - m.def("cutlass_fp8_fp8_half_gemm_fused", &cutlass_fp8_fp8_half_gemm_func, - py::arg("x"), py::arg("y"), py::arg("bias"), py::arg("transpose_x"), - py::arg("transpose_y"), py::arg("scale"), py::arg("output_dtype"), - py::arg("activation_type"), "cutlass_fp8_fp8_half_gemm_fused function"); - m.def("moe_fused_hadamard_quant_fp8", &MoeFusedHadamardQuantFp8Func, - py::arg("input"), py::arg("scale"), py::arg("topk_ids"), - py::arg("top_k"), py::arg("intermediate_size"), py::arg("tiled"), "moe_fused_hadamard_quant_fp8 function"); - m.def("fused_hadamard_quant_fp8", &FusedHadamardQuantFp8Func, - py::arg("input"), py::arg("scale"), "fused_hadamard_quant_fp8 function"); + m.def("cutlass_fp8_fp8_half_gemm_fused", + &cutlass_fp8_fp8_half_gemm_func, + py::arg("x"), + py::arg("y"), + py::arg("bias"), + py::arg("transpose_x"), + py::arg("transpose_y"), + py::arg("scale"), + py::arg("output_dtype"), + py::arg("activation_type"), + "cutlass_fp8_fp8_half_gemm_fused function"); + m.def("moe_fused_hadamard_quant_fp8", + &MoeFusedHadamardQuantFp8Func, + py::arg("input"), + py::arg("scale"), + py::arg("topk_ids"), + py::arg("top_k"), + py::arg("intermediate_size"), + py::arg("tiled"), + "moe_fused_hadamard_quant_fp8 function"); + m.def("fused_hadamard_quant_fp8", + &FusedHadamardQuantFp8Func, + py::arg("input"), + py::arg("scale"), + "fused_hadamard_quant_fp8 function"); #endif - m.def("init_custom_all_reduce", &init_custom_all_reduce, "init all reduce class function"); + m.def("init_custom_all_reduce", + &init_custom_all_reduce, + "init all reduce class function"); m.def("all_reduce", &all_reduce, "all reduce function"); @@ -1289,9 +1497,13 @@ PYBIND11_MODULE(fastdeploy_ops, m) { m.def("register_buffer", ®ister_buffer, "register ipc buffer"); - m.def("register_graph_buffers", ®ister_graph_buffers, "register_graph_buffers"); + m.def("register_graph_buffers", + ®ister_graph_buffers, + "register_graph_buffers"); - m.def("allocate_shared_buffer_and_handle", &allocate_shared_buffer_and_handle, "allocate_shared_buffer_and_handle"); + m.def("allocate_shared_buffer_and_handle", + &allocate_shared_buffer_and_handle, + "allocate_shared_buffer_and_handle"); m.def("free_shared_buffer", &free_shared_buffer, "free_shared_buffer"); @@ -1299,52 +1511,86 @@ PYBIND11_MODULE(fastdeploy_ops, m) { m.def("open_mem_handle", &open_mem_handle, "open_mem_handle"); - m.def("get_graph_buffer_ipc_meta", &get_graph_buffer_ipc_meta, "get_graph_buffer_ipc_meta"); + m.def("get_graph_buffer_ipc_meta", + &get_graph_buffer_ipc_meta, + "get_graph_buffer_ipc_meta"); // speculative decoding Kernel - m.def("speculate_get_padding_offset", &SpeculateGetPaddingOffset, "speculate_get_padding_offset function"); + m.def("speculate_get_padding_offset", + &SpeculateGetPaddingOffset, + "speculate_get_padding_offset function"); - m.def("speculate_get_seq_lens_output", &SpeculateGetSeqLensOutput, "speculate_get_seq_lens_output function"); + m.def("speculate_get_seq_lens_output", + &SpeculateGetSeqLensOutput, + "speculate_get_seq_lens_output function"); - m.def("speculate_get_output_padding_offset",&SpeculateGetOutputPaddingOffset, "speculate_get_output_padding_offset function"); + m.def("speculate_get_output_padding_offset", + &SpeculateGetOutputPaddingOffset, + "speculate_get_output_padding_offset function"); - m.def("speculate_get_token_penalty_multi_scores",&SpecTokenPenaltyMultiScores, "speculate_get_token_penalty_multi_scores function"); + m.def("speculate_get_token_penalty_multi_scores", + &SpecTokenPenaltyMultiScores, + "speculate_get_token_penalty_multi_scores function"); - m.def("speculate_set_stop_value_multi_seqs",&SpecGetStopFlagsMultiSeqs, "speculate_set_stop_value_multi_seqs function"); + m.def("speculate_set_stop_value_multi_seqs", + &SpecGetStopFlagsMultiSeqs, + "speculate_set_stop_value_multi_seqs function"); - m.def("speculate_verify",&SpeculateVerify, "speculate_verify function"); + m.def("speculate_verify", &SpeculateVerify, "speculate_verify function"); - m.def("speculate_update",&SpeculateUpdate, "Speculate Update Kernel"); + m.def("speculate_update", &SpeculateUpdate, "Speculate Update Kernel"); - m.def("speculate_set_value_by_flags_and_idx",&SpeculateSetValueByFlagsAndIdx, "speculate_set_value_by_flags_and_idx function"); + m.def("speculate_set_value_by_flags_and_idx", + &SpeculateSetValueByFlagsAndIdx, + "speculate_set_value_by_flags_and_idx function"); - m.def("speculate_save_output", &SpeculateSaveWithOutputMsgStatic, "speculate_save_output function"); + m.def("speculate_save_output", + &SpeculateSaveWithOutputMsgStatic, + "speculate_save_output function"); - m.def("speculate_clear_accept_nums",&SpeculateClearAcceptNums, "speculate_clear_accept_nums function"); + m.def("speculate_clear_accept_nums", + &SpeculateClearAcceptNums, + "speculate_clear_accept_nums function"); - m.def("speculate_schedule_cache",&SpeculateScheduleCache, "SpeculateScheduleCache function"); + m.def("speculate_schedule_cache", + &SpeculateScheduleCache, + "SpeculateScheduleCache function"); m.def("ngram_match", &NgramMatch, "ngram_match function"); m.def("hybird_mtp_ngram", &HybridMtpNgram, "ngram_match_mixed function"); - m.def("draft_model_postprocess",&DraftModelPostprocess, "draft_model_postprocess function"); + m.def("draft_model_postprocess", + &DraftModelPostprocess, + "draft_model_postprocess function"); - m.def("draft_model_preprocess",&DraftModelPreprocess, "draft_model_preprocess function"); + m.def("draft_model_preprocess", + &DraftModelPreprocess, + "draft_model_preprocess function"); - m.def("draft_model_update",&DraftModelUpdate, "draft_model_update function"); + m.def("draft_model_update", &DraftModelUpdate, "draft_model_update function"); - m.def("eagle_get_hidden_states",&EagleGetHiddenStates, "eagle_get_hidden_states function"); + m.def("eagle_get_hidden_states", + &EagleGetHiddenStates, + "eagle_get_hidden_states function"); - m.def("eagle_get_self_hidden_states", &EagleGetSelfHiddenStates, "eagle_get_self_hidden_states function"); + m.def("eagle_get_self_hidden_states", + &EagleGetSelfHiddenStates, + "eagle_get_self_hidden_states function"); - m.def("mtp_step_paddle",&MTPStepPaddle, "mtp_step_paddle function"); + m.def("mtp_step_paddle", &MTPStepPaddle, "mtp_step_paddle function"); - m.def("speculate_step_paddle",&SpeculateStepPaddle, "speculate_step_paddle function"); + m.def("speculate_step_paddle", + &SpeculateStepPaddle, + "speculate_step_paddle function"); - m.def("merge_prefill_decode_output", &MergePrefillDecodeOutput, "merge_prefill_decode_output function"); + m.def("merge_prefill_decode_output", + &MergePrefillDecodeOutput, + "merge_prefill_decode_output function"); - m.def("rejection_top_p_sampling", &TopPSamplingReject, "rejection_top_p_sampling function"); + m.def("rejection_top_p_sampling", + &TopPSamplingReject, + "rejection_top_p_sampling function"); m.def("top_k_renorm_probs", &TopKRenorm, "top_k_renorm_probs function"); @@ -1352,17 +1598,31 @@ PYBIND11_MODULE(fastdeploy_ops, m) { m.def("save_output", &SaveOutMmsgStatic, "save_output function"); - m.def("limit_thinking_content_length_v1", &LimitThinkingContentLengthV1, "limit_thinking_content_length_v1 function"); + m.def("limit_thinking_content_length_v1", + &LimitThinkingContentLengthV1, + "limit_thinking_content_length_v1 function"); - m.def("limit_thinking_content_length_v2", &LimitThinkingContentLengthV2, "limit_thinking_content_length_v2 function"); + m.def("limit_thinking_content_length_v2", + &LimitThinkingContentLengthV2, + "limit_thinking_content_length_v2 function"); - m.def("speculate_limit_thinking_content_length_v1", &SpeculateLimitThinkingContentLengthV1, "speculate limit thinking content length function"); + m.def("speculate_limit_thinking_content_length_v1", + &SpeculateLimitThinkingContentLengthV1, + "speculate limit thinking content length function"); - m.def("speculate_limit_thinking_content_length_v2", &SpeculateLimitThinkingContentLengthV2, "speculate limit thinking content length function"); + m.def("speculate_limit_thinking_content_length_v2", + &SpeculateLimitThinkingContentLengthV2, + "speculate limit thinking content length function"); - m.def("speculate_get_logits", &SpeculateGetLogits, "speculate_get_logits function"); + m.def("speculate_get_logits", + &SpeculateGetLogits, + "speculate_get_logits function"); - m.def("speculate_insert_first_token", &SpeculateInsertFirstToken, "speculate_insert_first_token function"); + m.def("speculate_insert_first_token", + &SpeculateInsertFirstToken, + "speculate_insert_first_token function"); - m.def("speculate_get_target_logits", &SpeculateGetTargetLogits, "speculate_get_target_logits function"); + m.def("speculate_get_target_logits", + &SpeculateGetTargetLogits, + "speculate_get_target_logits function"); } diff --git a/custom_ops/gpu_ops/update_inputs_v1.cu b/custom_ops/gpu_ops/update_inputs_v1.cu index 33076b073ca..a4f72e05ae1 100644 --- a/custom_ops/gpu_ops/update_inputs_v1.cu +++ b/custom_ops/gpu_ops/update_inputs_v1.cu @@ -16,145 +16,174 @@ template __global__ void update_inputs_kernel_v1(bool *not_need_stop, - int *seq_lens_this_time, - int *seq_lens_encoder, - int *seq_lens_decoder, - int *step_seq_lens_decoder, - int64_t *prompt_lens, - int64_t *topk_ids, - int64_t *input_ids, - int *block_tables, - const int64_t *stop_nums, - bool *stop_flags, - bool *is_block_step, - const int64_t *next_tokens, - const int bsz, - const int max_bsz, - const int input_ids_stride, - const int block_num_per_seq, - const int block_size, - bool prefill_one_step_stop) { - int thread_idx = threadIdx.x; - typedef cub::BlockReduce BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; + int *seq_lens_this_time, + int *seq_lens_encoder, + int *seq_lens_decoder, + int *step_seq_lens_decoder, + int64_t *prompt_lens, + int64_t *topk_ids, + int64_t *input_ids, + int *block_tables, + const int64_t *stop_nums, + bool *stop_flags, + bool *is_block_step, + const int64_t *next_tokens, + const int bsz, + const int max_bsz, + const int input_ids_stride, + const int block_num_per_seq, + const int block_size, + bool prefill_one_step_stop, + bool is_pooling_task) { + int thread_idx = threadIdx.x; + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; - bool stop_flag_now = false; - int64_t stop_flag_now_int = 0; - if (thread_idx < max_bsz) { - if (thread_idx < bsz) { - stop_flag_now = stop_flags[thread_idx]; - stop_flag_now_int = static_cast(stop_flag_now); - } else { + bool stop_flag_now = false; + int64_t stop_flag_now_int = 0; + + if (thread_idx < max_bsz) { + if (thread_idx < bsz) { + stop_flag_now = stop_flags[thread_idx]; + stop_flag_now_int = static_cast(stop_flag_now); + } else { + stop_flag_now_int = 1; + } + } + + if (thread_idx < bsz) { + if (stop_flag_now) { + seq_lens_this_time[thread_idx] = 0; + seq_lens_decoder[thread_idx] = 0; + seq_lens_encoder[thread_idx] = 0; + + } else { + if (is_pooling_task) { + if (seq_lens_this_time[thread_idx] > 0) { + int total_processed = + seq_lens_this_time[thread_idx] + seq_lens_decoder[thread_idx]; + + if (total_processed >= prompt_lens[thread_idx]) { + stop_flags[thread_idx] = true; + seq_lens_encoder[thread_idx] = 0; + seq_lens_decoder[thread_idx] = 0; + seq_lens_this_time[thread_idx] = 0; stop_flag_now_int = 1; + } + } else { + seq_lens_encoder[thread_idx] = 0; } - } - if (thread_idx < bsz) { - if(stop_flag_now) { - seq_lens_this_time[thread_idx] = 0; // stop at next step + } else { + if (seq_lens_this_time[thread_idx] + seq_lens_decoder[thread_idx] >= + prompt_lens[thread_idx]) { + if (prefill_one_step_stop) { + stop_flags[thread_idx] = true; + seq_lens_this_time[thread_idx] = 0; seq_lens_decoder[thread_idx] = 0; seq_lens_encoder[thread_idx] = 0; - } else { - if (seq_lens_this_time[thread_idx] + seq_lens_decoder[thread_idx] >= prompt_lens[thread_idx]) { - if (prefill_one_step_stop) { - // prefill done, stop - stop_flags[thread_idx] = true; - seq_lens_this_time[thread_idx] = 0; - seq_lens_decoder[thread_idx] = 0; - seq_lens_encoder[thread_idx] = 0; - stop_flag_now_int = 1; - } else{ - // decoding - seq_lens_decoder[thread_idx] += seq_lens_this_time[thread_idx]; - seq_lens_this_time[thread_idx] = 1; - seq_lens_encoder[thread_idx] = 0; - int64_t *input_ids_now = input_ids + thread_idx * input_ids_stride; - input_ids_now[0] = next_tokens[thread_idx]; + stop_flag_now_int = 1; + } else { + // decoding + seq_lens_decoder[thread_idx] += seq_lens_this_time[thread_idx]; + seq_lens_this_time[thread_idx] = 1; + seq_lens_encoder[thread_idx] = 0; + int64_t *input_ids_now = input_ids + thread_idx * input_ids_stride; + input_ids_now[0] = next_tokens[thread_idx]; - // to judge whether block is not enough - int *block_table_now = block_tables + thread_idx * block_num_per_seq; - if (seq_lens_this_time[thread_idx] != 0 && block_table_now[seq_lens_decoder[thread_idx] / block_size] == -1) { - // should be scheduled by server - is_block_step[thread_idx] = true; - seq_lens_this_time[thread_idx]= 0; - stop_flags[thread_idx] = true; - step_seq_lens_decoder[thread_idx] = seq_lens_decoder[thread_idx]; - seq_lens_decoder[thread_idx] = 0; - stop_flag_now_int = 1; - } - } - } else - { - stop_flags[thread_idx] = true; - seq_lens_this_time[thread_idx] = 0; - seq_lens_decoder[thread_idx] = 0; - seq_lens_encoder[thread_idx] = 0; - topk_ids[thread_idx] = -1; - stop_flag_now_int = 1; + int *block_table_now = + block_tables + thread_idx * block_num_per_seq; + if (seq_lens_this_time[thread_idx] != 0 && + block_table_now[seq_lens_decoder[thread_idx] / block_size] == + -1) { + is_block_step[thread_idx] = true; + seq_lens_this_time[thread_idx] = 0; + stop_flags[thread_idx] = true; + step_seq_lens_decoder[thread_idx] = seq_lens_decoder[thread_idx]; + seq_lens_decoder[thread_idx] = 0; + stop_flag_now_int = 1; } + } + } else { + stop_flags[thread_idx] = true; + seq_lens_this_time[thread_idx] = 0; + seq_lens_decoder[thread_idx] = 0; + seq_lens_encoder[thread_idx] = 0; + topk_ids[thread_idx] = -1; + stop_flag_now_int = 1; } + } } - __syncthreads(); - int64_t stop_sum = BlockReduce(temp_storage).Sum(stop_flag_now_int); - if (thread_idx == 0) { - not_need_stop[0] = stop_sum < stop_nums[0]; - } + } + + __syncthreads(); + int64_t stop_sum = BlockReduce(temp_storage).Sum(stop_flag_now_int); + if (thread_idx == 0) { + not_need_stop[0] = stop_sum < stop_nums[0]; + } } void UpdateInputesV1(const paddle::Tensor &stop_flags, - const paddle::Tensor ¬_need_stop, // only on cpu - const paddle::Tensor &seq_lens_this_time, - const paddle::Tensor &seq_lens_encoder, - const paddle::Tensor &seq_lens_decoder, - const paddle::Tensor &step_seq_lens_decoder, - const paddle::Tensor &prompt_lens, - const paddle::Tensor &topk_ids, - const paddle::Tensor &input_ids, - const paddle::Tensor &block_tables, - const paddle::Tensor &stop_nums, - const paddle::Tensor &next_tokens, - const paddle::Tensor &is_block_step, - const int block_size) { + const paddle::Tensor ¬_need_stop, // only on cpu + const paddle::Tensor &seq_lens_this_time, + const paddle::Tensor &seq_lens_encoder, + const paddle::Tensor &seq_lens_decoder, + const paddle::Tensor &step_seq_lens_decoder, + const paddle::Tensor &prompt_lens, + const paddle::Tensor &topk_ids, + const paddle::Tensor &input_ids, + const paddle::Tensor &block_tables, + const paddle::Tensor &stop_nums, + const paddle::Tensor &next_tokens, + const paddle::Tensor &is_block_step, + const int block_size, + const bool is_pooling_task) { #ifdef PADDLE_WITH_CUSTOM_DEVICE - auto dev_ctx = static_cast(paddle::experimental::DeviceContextPool::Instance().Get(input_ids.place())); - auto cu_stream = dev_ctx->stream(); + auto dev_ctx = static_cast( + paddle::experimental::DeviceContextPool::Instance().Get( + input_ids.place())); + auto cu_stream = dev_ctx->stream(); #else - auto cu_stream = input_ids.stream(); + auto cu_stream = input_ids.stream(); #endif - bool prefill_one_step_stop = false; - if (const char *env_p = std::getenv("PREFILL_NODE_ONE_STEP_STOP_V1")) { - if (env_p[0] == '1') { - prefill_one_step_stop = true; - } + bool prefill_one_step_stop = false; + if (const char *env_p = std::getenv("PREFILL_NODE_ONE_STEP_STOP_V1")) { + if (env_p[0] == '1') { + prefill_one_step_stop = true; } - const int max_bsz = stop_flags.shape()[0]; - const int now_bsz = seq_lens_this_time.shape()[0]; - const int input_ids_stride = input_ids.shape()[1]; - const int block_num_per_seq = block_tables.shape()[1]; - auto not_need_stop_gpu = not_need_stop.copy_to(stop_flags.place(), false); - update_inputs_kernel_v1<1024><<<1, 1024, 0, cu_stream>>>( - const_cast(not_need_stop_gpu.data()), - const_cast(seq_lens_this_time.data()), - const_cast(seq_lens_encoder.data()), - const_cast(seq_lens_decoder.data()), - const_cast(step_seq_lens_decoder.data()), - const_cast(prompt_lens.data()), - const_cast(topk_ids.data()), - const_cast(input_ids.data()), - const_cast(block_tables.data()), - stop_nums.data(), - const_cast(stop_flags.data()), - const_cast(is_block_step.data()), - next_tokens.data(), - now_bsz, - max_bsz, - input_ids_stride, - block_num_per_seq, - block_size, - prefill_one_step_stop); - auto not_need_stop_cpu = - not_need_stop_gpu.copy_to(not_need_stop.place(), false); - bool *not_need_stop_data = const_cast(not_need_stop.data()); - not_need_stop_data[0] = not_need_stop_cpu.data()[0]; + } + const int max_bsz = stop_flags.shape()[0]; + const int now_bsz = seq_lens_this_time.shape()[0]; + + const int bsz_to_process = is_pooling_task ? max_bsz : now_bsz; + + const int input_ids_stride = input_ids.shape()[1]; + const int block_num_per_seq = block_tables.shape()[1]; + auto not_need_stop_gpu = not_need_stop.copy_to(stop_flags.place(), false); + update_inputs_kernel_v1<1024><<<1, 1024, 0, cu_stream>>>( + const_cast(not_need_stop_gpu.data()), + const_cast(seq_lens_this_time.data()), + const_cast(seq_lens_encoder.data()), + const_cast(seq_lens_decoder.data()), + const_cast(step_seq_lens_decoder.data()), + const_cast(prompt_lens.data()), + const_cast(topk_ids.data()), + const_cast(input_ids.data()), + const_cast(block_tables.data()), + stop_nums.data(), + const_cast(stop_flags.data()), + const_cast(is_block_step.data()), + next_tokens.data(), + bsz_to_process, + max_bsz, + input_ids_stride, + block_num_per_seq, + block_size, + prefill_one_step_stop, + is_pooling_task); + auto not_need_stop_cpu = + not_need_stop_gpu.copy_to(not_need_stop.place(), false); + bool *not_need_stop_data = const_cast(not_need_stop.data()); + not_need_stop_data[0] = not_need_stop_cpu.data()[0]; } PD_BUILD_STATIC_OP(update_inputs_v1) @@ -171,7 +200,7 @@ PD_BUILD_STATIC_OP(update_inputs_v1) "stop_nums", "next_tokens", "is_block_step"}) - .Attrs({"block_size: int"}) + .Attrs({"block_size: int", "is_pooling_task:bool"}) .Outputs({"not_need_stop_out", "seq_lens_this_time_out", "seq_lens_encoder_out", diff --git a/fastdeploy/engine/common_engine.py b/fastdeploy/engine/common_engine.py index 42b29d433ea..6848fbdb4a5 100644 --- a/fastdeploy/engine/common_engine.py +++ b/fastdeploy/engine/common_engine.py @@ -728,6 +728,7 @@ def _fetch_request(): raise # 2. Schedule requests tasks = self.resource_manager.schedule() + # 3. Send to engine if tasks: if self.cfg.scheduler_config.splitwise_role == "decode": @@ -877,24 +878,27 @@ def _zmq_send_generated_tokens(self): for request_id, contents in results.items(): new_contents = [] for content in contents: - decode_type = content.outputs.decode_type - delta_text = "" - if decode_type == 0: - delta_text, token_ids = self._decode_token( - token_ids=content.outputs.token_ids, req_id=request_id, is_end=content.finished - ) + if isinstance(content, RequestOutput): + decode_type = content.outputs.decode_type + delta_text = "" + if decode_type == 0: + delta_text, token_ids = self._decode_token( + token_ids=content.outputs.token_ids, req_id=request_id, is_end=content.finished + ) + else: + token_ids = content.outputs.token_ids + if len(token_ids): + content.outputs.token_ids = token_ids + content.outputs.text = delta_text + new_contents.append(content) + elif content.finished: + new_contents.append(content) + else: + llm_logger.warning( + f"current tokens need to accumulate, req_id: {request_id} {content.outputs.token_ids}" + ) else: - token_ids = content.outputs.token_ids - if len(token_ids): - content.outputs.token_ids = token_ids - content.outputs.text = delta_text - new_contents.append(content) - elif content.finished: new_contents.append(content) - else: - llm_logger.warning( - f"current tokens need to accumulate, req_id: {request_id} {content.outputs.token_ids}" - ) if len(new_contents): llm_logger.info(f"Send response for request id: {request_id}") self.send_response_server.send_response(request_id, new_contents) diff --git a/fastdeploy/model_executor/layers/pooler.py b/fastdeploy/model_executor/layers/pooler.py index b4e3a6fc9e3..ec9bee1e9f1 100644 --- a/fastdeploy/model_executor/layers/pooler.py +++ b/fastdeploy/model_executor/layers/pooler.py @@ -305,19 +305,6 @@ def forward(self, pooled_data: Union[list[paddle.Tensor], paddle.Tensor], poolin return pooled_data -def build_output( - all_data: Union[paddle.Tensor, list[paddle.Tensor]], -) -> PoolerOutput: - # Pooling models D2H & synchronize occurs here - if isinstance(all_data, list): - all_data = [d.cpu() for d in all_data] - else: - all_data = all_data.cpu() - - all_outputs = [PoolingSequenceGroupOutput(data) for data in all_data] - return PoolerOutput(outputs=all_outputs) - - class PoolingMethod(nn.Layer, ABC): @staticmethod @@ -473,8 +460,11 @@ def forward( pooling_metadata: PoolingMetadata, ) -> PoolerOutput: pooled_data = self.extract_states(hidden_states, pooling_metadata) - pooled_data = self.head(pooled_data, pooling_metadata) - return build_output(pooled_data) + pooling_params = get_pooling_params(pooling_metadata) + assert len(pooled_data) == len(pooling_params) + + pooled_data = [self.head(d, p) for d, p in zip(pooled_data, pooling_params)] + return pooled_data class SimplePooler(Pooler): @@ -520,7 +510,7 @@ def forward( ) -> PoolerOutput: pooled_data = self.pooling(hidden_states, pooling_metadata) pooled_data = self.head(pooled_data, pooling_metadata) - return build_output(pooled_data) + return pooled_data class PoolerNormalize(PoolerActivation): @@ -567,7 +557,7 @@ def forward( hidden_states, pooling_metadata[offset : offset + num_items], ) - outputs.extend(group_output.outputs) + outputs.extend(group_output) offset += num_items return PoolerOutput(outputs) diff --git a/fastdeploy/model_executor/pre_and_post_process.py b/fastdeploy/model_executor/pre_and_post_process.py index bddb12b496b..f1158f9be84 100644 --- a/fastdeploy/model_executor/pre_and_post_process.py +++ b/fastdeploy/model_executor/pre_and_post_process.py @@ -15,7 +15,7 @@ """ import queue -from typing import Dict, Optional +from typing import Dict, Optional, Union import numpy as np import paddle @@ -85,6 +85,7 @@ speculate_limit_thinking_content_length_v2, ) +from fastdeploy.output.pooler import PoolerOutput from fastdeploy.output.stream_transfer_data import DecoderState, StreamTransferData from fastdeploy.worker.output import ModelOutputData, ModelRunnerOutput, SamplerOutput @@ -238,17 +239,33 @@ def pre_process( ) -def _build_stream_transfer_data(output_tokens: np.ndarray): +def _build_stream_transfer_data(output_tokens: np.ndarray, pooler_outputs: None): """Split output_tokens and output""" - output_tokens = output_tokens.reshape([-1]).numpy() - output_tokens_lists = np.split(output_tokens, output_tokens.shape[0]) stream_transfer_datas = [] - for bid, output_token_per_sample in enumerate(output_tokens_lists): - stream_transfer_data = StreamTransferData( - decoder_state=DecoderState.TEXT, tokens=output_token_per_sample, batch_id=bid - ) - stream_transfer_datas.append(stream_transfer_data) + if output_tokens is not None: + + output_tokens = output_tokens.reshape([-1]).numpy() + output_tokens_lists = np.split(output_tokens, output_tokens.shape[0]) + + for bid, output_token_per_sample in enumerate(output_tokens_lists): + stream_transfer_data = StreamTransferData( + decoder_state=DecoderState.TEXT, tokens=output_token_per_sample, batch_id=bid + ) + stream_transfer_datas.append(stream_transfer_data) + elif pooler_outputs is not None: + for bid, pooler_output in enumerate(pooler_outputs): + if pooler_output.dtype == paddle.bfloat16: + pooler_output = pooler_output.astype("float32") + + pooler_output = pooler_output.numpy() + if pooler_output.dtype != np.float32: + pooler_output = pooler_output.astype(np.float32) + + stream_transfer_data = StreamTransferData( + decoder_state=DecoderState.TEXT, pooler_output=pooler_output, batch_id=bid + ) + stream_transfer_datas.append(stream_transfer_data) return stream_transfer_datas @@ -301,6 +318,7 @@ def post_process_normal( model_output.stop_token_ids, model_output.stop_seqs_len, False, + is_pooling=False, ) # multi ends elif current_platform.is_maca(): set_stop_value_multi_ends( @@ -314,6 +332,7 @@ def post_process_normal( model_output.stop_token_ids, model_output.stop_seqs_len, False, + is_pooling=False, ) # multi ends else: set_stop_value_multi_ends( @@ -343,6 +362,7 @@ def post_process_normal( model_output.next_tokens, model_output.is_block_step, block_size, + False, # ✅ 改为位置参数,表示 is_pooling_task=False ) else: update_inputs( @@ -458,7 +478,7 @@ def post_process_specualate( def post_process( - sampler_output: SamplerOutput, + sampler_or_pooler_output: Union[SamplerOutput, PoolerOutput], model_output: ModelOutputData, share_inputs: Dict[str, paddle.Tensor], block_size: int = 64, @@ -470,28 +490,30 @@ def post_process( line_break_id: int = -1, ) -> None: """Post-processing steps after completing a single token generation.""" - if speculative_decoding: - post_process_specualate( - sampler_output, - model_output, - share_inputs, - save_each_rank, - skip_save_output, - think_end_id, - line_break_id, - ) - else: - post_process_normal( - sampler_output, + + if isinstance(sampler_or_pooler_output, PoolerOutput): + post_process_pooling( + sampler_or_pooler_output, model_output, share_inputs, block_size, save_each_rank, skip_save_output, async_output_queue, - think_end_id, - line_break_id, ) + else: + if speculative_decoding: + post_process_specualate(sampler_or_pooler_output, model_output, save_each_rank, skip_save_output) + else: + post_process_normal( + sampler_or_pooler_output, + model_output, + share_inputs, + block_size, + save_each_rank, + skip_save_output, + async_output_queue, + ) def step_cuda( @@ -775,3 +797,62 @@ def rebuild_padding( else: raise RuntimeError("Not supported platform") return hidden_states + + +def post_process_pooling( + pooler_output: PoolerOutput, + model_output: ModelOutputData, + share_inputs: Dict[str, paddle.Tensor], + block_size: int = 64, + save_each_rank: bool = False, + skip_save_output: bool = False, + async_output_queue: queue.Queue = None, +) -> None: + + paddle.assign( + paddle.where( + model_output.stop_flags, + model_output.step_idx, + model_output.step_idx + 1, + ), + model_output.step_idx, + ) + length_cond = paddle.greater_equal(model_output.step_idx, model_output.max_dec_len) + + paddle.assign( + paddle.logical_or(model_output.stop_flags, length_cond), + model_output.stop_flags, + ) + + with paddle.framework._no_check_dy2st_diff(): + if envs.ENABLE_V1_KVCACHE_SCHEDULER: + dummy_sampled_tokens = paddle.full_like(model_output.next_tokens, -1, dtype="int64") + + paddle.assign( + paddle.ones_like(model_output.stop_flags, dtype="bool"), + model_output.stop_flags, + ) + update_inputs_v1( + model_output.stop_flags, + model_output.not_need_stop, + model_output.seq_lens_this_time, + model_output.seq_lens_encoder, + model_output.seq_lens_decoder, + share_inputs["step_seq_lens_decoder"], + share_inputs["prompt_lens"], + dummy_sampled_tokens, + model_output.input_ids, + share_inputs["block_tables"], + model_output.stop_nums, + model_output.next_tokens, + model_output.is_block_step, + block_size, + True, + ) + + if not skip_save_output: + if envs.FD_USE_GET_SAVE_OUTPUT_V1: + if save_each_rank or model_output.mp_rank == 0: + output = _build_stream_transfer_data(output_tokens=None, pooler_outputs=pooler_output.outputs) + + async_output_queue.put(output) diff --git a/fastdeploy/output/stream_transfer_data.py b/fastdeploy/output/stream_transfer_data.py index 6241a28d990..f2d71f9fc2d 100644 --- a/fastdeploy/output/stream_transfer_data.py +++ b/fastdeploy/output/stream_transfer_data.py @@ -35,8 +35,8 @@ class StreamTransferData: """StreamTransferData""" decoder_state: DecoderState - tokens: np.array batch_id: int + tokens: Optional[np.array] = None speculaive_decoding: bool = False logprobs: Optional[np.array] = None accept_tokens: Optional[np.array] = None diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index e3de0803393..8dacb5cad73 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -26,6 +26,7 @@ from paddleformers.utils.log import logger from fastdeploy.config import FDConfig +from fastdeploy.engine.pooling_params import PoolingParams from fastdeploy.engine.request import Request, RequestType from fastdeploy.model_executor.graph_optimization.utils import ( GPUMemoryChecker, @@ -81,7 +82,6 @@ import zmq from fastdeploy import envs -from fastdeploy.engine.pooling_params import PoolingParams from fastdeploy.engine.tasks import PoolingTask from fastdeploy.input.ernie4_5_vl_processor import DataProcessor from fastdeploy.inter_communicator import IPCSignal, ZmqIpcClient @@ -310,9 +310,15 @@ def insert_tasks_v1(self, req_dicts: List[Request], num_running_requests: int = req_len = len(req_dicts) has_prefill_task = False has_decode_task = False + + batch_pooling_params = [] for i in range(req_len): request = req_dicts[i] idx = request.idx + + if hasattr(request, "pooling_params") and request.pooling_params is not None: + batch_pooling_params.append(request.pooling_params) + if request.task_type.value == RequestType.PREFILL.value: # prefill task prefill_start_index = request.prefill_start_index prefill_end_index = request.prefill_end_index @@ -354,24 +360,36 @@ def insert_tasks_v1(self, req_dicts: List[Request], num_running_requests: int = position_ids, request.get("max_tokens", 2048) ) - if request.get("enable_thinking", False) and request.get("reasoning_max_tokens", None) is not None: - # Enable thinking - self.share_inputs["max_think_lens"][idx : idx + 1, :] = request.get("reasoning_max_tokens") - self.share_inputs["limit_think_status"][idx : idx + 1, :] = 0 - else: - # Disable thinking - self.share_inputs["max_think_lens"][idx : idx + 1, :] = -1 - self.share_inputs["limit_think_status"][idx : idx + 1, :] = 0 + if not self.is_pooling_model: + if request.get("enable_thinking", False): + # Enable thinking + req_reasoning_max_tokens = request.get("reasoning_max_tokens") + req_max_tokens = request.get("max_tokens") + final_reasoning_tokens = ( + req_reasoning_max_tokens if req_reasoning_max_tokens is not None else req_max_tokens + ) + + self.share_inputs["enable_thinking"][idx : idx + 1] = True + self.share_inputs["need_think_end"][idx : idx + 1, :] = 1 + self.share_inputs["reasoning_index"][idx : idx + 1, :] = final_reasoning_tokens + else: + # Disable thinking + self.share_inputs["enable_thinking"][idx : idx + 1] = False + self.share_inputs["need_think_end"][idx : idx + 1, :] = 0 + self.share_inputs["reasoning_index"][idx : idx + 1, :] = 0 if isinstance(request.prompt_token_ids, np.ndarray): prompt_token_ids = request.prompt_token_ids.tolist() else: prompt_token_ids = request.prompt_token_ids input_ids = prompt_token_ids + request.output_token_ids + prompt_len = len(prompt_token_ids) + self.share_inputs["prompt_ids"][idx : idx + 1, :prompt_len] = np.array(prompt_token_ids, dtype="int64") logger.debug( f"Handle prefill request {request} at idx {idx}, " f"{prefill_start_index=}, {prefill_end_index=}, " f"need_prefilled_token_num={len(input_ids)}" + f"prompt_len={prompt_len}" ) self.share_inputs["input_ids"][idx : idx + 1, :length] = np.array( input_ids[prefill_start_index:prefill_end_index] @@ -466,6 +484,7 @@ def insert_tasks_v1(self, req_dicts: List[Request], num_running_requests: int = else: self.share_inputs["stop_seqs_len"][idx : idx + 1, :] = 0 + self.pooling_params = batch_pooling_params if has_prefill_task or has_decode_task: self.share_inputs["not_need_stop"][0] = True self.share_inputs["seq_lens_this_time"] = self.seq_lens_this_time_buffer[:num_running_requests] @@ -586,14 +605,23 @@ def insert_prefill_inputs(self, req_dicts: List[Request], num_running_requests: ) self.share_inputs["seq_lens_decoder"][idx : idx + 1] = 0 - if request.get("enable_thinking", False) and request.get("reasoning_max_tokens", None) is not None: - # Enable thinking - self.share_inputs["max_think_lens"][idx : idx + 1, :] = request.get("reasoning_max_tokens") - self.share_inputs["limit_think_status"][idx : idx + 1, :] = 0 - else: - # Disable thinking - self.share_inputs["max_think_lens"][idx : idx + 1, :] = -1 - self.share_inputs["limit_think_status"][idx : idx + 1, :] = 0 + if not self.is_pooling_model: + if request.get("enable_thinking", False): + # Enable thinking + req_reasoning_max_tokens = request.get("reasoning_max_tokens") + req_max_tokens = request.get("max_tokens") + final_reasoning_tokens = ( + req_reasoning_max_tokens if req_reasoning_max_tokens is not None else req_max_tokens + ) + + self.share_inputs["enable_thinking"][idx : idx + 1] = True + self.share_inputs["need_think_end"][idx : idx + 1, :] = 1 + self.share_inputs["reasoning_index"][idx : idx + 1, :] = final_reasoning_tokens + else: + # Disable thinking + self.share_inputs["enable_thinking"][idx : idx + 1] = False + self.share_inputs["need_think_end"][idx : idx + 1, :] = 0 + self.share_inputs["reasoning_index"][idx : idx + 1, :] = 0 def get_attr_from_request(request, attr, default_value=None): res = request.get(attr, default_value) @@ -1384,13 +1412,13 @@ def _dummy_pooler_run( ) -> PoolerOutput: output_size = dict[PoolingTask, float]() for task in self.get_supported_pooling_tasks(): + output = self._dummy_pooler_run_task(hidden_states, task) - output_size[task] = output.get_data_nbytes() + output_size[task] = sum(o.numel() * o.element_size() if hasattr(o, "numel") else 0 for o in output) del output max_task = max(output_size.items(), key=lambda x: x[1])[0] final_output = self._dummy_pooler_run_task(hidden_states, max_task) - return final_output def _dummy_sampler_run( @@ -1476,7 +1504,7 @@ def _dummy_sampler_run( ) post_process( - sampler_output=sampler_output, + sampler_or_pooler_output=sampler_output, model_output=model_output_data, share_inputs=self.share_inputs, block_size=self.cache_config.block_size, @@ -1833,171 +1861,251 @@ class at the server level, which is too granular for ModelRunner. ) if self.use_cudagraph: model_output = model_output[: self.real_token_num] - hidden_states = rebuild_padding( - model_output, - self.share_inputs["cu_seqlens_q"], - self.share_inputs["seq_lens_this_time"], - self.share_inputs["seq_lens_decoder"], - self.share_inputs["seq_lens_encoder"], - (self.share_inputs["output_padding_offset"] if self.speculative_decoding else None), - self.model_config.max_model_len, - ) - # 4. Compute logits, Sample - logits = None - if hasattr(self.model, "is_pooling_model") and self.model.is_pooling_model: - # TODO(lizexu123) The execution of the pooling function have not been implemented yet. - pass - else: - logits = self.model.compute_logits(hidden_states) + hidden_states = model_output + if self.is_pooling_model: + pooler_output = self._pool(hidden_states, num_running_requests) + + model_output_data = ModelOutputData( + next_tokens=self.share_inputs["next_tokens"], + stop_flags=self.share_inputs["stop_flags"], + step_idx=self.share_inputs["step_idx"], + max_dec_len=self.share_inputs["max_dec_len"], + pre_ids=self.share_inputs["pre_ids"], + seq_lens_this_time=self.share_inputs["seq_lens_this_time"], + eos_token_id=self.share_inputs["eos_token_id"], + not_need_stop=self.share_inputs["not_need_stop"], + input_ids=self.share_inputs["input_ids"], + stop_nums=self.share_inputs["stop_nums"], + seq_lens_encoder=self.share_inputs["seq_lens_encoder"], + seq_lens_decoder=self.share_inputs["seq_lens_decoder"], + is_block_step=self.share_inputs["is_block_step"], + full_hidden_states=model_output, + msg_queue_id=self.parallel_config.msg_queue_id, + mp_rank=self.parallel_config.tensor_parallel_rank, + use_ep=self.parallel_config.use_ep, + draft_tokens=(self.share_inputs["draft_tokens"] if self.speculative_decoding else None), + actual_draft_token_num=( + self.share_inputs["actual_draft_token_num"] if self.speculative_decoding else None + ), + accept_tokens=(self.share_inputs["accept_tokens"] if self.speculative_decoding else None), + accept_num=(self.share_inputs["accept_num"] if self.speculative_decoding else None), + stop_token_ids=self.share_inputs["stop_seqs"], + stop_seqs_len=self.share_inputs["stop_seqs_len"], + prompt_lens=self.share_inputs["prompt_lens"], + ) - if not self.speculative_decoding: - set_value_by_flags_and_idx( - self.share_inputs["pre_ids"], - self.share_inputs["input_ids"], - self.share_inputs["seq_lens_this_time"], - self.share_inputs["seq_lens_encoder"], - self.share_inputs["seq_lens_decoder"], - self.share_inputs["step_idx"], - self.share_inputs["stop_flags"], + post_process( + sampler_or_pooler_output=pooler_output, + model_output=model_output_data, + share_inputs=self.share_inputs, + block_size=self.cache_config.block_size, + save_each_rank=self.parallel_config.use_ep, + speculative_decoding=self.speculative_decoding, + skip_save_output=False, + async_output_queue=self.async_output_queue, ) - sampler_output = self.sampler( - logits, - self.sampling_metadata, - skip_idx_list, + + self.seq_lens_this_time_buffer[:num_running_requests].copy_( + self.share_inputs["seq_lens_this_time"][:num_running_requests], False ) - if self.parallel_config.tensor_parallel_size > 1: - paddle.distributed.broadcast( - sampler_output.sampled_token_ids, - self.parallel_config.data_parallel_rank * self.parallel_config.tensor_parallel_size, - group=self.parallel_config.tp_group, - ) + + return None else: - sampler_output = self.sampler( - logits, - self.sampling_metadata, + hidden_states = rebuild_padding( + model_output, + self.share_inputs["cu_seqlens_q"], + self.share_inputs["seq_lens_this_time"], + self.share_inputs["seq_lens_decoder"], + self.share_inputs["seq_lens_encoder"], + (self.share_inputs["output_padding_offset"] if self.speculative_decoding else None), self.model_config.max_model_len, - self.share_inputs, ) - if self.parallel_config.tensor_parallel_size > 1: - paddle.distributed.broadcast( - self.share_inputs["accept_tokens"], - self.parallel_config.data_parallel_rank * self.parallel_config.tensor_parallel_size, - group=self.parallel_config.tp_group, - ) - paddle.distributed.broadcast( - self.share_inputs["accept_num"], - self.parallel_config.data_parallel_rank * self.parallel_config.tensor_parallel_size, - group=self.parallel_config.tp_group, - ) - paddle.distributed.broadcast( + + # 4. Compute logits, Sample + logits = self.model.compute_logits(hidden_states) + + if not self.speculative_decoding: + set_value_by_flags_and_idx( + self.share_inputs["pre_ids"], + self.share_inputs["input_ids"], + self.share_inputs["seq_lens_this_time"], + self.share_inputs["seq_lens_encoder"], + self.share_inputs["seq_lens_decoder"], self.share_inputs["step_idx"], - self.parallel_config.data_parallel_rank * self.parallel_config.tensor_parallel_size, - group=self.parallel_config.tp_group, - ) - paddle.distributed.broadcast( self.share_inputs["stop_flags"], - self.parallel_config.data_parallel_rank * self.parallel_config.tensor_parallel_size, - group=self.parallel_config.tp_group, ) + sampler_output = self.sampler( + logits, + self.sampling_metadata, + skip_idx_list, + ) + if self.parallel_config.tensor_parallel_size > 1: + paddle.distributed.broadcast( + sampler_output.sampled_token_ids, + self.parallel_config.data_parallel_rank * self.parallel_config.tensor_parallel_size, + group=self.parallel_config.tp_group, + ) + else: + sampler_output = self.sampler( + logits, + self.sampling_metadata, + self.model_config.max_model_len, + self.share_inputs, + ) + if self.parallel_config.tensor_parallel_size > 1: + paddle.distributed.broadcast( + self.share_inputs["accept_tokens"], + self.parallel_config.data_parallel_rank * self.parallel_config.tensor_parallel_size, + group=self.parallel_config.tp_group, + ) + paddle.distributed.broadcast( + self.share_inputs["accept_num"], + self.parallel_config.data_parallel_rank * self.parallel_config.tensor_parallel_size, + group=self.parallel_config.tp_group, + ) + paddle.distributed.broadcast( + self.share_inputs["step_idx"], + self.parallel_config.data_parallel_rank * self.parallel_config.tensor_parallel_size, + group=self.parallel_config.tp_group, + ) + paddle.distributed.broadcast( + self.share_inputs["stop_flags"], + self.parallel_config.data_parallel_rank * self.parallel_config.tensor_parallel_size, + group=self.parallel_config.tp_group, + ) - # 5. Post Process - model_output_data = ModelOutputData( - next_tokens=self.share_inputs["next_tokens"], - stop_flags=self.share_inputs["stop_flags"], - step_idx=self.share_inputs["step_idx"], - max_dec_len=self.share_inputs["max_dec_len"], - pre_ids=self.share_inputs["pre_ids"], - seq_lens_this_time=self.share_inputs["seq_lens_this_time"], - eos_token_id=self.share_inputs["eos_token_id"], - not_need_stop=self.share_inputs["not_need_stop"], - input_ids=self.share_inputs["input_ids"], - stop_nums=self.share_inputs["stop_nums"], - seq_lens_encoder=self.share_inputs["seq_lens_encoder"], - seq_lens_decoder=self.share_inputs["seq_lens_decoder"], - is_block_step=self.share_inputs["is_block_step"], - full_hidden_states=model_output, - msg_queue_id=self.parallel_config.msg_queue_id, - mp_rank=self.parallel_config.tensor_parallel_rank, - use_ep=self.parallel_config.use_ep, - draft_tokens=(self.share_inputs["draft_tokens"] if self.speculative_decoding else None), - actual_draft_token_num=( - self.share_inputs["actual_draft_token_num"] if self.speculative_decoding else None - ), - accept_tokens=(self.share_inputs["accept_tokens"] if self.speculative_decoding else None), - accept_num=(self.share_inputs["accept_num"] if self.speculative_decoding else None), - stop_token_ids=self.share_inputs["stop_seqs"], - stop_seqs_len=self.share_inputs["stop_seqs_len"], - prompt_lens=self.share_inputs["prompt_lens"], - ) + # 5. Post Process + model_output_data = ModelOutputData( + next_tokens=self.share_inputs["next_tokens"], + stop_flags=self.share_inputs["stop_flags"], + step_idx=self.share_inputs["step_idx"], + max_dec_len=self.share_inputs["max_dec_len"], + pre_ids=self.share_inputs["pre_ids"], + seq_lens_this_time=self.share_inputs["seq_lens_this_time"], + eos_token_id=self.share_inputs["eos_token_id"], + not_need_stop=self.share_inputs["not_need_stop"], + input_ids=self.share_inputs["input_ids"], + stop_nums=self.share_inputs["stop_nums"], + seq_lens_encoder=self.share_inputs["seq_lens_encoder"], + seq_lens_decoder=self.share_inputs["seq_lens_decoder"], + is_block_step=self.share_inputs["is_block_step"], + full_hidden_states=model_output, + msg_queue_id=self.parallel_config.msg_queue_id, + mp_rank=self.parallel_config.tensor_parallel_rank, + use_ep=self.parallel_config.use_ep, + draft_tokens=(self.share_inputs["draft_tokens"] if self.speculative_decoding else None), + actual_draft_token_num=( + self.share_inputs["actual_draft_token_num"] if self.speculative_decoding else None + ), + accept_tokens=(self.share_inputs["accept_tokens"] if self.speculative_decoding else None), + accept_num=(self.share_inputs["accept_num"] if self.speculative_decoding else None), + stop_token_ids=self.share_inputs["stop_seqs"], + stop_seqs_len=self.share_inputs["stop_seqs_len"], + prompt_lens=self.share_inputs["prompt_lens"], + ) - if self.speculative_config.method in ["mtp"] and self.scheduler_config.splitwise_role == "prefill": - skip_save_output = True - else: - skip_save_output = False - post_process( - sampler_output=sampler_output, - model_output=model_output_data, - share_inputs=self.share_inputs, - block_size=self.cache_config.block_size, - save_each_rank=self.parallel_config.use_ep, - speculative_decoding=self.speculative_decoding, - skip_save_output=skip_save_output, - async_output_queue=self.async_output_queue, - think_end_id=self.model_config.think_end_id, - line_break_id=self.model_config.line_break_id, - ) - if self.guided_backend is not None and sampler_output is not None: - self.sampler.post_process(sampler_output.sampled_token_ids, skip_idx_list) + if self.speculative_config.method in ["mtp"] and self.scheduler_config.splitwise_role == "prefill": + skip_save_output = True + else: + skip_save_output = False + + post_process( + sampler_or_pooler_output=sampler_output, + model_output=model_output_data, + share_inputs=self.share_inputs, + block_size=self.cache_config.block_size, + save_each_rank=self.parallel_config.use_ep, + speculative_decoding=self.speculative_decoding, + skip_save_output=skip_save_output, + async_output_queue=self.async_output_queue, + think_end_id=self.model_config.think_end_id, + line_break_id=self.model_config.line_break_id, + ) + if self.guided_backend is not None and sampler_output is not None: + self.sampler.post_process(sampler_output.sampled_token_ids, skip_idx_list) + + # 6. Speculative decode + if self.speculative_decoding: + if self.speculative_method == "mtp": + self.proposer.run( + full_hidden_states=model_output, step_use_cudagraph=self.forward_meta.step_use_cudagraph + ) + else: + self.proposer.run(share_inputs=self.share_inputs) - # 6. Speculative decode - if self.speculative_decoding: - if self.speculative_method == "mtp": - self.proposer.run( - full_hidden_states=model_output, step_use_cudagraph=self.forward_meta.step_use_cudagraph + # 7. Update 'infer_seed' and step_cuda() + self.share_inputs["infer_seed"].add_(self.infer_seed_increment) + self.share_inputs["infer_seed"][:] %= self.MAX_INFER_SEED + if not envs.ENABLE_V1_KVCACHE_SCHEDULER: + step_cuda( + self.share_inputs, + self.cache_config.block_size, + self.cache_config.enc_dec_block_num, + self.speculative_config, + self.cache_config.enable_prefix_caching, ) - else: - self.proposer.run(share_inputs=self.share_inputs) - # 7. Update 'infer_seed' and step_cuda() - self.share_inputs["infer_seed"].add_(self.infer_seed_increment) - self.share_inputs["infer_seed"][:] %= self.MAX_INFER_SEED - if not envs.ENABLE_V1_KVCACHE_SCHEDULER: - step_cuda( - self.share_inputs, - self.cache_config.block_size, - self.cache_config.enc_dec_block_num, - self.speculative_config, - self.cache_config.enable_prefix_caching, - ) + self._update_chunked_prefill(model_forward_batch) + self._add_cache(model_forward_batch) + elif self.speculative_decoding: + speculate_schedule_cache( + self.share_inputs["draft_tokens"], + self.share_inputs["block_tables"], + self.share_inputs["stop_flags"], + self.share_inputs["prompt_lens"], + self.share_inputs["seq_lens_this_time"], + self.share_inputs["seq_lens_encoder"], + self.share_inputs["seq_lens_decoder"], + self.share_inputs["step_seq_lens_decoder"], + self.share_inputs["step_draft_tokens"], + self.share_inputs["step_seq_lens_this_time"], + self.share_inputs["accept_num"], + self.share_inputs["accept_tokens"], + self.share_inputs["is_block_step"], + self.share_inputs["not_need_stop"], + self.share_inputs["stop_nums"], + self.cache_config.block_size, + self.speculative_config.num_speculative_tokens, + ) - self._update_chunked_prefill(model_forward_batch) - self._add_cache(model_forward_batch) - elif self.speculative_decoding: - speculate_schedule_cache( - self.share_inputs["draft_tokens"], - self.share_inputs["block_tables"], - self.share_inputs["stop_flags"], - self.share_inputs["prompt_lens"], - self.share_inputs["seq_lens_this_time"], - self.share_inputs["seq_lens_encoder"], - self.share_inputs["seq_lens_decoder"], - self.share_inputs["step_seq_lens_decoder"], - self.share_inputs["step_draft_tokens"], - self.share_inputs["step_seq_lens_this_time"], - self.share_inputs["accept_num"], - self.share_inputs["accept_tokens"], - self.share_inputs["is_block_step"], - self.share_inputs["not_need_stop"], - self.share_inputs["stop_nums"], - self.cache_config.block_size, - self.speculative_config.num_speculative_tokens, + self.seq_lens_this_time_buffer[:num_running_requests].copy_( + self.share_inputs["seq_lens_this_time"][:num_running_requests], False ) + return None + + def _pool(self, hidden_states: paddle.Tensor, num_running_requests: int) -> Optional[ModelRunnerOutput]: + + num_scheduled_tokens = int(self.share_inputs["seq_lens_this_time"][:num_running_requests].sum()) - self.seq_lens_this_time_buffer[:num_running_requests].copy_( - self.share_inputs["seq_lens_this_time"][:num_running_requests], False + hidden_states = hidden_states[:num_scheduled_tokens] + + prompt_lens = self.share_inputs["prompt_lens"][:num_running_requests] + prompt_token_ids = self.share_inputs["prompt_ids"] + + pooling_metadata = PoolingMetadata( + prompt_lens=prompt_lens, + prompt_token_ids=prompt_token_ids, + pooling_params=self.pooling_params, ) - return None + num_scheduled_tokens_list = [ + int(self.share_inputs["seq_lens_this_time"][i]) for i in range(num_running_requests) + ] + device_str = "gpu" if hidden_states.place.is_gpu_place() else "cpu" + pooling_metadata.build_pooling_cursor(num_scheduled_tokens_list, device=device_str) + + raw_pooler_output = self.model.pooler(hidden_states=hidden_states, pooling_metadata=pooling_metadata) + seq_lens_cpu = self.share_inputs["seq_lens_this_time"][:num_running_requests] + pooler_output: list[Optional[paddle.Tensor]] = [] + for raw_output, seq_len, prompt_len in zip(raw_pooler_output, seq_lens_cpu, pooling_metadata.prompt_lens): + output = raw_output.data if int(seq_len) == int(prompt_len) else None + pooler_output.append(output) + + pooler_output = PoolerOutput( + outputs=pooler_output, + ) + + return pooler_output def _add_cache(self, model_forward_batch) -> None: """ diff --git a/tests/pooling/test_Qwen3-Embedding_serving.py b/tests/pooling/test_Qwen3-Embedding_serving.py new file mode 100644 index 00000000000..aad376314ff --- /dev/null +++ b/tests/pooling/test_Qwen3-Embedding_serving.py @@ -0,0 +1,308 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +import signal +import socket +import subprocess +import sys +import time +from typing import List + +import numpy as np +import pytest +import requests + +# Read ports from environment variables +FD_API_PORT = int(os.getenv("FD_API_PORT", 8189)) +FD_ENGINE_QUEUE_PORT = int(os.getenv("FD_ENGINE_QUEUE_PORT", 8134)) +FD_METRICS_PORT = int(os.getenv("FD_METRICS_PORT", 8234)) +FD_CACHE_QUEUE_PORT = int(os.getenv("FD_CACHE_QUEUE_PORT", 8334)) + +PORTS_TO_CLEAN = [FD_API_PORT, FD_ENGINE_QUEUE_PORT, FD_METRICS_PORT, FD_CACHE_QUEUE_PORT] + + +def is_port_open(host: str, port: int, timeout=1.0): + """Check if a TCP port is open.""" + try: + with socket.create_connection((host, port), timeout): + return True + except Exception: + return False + + +def kill_process_on_port(port: int): + """Kill processes listening on the given port.""" + try: + output = subprocess.check_output(f"lsof -i:{port} -t", shell=True).decode().strip() + for pid in output.splitlines(): + os.kill(int(pid), signal.SIGKILL) + print(f"Killed process on port {port}, pid={pid}") + except subprocess.CalledProcessError: + pass + + +def clean_ports(): + """Clean all ports in PORTS_TO_CLEAN.""" + for port in PORTS_TO_CLEAN: + kill_process_on_port(port) + time.sleep(2) + + +@pytest.fixture(scope="session", autouse=True) +def setup_and_run_embedding_server(): + """ + Start embedding model API server for testing. + """ + print("Pre-test port cleanup...") + clean_ports() + + os.environ["ENABLE_V1_KVCACHE_SCHEDULER"] = "1" + os.environ["FD_DISABLE_CHUNKED_PREFILL"] = "1" + os.environ["FD_USE_GET_SAVE_OUTPUT_V1"] = "1" + + base_path = os.getenv("MODEL_PATH") + if base_path: + model_path = os.path.join(base_path, "Qwen3-Embedding-0.6B") + else: + model_path = "./Qwen3-Embedding-0.6B" + + if not os.path.exists(model_path): + pytest.skip(f"Model path not found: {model_path}") + + log_path = "embedding_server.log" + cmd = [ + sys.executable, + "-m", + "fastdeploy.entrypoints.openai.api_server", + "--model", + model_path, + "--port", + str(FD_API_PORT), + "--tensor-parallel-size", + "2", + "--engine-worker-queue-port", + str(FD_ENGINE_QUEUE_PORT), + "--metrics-port", + str(FD_METRICS_PORT), + "--cache-queue-port", + str(FD_CACHE_QUEUE_PORT), + "--max-model-len", + "8192", + "--max-num-seqs", + "256", + "--runner", + "pooling", + ] + + with open(log_path, "w") as logfile: + process = subprocess.Popen( + cmd, + stdout=logfile, + stderr=subprocess.STDOUT, + start_new_session=True, + ) + + # Wait for server to start (up to 480 seconds) + for _ in range(480): + if is_port_open("127.0.0.1", FD_API_PORT): + print(f"Embedding API server is up on port {FD_API_PORT}") + break + time.sleep(1) + else: + print("Embedding API server failed to start. Cleaning up...") + try: + os.killpg(process.pid, signal.SIGTERM) + except Exception as e: + print(f"Failed to kill process group: {e}") + raise RuntimeError(f"Embedding API server did not start on port {FD_API_PORT}") + + yield + + print("\n===== Post-test embedding server cleanup... =====") + try: + os.killpg(process.pid, signal.SIGTERM) + print(f"Embedding API server (pid={process.pid}) terminated") + except Exception as e: + print(f"Failed to terminate embedding API server: {e}") + + +@pytest.fixture(scope="session") +def embedding_api_url(): + """Returns the API endpoint URL for embeddings.""" + return f"http://0.0.0.0:{FD_API_PORT}/v1/embeddings" + + +@pytest.fixture +def headers(): + """Returns common HTTP request headers.""" + return {"Content-Type": "application/json"} + + +# ========================== +# Test Cases +# ========================== + + +@pytest.fixture +def consistent_payload(): + """ + Returns a fixed payload for consistency testing, + including a fixed random seed and temperature. + """ + return { + "messages": [ + { + "role": "user", + "content": "北京天安门在哪里?", + } + ], + "temperature": 0.8, + "top_p": 0, # fix top_p to reduce randomness + "seed": 13, # fixed random seed + } + + +def save_embedding_baseline(embedding: List[float], baseline_file: str): + """ + Save embedding vector to baseline file. + """ + baseline_data = {"embedding": embedding, "dimension": len(embedding)} + with open(baseline_file, "w", encoding="utf-8") as f: + json.dump(baseline_data, f, indent=2) + print(f"Baseline saved to: {baseline_file}") + + +def compare_embeddings(embedding1: List[float], embedding2: List[float], threshold: float = 0.01) -> float: + """ + Compare two embedding vectors using mean absolute difference. + + Returns: + mean_abs_diff: mean absolute difference between two embeddings + """ + arr1 = np.array(embedding1, dtype=np.float32) + arr2 = np.array(embedding2, dtype=np.float32) + + # Mean absolute difference + mean_abs_diff = np.mean(np.abs(arr1 - arr2)) + + print(f"Mean Absolute Difference: {mean_abs_diff:.6f}") + + return mean_abs_diff + + +def check_embedding_against_baseline(embedding: List[float], baseline_file: str, threshold: float = 0.01): + """ + Check embedding against baseline file. + + Args: + embedding: Current embedding vector + baseline_file: Path to baseline file + threshold: Maximum allowed difference rate (1 - cosine_similarity) + """ + try: + with open(baseline_file, "r", encoding="utf-8") as f: + baseline_data = json.load(f) + baseline_embedding = baseline_data["embedding"] + except FileNotFoundError: + raise AssertionError(f"Baseline file not found: {baseline_file}") + + if len(embedding) != len(baseline_embedding): + raise AssertionError( + f"Embedding dimension mismatch: current={len(embedding)}, baseline={len(baseline_embedding)}" + ) + + mean_abs_diff = compare_embeddings(embedding, baseline_embedding, threshold) + + if mean_abs_diff >= threshold: + # Save current embedding for debugging + temp_file = f"{baseline_file}.current" + save_embedding_baseline(embedding, temp_file) + + raise AssertionError( + f"Embedding differs from baseline by too much (mean_abs_diff={mean_abs_diff:.6f} >= {threshold}):\n" + f"Current embedding saved to: {temp_file}\n" + f"Please check the differences." + ) + + +def test_single_text_embedding(embedding_api_url, headers): + """Test embedding generation for a single text input.""" + payload = { + "input": "北京天安门在哪里?", + "model": "Qwen3-Embedding-0.6B", + } + + resp = requests.post(embedding_api_url, headers=headers, json=payload) + assert resp.status_code == 200, f"Unexpected status code: {resp.status_code}" + + result = resp.json() + assert "data" in result, "Response missing 'data' field" + assert len(result["data"]) == 1, "Expected single embedding result" + + embedding = result["data"][0]["embedding"] + assert isinstance(embedding, list), "Embedding should be a list" + assert len(embedding) > 0, "Embedding vector should not be empty" + assert all(isinstance(x, (int, float)) for x in embedding), "Embedding values should be numeric" + + print(f"Single text embedding dimension: {len(embedding)}") + + base_path = os.getenv("MODEL_PATH", "") + baseline_filename = "Qwen3-Embedding-0.6B-baseline.json" + + if base_path: + baseline_file = os.path.join(base_path, baseline_filename) + else: + baseline_file = baseline_filename + + if not os.path.exists(baseline_file): + print("Baseline file not found. Saving current embedding as baseline...") + save_embedding_baseline(embedding, baseline_file) + else: + print(f"Comparing with baseline: {baseline_file}") + check_embedding_against_baseline(embedding, baseline_file, threshold=0.01) + + +def test_batch_embeddings(embedding_api_url, headers): + """Test embedding generation for batch inputs.""" + payload = { + "input": [ + "北京天安门在哪里?", + ], + "model": "Qwen3-Embedding-0.6B", + } + + resp = requests.post(embedding_api_url, headers=headers, json=payload) + assert resp.status_code == 200, f"Unexpected status code: {resp.status_code}" + + result = resp.json() + assert "data" in result, "Response missing 'data' field" + assert len(result["data"]) == 1, "Expected three embedding results" + + base_path = os.getenv("MODEL_PATH", "") + + for idx, item in enumerate(result["data"]): + embedding = item["embedding"] + + baseline_filename = f"Qwen3-Embedding-0.6B-batch-{idx}-baseline.json" + if base_path: + baseline_file = os.path.join(base_path, baseline_filename) + else: + baseline_file = baseline_filename + + if not os.path.exists(baseline_file): + save_embedding_baseline(embedding, baseline_file) + else: + check_embedding_against_baseline(embedding, baseline_file, threshold=0.01) From 820004045dfb5ed83a82bc610d90c53418aaac10 Mon Sep 17 00:00:00 2001 From: lizexu <2694294196@qq.com> Date: Fri, 24 Oct 2025 21:56:08 +0800 Subject: [PATCH 03/20] fix --- fastdeploy/model_executor/layers/embeddings.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/fastdeploy/model_executor/layers/embeddings.py b/fastdeploy/model_executor/layers/embeddings.py index ccdac44b59f..69dbbe371c6 100644 --- a/fastdeploy/model_executor/layers/embeddings.py +++ b/fastdeploy/model_executor/layers/embeddings.py @@ -236,12 +236,9 @@ def weight_loader(self, param, loaded_weight, shard_id=None): output_dim = getattr(param, "output_dim", None) packed_dim = getattr(param, "packed_dim", None) -<<<<<<< HEAD -======= if not param._is_initialized(): param.initialize() ->>>>>>> 791b101195fa6b07918d62f49b5c2ad41a61d80a loaded_weight = get_tensor(loaded_weight) if param.dtype != loaded_weight.dtype: if loaded_weight.dtype == paddle.int8 and param.dtype == paddle.float8_e4m3fn: @@ -253,11 +250,7 @@ def weight_loader(self, param, loaded_weight, shard_id=None): assert ( param.shape == loaded_weight.shape ), f"Shape mismatch: param {param.shape} vs loaded_weight {loaded_weight.shape}" -<<<<<<< HEAD - param.set_value(loaded_weight) -======= param.copy_(loaded_weight, False) ->>>>>>> 791b101195fa6b07918d62f49b5c2ad41a61d80a return start_idx = self.shard_indices.org_vocab_start_index From 58616e4cd1c0c96b7e9dbb0868750f5797f83431 Mon Sep 17 00:00:00 2001 From: lizexu <2694294196@qq.com> Date: Mon, 27 Oct 2025 14:57:27 +0800 Subject: [PATCH 04/20] fix bug --- fastdeploy/model_executor/pre_and_post_process.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fastdeploy/model_executor/pre_and_post_process.py b/fastdeploy/model_executor/pre_and_post_process.py index f1158f9be84..d5b579d321f 100644 --- a/fastdeploy/model_executor/pre_and_post_process.py +++ b/fastdeploy/model_executor/pre_and_post_process.py @@ -318,7 +318,6 @@ def post_process_normal( model_output.stop_token_ids, model_output.stop_seqs_len, False, - is_pooling=False, ) # multi ends elif current_platform.is_maca(): set_stop_value_multi_ends( @@ -332,7 +331,6 @@ def post_process_normal( model_output.stop_token_ids, model_output.stop_seqs_len, False, - is_pooling=False, ) # multi ends else: set_stop_value_multi_ends( @@ -362,7 +360,7 @@ def post_process_normal( model_output.next_tokens, model_output.is_block_step, block_size, - False, # ✅ 改为位置参数,表示 is_pooling_task=False + False, ) else: update_inputs( From ad2f7b6960ed2b1bcd4887d906d4a7505a9589e4 Mon Sep 17 00:00:00 2001 From: lizexu <2694294196@qq.com> Date: Mon, 27 Oct 2025 16:27:20 +0800 Subject: [PATCH 05/20] fix test_return_token_ids.py and update enable_thinking --- fastdeploy/worker/gpu_model_runner.py | 3 ++ tests/pooling/test_Qwen3-Embedding_serving.py | 33 ------------------- 2 files changed, 3 insertions(+), 33 deletions(-) diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index cf5c02b943c..2524371c410 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -962,6 +962,9 @@ def _init_share_inputs(self, max_num_seqs: int): self.share_inputs["kv_num_blocks_x_cpu"] = None # CPU # Initialize thinking related buffers + self.share_inputs["enable_thinking"] = paddle.full(shape=[max_num_seqs, 1], fill_value=False, dtype="bool") + self.share_inputs["need_think_end"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32") + self.share_inputs["reasoning_index"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32") self.share_inputs["max_think_lens"] = paddle.full(shape=[max_num_seqs, 1], fill_value=-1, dtype="int32") self.share_inputs["limit_think_status"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32") diff --git a/tests/pooling/test_Qwen3-Embedding_serving.py b/tests/pooling/test_Qwen3-Embedding_serving.py index aad376314ff..3c6cb840e3b 100644 --- a/tests/pooling/test_Qwen3-Embedding_serving.py +++ b/tests/pooling/test_Qwen3-Embedding_serving.py @@ -273,36 +273,3 @@ def test_single_text_embedding(embedding_api_url, headers): else: print(f"Comparing with baseline: {baseline_file}") check_embedding_against_baseline(embedding, baseline_file, threshold=0.01) - - -def test_batch_embeddings(embedding_api_url, headers): - """Test embedding generation for batch inputs.""" - payload = { - "input": [ - "北京天安门在哪里?", - ], - "model": "Qwen3-Embedding-0.6B", - } - - resp = requests.post(embedding_api_url, headers=headers, json=payload) - assert resp.status_code == 200, f"Unexpected status code: {resp.status_code}" - - result = resp.json() - assert "data" in result, "Response missing 'data' field" - assert len(result["data"]) == 1, "Expected three embedding results" - - base_path = os.getenv("MODEL_PATH", "") - - for idx, item in enumerate(result["data"]): - embedding = item["embedding"] - - baseline_filename = f"Qwen3-Embedding-0.6B-batch-{idx}-baseline.json" - if base_path: - baseline_file = os.path.join(base_path, baseline_filename) - else: - baseline_file = baseline_filename - - if not os.path.exists(baseline_file): - save_embedding_baseline(embedding, baseline_file) - else: - check_embedding_against_baseline(embedding, baseline_file, threshold=0.01) From 955fac1767f09118c74f0a1d4c50540270e3839e Mon Sep 17 00:00:00 2001 From: lizexu <2694294196@qq.com> Date: Mon, 27 Oct 2025 18:00:42 +0800 Subject: [PATCH 06/20] fix mtp dummy_run --- fastdeploy/model_executor/pre_and_post_process.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/fastdeploy/model_executor/pre_and_post_process.py b/fastdeploy/model_executor/pre_and_post_process.py index d5b579d321f..e7c79230720 100644 --- a/fastdeploy/model_executor/pre_and_post_process.py +++ b/fastdeploy/model_executor/pre_and_post_process.py @@ -501,7 +501,15 @@ def post_process( ) else: if speculative_decoding: - post_process_specualate(sampler_or_pooler_output, model_output, save_each_rank, skip_save_output) + post_process_specualate( + sampler_or_pooler_output, + model_output, + share_inputs, + save_each_rank, + skip_save_output, + think_end_id, + line_break_id, + ) else: post_process_normal( sampler_or_pooler_output, @@ -511,6 +519,8 @@ def post_process( save_each_rank, skip_save_output, async_output_queue, + think_end_id, + line_break_id, ) From 0206d427a808ac1c2770ba5baa150b83d139f388 Mon Sep 17 00:00:00 2001 From: lizexu <2694294196@qq.com> Date: Tue, 28 Oct 2025 13:55:30 +0800 Subject: [PATCH 07/20] merge develop --- fastdeploy/worker/gpu_model_runner.py | 39 +++++++-------------------- 1 file changed, 10 insertions(+), 29 deletions(-) diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index c024d85645f..1698f01ab43 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -519,22 +519,14 @@ def insert_tasks_v1(self, req_dicts: List[Request], num_running_requests: int = self._apply_mm_inputs(request, multi_vision_inputs, rope_3d_position_ids) if not self.is_pooling_model: - if request.get("enable_thinking", False): + if request.get("enable_thinking", False) and request.get("reasoning_max_tokens", None) is not None: # Enable thinking - req_reasoning_max_tokens = request.get("reasoning_max_tokens") - req_max_tokens = request.get("max_tokens") - final_reasoning_tokens = ( - req_reasoning_max_tokens if req_reasoning_max_tokens is not None else req_max_tokens - ) - - self.share_inputs["enable_thinking"][idx : idx + 1] = True - self.share_inputs["need_think_end"][idx : idx + 1, :] = 1 - self.share_inputs["reasoning_index"][idx : idx + 1, :] = final_reasoning_tokens + self.share_inputs["max_think_lens"][idx : idx + 1, :] = request.get("reasoning_max_tokens") + self.share_inputs["limit_think_status"][idx : idx + 1, :] = 0 else: # Disable thinking - self.share_inputs["enable_thinking"][idx : idx + 1] = False - self.share_inputs["need_think_end"][idx : idx + 1, :] = 0 - self.share_inputs["reasoning_index"][idx : idx + 1, :] = 0 + self.share_inputs["max_think_lens"][idx : idx + 1, :] = -1 + self.share_inputs["limit_think_status"][idx : idx + 1, :] = 0 if isinstance(request.prompt_token_ids, np.ndarray): prompt_token_ids = request.prompt_token_ids.tolist() @@ -779,22 +771,14 @@ def insert_prefill_inputs(self, req_dicts: List[Request], num_running_requests: self.share_inputs["seq_lens_decoder"][idx : idx + 1] = 0 if not self.is_pooling_model: - if request.get("enable_thinking", False): + if request.get("enable_thinking", False) and request.get("reasoning_max_tokens", None) is not None: # Enable thinking - req_reasoning_max_tokens = request.get("reasoning_max_tokens") - req_max_tokens = request.get("max_tokens") - final_reasoning_tokens = ( - req_reasoning_max_tokens if req_reasoning_max_tokens is not None else req_max_tokens - ) - - self.share_inputs["enable_thinking"][idx : idx + 1] = True - self.share_inputs["need_think_end"][idx : idx + 1, :] = 1 - self.share_inputs["reasoning_index"][idx : idx + 1, :] = final_reasoning_tokens + self.share_inputs["max_think_lens"][idx : idx + 1, :] = request.get("reasoning_max_tokens") + self.share_inputs["limit_think_status"][idx : idx + 1, :] = 0 else: # Disable thinking - self.share_inputs["enable_thinking"][idx : idx + 1] = False - self.share_inputs["need_think_end"][idx : idx + 1, :] = 0 - self.share_inputs["reasoning_index"][idx : idx + 1, :] = 0 + self.share_inputs["max_think_lens"][idx : idx + 1, :] = -1 + self.share_inputs["limit_think_status"][idx : idx + 1, :] = 0 def get_attr_from_request(request, attr, default_value=None): res = request.get(attr, default_value) @@ -1102,9 +1086,6 @@ def _init_share_inputs(self, max_num_seqs: int): self.share_inputs["kv_num_blocks_x_cpu"] = None # CPU # Initialize thinking related buffers - self.share_inputs["enable_thinking"] = paddle.full(shape=[max_num_seqs, 1], fill_value=False, dtype="bool") - self.share_inputs["need_think_end"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32") - self.share_inputs["reasoning_index"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32") self.share_inputs["max_think_lens"] = paddle.full(shape=[max_num_seqs, 1], fill_value=-1, dtype="int32") self.share_inputs["limit_think_status"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32") From 30795d21fc3ab15aaf5344aa9c8a6441980b61b0 Mon Sep 17 00:00:00 2001 From: lizexu <2694294196@qq.com> Date: Tue, 28 Oct 2025 17:04:58 +0800 Subject: [PATCH 08/20] fix np.float32 --- fastdeploy/model_executor/pre_and_post_process.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/fastdeploy/model_executor/pre_and_post_process.py b/fastdeploy/model_executor/pre_and_post_process.py index e7c79230720..b537655225c 100644 --- a/fastdeploy/model_executor/pre_and_post_process.py +++ b/fastdeploy/model_executor/pre_and_post_process.py @@ -259,8 +259,6 @@ def _build_stream_transfer_data(output_tokens: np.ndarray, pooler_outputs: None) pooler_output = pooler_output.astype("float32") pooler_output = pooler_output.numpy() - if pooler_output.dtype != np.float32: - pooler_output = pooler_output.astype(np.float32) stream_transfer_data = StreamTransferData( decoder_state=DecoderState.TEXT, pooler_output=pooler_output, batch_id=bid From 6bc1ed29890687ac81312aad9ddfe1ff1c9d09db Mon Sep 17 00:00:00 2001 From: lizexu <2694294196@qq.com> Date: Tue, 28 Oct 2025 17:41:54 +0800 Subject: [PATCH 09/20] delete FD_DISABLE_CHUNKED_PREFILL and FD_USE_GET_SAVE_OUTPUT_V1 --- fastdeploy/model_executor/pre_and_post_process.py | 7 +++---- fastdeploy/worker/gpu_model_runner.py | 7 ++++++- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/fastdeploy/model_executor/pre_and_post_process.py b/fastdeploy/model_executor/pre_and_post_process.py index b537655225c..cb6348a6b8e 100644 --- a/fastdeploy/model_executor/pre_and_post_process.py +++ b/fastdeploy/model_executor/pre_and_post_process.py @@ -857,8 +857,7 @@ def post_process_pooling( ) if not skip_save_output: - if envs.FD_USE_GET_SAVE_OUTPUT_V1: - if save_each_rank or model_output.mp_rank == 0: - output = _build_stream_transfer_data(output_tokens=None, pooler_outputs=pooler_output.outputs) + if save_each_rank or model_output.mp_rank == 0: + output = _build_stream_transfer_data(output_tokens=None, pooler_outputs=pooler_output.outputs) - async_output_queue.put(output) + async_output_queue.put(output) diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 1698f01ab43..8ac9cd5528c 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -940,7 +940,7 @@ def get_supported_pooling_tasks(self) -> list[PoolingTask]: if self.cache_config.enable_chunked_prefill and "encode" in supported_tasks: supported_tasks.remove("encode") - logger.warning( + logger.debug( "Chunked prefill is not supported with " "encode task which using ALL pooling. " "Please turn off chunked prefill by export=FD_DISABLE_CHUNKED_PREFILL=1 before using it." @@ -1537,6 +1537,11 @@ def _dummy_pooler_run_task( req_num_tokens = num_tokens // num_reqs + print("num_tokens", num_tokens) + print("max_num_seqs", max_num_seqs) + print("num_reqs", num_reqs) + print("min_tokens_per_req", min_tokens_per_req) + print("num_scheduled_token_list", num_scheduled_tokens_list) dummy_prompt_lens = paddle.to_tensor(num_scheduled_tokens_list, dtype="int64") dummy_token_ids = paddle.zeros( [num_reqs, req_num_tokens], From f439ca23f7ffa53dd6a9c18e29743b2593e11d1a Mon Sep 17 00:00:00 2001 From: lizexu <2694294196@qq.com> Date: Tue, 28 Oct 2025 12:10:00 +0000 Subject: [PATCH 10/20] delete and build_stream_transfer_data --- fastdeploy/model_executor/pre_and_post_process.py | 6 +++--- fastdeploy/worker/gpu_model_runner.py | 6 ------ 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/fastdeploy/model_executor/pre_and_post_process.py b/fastdeploy/model_executor/pre_and_post_process.py index cb6348a6b8e..9a4149b82bd 100644 --- a/fastdeploy/model_executor/pre_and_post_process.py +++ b/fastdeploy/model_executor/pre_and_post_process.py @@ -15,7 +15,7 @@ """ import queue -from typing import Dict, Optional, Union +from typing import Dict, List, Optional, Union import numpy as np import paddle @@ -85,7 +85,7 @@ speculate_limit_thinking_content_length_v2, ) -from fastdeploy.output.pooler import PoolerOutput +from fastdeploy.output.pooler import PoolerOutput, PoolingSequenceGroupOutput from fastdeploy.output.stream_transfer_data import DecoderState, StreamTransferData from fastdeploy.worker.output import ModelOutputData, ModelRunnerOutput, SamplerOutput @@ -239,7 +239,7 @@ def pre_process( ) -def _build_stream_transfer_data(output_tokens: np.ndarray, pooler_outputs: None): +def _build_stream_transfer_data(output_tokens: np.ndarray, pooler_outputs: List[PoolingSequenceGroupOutput] = None): """Split output_tokens and output""" stream_transfer_datas = [] diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 8ac9cd5528c..a3a8dc740f0 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -1536,12 +1536,6 @@ def _dummy_pooler_run_task( assert len(num_scheduled_tokens_list) == num_reqs req_num_tokens = num_tokens // num_reqs - - print("num_tokens", num_tokens) - print("max_num_seqs", max_num_seqs) - print("num_reqs", num_reqs) - print("min_tokens_per_req", min_tokens_per_req) - print("num_scheduled_token_list", num_scheduled_tokens_list) dummy_prompt_lens = paddle.to_tensor(num_scheduled_tokens_list, dtype="int64") dummy_token_ids = paddle.zeros( [num_reqs, req_num_tokens], From 27d686bd076e9b5361ed18a4131d38015ef2be0f Mon Sep 17 00:00:00 2001 From: lizexu <2694294196@qq.com> Date: Tue, 28 Oct 2025 16:12:08 +0000 Subject: [PATCH 11/20] fix test_update_v1: --- fastdeploy/worker/gpu_model_runner.py | 2 +- tests/operators/test_update_inputs_v1.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index a3a8dc740f0..6c46821fc39 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -2029,8 +2029,8 @@ class at the server level, which is too granular for ModelRunner. if self.use_cudagraph: model_output = model_output[: self.real_token_num] - hidden_states = model_output if self.is_pooling_model: + hidden_states = model_output pooler_output = self._pool(hidden_states, num_running_requests) model_output_data = ModelOutputData( diff --git a/tests/operators/test_update_inputs_v1.py b/tests/operators/test_update_inputs_v1.py index 857c4b2b642..aee3aa5c622 100644 --- a/tests/operators/test_update_inputs_v1.py +++ b/tests/operators/test_update_inputs_v1.py @@ -103,6 +103,7 @@ def update_inputs_v1_ref( next_tokens, is_block_step, block_size, + is_pooling_task=False, ): max_bsz = stop_flags.shape[0] now_bsz = seq_lens_this_time.shape[0] @@ -173,6 +174,7 @@ def test_update_inputs_v1(self): next_tokens = paddle.to_tensor(next_tokens) is_block_step = paddle.to_tensor(is_block_step) block_size = 1024 + is_pooling_task = False inputs = ( stop_flags, @@ -189,6 +191,7 @@ def test_update_inputs_v1(self): next_tokens, is_block_step, block_size, + is_pooling_task, ) # inplace modify, need to clone inputs inputs_clone = [x.clone() if isinstance(x, paddle.Tensor) else x for x in inputs] From 57e76be1a6bbaf099868ccb28a280957d980dda5 Mon Sep 17 00:00:00 2001 From: lizexu <2694294196@qq.com> Date: Wed, 29 Oct 2025 09:15:45 +0000 Subject: [PATCH 12/20] fix --- fastdeploy/model_executor/pre_and_post_process.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fastdeploy/model_executor/pre_and_post_process.py b/fastdeploy/model_executor/pre_and_post_process.py index 9a4149b82bd..300527c6024 100644 --- a/fastdeploy/model_executor/pre_and_post_process.py +++ b/fastdeploy/model_executor/pre_and_post_process.py @@ -14,6 +14,7 @@ # limitations under the License. """ +import os import queue from typing import Dict, List, Optional, Union @@ -857,7 +858,7 @@ def post_process_pooling( ) if not skip_save_output: + os.environ["FD_USE_GET_SAVE_OUTPUT_V1"] = "1" if save_each_rank or model_output.mp_rank == 0: output = _build_stream_transfer_data(output_tokens=None, pooler_outputs=pooler_output.outputs) - async_output_queue.put(output) From eae6db64e6de842fb2833fd0f7e9744f509c12ee Mon Sep 17 00:00:00 2001 From: lizexu <2694294196@qq.com> Date: Wed, 29 Oct 2025 09:50:46 +0000 Subject: [PATCH 13/20] fix --- fastdeploy/model_executor/pre_and_post_process.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/fastdeploy/model_executor/pre_and_post_process.py b/fastdeploy/model_executor/pre_and_post_process.py index 300527c6024..51e41ded05c 100644 --- a/fastdeploy/model_executor/pre_and_post_process.py +++ b/fastdeploy/model_executor/pre_and_post_process.py @@ -14,7 +14,6 @@ # limitations under the License. """ -import os import queue from typing import Dict, List, Optional, Union @@ -856,9 +855,12 @@ def post_process_pooling( block_size, True, ) + print("model_output.seq_lens_encoder", model_output.seq_lens_encoder) if not skip_save_output: - os.environ["FD_USE_GET_SAVE_OUTPUT_V1"] = "1" - if save_each_rank or model_output.mp_rank == 0: - output = _build_stream_transfer_data(output_tokens=None, pooler_outputs=pooler_output.outputs) - async_output_queue.put(output) + if envs.FD_USE_GET_SAVE_OUTPUT_V1: + if save_each_rank or model_output.mp_rank == 0: + output = _build_stream_transfer_data(output_tokens=None, pooler_outputs=pooler_output.outputs) + async_output_queue.put(output) + else: + raise RuntimeError("Not supported save_output mode") From 90d5ee1a0a16464dca246bf70150c6bf185db83a Mon Sep 17 00:00:00 2001 From: lizexu <2694294196@qq.com> Date: Wed, 29 Oct 2025 11:18:41 +0000 Subject: [PATCH 14/20] update dummy_run post_process --- custom_ops/gpu_ops/cpp_extensions.cc | 3 +- custom_ops/gpu_ops/update_inputs_v1.cu | 98 +++++++------------ .../model_executor/pre_and_post_process.py | 5 +- fastdeploy/worker/gpu_model_runner.py | 49 +++++++++- 4 files changed, 83 insertions(+), 72 deletions(-) diff --git a/custom_ops/gpu_ops/cpp_extensions.cc b/custom_ops/gpu_ops/cpp_extensions.cc index 3f8d424fff9..71600695a2b 100644 --- a/custom_ops/gpu_ops/cpp_extensions.cc +++ b/custom_ops/gpu_ops/cpp_extensions.cc @@ -441,8 +441,7 @@ void UpdateInputsV1(const paddle::Tensor& stop_flags, const paddle::Tensor& stop_nums, const paddle::Tensor& next_tokens, const paddle::Tensor& is_block_step, - const int block_size, - const bool is_pooling_task); + const int block_size); void RecoverDecodeTask( const paddle::Tensor& stop_flags, diff --git a/custom_ops/gpu_ops/update_inputs_v1.cu b/custom_ops/gpu_ops/update_inputs_v1.cu index 54f8cd45213..64230ae2565 100644 --- a/custom_ops/gpu_ops/update_inputs_v1.cu +++ b/custom_ops/gpu_ops/update_inputs_v1.cu @@ -33,8 +33,7 @@ __global__ void update_inputs_kernel_v1(bool* not_need_stop, const int input_ids_stride, const int block_num_per_seq, const int block_size, - bool prefill_one_step_stop, - bool is_pooling_task) { + bool prefill_one_step_stop) { int thread_idx = threadIdx.x; typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; @@ -49,75 +48,53 @@ __global__ void update_inputs_kernel_v1(bool* not_need_stop, stop_flag_now_int = 1; } } - if (thread_idx < bsz) { if (stop_flag_now) { seq_lens_this_time[thread_idx] = 0; // stop at next step seq_lens_decoder[thread_idx] = 0; seq_lens_encoder[thread_idx] = 0; } else { - if (is_pooling_task) { - if (seq_lens_this_time[thread_idx] > 0) { - int total_processed = - seq_lens_this_time[thread_idx] + seq_lens_decoder[thread_idx]; - - if (total_processed >= prompt_lens[thread_idx]) { - stop_flags[thread_idx] = true; - seq_lens_encoder[thread_idx] = 0; - seq_lens_decoder[thread_idx] = 0; - seq_lens_this_time[thread_idx] = 0; - stop_flag_now_int = 1; - } - } else { + if (seq_lens_this_time[thread_idx] + seq_lens_decoder[thread_idx] >= + prompt_lens[thread_idx]) { + if (prefill_one_step_stop) { + // prefill done, stop + stop_flags[thread_idx] = true; + seq_lens_this_time[thread_idx] = 0; + seq_lens_decoder[thread_idx] = 0; seq_lens_encoder[thread_idx] = 0; stop_flag_now_int = 1; - } - } else { - // Normal generation task logic - if (seq_lens_this_time[thread_idx] + seq_lens_decoder[thread_idx] >= - prompt_lens[thread_idx]) { - if (prefill_one_step_stop) { - // prefill done, stop - stop_flags[thread_idx] = true; + } else { + // decoding + seq_lens_decoder[thread_idx] += seq_lens_this_time[thread_idx]; + seq_lens_this_time[thread_idx] = 1; + seq_lens_encoder[thread_idx] = 0; + int64_t* input_ids_now = input_ids + thread_idx * input_ids_stride; + input_ids_now[0] = next_tokens[thread_idx]; + + // to judge whether block is not enough + int* block_table_now = block_tables + thread_idx * block_num_per_seq; + if (seq_lens_this_time[thread_idx] != 0 && + block_table_now[seq_lens_decoder[thread_idx] / block_size] == + -1) { + // should be scheduled by server + is_block_step[thread_idx] = true; seq_lens_this_time[thread_idx] = 0; + stop_flags[thread_idx] = true; + step_seq_lens_decoder[thread_idx] = seq_lens_decoder[thread_idx]; seq_lens_decoder[thread_idx] = 0; - seq_lens_encoder[thread_idx] = 0; stop_flag_now_int = 1; - } else { - // decoding - seq_lens_decoder[thread_idx] += seq_lens_this_time[thread_idx]; - seq_lens_this_time[thread_idx] = 1; - seq_lens_encoder[thread_idx] = 0; - int64_t* input_ids_now = input_ids + thread_idx * input_ids_stride; - input_ids_now[0] = next_tokens[thread_idx]; - - // to judge whether block is not enough - int* block_table_now = - block_tables + thread_idx * block_num_per_seq; - if (seq_lens_this_time[thread_idx] != 0 && - block_table_now[seq_lens_decoder[thread_idx] / block_size] == - -1) { - // should be scheduled by server - is_block_step[thread_idx] = true; - seq_lens_this_time[thread_idx] = 0; - stop_flags[thread_idx] = true; - step_seq_lens_decoder[thread_idx] = seq_lens_decoder[thread_idx]; - seq_lens_decoder[thread_idx] = 0; - stop_flag_now_int = 1; - } } - } else { - stop_flags[thread_idx] = true; - seq_lens_this_time[thread_idx] = 0; - seq_lens_decoder[thread_idx] = 0; - seq_lens_encoder[thread_idx] = 0; - topk_ids[thread_idx] = -1; - stop_flag_now_int = 1; } + } else { + stop_flags[thread_idx] = true; + seq_lens_this_time[thread_idx] = 0; + seq_lens_decoder[thread_idx] = 0; + seq_lens_encoder[thread_idx] = 0; + topk_ids[thread_idx] = -1; + stop_flag_now_int = 1; } } } - __syncthreads(); int64_t stop_sum = BlockReduce(temp_storage).Sum(stop_flag_now_int); if (thread_idx == 0) { @@ -138,8 +115,7 @@ void UpdateInputsV1(const paddle::Tensor& stop_flags, const paddle::Tensor& stop_nums, const paddle::Tensor& next_tokens, const paddle::Tensor& is_block_step, - const int block_size, - const bool is_pooling_task) { + const int block_size) { #ifdef PADDLE_WITH_CUSTOM_DEVICE auto dev_ctx = static_cast( paddle::experimental::DeviceContextPool::Instance().Get( @@ -156,7 +132,6 @@ void UpdateInputsV1(const paddle::Tensor& stop_flags, } const int max_bsz = stop_flags.shape()[0]; const int now_bsz = seq_lens_this_time.shape()[0]; - const int bsz_to_process = is_pooling_task ? max_bsz : now_bsz; const int input_ids_stride = input_ids.shape()[1]; const int block_num_per_seq = block_tables.shape()[1]; auto not_need_stop_gpu = not_need_stop.copy_to(stop_flags.place(), false); @@ -174,13 +149,12 @@ void UpdateInputsV1(const paddle::Tensor& stop_flags, const_cast(stop_flags.data()), const_cast(is_block_step.data()), next_tokens.data(), - bsz_to_process, + now_bsz, max_bsz, input_ids_stride, block_num_per_seq, block_size, - prefill_one_step_stop, - is_pooling_task); + prefill_one_step_stop); auto not_need_stop_cpu = not_need_stop_gpu.copy_to(not_need_stop.place(), false); bool* not_need_stop_data = const_cast(not_need_stop.data()); @@ -201,7 +175,7 @@ PD_BUILD_STATIC_OP(update_inputs_v1) "stop_nums", "next_tokens", "is_block_step"}) - .Attrs({"block_size: int", "is_pooling_task: bool"}) + .Attrs({"block_size: int"}) .Outputs({"not_need_stop_out", "seq_lens_this_time_out", "seq_lens_encoder_out", diff --git a/fastdeploy/model_executor/pre_and_post_process.py b/fastdeploy/model_executor/pre_and_post_process.py index 51e41ded05c..0e7b3fb82b4 100644 --- a/fastdeploy/model_executor/pre_and_post_process.py +++ b/fastdeploy/model_executor/pre_and_post_process.py @@ -358,7 +358,6 @@ def post_process_normal( model_output.next_tokens, model_output.is_block_step, block_size, - False, ) else: update_inputs( @@ -853,9 +852,7 @@ def post_process_pooling( model_output.next_tokens, model_output.is_block_step, block_size, - True, ) - print("model_output.seq_lens_encoder", model_output.seq_lens_encoder) if not skip_save_output: if envs.FD_USE_GET_SAVE_OUTPUT_V1: @@ -863,4 +860,4 @@ def post_process_pooling( output = _build_stream_transfer_data(output_tokens=None, pooler_outputs=pooler_output.outputs) async_output_queue.put(output) else: - raise RuntimeError("Not supported save_output mode") + raise RuntimeError("Not supported save_output mode,Please set FD_USE_GET_SAVE_OUTPUT_V1=1 ") diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 6962f549c07..99233803727 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -1579,6 +1579,7 @@ def _dummy_pooler_run_task( def _dummy_pooler_run( self, hidden_states: paddle.Tensor, + model_output: paddle.Tensor, ) -> PoolerOutput: output_size = dict[PoolingTask, float]() for task in self.get_supported_pooling_tasks(): @@ -1588,8 +1589,49 @@ def _dummy_pooler_run( del output max_task = max(output_size.items(), key=lambda x: x[1])[0] - final_output = self._dummy_pooler_run_task(hidden_states, max_task) - return final_output + pooler_output = self._dummy_pooler_run_task(hidden_states, max_task) + + model_output_data = ModelOutputData( + next_tokens=self.share_inputs["next_tokens"], + stop_flags=self.share_inputs["stop_flags"], + step_idx=self.share_inputs["step_idx"], + max_dec_len=self.share_inputs["max_dec_len"], + pre_ids=self.share_inputs["pre_ids"], + seq_lens_this_time=self.share_inputs["seq_lens_this_time"], + eos_token_id=self.share_inputs["eos_token_id"], + not_need_stop=self.share_inputs["not_need_stop"], + input_ids=self.share_inputs["input_ids"], + stop_nums=self.share_inputs["stop_nums"], + seq_lens_encoder=self.share_inputs["seq_lens_encoder"], + seq_lens_decoder=self.share_inputs["seq_lens_decoder"], + is_block_step=self.share_inputs["is_block_step"], + full_hidden_states=model_output, + msg_queue_id=self.parallel_config.msg_queue_id, + mp_rank=self.parallel_config.tensor_parallel_rank, + use_ep=self.parallel_config.use_ep, + draft_tokens=(self.share_inputs["draft_tokens"] if self.speculative_decoding else None), + actual_draft_token_num=( + self.share_inputs["actual_draft_token_num"] if self.speculative_decoding else None + ), + accept_tokens=(self.share_inputs["accept_tokens"] if self.speculative_decoding else None), + accept_num=(self.share_inputs["accept_num"] if self.speculative_decoding else None), + stop_token_ids=self.share_inputs["stop_seqs"], + stop_seqs_len=self.share_inputs["stop_seqs_len"], + prompt_lens=self.share_inputs["prompt_lens"], + ) + + post_process( + sampler_or_pooler_output=pooler_output, + model_output=model_output_data, + share_inputs=self.share_inputs, + block_size=self.cache_config.block_size, + speculative_decoding=self.speculative_decoding, + skip_save_output=True, + async_output_queue=self.async_output_queue, + think_end_id=self.model_config.think_end_id, + line_break_id=self.model_config.line_break_id, + ) + return pooler_output def _dummy_sampler_run( self, @@ -1718,7 +1760,6 @@ def _dummy_run( accept_all_drafts: Target model will accept all draft tokens reject_all_drafts: Target model will reject all draft tokens """ - input_length_list, max_dec_len_list, block_num = self.get_input_length_list( num_tokens=num_tokens, batch_size=batch_size, @@ -1773,7 +1814,7 @@ def _dummy_run( ) if self.is_pooling_model: - self._dummy_pooler_run(hidden_states) + self._dummy_pooler_run(hidden_states, model_output) break else: self._dummy_sampler_run(hidden_states, model_output, accept_all_drafts, reject_all_drafts) From 1a356918118ef1473aef5191b521fc97dfcaca79 Mon Sep 17 00:00:00 2001 From: lizexu <2694294196@qq.com> Date: Wed, 29 Oct 2025 11:20:24 +0000 Subject: [PATCH 15/20] delete test_update_v1 --- tests/operators/test_update_inputs_v1.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/operators/test_update_inputs_v1.py b/tests/operators/test_update_inputs_v1.py index aee3aa5c622..857c4b2b642 100644 --- a/tests/operators/test_update_inputs_v1.py +++ b/tests/operators/test_update_inputs_v1.py @@ -103,7 +103,6 @@ def update_inputs_v1_ref( next_tokens, is_block_step, block_size, - is_pooling_task=False, ): max_bsz = stop_flags.shape[0] now_bsz = seq_lens_this_time.shape[0] @@ -174,7 +173,6 @@ def test_update_inputs_v1(self): next_tokens = paddle.to_tensor(next_tokens) is_block_step = paddle.to_tensor(is_block_step) block_size = 1024 - is_pooling_task = False inputs = ( stop_flags, @@ -191,7 +189,6 @@ def test_update_inputs_v1(self): next_tokens, is_block_step, block_size, - is_pooling_task, ) # inplace modify, need to clone inputs inputs_clone = [x.clone() if isinstance(x, paddle.Tensor) else x for x in inputs] From 2fa8733f0ffbfc69caf8cb0564d26f3b0af1c379 Mon Sep 17 00:00:00 2001 From: lizexu <2694294196@qq.com> Date: Wed, 29 Oct 2025 11:32:03 +0000 Subject: [PATCH 16/20] fix --- fastdeploy/config.py | 3 +++ fastdeploy/model_executor/pre_and_post_process.py | 9 +++------ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fastdeploy/config.py b/fastdeploy/config.py index 8cd82fbc655..2265ac16499 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -242,6 +242,9 @@ def _post_init(self): self.enable_mm = is_multimodal_model + if self.runner_type == "pooling": + os.environ["FD_USE_GET_SAVE_OUTPUT_V1"] = "1" + if self.runner_type == "generate" and not is_generative_model: if is_multimodal_model: pass diff --git a/fastdeploy/model_executor/pre_and_post_process.py b/fastdeploy/model_executor/pre_and_post_process.py index 0e7b3fb82b4..ef76d89403b 100644 --- a/fastdeploy/model_executor/pre_and_post_process.py +++ b/fastdeploy/model_executor/pre_and_post_process.py @@ -855,9 +855,6 @@ def post_process_pooling( ) if not skip_save_output: - if envs.FD_USE_GET_SAVE_OUTPUT_V1: - if save_each_rank or model_output.mp_rank == 0: - output = _build_stream_transfer_data(output_tokens=None, pooler_outputs=pooler_output.outputs) - async_output_queue.put(output) - else: - raise RuntimeError("Not supported save_output mode,Please set FD_USE_GET_SAVE_OUTPUT_V1=1 ") + if save_each_rank or model_output.mp_rank == 0: + output = _build_stream_transfer_data(output_tokens=None, pooler_outputs=pooler_output.outputs) + async_output_queue.put(output) From 5b12f6fbc58f85569acf4e07d8ab030005a1b985 Mon Sep 17 00:00:00 2001 From: lizexu <2694294196@qq.com> Date: Wed, 29 Oct 2025 13:21:29 +0000 Subject: [PATCH 17/20] fix dummy_run --- fastdeploy/worker/gpu_model_runner.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 99233803727..1dfbb6922b0 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -1546,11 +1546,8 @@ def _dummy_pooler_run_task( assert len(num_scheduled_tokens_list) == num_reqs req_num_tokens = num_tokens // num_reqs - dummy_prompt_lens = paddle.to_tensor(num_scheduled_tokens_list, dtype="int64") - dummy_token_ids = paddle.zeros( - [num_reqs, req_num_tokens], - dtype="int64", - ) + dummy_prompt_lens = paddle.to_tensor(num_scheduled_tokens_list, dtype="int64", place=paddle.CPUPlace()) + dummy_token_ids = paddle.zeros([num_reqs, req_num_tokens], dtype="int64", device=hidden_states.place) model = cast(FdModelForPooling, self.get_model()) dummy_pooling_params = PoolingParams(task=task) to_update = model.pooler.get_pooling_updates(task) From 7ca73ba1adfd2b55a65969968de3daf1e749729b Mon Sep 17 00:00:00 2001 From: lizexu <2694294196@qq.com> Date: Thu, 30 Oct 2025 08:26:36 +0000 Subject: [PATCH 18/20] fix model_path --- tests/pooling/test_Qwen3-Embedding_serving.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/pooling/test_Qwen3-Embedding_serving.py b/tests/pooling/test_Qwen3-Embedding_serving.py index 3c6cb840e3b..7055c85311e 100644 --- a/tests/pooling/test_Qwen3-Embedding_serving.py +++ b/tests/pooling/test_Qwen3-Embedding_serving.py @@ -69,7 +69,6 @@ def setup_and_run_embedding_server(): print("Pre-test port cleanup...") clean_ports() - os.environ["ENABLE_V1_KVCACHE_SCHEDULER"] = "1" os.environ["FD_DISABLE_CHUNKED_PREFILL"] = "1" os.environ["FD_USE_GET_SAVE_OUTPUT_V1"] = "1" @@ -263,7 +262,7 @@ def test_single_text_embedding(embedding_api_url, headers): baseline_filename = "Qwen3-Embedding-0.6B-baseline.json" if base_path: - baseline_file = os.path.join(base_path, baseline_filename) + baseline_file = os.path.join(base_path, "torch", baseline_filename) else: baseline_file = baseline_filename From 1e3cae55cf51e5f6786e6f1903561e2c3a13b78a Mon Sep 17 00:00:00 2001 From: lizexu <2694294196@qq.com> Date: Thu, 30 Oct 2025 08:27:42 +0000 Subject: [PATCH 19/20] fix model_path --- tests/pooling/test_Qwen3-Embedding_serving.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/pooling/test_Qwen3-Embedding_serving.py b/tests/pooling/test_Qwen3-Embedding_serving.py index 7055c85311e..69e93759386 100644 --- a/tests/pooling/test_Qwen3-Embedding_serving.py +++ b/tests/pooling/test_Qwen3-Embedding_serving.py @@ -74,7 +74,7 @@ def setup_and_run_embedding_server(): base_path = os.getenv("MODEL_PATH") if base_path: - model_path = os.path.join(base_path, "Qwen3-Embedding-0.6B") + model_path = os.path.join(base_path, "torch", "Qwen3-Embedding-0.6B") else: model_path = "./Qwen3-Embedding-0.6B" From 90ef1141cf781bf98fb079dc07e9218ec6ddc493 Mon Sep 17 00:00:00 2001 From: lizexu <2694294196@qq.com> Date: Thu, 30 Oct 2025 09:31:05 +0000 Subject: [PATCH 20/20] fix dummy_run --- fastdeploy/worker/gpu_model_runner.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 1dfbb6922b0..92e15f7927a 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -1798,22 +1798,22 @@ def _dummy_run( if self.use_cudagraph: model_output = model_output[: self.real_token_num] - hidden_states = rebuild_padding( - model_output, - self.share_inputs["cu_seqlens_q"], - self.share_inputs["seq_lens_this_time"], - self.share_inputs["seq_lens_decoder"], - self.share_inputs["seq_lens_encoder"], - ( - self.share_inputs["output_padding_offset"] if self.speculative_decoding else None - ), # speculative decoding requires - self.model_config.max_model_len, - ) - if self.is_pooling_model: + hidden_states = model_output self._dummy_pooler_run(hidden_states, model_output) break else: + hidden_states = rebuild_padding( + model_output, + self.share_inputs["cu_seqlens_q"], + self.share_inputs["seq_lens_this_time"], + self.share_inputs["seq_lens_decoder"], + self.share_inputs["seq_lens_encoder"], + ( + self.share_inputs["output_padding_offset"] if self.speculative_decoding else None + ), # speculative decoding requires + self.model_config.max_model_len, + ) self._dummy_sampler_run(hidden_states, model_output, accept_all_drafts, reject_all_drafts) # 7. Updata 'infer_seed' and step_cuda()