PaddlePaddle · gongshaotian · Oct 31, 2025 · Sep 22, 2025 · Oct 10, 2025 · Oct 10, 2025
diff --git a/custom_ops/gpu_ops/cpp_extensions.cc b/custom_ops/gpu_ops/cpp_extensions.cc
@@ -441,7 +441,8 @@ void UpdateInputsV1(const paddle::Tensor& stop_flags,
                     const paddle::Tensor& stop_nums,
                     const paddle::Tensor& next_tokens,
                     const paddle::Tensor& is_block_step,
-                    const int block_size);
+                    const int block_size,
+                    const bool is_pooling_task);
 
 void RecoverDecodeTask(
     const paddle::Tensor& stop_flags,

diff --git a/custom_ops/gpu_ops/update_inputs_v1.cu b/custom_ops/gpu_ops/update_inputs_v1.cu
@@ -33,7 +33,8 @@ __global__ void update_inputs_kernel_v1(bool* not_need_stop,
                                         const int input_ids_stride,
                                         const int block_num_per_seq,
                                         const int block_size,
-                                        bool prefill_one_step_stop) {
+                                        bool prefill_one_step_stop,
+                                        bool is_pooling_task) {
   int thread_idx = threadIdx.x;
   typedef cub::BlockReduce<int64_t, THREADBLOCK_SIZE> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
@@ -48,53 +49,75 @@ __global__ void update_inputs_kernel_v1(bool* not_need_stop,
       stop_flag_now_int = 1;
     }
   }
+
   if (thread_idx < bsz) {
     if (stop_flag_now) {
       seq_lens_this_time[thread_idx] = 0;  // stop at next step
       seq_lens_decoder[thread_idx] = 0;
       seq_lens_encoder[thread_idx] = 0;
     } else {
-      if (seq_lens_this_time[thread_idx] + seq_lens_decoder[thread_idx] >=
-          prompt_lens[thread_idx]) {
-        if (prefill_one_step_stop) {
-          // prefill done, stop
-          stop_flags[thread_idx] = true;
-          seq_lens_this_time[thread_idx] = 0;
-          seq_lens_decoder[thread_idx] = 0;
-          seq_lens_encoder[thread_idx] = 0;
-          stop_flag_now_int = 1;
-        } else {
-          // decoding
-          seq_lens_decoder[thread_idx] += seq_lens_this_time[thread_idx];
-          seq_lens_this_time[thread_idx] = 1;
-          seq_lens_encoder[thread_idx] = 0;
-          int64_t* input_ids_now = input_ids + thread_idx * input_ids_stride;
-          input_ids_now[0] = next_tokens[thread_idx];
+      if (is_pooling_task) {
+        if (seq_lens_this_time[thread_idx] > 0) {
+          int total_processed =
+              seq_lens_this_time[thread_idx] + seq_lens_decoder[thread_idx];
 
-          // to judge whether block is not enough
-          int* block_table_now = block_tables + thread_idx * block_num_per_seq;
-          if (seq_lens_this_time[thread_idx] != 0 &&
-              block_table_now[seq_lens_decoder[thread_idx] / block_size] ==
-                  -1) {
-            // should be scheduled by server
-            is_block_step[thread_idx] = true;
-            seq_lens_this_time[thread_idx] = 0;
+          if (total_processed >= prompt_lens[thread_idx]) {
             stop_flags[thread_idx] = true;
-            step_seq_lens_decoder[thread_idx] = seq_lens_decoder[thread_idx];
+            seq_lens_encoder[thread_idx] = 0;
             seq_lens_decoder[thread_idx] = 0;
+            seq_lens_this_time[thread_idx] = 0;
             stop_flag_now_int = 1;
           }
+        } else {
+          seq_lens_encoder[thread_idx] = 0;
+          stop_flag_now_int = 1;
         }
       } else {
-        stop_flags[thread_idx] = true;
-        seq_lens_this_time[thread_idx] = 0;
-        seq_lens_decoder[thread_idx] = 0;
-        seq_lens_encoder[thread_idx] = 0;
-        topk_ids[thread_idx] = -1;
-        stop_flag_now_int = 1;
+        // Normal generation task logic
+        if (seq_lens_this_time[thread_idx] + seq_lens_decoder[thread_idx] >=
+            prompt_lens[thread_idx]) {
+          if (prefill_one_step_stop) {
+            // prefill done, stop
+            stop_flags[thread_idx] = true;
+            seq_lens_this_time[thread_idx] = 0;
+            seq_lens_decoder[thread_idx] = 0;
+            seq_lens_encoder[thread_idx] = 0;
+            stop_flag_now_int = 1;
+          } else {
+            // decoding
+            seq_lens_decoder[thread_idx] += seq_lens_this_time[thread_idx];
+            seq_lens_this_time[thread_idx] = 1;
+            seq_lens_encoder[thread_idx] = 0;
+            int64_t* input_ids_now = input_ids + thread_idx * input_ids_stride;
+            input_ids_now[0] = next_tokens[thread_idx];
+
+            // to judge whether block is not enough
+            int* block_table_now =
+                block_tables + thread_idx * block_num_per_seq;
+            if (seq_lens_this_time[thread_idx] != 0 &&
+                block_table_now[seq_lens_decoder[thread_idx] / block_size] ==
+                    -1) {
+              // should be scheduled by server
+              is_block_step[thread_idx] = true;
+              seq_lens_this_time[thread_idx] = 0;
+              stop_flags[thread_idx] = true;
+              step_seq_lens_decoder[thread_idx] = seq_lens_decoder[thread_idx];
+              seq_lens_decoder[thread_idx] = 0;
+              stop_flag_now_int = 1;
+            }
+          }
+        } else {
+          stop_flags[thread_idx] = true;
+          seq_lens_this_time[thread_idx] = 0;
+          seq_lens_decoder[thread_idx] = 0;
+          seq_lens_encoder[thread_idx] = 0;
+          topk_ids[thread_idx] = -1;
+          stop_flag_now_int = 1;
+        }
       }
     }
   }
+
   __syncthreads();
   int64_t stop_sum = BlockReduce(temp_storage).Sum(stop_flag_now_int);
   if (thread_idx == 0) {
@@ -115,7 +138,8 @@ void UpdateInputsV1(const paddle::Tensor& stop_flags,
                     const paddle::Tensor& stop_nums,
                     const paddle::Tensor& next_tokens,
                     const paddle::Tensor& is_block_step,
-                    const int block_size) {
+                    const int block_size,
+                    const bool is_pooling_task) {
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
   auto dev_ctx = static_cast<const phi::CustomContext*>(
       paddle::experimental::DeviceContextPool::Instance().Get(
@@ -132,6 +156,7 @@ void UpdateInputsV1(const paddle::Tensor& stop_flags,
   }
   const int max_bsz = stop_flags.shape()[0];
   const int now_bsz = seq_lens_this_time.shape()[0];
+  const int bsz_to_process = is_pooling_task ? max_bsz : now_bsz;
   const int input_ids_stride = input_ids.shape()[1];
   const int block_num_per_seq = block_tables.shape()[1];
   auto not_need_stop_gpu = not_need_stop.copy_to(stop_flags.place(), false);
@@ -149,12 +174,13 @@ void UpdateInputsV1(const paddle::Tensor& stop_flags,
       const_cast<bool*>(stop_flags.data<bool>()),
       const_cast<bool*>(is_block_step.data<bool>()),
       next_tokens.data<int64_t>(),
-      now_bsz,
+      bsz_to_process,
       max_bsz,
       input_ids_stride,
       block_num_per_seq,
       block_size,
-      prefill_one_step_stop);
+      prefill_one_step_stop,
+      is_pooling_task);
   auto not_need_stop_cpu =
       not_need_stop_gpu.copy_to(not_need_stop.place(), false);
   bool* not_need_stop_data = const_cast<bool*>(not_need_stop.data<bool>());
@@ -175,7 +201,7 @@ PD_BUILD_STATIC_OP(update_inputs_v1)
              "stop_nums",
              "next_tokens",
              "is_block_step"})
-    .Attrs({"block_size: int"})
+    .Attrs({"block_size: int", "is_pooling_task: bool"})
     .Outputs({"not_need_stop_out",
               "seq_lens_this_time_out",
               "seq_lens_encoder_out",

diff --git a/fastdeploy/engine/common_engine.py b/fastdeploy/engine/common_engine.py
@@ -737,6 +737,7 @@ def _fetch_request():
                                 raise
                 # 2. Schedule requests
                 tasks = self.resource_manager.schedule()
+
                 # 3. Send to engine
                 if tasks:
                     if self.cfg.scheduler_config.splitwise_role == "decode":
@@ -886,24 +887,27 @@ def _zmq_send_generated_tokens(self):
                 for request_id, contents in results.items():
                     new_contents = []
                     for content in contents:
-                        decode_type = content.outputs.decode_type
-                        delta_text = ""
-                        if decode_type == 0:
-                            delta_text, token_ids = self._decode_token(
-                                token_ids=content.outputs.token_ids, req_id=request_id, is_end=content.finished
-                            )
+                        if isinstance(content, RequestOutput):
+                            decode_type = content.outputs.decode_type
+                            delta_text = ""
+                            if decode_type == 0:
+                                delta_text, token_ids = self._decode_token(
+                                    token_ids=content.outputs.token_ids, req_id=request_id, is_end=content.finished
+                                )
+                            else:
+                                token_ids = content.outputs.token_ids
+                            if len(token_ids):
+                                content.outputs.token_ids = token_ids
+                                content.outputs.text = delta_text
+                                new_contents.append(content)
+                            elif content.finished:
+                                new_contents.append(content)
+                            else:
+                                llm_logger.warning(
+                                    f"current tokens need to accumulate, req_id: {request_id} {content.outputs.token_ids}"
+                                )
                         else:
-                            token_ids = content.outputs.token_ids
-                        if len(token_ids):
-                            content.outputs.token_ids = token_ids
-                            content.outputs.text = delta_text
-                            new_contents.append(content)
-                        elif content.finished:
                             new_contents.append(content)
-                        else:
-                            llm_logger.warning(
-                                f"current tokens need to accumulate, req_id: {request_id} {content.outputs.token_ids}"
-                            )
                     if len(new_contents):
                         llm_logger.info(f"Send response for request id: {request_id}")
                         self.send_response_server.send_response(request_id, new_contents)

diff --git a/fastdeploy/model_executor/layers/pooler.py b/fastdeploy/model_executor/layers/pooler.py
@@ -305,19 +305,6 @@ def forward(self, pooled_data: Union[list[paddle.Tensor], paddle.Tensor], poolin
         return pooled_data
 
 
-def build_output(
-    all_data: Union[paddle.Tensor, list[paddle.Tensor]],
-) -> PoolerOutput:
-    # Pooling models D2H & synchronize occurs here
-    if isinstance(all_data, list):
-        all_data = [d.cpu() for d in all_data]
-    else:
-        all_data = all_data.cpu()
-
-    all_outputs = [PoolingSequenceGroupOutput(data) for data in all_data]
-    return PoolerOutput(outputs=all_outputs)
-
-
 class PoolingMethod(nn.Layer, ABC):
 
     @staticmethod
@@ -473,8 +460,11 @@ def forward(
         pooling_metadata: PoolingMetadata,
     ) -> PoolerOutput:
         pooled_data = self.extract_states(hidden_states, pooling_metadata)
-        pooled_data = self.head(pooled_data, pooling_metadata)
-        return build_output(pooled_data)
+        pooling_params = get_pooling_params(pooling_metadata)
+        assert len(pooled_data) == len(pooling_params)
+
+        pooled_data = [self.head(d, p) for d, p in zip(pooled_data, pooling_params)]
+        return pooled_data
 
 
 class SimplePooler(Pooler):
@@ -520,7 +510,7 @@ def forward(
     ) -> PoolerOutput:
         pooled_data = self.pooling(hidden_states, pooling_metadata)
         pooled_data = self.head(pooled_data, pooling_metadata)
-        return build_output(pooled_data)
+        return pooled_data
 
 
 class PoolerNormalize(PoolerActivation):
@@ -567,7 +557,7 @@ def forward(
                 hidden_states,
                 pooling_metadata[offset : offset + num_items],
             )
-            outputs.extend(group_output.outputs)
+            outputs.extend(group_output)
             offset += num_items
 
         return PoolerOutput(outputs)