@@ -47,6 +47,7 @@ def __init__(
4747        mm_processor_kwargs = None ,
4848        reasoning_parser_obj = None ,
4949        tool_parser_obj = None ,
50+         enable_processor_cache = False ,
5051    ):
5152        """ 
5253        Initialize PaddleOCRVLProcessor instance. 
@@ -65,6 +66,7 @@ def __init__(
6566        processor_kwargs  =  self ._parse_processor_kwargs (mm_processor_kwargs )
6667        self .processor  =  DataProcessor (
6768            model_path = model_name_or_path ,
69+             enable_processor_cache = enable_processor_cache ,
6870            tokens_per_second = config .vision_config .tokens_per_second ,
6971            tokenizer = self .tokenizer ,
7072            ** processor_kwargs ,
@@ -252,27 +254,21 @@ def process_request_dict(self, request, max_model_len=None):
252254
253255        return  request 
254256
255-     def  append_generated_tokens (self , outputs , generated_token_ids ):
257+     def  append_generated_tokens (self , multimodal_inputs , generated_token_ids ):
256258        """ 
257259        Append generated tokens to existing outputs. 
258260
259261        Args: 
260262            outputs: Current model outputs 
261263            generated_token_ids: Generated tokens to append 
262264        """ 
263-         out  =  {"input_ids" : [], "token_type_ids" : [], "position_ids" : [], "cur_position" : outputs ["cur_position" ]}
264-         self .processor ._add_text (generated_token_ids , out )
265+         num_tokens  =  len (generated_token_ids )
266+         multimodal_inputs ["input_ids" ].extend (generated_token_ids )
267+         multimodal_inputs ["token_type_ids" ].extend ([0 ] *  num_tokens )
265268
266-         outputs ["input_ids" ] =  np .concatenate (
267-             [outputs ["input_ids" ], np .array (out ["input_ids" ], dtype = np .int64 )], axis = 0 
268-         )
269-         outputs ["token_type_ids" ] =  np .concatenate (
270-             [outputs ["token_type_ids" ], np .array (out ["token_type_ids" ], dtype = np .int64 )], axis = 0 
271-         )
272-         outputs ["position_ids" ] =  np .concatenate (
273-             [outputs ["position_ids" ], out ["position_ids" ][0 ]], axis = 1 , dtype = np .int64 
274-         )
275-         outputs ["cur_position" ] =  out ["cur_position" ]
269+         pos_ids  =  self .processor ._compute_text_positions (multimodal_inputs ["cur_position" ], num_tokens )
270+         multimodal_inputs ["position_ids" ].append (pos_ids )
271+         multimodal_inputs ["cur_position" ] +=  num_tokens 
276272
277273    def  pack_outputs (self , outputs ):
278274        """ 
@@ -284,6 +280,22 @@ def pack_outputs(self, outputs):
284280        Returns: 
285281            dict: Packed output dictionary with all required fields 
286282        """ 
283+         if  not  outputs ["images" ]:
284+             outputs ["images" ] =  None   # No images case 
285+             outputs ["grid_thw" ] =  None   # No spatial dimensions 
286+             outputs ["image_type_ids" ] =  None   # No type IDs 
287+         else :
288+             outputs ["images" ] =  np .vstack (outputs ["images" ])  # Stack image features vertically 
289+             outputs ["grid_thw" ] =  np .vstack (outputs ["grid_thw" ])  # Stack spatial dimensions 
290+             outputs ["image_type_ids" ] =  np .array (outputs ["image_type_ids" ])  # Convert to numpy array 
291+ 
292+         # Convert all outputs to numpy arrays with appropriate types 
293+         outputs ["input_ids" ] =  np .array (outputs ["input_ids" ], dtype = np .int64 )  # Token IDs as int64 
294+         outputs ["token_type_ids" ] =  np .array (outputs ["token_type_ids" ], dtype = np .int64 )  # Type IDs as int64 
295+         outputs ["position_ids" ] =  np .concatenate (
296+             outputs ["position_ids" ], axis = 1 , dtype = np .int64 
297+         )  # Concatenate position ID 
298+ 
287299        outputs ["image_patch_id" ] =  self .processor .image_token_id 
288300        outputs ["video_patch_id" ] =  self .processor .video_token_id 
289301        outputs ["position_ids" ] =  outputs ["position_ids" ].transpose (1 , 0 )
0 commit comments