Skip to content

Commit 561b9f3

Browse files
authored
[BugFix] fix paddleocr prefix cache bug (#4625)
* fix paddleocr prefix cache bug * disable prefix-caching in ocr
1 parent fff5fb5 commit 561b9f3

File tree

5 files changed

+352
-130
lines changed

5 files changed

+352
-130
lines changed

fastdeploy/engine/args_utils.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -452,6 +452,8 @@ def __post_init__(self):
452452

453453
if "PaddleOCR" in get_model_architecture(self.model, self.model_config_name):
454454
envs.FD_ENABLE_MAX_PREFILL = 1
455+
self.enable_prefix_caching = False
456+
self.max_encoder_cache = 0
455457

456458
@staticmethod
457459
def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:

fastdeploy/input/paddleocr_vl_processor/paddleocr_vl_processor.py

Lines changed: 25 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ def __init__(
4747
mm_processor_kwargs=None,
4848
reasoning_parser_obj=None,
4949
tool_parser_obj=None,
50+
enable_processor_cache=False,
5051
):
5152
"""
5253
Initialize PaddleOCRVLProcessor instance.
@@ -65,6 +66,7 @@ def __init__(
6566
processor_kwargs = self._parse_processor_kwargs(mm_processor_kwargs)
6667
self.processor = DataProcessor(
6768
model_path=model_name_or_path,
69+
enable_processor_cache=enable_processor_cache,
6870
tokens_per_second=config.vision_config.tokens_per_second,
6971
tokenizer=self.tokenizer,
7072
**processor_kwargs,
@@ -252,27 +254,21 @@ def process_request_dict(self, request, max_model_len=None):
252254

253255
return request
254256

255-
def append_generated_tokens(self, outputs, generated_token_ids):
257+
def append_generated_tokens(self, multimodal_inputs, generated_token_ids):
256258
"""
257259
Append generated tokens to existing outputs.
258260
259261
Args:
260262
outputs: Current model outputs
261263
generated_token_ids: Generated tokens to append
262264
"""
263-
out = {"input_ids": [], "token_type_ids": [], "position_ids": [], "cur_position": outputs["cur_position"]}
264-
self.processor._add_text(generated_token_ids, out)
265+
num_tokens = len(generated_token_ids)
266+
multimodal_inputs["input_ids"].extend(generated_token_ids)
267+
multimodal_inputs["token_type_ids"].extend([0] * num_tokens)
265268

266-
outputs["input_ids"] = np.concatenate(
267-
[outputs["input_ids"], np.array(out["input_ids"], dtype=np.int64)], axis=0
268-
)
269-
outputs["token_type_ids"] = np.concatenate(
270-
[outputs["token_type_ids"], np.array(out["token_type_ids"], dtype=np.int64)], axis=0
271-
)
272-
outputs["position_ids"] = np.concatenate(
273-
[outputs["position_ids"], out["position_ids"][0]], axis=1, dtype=np.int64
274-
)
275-
outputs["cur_position"] = out["cur_position"]
269+
pos_ids = self.processor._compute_text_positions(multimodal_inputs["cur_position"], num_tokens)
270+
multimodal_inputs["position_ids"].append(pos_ids)
271+
multimodal_inputs["cur_position"] += num_tokens
276272

277273
def pack_outputs(self, outputs):
278274
"""
@@ -284,6 +280,22 @@ def pack_outputs(self, outputs):
284280
Returns:
285281
dict: Packed output dictionary with all required fields
286282
"""
283+
if not outputs["images"]:
284+
outputs["images"] = None # No images case
285+
outputs["grid_thw"] = None # No spatial dimensions
286+
outputs["image_type_ids"] = None # No type IDs
287+
else:
288+
outputs["images"] = np.vstack(outputs["images"]) # Stack image features vertically
289+
outputs["grid_thw"] = np.vstack(outputs["grid_thw"]) # Stack spatial dimensions
290+
outputs["image_type_ids"] = np.array(outputs["image_type_ids"]) # Convert to numpy array
291+
292+
# Convert all outputs to numpy arrays with appropriate types
293+
outputs["input_ids"] = np.array(outputs["input_ids"], dtype=np.int64) # Token IDs as int64
294+
outputs["token_type_ids"] = np.array(outputs["token_type_ids"], dtype=np.int64) # Type IDs as int64
295+
outputs["position_ids"] = np.concatenate(
296+
outputs["position_ids"], axis=1, dtype=np.int64
297+
) # Concatenate position ID
298+
287299
outputs["image_patch_id"] = self.processor.image_token_id
288300
outputs["video_patch_id"] = self.processor.video_token_id
289301
outputs["position_ids"] = outputs["position_ids"].transpose(1, 0)

0 commit comments

Comments
 (0)