diff --git a/README.md b/README.md index 7635611..f6c9a29 100644 --- a/README.md +++ b/README.md @@ -222,6 +222,16 @@ with GlmOcr() as parser: print(result.json_result) result.save() +# Extract printed page numbers from PP-DocLayoutV3 `number` regions +with GlmOcr(detect_printed_page_numbers=True) as parser: + result = parser.parse("document.pdf") + print(result.to_dict().get("page_metadata", [])) + +# Export image assets with rendered and embedded outputs +with GlmOcr(enable_image_asset_export=True) as parser: + result = parser.parse("document.pdf") + result.save() + # Place layout model on CPU (useful when GPU is reserved for OCR) with GlmOcr(layout_device="cpu") as parser: result = parser.parse("image.png") @@ -302,6 +312,12 @@ pipeline: # Result formatting result_formatter: output_format: both # json, markdown, or both + detect_printed_page_numbers: false + enable_image_asset_export: false + markdown_image_preference: embedded # embedded | rendered + image_match_iou_threshold: 0.5 + image_match_containment_threshold: 0.8 + rendered_image_dpi: 300 # Layout model device placement layout: @@ -310,6 +326,40 @@ pipeline: See [config.yaml](glmocr/config.yaml) for all options. +Printed page number detection can be enabled in three ways: + +```python +with GlmOcr(detect_printed_page_numbers=True) as parser: + result = parser.parse("document.pdf") +``` + +```powershell +$env:GLMOCR_DETECT_PRINTED_PAGE_NUMBERS = 'true' +``` + +```yaml +pipeline: + result_formatter: + detect_printed_page_numbers: true +``` + +Image asset export can also be enabled from Python or YAML: + +```python +with GlmOcr(enable_image_asset_export=True) as parser: + result = parser.parse("document.pdf") +``` + +```yaml +pipeline: + result_formatter: + enable_image_asset_export: true + markdown_image_preference: embedded + image_match_iou_threshold: 0.5 + image_match_containment_threshold: 0.8 + rendered_image_dpi: 300 +``` + ### Output Formats Here are two examples of output formats: @@ -320,6 +370,61 @@ Here are two examples of output formats: [[{ "index": 0, "label": "text", "content": "...", "bbox_2d": null }]] ``` +When printed page detection is enabled and printed-page data is actually found, +saved `paper.json` is wrapped as a top-level object and includes: + +```json +{ + "json_result": [[{ "index": 0, "label": "text", "content": "...", "bbox_2d": null }]], + "page_number_candidates": [ + { + "page_index": 1, + "label": "number", + "content": "22", + "layout_index": 0, + "bbox_2d": [92, 26, 120, 41], + "layout_score": 0.77, + "numeric_like": true, + "roman_like": false + } + ], + "document_page_numbering": { + "strategy": "visual_sequence", + "confidence": 1.0, + "sequence_type": "arabic", + "page_offset": 21, + "candidate_pages": 4 + }, + "page_metadata": [ + { + "page_index": 1, + "printed_page_label": "22", + "printed_page_block_index": 0, + "printed_page_bbox_2d": [92, 26, 120, 41], + "printed_page_confidence": 0.77 + } + ] +} +``` + +When image asset export is enabled, image-like blocks can additionally expose: + +```json +{ + "label": "image", + "image_path": "imgs_embedded/embedded_page2_idx2_xref199.jpeg", + "rendered_image_path": "imgs_rendered/cropped_page2_idx0.jpg", + "embedded_image_path": "imgs_embedded/embedded_page2_idx2_xref199.jpeg", + "image_asset_source": "embedded" +} +``` + +Behavior summary: +- rendered image assets are written to `imgs_rendered/` +- if `enable_image_asset_export=true`, matched embedded PDF images are also written to `imgs_embedded/` +- `image_path` follows `markdown_image_preference` +- `embedded_image_path` is `null` when no embedded match exists + - Markdown ```markdown diff --git a/README_zh.md b/README_zh.md index 8a2f3fe..c63a66d 100644 --- a/README_zh.md +++ b/README_zh.md @@ -214,6 +214,16 @@ with GlmOcr() as parser: result = parser.parse("image.png") print(result.json_result) result.save() + +# 从 PP-DocLayoutV3 的 `number` 区域提取印刷页码 +with GlmOcr(detect_printed_page_numbers=True) as parser: + result = parser.parse("document.pdf") + print(result.to_dict().get("page_metadata", [])) + +# 导出渲染图像与嵌入式 PDF 图像资产 +with GlmOcr(enable_image_asset_export=True) as parser: + result = parser.parse("document.pdf") + result.save() ``` #### Flask 服务 @@ -287,10 +297,50 @@ pipeline: # Result formatting result_formatter: output_format: both # json, markdown, or both + detect_printed_page_numbers: false + enable_image_asset_export: false + markdown_image_preference: embedded # embedded | rendered + image_match_iou_threshold: 0.5 + image_match_containment_threshold: 0.8 + rendered_image_dpi: 300 ``` 更多选项请参考 [config.yaml](glmocr/config.yaml)。 +印刷页码检测支持以下三种启用方式: + +```python +with GlmOcr(detect_printed_page_numbers=True) as parser: + result = parser.parse("document.pdf") +``` + +```powershell +$env:GLMOCR_DETECT_PRINTED_PAGE_NUMBERS = 'true' +``` + +```yaml +pipeline: + result_formatter: + detect_printed_page_numbers: true +``` + +图像资产导出也可以通过 Python 或 YAML 启用: + +```python +with GlmOcr(enable_image_asset_export=True) as parser: + result = parser.parse("document.pdf") +``` + +```yaml +pipeline: + result_formatter: + enable_image_asset_export: true + markdown_image_preference: embedded + image_match_iou_threshold: 0.5 + image_match_containment_threshold: 0.8 + rendered_image_dpi: 300 +``` + ### 输出格式 这里给出两种输出格式示例: @@ -301,6 +351,60 @@ pipeline: [[{ "index": 0, "label": "text", "content": "...", "bbox_2d": null }]] ``` +启用印刷页码检测且实际检测到印刷页码数据时,保存的 `paper.json` 会变成顶层对象,并包含: + +```json +{ + "json_result": [[{ "index": 0, "label": "text", "content": "...", "bbox_2d": null }]], + "page_number_candidates": [ + { + "page_index": 1, + "label": "number", + "content": "22", + "layout_index": 0, + "bbox_2d": [92, 26, 120, 41], + "layout_score": 0.77, + "numeric_like": true, + "roman_like": false + } + ], + "document_page_numbering": { + "strategy": "visual_sequence", + "confidence": 1.0, + "sequence_type": "arabic", + "page_offset": 21, + "candidate_pages": 4 + }, + "page_metadata": [ + { + "page_index": 1, + "printed_page_label": "22", + "printed_page_block_index": 0, + "printed_page_bbox_2d": [92, 26, 120, 41], + "printed_page_confidence": 0.77 + } + ] +} +``` + +启用图像资产导出后,图像类区域还会额外暴露: + +```json +{ + "label": "image", + "image_path": "imgs_embedded/embedded_page2_idx2_xref199.jpeg", + "rendered_image_path": "imgs_rendered/cropped_page2_idx0.jpg", + "embedded_image_path": "imgs_embedded/embedded_page2_idx2_xref199.jpeg", + "image_asset_source": "embedded" +} +``` + +行为说明: +- 渲染图像资产写入 `imgs_rendered/` +- 当 `enable_image_asset_export=true` 时,匹配成功的嵌入式 PDF 图像还会写入 `imgs_embedded/` +- `image_path` 会根据 `markdown_image_preference` 选择最终引用的资产 +- 若没有嵌入式匹配,`embedded_image_path` 为 `null` + - Markdown ```markdown diff --git a/glmocr/api.py b/glmocr/api.py index dcf5570..59f995f 100644 --- a/glmocr/api.py +++ b/glmocr/api.py @@ -29,8 +29,8 @@ from glmocr.config import load_config from glmocr.parser_result import PipelineResult +from glmocr.utils.image_asset_utils import export_image_assets from glmocr.utils.logging import get_logger, ensure_logging_configured -from glmocr.utils.markdown_utils import resolve_image_regions logger = get_logger(__name__) @@ -84,6 +84,7 @@ def __init__( ocr_api_port: Optional[int] = None, cuda_visible_devices: Optional[str] = None, layout_device: Optional[str] = None, + detect_printed_page_numbers: Optional[bool] = None, **kwargs: Any, ): """Initialize GlmOcr. @@ -130,6 +131,7 @@ def __init__( ocr_api_port=ocr_api_port, cuda_visible_devices=cuda_visible_devices, layout_device=layout_device, + detect_printed_page_numbers=detect_printed_page_numbers, **kwargs, ) # Apply logging config for API/SDK usage. @@ -441,8 +443,11 @@ def _maas_response_to_pipeline_result( { "index": region.get("index", 0), "label": region.get("label", "text"), + "native_label": region.get("label", "text"), "content": region.get("content", ""), "bbox_2d": bbox, + "layout_index": region.get("index", 0), + "layout_score": float(region.get("score") or 0.0), } ) json_result.append(page_result) @@ -454,18 +459,43 @@ def _maas_response_to_pipeline_result( pages_info, ) - json_result, markdown_result, image_files = resolve_image_regions( + json_result, markdown_result, image_files = export_image_assets( json_result, markdown_result, source, + enable_image_asset_export=self.config_model.pipeline.result_formatter.enable_image_asset_export, + markdown_image_preference=self.config_model.pipeline.result_formatter.markdown_image_preference, + image_match_iou_threshold=self.config_model.pipeline.result_formatter.image_match_iou_threshold, + image_match_containment_threshold=self.config_model.pipeline.result_formatter.image_match_containment_threshold, + rendered_image_dpi=self.config_model.pipeline.result_formatter.rendered_image_dpi, ) + page_metadata = None + page_number_candidates = None + document_page_numbering = None + if self.config_model.pipeline.result_formatter.detect_printed_page_numbers: + from glmocr.postprocess import ResultFormatter + + formatter = ResultFormatter(self.config_model.pipeline.result_formatter) + ( + page_number_candidates, + document_page_numbering, + page_metadata, + ) = formatter.extract_printed_page_data(json_result) + + from glmocr.postprocess import ResultFormatter + + ResultFormatter._strip_layout_metadata(json_result) + # Create PipelineResult result = PipelineResult( json_result=json_result, markdown_result=markdown_result, original_images=[source], image_files=image_files or None, + page_metadata=page_metadata, + page_number_candidates=page_number_candidates, + document_page_numbering=document_page_numbering, ) # Store additional MaaS response data diff --git a/glmocr/config.py b/glmocr/config.py index 789e206..d8123b8 100644 --- a/glmocr/config.py +++ b/glmocr/config.py @@ -4,7 +4,7 @@ import os from pathlib import Path -from typing import Any, Dict, Optional, Union, List +from typing import Any, Dict, Optional, Union, List, Literal import yaml from dotenv import dotenv_values @@ -48,6 +48,13 @@ def _find_dotenv(start: Optional[Path] = None) -> Optional[Path]: "LAYOUT_CUDA_VISIBLE_DEVICES": "pipeline.layout.cuda_visible_devices", # Explicit device for layout model: "cpu", "cuda", "cuda:0", etc. "LAYOUT_DEVICE": "pipeline.layout.device", + # Result formatter + "DETECT_PRINTED_PAGE_NUMBERS": "pipeline.result_formatter.detect_printed_page_numbers", + "ENABLE_IMAGE_ASSET_EXPORT": "pipeline.result_formatter.enable_image_asset_export", + "MARKDOWN_IMAGE_PREFERENCE": "pipeline.result_formatter.markdown_image_preference", + "IMAGE_MATCH_IOU_THRESHOLD": "pipeline.result_formatter.image_match_iou_threshold", + "IMAGE_MATCH_CONTAINMENT_THRESHOLD": "pipeline.result_formatter.image_match_containment_threshold", + "RENDERED_IMAGE_DPI": "pipeline.result_formatter.rendered_image_dpi", # Logging "LOG_LEVEL": "logging.level", } @@ -175,8 +182,25 @@ class ResultFormatterConfig(_BaseConfig): enable_merge_formula_numbers: bool = True enable_merge_text_blocks: bool = True enable_format_bullet_points: bool = True + detect_printed_page_numbers: bool = False + enable_image_asset_export: bool = False + markdown_image_preference: Literal["embedded", "rendered"] = "embedded" + image_match_iou_threshold: float = 0.5 + image_match_containment_threshold: float = 0.8 + rendered_image_dpi: int = 300 label_visualization_mapping: Dict[str, Any] = Field(default_factory=dict) + @field_validator("markdown_image_preference") + @classmethod + def _validate_markdown_image_preference( + cls, value: str + ) -> Literal["embedded", "rendered"]: + if value not in ("embedded", "rendered"): + raise ValueError( + "markdown_image_preference must be 'embedded' or 'rendered'" + ) + return value + class LayoutConfig(_BaseConfig): model_dir: Optional[str] = None @@ -260,9 +284,22 @@ def _coerce_env_value(dotted_path: str, raw: str) -> Any: # Boolean fields if dotted_path == "pipeline.maas.enabled": return raw.strip().lower() in ("maas", "true", "1", "yes") + if dotted_path == "pipeline.result_formatter.detect_printed_page_numbers": + return raw.strip().lower() in ("true", "1", "yes", "on") + if dotted_path == "pipeline.result_formatter.enable_image_asset_export": + return raw.strip().lower() in ("true", "1", "yes", "on") # Integer fields if dotted_path.endswith((".api_port", ".request_timeout", ".connect_timeout")): return int(raw) + if dotted_path == "pipeline.result_formatter.rendered_image_dpi": + return int(raw) + if dotted_path.endswith( + ( + ".image_match_iou_threshold", + ".image_match_containment_threshold", + ) + ): + return float(raw) return raw @@ -429,6 +466,12 @@ def from_env( "mode": "pipeline.maas.enabled", "timeout": "pipeline.maas.request_timeout", "log_level": "logging.level", + "detect_printed_page_numbers": "pipeline.result_formatter.detect_printed_page_numbers", + "enable_image_asset_export": "pipeline.result_formatter.enable_image_asset_export", + "markdown_image_preference": "pipeline.result_formatter.markdown_image_preference", + "image_match_iou_threshold": "pipeline.result_formatter.image_match_iou_threshold", + "image_match_containment_threshold": "pipeline.result_formatter.image_match_containment_threshold", + "rendered_image_dpi": "pipeline.result_formatter.rendered_image_dpi", # Self-hosted OCR API "ocr_api_host": "pipeline.ocr_api.api_host", "ocr_api_port": "pipeline.ocr_api.api_port", diff --git a/glmocr/config.yaml b/glmocr/config.yaml index 8c287fe..5679570 100644 --- a/glmocr/config.yaml +++ b/glmocr/config.yaml @@ -164,6 +164,7 @@ pipeline: - content - doc_title - figure_title + - number - paragraph_title - reference_content - text @@ -256,6 +257,7 @@ pipeline: - content - doc_title - figure_title + - number - paragraph_title - reference_content - text @@ -274,7 +276,6 @@ pipeline: abandon: - header - footer - - number - footnote - aside_text - reference diff --git a/glmocr/layout/layout_detector.py b/glmocr/layout/layout_detector.py index c4f1680..1e73970 100644 --- a/glmocr/layout/layout_detector.py +++ b/glmocr/layout/layout_detector.py @@ -2,15 +2,19 @@ from __future__ import annotations +from pathlib import Path from typing import TYPE_CHECKING, List, Dict import cv2 import torch import numpy as np from PIL import Image +from huggingface_hub import hf_hub_download +from safetensors.torch import load_file from transformers import ( + PPDocLayoutV3Config, PPDocLayoutV3ForObjectDetection, - PPDocLayoutV3ImageProcessorFast, + PPDocLayoutV3ImageProcessor, ) from glmocr.layout.base import BaseLayoutDetector @@ -56,14 +60,55 @@ def __init__(self, config: "LayoutConfig"): self._image_processor = None self._device = None + def _resolve_model_weights_path(self) -> Path: + """Return the local safetensors path for the configured model.""" + model_path = Path(self.model_dir) + if model_path.is_dir(): + return model_path / "model.safetensors" + return Path( + hf_hub_download(repo_id=self.model_dir, filename="model.safetensors") + ) + + def _prepare_pp_doclayout_state_dict(self) -> Dict[str, torch.Tensor]: + """Alias tied PP-DocLayoutV3 detection head weights before model load.""" + state_dict = load_file(str(self._resolve_model_weights_path())) + alias_pairs = { + "model.enc_score_head.weight": "model.decoder.class_embed.weight", + "model.enc_score_head.bias": "model.decoder.class_embed.bias", + "model.enc_bbox_head.layers.0.weight": "model.decoder.bbox_embed.layers.0.weight", + "model.enc_bbox_head.layers.0.bias": "model.decoder.bbox_embed.layers.0.bias", + "model.enc_bbox_head.layers.1.weight": "model.decoder.bbox_embed.layers.1.weight", + "model.enc_bbox_head.layers.1.bias": "model.decoder.bbox_embed.layers.1.bias", + "model.enc_bbox_head.layers.2.weight": "model.decoder.bbox_embed.layers.2.weight", + "model.enc_bbox_head.layers.2.bias": "model.decoder.bbox_embed.layers.2.bias", + } + + remapped = [] + for source_key, target_key in alias_pairs.items(): + if source_key in state_dict and target_key not in state_dict: + state_dict[target_key] = state_dict[source_key].clone() + remapped.append(target_key) + + if remapped: + logger.warning( + "Prepared PP-DocLayoutV3 state dict with decoder head aliases: %s", + ", ".join(remapped), + ) + + return state_dict + def start(self): """Load model and processor once in the main process.""" logger.debug("Initializing PP-DocLayoutV3...") - self._image_processor = PPDocLayoutV3ImageProcessorFast.from_pretrained( + self._image_processor = PPDocLayoutV3ImageProcessor.from_pretrained( self.model_dir ) - self._model = PPDocLayoutV3ForObjectDetection.from_pretrained(self.model_dir) + self._model = PPDocLayoutV3ForObjectDetection.from_pretrained( + None, + config=PPDocLayoutV3Config.from_pretrained(self.model_dir), + state_dict=self._prepare_pp_doclayout_state_dict(), + ) self._model.eval() # Device selection priority: diff --git a/glmocr/parser_result/base.py b/glmocr/parser_result/base.py index 0b996c2..5a4b423 100644 --- a/glmocr/parser_result/base.py +++ b/glmocr/parser_result/base.py @@ -30,6 +30,9 @@ def __init__( original_images: Optional[List[str]] = None, image_files: Optional[Dict[str, Any]] = None, raw_json_result: Optional[list] = None, + page_metadata: Optional[List[Dict[str, Any]]] = None, + page_number_candidates: Optional[List[Dict[str, Any]]] = None, + document_page_numbering: Optional[Dict[str, Any]] = None, ): """Initialize. @@ -37,10 +40,13 @@ def __init__( json_result: JSON result (string, dict, or list). markdown_result: Markdown result (optional). original_images: Original image paths. - image_files: Mapping of ``filename`` → PIL Image for image-type - regions, to be saved under ``imgs/`` during :meth:`save`. + image_files: Mapping of relative output path → image asset payload + for image-type regions, to be saved during :meth:`save`. raw_json_result: Raw model output before post-processing; saved as ``{name}_model.json`` alongside the final result. + page_metadata: Derived per-page printed page metadata. + page_number_candidates: Raw printed page-number candidates. + document_page_numbering: Document-level numbering inference. """ if isinstance(json_result, str): try: @@ -56,6 +62,9 @@ def __init__( ] self.image_files = image_files self.raw_json_result = raw_json_result + self.page_metadata = page_metadata + self.page_number_candidates = page_number_candidates + self.document_page_numbering = document_page_numbering @abstractmethod def save( @@ -88,6 +97,27 @@ def _save_json_and_markdown(self, output_dir: Union[str, Path]) -> None: json_data = json.loads(json_data) except json.JSONDecodeError: pass + + has_printed_page_data = ( + bool(self.page_metadata) + or bool(self.page_number_candidates) + or self.document_page_numbering is not None + ) + + if has_printed_page_data: + json_data = { + "json_result": json_data, + "page_metadata": ( + self.page_metadata if self.page_metadata is not None else [] + ), + "page_number_candidates": ( + self.page_number_candidates + if self.page_number_candidates is not None + else [] + ), + "document_page_numbering": self.document_page_numbering, + } + with open(json_file, "w", encoding="utf-8") as f: if isinstance(json_data, (dict, list)): json.dump(json_data, f, ensure_ascii=False, indent=2) @@ -114,13 +144,16 @@ def _save_json_and_markdown(self, output_dir: Union[str, Path]) -> None: # Image files produced by the result formatter if self.image_files: - imgs_dir = output_path / "imgs" - imgs_dir.mkdir(parents=True, exist_ok=True) - for filename, img in self.image_files.items(): + for rel_path, img in self.image_files.items(): try: - img.save(imgs_dir / filename, quality=95) + target = output_path / rel_path + target.parent.mkdir(parents=True, exist_ok=True) + if isinstance(img, (bytes, bytearray)): + target.write_bytes(bytes(img)) + else: + img.save(target, quality=95) except Exception as e: - logger.warning("Failed to save image %s: %s", filename, e) + logger.warning("Failed to save image %s: %s", rel_path, e) self.image_files = None def to_dict(self) -> dict: @@ -134,6 +167,12 @@ def to_dict(self) -> dict: "markdown_result": self.markdown_result or "", "original_images": self.original_images, } + if self.page_metadata is not None: + d["page_metadata"] = self.page_metadata + if self.page_number_candidates is not None: + d["page_number_candidates"] = self.page_number_candidates + if self.document_page_numbering is not None: + d["document_page_numbering"] = self.document_page_numbering # Include optional metadata set by MaaS mode. for attr in ("_usage", "_data_info", "_error"): val = getattr(self, attr, None) diff --git a/glmocr/parser_result/pipeline_result.py b/glmocr/parser_result/pipeline_result.py index 800084c..7a9e9d9 100644 --- a/glmocr/parser_result/pipeline_result.py +++ b/glmocr/parser_result/pipeline_result.py @@ -26,6 +26,9 @@ def __init__( image_files: Optional[dict] = None, raw_json_result: Optional[list] = None, layout_vis_images: Optional[Dict[int, Any]] = None, + page_metadata: Optional[List[Dict[str, Any]]] = None, + page_number_candidates: Optional[List[Dict[str, Any]]] = None, + document_page_numbering: Optional[Dict[str, Any]] = None, ): """Initialize. @@ -38,6 +41,9 @@ def __init__( raw_json_result: Raw model output before post-processing (optional). layout_vis_images: Mapping of ``page_idx`` → PIL Image for layout visualization; saved to ``layout_vis/`` during :meth:`save`. + page_metadata: Derived per-page printed page metadata. + page_number_candidates: Raw printed page-number candidates. + document_page_numbering: Document-level numbering inference. """ super().__init__( json_result=json_result, @@ -45,6 +51,9 @@ def __init__( original_images=original_images, image_files=image_files, raw_json_result=raw_json_result, + page_metadata=page_metadata, + page_number_candidates=page_number_candidates, + document_page_numbering=document_page_numbering, ) self.layout_vis_images = layout_vis_images diff --git a/glmocr/pipeline/pipeline.py b/glmocr/pipeline/pipeline.py index 699f48d..0705099 100644 --- a/glmocr/pipeline/pipeline.py +++ b/glmocr/pipeline/pipeline.py @@ -15,6 +15,7 @@ from __future__ import annotations +import json import time import threading from typing import TYPE_CHECKING, Any, Dict, Generator, List, Optional @@ -23,6 +24,7 @@ from glmocr.ocr_client import OCRClient from glmocr.parser_result import PipelineResult from glmocr.postprocess import ResultFormatter +from glmocr.utils.image_asset_utils import export_image_assets from glmocr.utils.logging import get_logger from glmocr.pipeline._common import ( @@ -362,6 +364,22 @@ def _emit_results( grouped, cropped_images=cropped_images or None, ) + parsed_json = json.loads(json_u) + parsed_json, md_u, image_files = export_image_assets( + parsed_json, + md_u, + original_inputs[u], + enable_image_asset_export=self.config.result_formatter.enable_image_asset_export, + markdown_image_preference=self.config.result_formatter.markdown_image_preference, + image_match_iou_threshold=self.config.result_formatter.image_match_iou_threshold, + image_match_containment_threshold=self.config.result_formatter.image_match_containment_threshold, + rendered_image_dpi=self.config.result_formatter.rendered_image_dpi, + rendered_images=image_files or None, + ) + json_u = json.dumps(parsed_json, ensure_ascii=False, indent=2) + page_metadata = self.result_formatter.page_metadata + page_number_candidates = self.result_formatter.page_number_candidates + document_page_numbering = self.result_formatter.document_page_numbering vis_images = {} for pi in page_indices: @@ -378,6 +396,9 @@ def _emit_results( image_files=image_files or None, raw_json_result=raw_json, layout_vis_images=vis_images or None, + page_metadata=page_metadata, + page_number_candidates=page_number_candidates, + document_page_numbering=document_page_numbering, ) built.add(u) if preserve_order: diff --git a/glmocr/postprocess/result_formatter.py b/glmocr/postprocess/result_formatter.py index 8b31d5d..6797b03 100644 --- a/glmocr/postprocess/result_formatter.py +++ b/glmocr/postprocess/result_formatter.py @@ -12,10 +12,11 @@ from __future__ import annotations +import collections import re import json from copy import deepcopy -from typing import TYPE_CHECKING, List, Dict, Tuple, Any +from typing import TYPE_CHECKING, List, Dict, Tuple, Any, Optional try: # Optional dependency for better English word validation quality. from wordfreq import zipf_frequency @@ -70,6 +71,10 @@ def __init__(self, config: "ResultFormatterConfig"): self.enable_merge_formula_numbers = config.enable_merge_formula_numbers self.enable_merge_text_blocks = config.enable_merge_text_blocks self.enable_format_bullet_points = config.enable_format_bullet_points + self.detect_printed_page_numbers = config.detect_printed_page_numbers + self.page_metadata: Optional[List[Dict[str, Any]]] = None + self.page_number_candidates: Optional[List[Dict[str, Any]]] = None + self.document_page_numbering: Optional[Dict[str, Any]] = None # ========================================================================= # OCR-only mode @@ -158,8 +163,12 @@ def process( Returns: (json_str, markdown_str, image_files) where *image_files* maps - ``filename`` → PIL Image for the caller to persist. + relative output path → PIL Image for the caller to persist. """ + self.page_metadata = None + self.page_number_candidates = None + self.document_page_numbering = None + json_final_results = [] with profiler.measure("format_regions"): @@ -173,6 +182,12 @@ def process( for item in sorted_results: result = deepcopy(item) + result["layout_index"] = result.get( + "layout_index", result.get("index", 0) + ) + result["layout_score"] = float( + result.get("layout_score", result.get("score") or 0.0) + ) result["native_label"] = result.get("label", "text") # Map labels @@ -215,6 +230,15 @@ def process( json_final_results.append(json_page_results) + if self.detect_printed_page_numbers: + ( + self.page_number_candidates, + self.document_page_numbering, + self.page_metadata, + ) = self.extract_printed_page_data(json_final_results) + + self._strip_layout_metadata(json_final_results) + # Generate markdown results and resolve image regions image_files: Dict[str, Any] = {} image_counter = 0 @@ -234,9 +258,12 @@ def process( filename = ( f"{image_prefix}_page{page_idx}_idx{image_counter}.jpg" ) - rel_path = f"imgs/{filename}" - image_files[filename] = img + rel_path = f"imgs_rendered/{filename}" + image_files[rel_path] = img result["image_path"] = rel_path + result["rendered_image_path"] = rel_path + result["embedded_image_path"] = None + result["image_asset_source"] = "rendered" markdown_page_results.append( f"![Image {page_idx}-{image_counter}]({rel_path})" ) @@ -251,6 +278,190 @@ def process( return json_str, markdown_str, image_files + def extract_printed_page_data( + self, + pages: List[List[Dict[str, Any]]], + ) -> Tuple[ + List[Dict[str, Any]], + Optional[Dict[str, Any]], + List[Dict[str, Any]], + ]: + """Extract number candidates and derived printed page metadata.""" + candidates = self._extract_page_number_candidates(pages) + document_page_numbering = self._infer_document_page_numbering(candidates) + page_metadata = self._build_printed_page_metadata(candidates) + return candidates, document_page_numbering, page_metadata + + def _extract_page_number_candidates( + self, + pages: List[List[Dict[str, Any]]], + ) -> List[Dict[str, Any]]: + """Extract raw `number` candidates for printed page inference.""" + candidates: List[Dict[str, Any]] = [] + for page_index, page_blocks in enumerate(pages): + for block in page_blocks: + candidate = self._build_page_number_candidate(page_index, block) + if candidate is not None: + candidates.append(candidate) + return candidates + + def _build_page_number_candidate( + self, + page_index: int, + block: Dict[str, Any], + ) -> Optional[Dict[str, Any]]: + """Build a normalized page-number candidate from one layout block.""" + if block.get("native_label") != "number": + return None + + bbox = block.get("bbox_2d") + if not isinstance(bbox, list) or len(bbox) != 4: + return None + + label = self._normalize_printed_page_label(block.get("content")) + if label is None: + return None + + x1, y1, x2, y2 = bbox + width = x2 - x1 + height = y2 - y1 + if width <= 0 or height <= 0 or width > 140 or height > 120: + return None + if not self._is_margin_candidate(x1, y1, x2, y2): + return None + + return { + "page_index": page_index, + "label": "number", + "content": label, + "layout_index": block.get("layout_index", block.get("index", 0)), + "bbox_2d": bbox, + "layout_score": float(block.get("layout_score") or 0.0), + "numeric_like": label.isdigit(), + "roman_like": self._is_roman_like(label), + } + + @staticmethod + def _is_margin_candidate(x1: int, y1: int, x2: int, y2: int) -> bool: + """Return whether a candidate lies in a plausible page-margin folio area.""" + in_margin_band = y1 <= 120 or y2 >= 880 + in_outer_margin = x1 <= 180 or x2 >= 820 + return in_margin_band and in_outer_margin + + @staticmethod + def _is_roman_like(content: str) -> bool: + """Check whether a label looks like a Roman numeral folio.""" + return bool(re.fullmatch(r"(?i)[ivxlcdm]+", content)) + + def _infer_document_page_numbering( + self, + candidates: List[Dict[str, Any]], + ) -> Optional[Dict[str, Any]]: + """Infer document-level numbering from number-only candidates.""" + if not candidates: + return None + + best_candidates = self._best_candidates_by_page(candidates) + page_count = len(best_candidates) + numeric_candidates = [c for c in best_candidates if c["numeric_like"]] + roman_candidates = [c for c in best_candidates if c["roman_like"]] + + if numeric_candidates: + offsets = collections.Counter( + int(c["content"]) - int(c["page_index"]) for c in numeric_candidates + ) + page_offset, support = offsets.most_common(1)[0] + return { + "strategy": "visual_sequence", + "confidence": round(support / max(1, page_count), 3), + "sequence_type": "arabic", + "page_offset": page_offset, + "candidate_pages": page_count, + } + + if roman_candidates: + return { + "strategy": "visual_sequence", + "confidence": round(len(roman_candidates) / max(1, page_count), 3), + "sequence_type": "roman", + "page_offset": None, + "candidate_pages": len(roman_candidates), + } + + return None + + def _build_printed_page_metadata( + self, + candidates: List[Dict[str, Any]], + ) -> List[Dict[str, Any]]: + """Build per-page printed page metadata from selected candidates.""" + if not candidates: + return [] + + metadata: List[Dict[str, Any]] = [] + for candidate in self._best_candidates_by_page(candidates): + metadata.append( + { + "page_index": candidate["page_index"], + "printed_page_label": candidate["content"], + "printed_page_block_index": candidate["layout_index"], + "printed_page_bbox_2d": candidate["bbox_2d"], + "printed_page_confidence": candidate["layout_score"], + } + ) + return metadata + + def _best_candidates_by_page( + self, + candidates: List[Dict[str, Any]], + ) -> List[Dict[str, Any]]: + """Select the strongest candidate per page.""" + by_page: Dict[int, List[Dict[str, Any]]] = collections.defaultdict(list) + for candidate in candidates: + by_page[int(candidate["page_index"])].append(candidate) + return [ + min(by_page[page_index], key=self._candidate_sort_key) + for page_index in sorted(by_page) + ] + + @staticmethod + def _candidate_sort_key(block: Dict[str, Any]) -> tuple[int, int, int, int]: + """Prefer blocks nearest to outer top/bottom page margins.""" + bbox = block.get("bbox_2d") or [0, 0, 1000, 1000] + x1, y1, x2, y2 = bbox + top_distance = y1 + bottom_distance = 1000 - y2 + edge_distance = min(top_distance, bottom_distance) + side_distance = min(x1, 1000 - x2) + return ( + edge_distance, + side_distance, + -int(block.get("layout_score", 0) * 1000), + int(block.get("layout_index", block.get("index", 0))), + ) + + @staticmethod + def _normalize_printed_page_label(content: Any) -> Optional[str]: + """Normalize OCR text from a printed page-number candidate.""" + if not isinstance(content, str): + return None + label = content.strip() + if not label or len(label) > 12: + return None + if not re.fullmatch(r"[A-Za-z0-9][A-Za-z0-9\-./]*", label): + return None + if not (re.search(r"\d", label) or ResultFormatter._is_roman_like(label)): + return None + return label + + @staticmethod + def _strip_layout_metadata(pages: List[List[Dict[str, Any]]]) -> None: + """Remove broad layout-only metadata from final JSON blocks.""" + for page in pages: + for block in page: + block.pop("layout_index", None) + block.pop("layout_score", None) + # ========================================================================= # Content handling # ========================================================================= diff --git a/glmocr/tests/test_unit.py b/glmocr/tests/test_unit.py index 62ba50d..d2a5473 100644 --- a/glmocr/tests/test_unit.py +++ b/glmocr/tests/test_unit.py @@ -1,6 +1,7 @@ """Unit tests for glmocr (no external services required).""" import json +import tempfile from pathlib import Path from unittest.mock import MagicMock, patch @@ -24,6 +25,25 @@ def test_config_to_dict(self): cfg = load_config().to_dict() assert isinstance(cfg, dict) + def test_default_config_routes_number_to_text_ocr(self): + """Default SDK config preserves PP-DocLayoutV3 number regions for OCR.""" + from glmocr.config import load_config + + cfg = load_config() + text_labels = cfg.pipeline.layout.label_task_mapping["text"] + assert "number" in text_labels + + def test_image_asset_export_defaults(self): + """Image asset export defaults remain conservative and opt-in.""" + from glmocr.config import ResultFormatterConfig + + cfg = ResultFormatterConfig() + assert cfg.enable_image_asset_export is False + assert cfg.markdown_image_preference == "embedded" + assert cfg.image_match_iou_threshold == 0.5 + assert cfg.image_match_containment_threshold == 0.8 + assert cfg.rendered_image_dpi == 300 + class TestLayoutDeviceUnit: """Unit tests for layout device selection and config plumbing (mocked).""" @@ -126,9 +146,14 @@ def test_detector_device_selection_explicit_cpu(self): return_value=mock_model, ), patch( - "glmocr.layout.layout_detector.PPDocLayoutV3ImageProcessorFast.from_pretrained", + "glmocr.layout.layout_detector.PPDocLayoutV3Config.from_pretrained", + return_value=MagicMock(), + ), + patch( + "glmocr.layout.layout_detector.PPDocLayoutV3ImageProcessor.from_pretrained", return_value=mock_proc, ), + patch.object(det, "_prepare_pp_doclayout_state_dict", return_value={}), ): det.start() @@ -145,9 +170,14 @@ def test_detector_device_selection_explicit_cuda(self): return_value=mock_model, ), patch( - "glmocr.layout.layout_detector.PPDocLayoutV3ImageProcessorFast.from_pretrained", + "glmocr.layout.layout_detector.PPDocLayoutV3Config.from_pretrained", + return_value=MagicMock(), + ), + patch( + "glmocr.layout.layout_detector.PPDocLayoutV3ImageProcessor.from_pretrained", return_value=mock_proc, ), + patch.object(det, "_prepare_pp_doclayout_state_dict", return_value={}), ): det.start() @@ -166,10 +196,15 @@ def test_detector_device_selection_auto_fallback_cpu(self): return_value=mock_model, ), patch( - "glmocr.layout.layout_detector.PPDocLayoutV3ImageProcessorFast.from_pretrained", + "glmocr.layout.layout_detector.PPDocLayoutV3Config.from_pretrained", + return_value=MagicMock(), + ), + patch( + "glmocr.layout.layout_detector.PPDocLayoutV3ImageProcessor.from_pretrained", return_value=mock_proc, ), patch.object(torch.cuda, "is_available", return_value=False), + patch.object(det, "_prepare_pp_doclayout_state_dict", return_value={}), ): det.start() @@ -199,15 +234,67 @@ def test_detector_device_selection_auto_cuda(self): return_value=mock_model, ), patch( - "glmocr.layout.layout_detector.PPDocLayoutV3ImageProcessorFast.from_pretrained", + "glmocr.layout.layout_detector.PPDocLayoutV3Config.from_pretrained", + return_value=MagicMock(), + ), + patch( + "glmocr.layout.layout_detector.PPDocLayoutV3ImageProcessor.from_pretrained", return_value=mock_proc, ), patch.object(torch.cuda, "is_available", return_value=True), + patch.object(det, "_prepare_pp_doclayout_state_dict", return_value={}), ): det.start() assert det._device == "cuda:1" + def test_detector_prepares_pp_doclayout_decoder_head_aliases(self): + """State dict preparation aliases tied encoder head weights for load.""" + torch = self._require_layout_runtime() + + det, _, _ = self._mock_detector("cpu") + state_dict = { + "model.enc_score_head.weight": torch.full((3, 4), 1.25), + "model.enc_score_head.bias": torch.full((3,), 2.5), + "model.enc_bbox_head.layers.0.weight": torch.full((4, 4), 1.0), + "model.enc_bbox_head.layers.0.bias": torch.full((4,), 11.0), + "model.enc_bbox_head.layers.1.weight": torch.full((4, 4), 2.0), + "model.enc_bbox_head.layers.1.bias": torch.full((4,), 12.0), + "model.enc_bbox_head.layers.2.weight": torch.full((4, 4), 3.0), + "model.enc_bbox_head.layers.2.bias": torch.full((4,), 13.0), + } + + with ( + patch.object( + det, + "_resolve_model_weights_path", + return_value=Path("dummy-model.safetensors"), + ), + patch( + "glmocr.layout.layout_detector.load_file", + return_value=state_dict.copy(), + ), + ): + prepared = det._prepare_pp_doclayout_state_dict() + + assert torch.equal( + prepared["model.decoder.class_embed.weight"], + state_dict["model.enc_score_head.weight"], + ) + assert torch.equal( + prepared["model.decoder.class_embed.bias"], + state_dict["model.enc_score_head.bias"], + ) + for idx in range(3): + assert torch.equal( + prepared[f"model.decoder.bbox_embed.layers.{idx}.weight"], + state_dict[f"model.enc_bbox_head.layers.{idx}.weight"], + ) + assert torch.equal( + prepared[f"model.decoder.bbox_embed.layers.{idx}.bias"], + state_dict[f"model.enc_bbox_head.layers.{idx}.bias"], + ) + class TestPageLoader: """Tests for PageLoader.""" @@ -494,6 +581,164 @@ def test_result_formatter_clean_content(self): cleaned = formatter._clean_content("Hello....World") assert "....." not in cleaned + def test_result_formatter_feature_off_keeps_json_result_lean(self): + """Feature disabled does not leak broad layout metadata into json_result.""" + from glmocr.postprocess import ResultFormatter + from glmocr.config import ResultFormatterConfig + + formatter = ResultFormatter(ResultFormatterConfig()) + grouped_results = [ + [ + { + "index": 7, + "label": "number", + "content": "12", + "bbox_2d": [944, 12, 972, 42], + "score": 0.88, + } + ] + ] + + parsed = json.loads(formatter.process(grouped_results)[0]) + assert parsed[0][0]["native_label"] == "number" + assert "layout_index" not in parsed[0][0] + assert "layout_score" not in parsed[0][0] + + def test_result_formatter_extracts_page_number_data(self): + """Formatter extracts printed page data from number blocks.""" + from glmocr.postprocess import ResultFormatter + from glmocr.config import ResultFormatterConfig + + formatter = ResultFormatter( + ResultFormatterConfig(detect_printed_page_numbers=True) + ) + grouped_results = [ + [ + { + "index": 7, + "label": "number", + "content": "12", + "bbox_2d": [944, 12, 972, 42], + "score": 0.88, + } + ] + ] + + formatter.process(grouped_results) + + assert formatter.page_number_candidates[0]["layout_index"] == 7 + assert formatter.page_number_candidates[0]["layout_score"] == 0.88 + assert formatter.page_metadata == [ + { + "page_index": 0, + "printed_page_label": "12", + "printed_page_block_index": 7, + "printed_page_bbox_2d": [944, 12, 972, 42], + "printed_page_confidence": 0.88, + } + ] + assert formatter.page_number_candidates == [ + { + "page_index": 0, + "label": "number", + "content": "12", + "layout_index": 7, + "bbox_2d": [944, 12, 972, 42], + "layout_score": 0.88, + "numeric_like": True, + "roman_like": False, + } + ] + assert formatter.document_page_numbering == { + "strategy": "visual_sequence", + "confidence": 1.0, + "sequence_type": "arabic", + "page_offset": 12, + "candidate_pages": 1, + } + + def test_result_formatter_ignores_non_margin_number_blocks(self): + """Formatter ignores number blocks that are not in page margins.""" + from glmocr.postprocess import ResultFormatter + from glmocr.config import ResultFormatterConfig + + formatter = ResultFormatter( + ResultFormatterConfig(detect_printed_page_numbers=True) + ) + grouped_results = [ + [ + { + "index": 7, + "label": "number", + "content": "12", + "bbox_2d": [400, 400, 428, 430], + "score": 0.88, + } + ] + ] + + formatter.process(grouped_results) + + parsed = json.loads(formatter.process(grouped_results)[0]) + assert "layout_index" not in parsed[0][0] + assert "layout_score" not in parsed[0][0] + assert formatter.page_metadata == [] + assert formatter.page_number_candidates == [] + assert formatter.document_page_numbering is None + + def test_result_formatter_accepts_roman_number_candidates(self): + """Formatter preserves Roman numeral number candidates.""" + from glmocr.postprocess import ResultFormatter + from glmocr.config import ResultFormatterConfig + + formatter = ResultFormatter( + ResultFormatterConfig(detect_printed_page_numbers=True) + ) + grouped_results = [ + [ + { + "index": 7, + "label": "number", + "content": "iv", + "bbox_2d": [944, 12, 972, 42], + "score": 0.75, + } + ] + ] + + formatter.process(grouped_results) + + assert formatter.page_number_candidates[0]["layout_index"] == 7 + assert formatter.page_number_candidates[0]["layout_score"] == 0.75 + assert formatter.page_metadata == [ + { + "page_index": 0, + "printed_page_label": "iv", + "printed_page_block_index": 7, + "printed_page_bbox_2d": [944, 12, 972, 42], + "printed_page_confidence": 0.75, + } + ] + assert formatter.page_number_candidates == [ + { + "page_index": 0, + "label": "number", + "content": "iv", + "layout_index": 7, + "bbox_2d": [944, 12, 972, 42], + "layout_score": 0.75, + "numeric_like": False, + "roman_like": True, + } + ] + assert formatter.document_page_numbering == { + "strategy": "visual_sequence", + "confidence": 1.0, + "sequence_type": "roman", + "page_offset": None, + "candidate_pages": 1, + } + class TestMaaSClient: """Tests for MaaSClient.""" @@ -845,6 +1090,17 @@ def test_no_env_returns_empty(self, monkeypatch): assert _collect_env_overrides() == {} + def test_detect_printed_page_numbers_env_var(self, monkeypatch): + """Printed page detection can be enabled via environment variable.""" + from glmocr.config import _collect_env_overrides + + monkeypatch.setenv("GLMOCR_DETECT_PRINTED_PAGE_NUMBERS", "true") + overrides = _collect_env_overrides() + assert ( + overrides["pipeline"]["result_formatter"]["detect_printed_page_numbers"] + is True + ) + class TestFromEnv: """Tests for GlmOcrConfig.from_env() – full priority chain.""" @@ -981,6 +1237,127 @@ def test_to_json_unicode_preserved(self): # ensure_ascii=False by default → raw CJK characters assert "中文测试" in s + def test_to_dict_includes_printed_page_fields(self): + r = self._make_result( + page_metadata=[ + { + "page_index": 0, + "printed_page_label": "12", + "printed_page_block_index": 7, + "printed_page_bbox_2d": [944, 12, 972, 42], + "printed_page_confidence": 0.88, + } + ], + page_number_candidates=[ + { + "page_index": 0, + "label": "number", + "content": "12", + "layout_index": 7, + "bbox_2d": [944, 12, 972, 42], + "layout_score": 0.88, + "numeric_like": True, + "roman_like": False, + } + ], + document_page_numbering={ + "strategy": "visual_sequence", + "confidence": 1.0, + "sequence_type": "arabic", + "page_offset": 12, + "candidate_pages": 1, + }, + ) + d = r.to_dict() + assert d["page_metadata"][0]["printed_page_label"] == "12" + assert d["page_number_candidates"][0]["label"] == "number" + assert d["document_page_numbering"]["page_offset"] == 12 + + def test_save_wraps_json_with_printed_page_fields(self): + r = self._make_result( + original_images=["paper.pdf"], + page_metadata=[ + { + "page_index": 0, + "printed_page_label": "12", + "printed_page_block_index": 7, + "printed_page_bbox_2d": [944, 12, 972, 42], + "printed_page_confidence": 0.88, + } + ], + page_number_candidates=[ + { + "page_index": 0, + "label": "number", + "content": "12", + "layout_index": 7, + "bbox_2d": [944, 12, 972, 42], + "layout_score": 0.88, + "numeric_like": True, + "roman_like": False, + } + ], + document_page_numbering={ + "strategy": "visual_sequence", + "confidence": 1.0, + "sequence_type": "arabic", + "page_offset": 12, + "candidate_pages": 1, + }, + ) + + with tempfile.TemporaryDirectory() as tmp_dir: + r.save(output_dir=tmp_dir, save_layout_visualization=False) + saved = json.loads(Path(tmp_dir, "paper", "paper.json").read_text("utf-8")) + + assert "json_result" in saved + assert saved["page_metadata"][0]["printed_page_label"] == "12" + assert saved["page_number_candidates"][0]["label"] == "number" + assert saved["document_page_numbering"]["page_offset"] == 12 + + def test_save_keeps_legacy_json_shape_without_printed_page_data(self): + r = self._make_result(original_images=["paper.pdf"]) + + with tempfile.TemporaryDirectory() as tmp_dir: + r.save(output_dir=tmp_dir, save_layout_visualization=False) + saved = json.loads(Path(tmp_dir, "paper", "paper.json").read_text("utf-8")) + + assert isinstance(saved, list) + + def test_save_keeps_legacy_json_shape_when_detection_has_no_hits(self): + r = self._make_result( + original_images=["paper.pdf"], + page_metadata=[], + page_number_candidates=[], + document_page_numbering=None, + ) + + with tempfile.TemporaryDirectory() as tmp_dir: + r.save(output_dir=tmp_dir, save_layout_visualization=False) + saved = json.loads(Path(tmp_dir, "paper", "paper.json").read_text("utf-8")) + + assert isinstance(saved, list) + + def test_save_supports_nested_image_asset_paths_and_bytes(self): + from io import BytesIO + + from PIL import Image + + buf = BytesIO() + Image.new("RGB", (4, 4), color=(255, 0, 0)).save(buf, format="PNG") + payload = buf.getvalue() + + r = self._make_result( + original_images=["paper.pdf"], + image_files={"imgs_embedded/test.png": payload}, + ) + + with tempfile.TemporaryDirectory() as tmp_dir: + r.save(output_dir=tmp_dir, save_layout_visualization=False) + saved_path = Path(tmp_dir, "paper", "imgs_embedded", "test.png") + assert saved_path.exists() + assert saved_path.read_bytes() == payload + def test_repr(self): r = self._make_result() assert "PipelineResult" in repr(r) @@ -1251,6 +1628,93 @@ def test_parse_stream_selfhosted_delegates(self): preserve_order=True, ) + def test_maas_response_includes_printed_page_metadata_when_enabled(self): + """MaaS conversion derives printed page data from number blocks.""" + from glmocr.api import GlmOcr + from glmocr.config import GlmOcrConfig, ResultFormatterConfig + + parser = object.__new__(GlmOcr) + parser._use_maas = True + parser._pipeline = None + parser._maas_client = MagicMock() + parser.config_model = GlmOcrConfig() + parser.config_model.pipeline.result_formatter = ResultFormatterConfig( + detect_printed_page_numbers=True + ) + + response = { + "md_results": "", + "layout_details": [ + [ + { + "index": 7, + "label": "number", + "content": "12", + "bbox_2d": [1926, 32, 1982, 111], + "score": 0.88, + } + ] + ], + "data_info": {"pages": [{"width": 2040, "height": 2640}]}, + } + + result = parser._maas_response_to_pipeline_result(response, "paper.pdf") + + assert result.page_number_candidates == [ + { + "page_index": 0, + "label": "number", + "content": "12", + "layout_index": 7, + "bbox_2d": [944, 12, 972, 42], + "layout_score": 0.88, + "numeric_like": True, + "roman_like": False, + } + ] + assert result.document_page_numbering == { + "strategy": "visual_sequence", + "confidence": 1.0, + "sequence_type": "arabic", + "page_offset": 12, + "candidate_pages": 1, + } + assert result.page_metadata[0]["printed_page_label"] == "12" + + def test_maas_response_feature_off_keeps_json_result_lean(self): + """MaaS conversion does not leak broad layout metadata when feature is off.""" + from glmocr.api import GlmOcr + from glmocr.config import GlmOcrConfig + + parser = object.__new__(GlmOcr) + parser._use_maas = True + parser._pipeline = None + parser._maas_client = MagicMock() + parser.config_model = GlmOcrConfig() + + response = { + "md_results": "", + "layout_details": [ + [ + { + "index": 7, + "label": "number", + "content": "12", + "bbox_2d": [1926, 32, 1982, 111], + "score": 0.88, + } + ] + ], + "data_info": {"pages": [{"width": 2040, "height": 2640}]}, + } + + result = parser._maas_response_to_pipeline_result(response, "paper.pdf") + + block = result.json_result[0][0] + assert block["native_label"] == "number" + assert "layout_index" not in block + assert "layout_score" not in block + class TestGlmOcrConstructor: """Tests for GlmOcr.__init__ kwarg handling (config assembly only).""" @@ -1308,6 +1772,658 @@ def test_selfhosted_model_kwarg_is_forwarded_to_ocr_api(self, monkeypatch): assert parser.config_model.pipeline.ocr_api.model == "glm-ocr" parser.close() + def test_detect_printed_page_numbers_kwarg_is_forwarded(self, monkeypatch): + """Public constructor flag enables printed page detection in config.""" + from glmocr.config import _ENV_MAP, ENV_PREFIX + + for suffix in _ENV_MAP: + monkeypatch.delenv(f"{ENV_PREFIX}{suffix}", raising=False) + monkeypatch.setattr("glmocr.config._find_dotenv", lambda: None) + + with patch("glmocr.maas_client.MaaSClient") as mock_maas: + mock_maas.return_value.start = MagicMock() + from glmocr.api import GlmOcr + + parser = GlmOcr(api_key="sk-test", detect_printed_page_numbers=True) + assert ( + parser.config_model.pipeline.result_formatter.detect_printed_page_numbers + is True + ) + parser.close() + + +class TestImageAssetExport: + """Tests for SDK-owned image asset export.""" + + def test_export_image_assets_prefers_embedded_pdf_image(self): + from io import BytesIO + + import fitz + from PIL import Image + + from glmocr.utils.image_asset_utils import export_image_assets + + with tempfile.TemporaryDirectory() as tmp_dir: + pdf_path = Path(tmp_dir, "sample.pdf") + + img_buf = BytesIO() + Image.new("RGB", (80, 60), color=(0, 128, 255)).save(img_buf, format="PNG") + img_bytes = img_buf.getvalue() + + doc = fitz.open() + page = doc.new_page(width=300, height=300) + rect = fitz.Rect(60, 70, 180, 170) + page.insert_image(rect, stream=img_bytes) + doc.save(pdf_path) + doc.close() + + bbox = [200, 233, 600, 567] + json_result = [ + [ + { + "index": 0, + "label": "image", + "bbox_2d": bbox, + "content": None, + } + ] + ] + markdown = f"![](page=0,bbox={bbox})" + + updated_json, updated_md, image_files = export_image_assets( + json_result, + markdown, + str(pdf_path), + enable_image_asset_export=True, + markdown_image_preference="embedded", + image_match_iou_threshold=0.3, + image_match_containment_threshold=0.8, + rendered_image_dpi=300, + ) + + block = updated_json[0][0] + assert block["image_asset_source"] == "embedded" + assert block["image_path"].startswith("imgs_embedded/") + assert block["embedded_image_path"].startswith("imgs_embedded/") + assert block["rendered_image_path"].startswith("imgs_rendered/") + assert any(path.startswith("imgs_embedded/") for path in image_files) + assert any(path.startswith("imgs_rendered/") for path in image_files) + assert "imgs_embedded/" in updated_md + + def test_export_image_assets_falls_back_to_rendered_when_no_embedded_match(self): + from PIL import Image + + from glmocr.utils.image_asset_utils import export_image_assets + + with tempfile.TemporaryDirectory() as tmp_dir: + img_path = Path(tmp_dir, "sample.png") + Image.new("RGB", (200, 200), color=(255, 255, 255)).save(img_path) + + bbox = [100, 100, 500, 500] + json_result = [ + [ + { + "index": 0, + "label": "image", + "bbox_2d": bbox, + "content": None, + } + ] + ] + markdown = f"![](page=0,bbox={bbox})" + + updated_json, updated_md, image_files = export_image_assets( + json_result, + markdown, + str(img_path), + enable_image_asset_export=True, + markdown_image_preference="embedded", + image_match_iou_threshold=0.5, + image_match_containment_threshold=0.8, + rendered_image_dpi=300, + ) + + block = updated_json[0][0] + assert block["image_asset_source"] == "rendered" + assert block["image_path"].startswith("imgs_rendered/") + assert block["rendered_image_path"].startswith("imgs_rendered/") + assert block["embedded_image_path"] is None + assert all(path.startswith("imgs_rendered/") for path in image_files) + assert "imgs_rendered/" in updated_md + + def test_export_image_assets_rendered_only_mode_uses_imgs_rendered(self): + from PIL import Image + + from glmocr.utils.image_asset_utils import export_image_assets + + with tempfile.TemporaryDirectory() as tmp_dir: + img_path = Path(tmp_dir, "sample.png") + Image.new("RGB", (200, 200), color=(255, 255, 255)).save(img_path) + + bbox = [100, 100, 500, 500] + json_result = [ + [ + { + "index": 0, + "label": "image", + "bbox_2d": bbox, + "content": None, + } + ] + ] + markdown = f"![](page=0,bbox={bbox})" + + updated_json, updated_md, image_files = export_image_assets( + json_result, + markdown, + str(img_path), + enable_image_asset_export=False, + markdown_image_preference="embedded", + image_match_iou_threshold=0.5, + image_match_containment_threshold=0.8, + rendered_image_dpi=300, + ) + + block = updated_json[0][0] + assert block["image_asset_source"] == "rendered" + assert block["image_path"].startswith("imgs_rendered/") + assert block["rendered_image_path"] == block["image_path"] + assert block["embedded_image_path"] is None + assert all(path.startswith("imgs_rendered/") for path in image_files) + assert "imgs_rendered/" in updated_md + + def test_export_image_assets_prefers_rendered_when_configured(self): + from io import BytesIO + + import fitz + from PIL import Image + + from glmocr.utils.image_asset_utils import export_image_assets + + with tempfile.TemporaryDirectory() as tmp_dir: + pdf_path = Path(tmp_dir, "sample.pdf") + + img_buf = BytesIO() + Image.new("RGB", (80, 60), color=(0, 128, 255)).save(img_buf, format="PNG") + img_bytes = img_buf.getvalue() + + doc = fitz.open() + page = doc.new_page(width=300, height=300) + rect = fitz.Rect(60, 70, 180, 170) + page.insert_image(rect, stream=img_bytes) + doc.save(pdf_path) + doc.close() + + bbox = [200, 233, 600, 567] + json_result = [ + [{"index": 0, "label": "image", "bbox_2d": bbox, "content": None}] + ] + markdown = f"![](page=0,bbox={bbox})" + + updated_json, updated_md, image_files = export_image_assets( + json_result, + markdown, + str(pdf_path), + enable_image_asset_export=True, + markdown_image_preference="rendered", + image_match_iou_threshold=0.3, + image_match_containment_threshold=0.8, + rendered_image_dpi=300, + ) + + block = updated_json[0][0] + assert block["image_asset_source"] == "rendered" + assert block["image_path"].startswith("imgs_rendered/") + assert block["rendered_image_path"].startswith("imgs_rendered/") + assert block["embedded_image_path"].startswith("imgs_embedded/") + assert "imgs_rendered/" in updated_md + + def test_preserve_rendered_assets_uses_rendered_path_over_image_path(self): + from PIL import Image + + from glmocr.utils.image_asset_utils import export_image_assets + + with tempfile.TemporaryDirectory() as tmp_dir: + img_path = Path(tmp_dir, "sample.png") + Image.new("RGB", (200, 200), color=(255, 255, 255)).save(img_path) + + rendered = Image.new("RGB", (32, 24), color=(10, 20, 30)) + json_result = [ + [ + { + "index": 0, + "label": "image", + "bbox_2d": [100, 100, 500, 500], + "content": None, + "image_path": "imgs_embedded/existing.png", + "rendered_image_path": "imgs_rendered/rendered_page0_idx0.jpg", + "embedded_image_path": "imgs_embedded/existing.png", + "image_asset_source": "embedded", + } + ] + ] + markdown = "![Image](imgs_embedded/existing.png)" + + updated_json, updated_md, image_files = export_image_assets( + json_result, + markdown, + str(img_path), + enable_image_asset_export=False, + markdown_image_preference="embedded", + image_match_iou_threshold=0.5, + image_match_containment_threshold=0.8, + rendered_image_dpi=300, + rendered_images={"rendered_page0_idx0.jpg": rendered}, + ) + + block = updated_json[0][0] + assert block["image_path"] == "imgs_rendered/rendered_page0_idx0.jpg" + assert ( + block["rendered_image_path"] == "imgs_rendered/rendered_page0_idx0.jpg" + ) + assert block["embedded_image_path"] is None + assert "imgs_rendered/rendered_page0_idx0.jpg" in image_files + assert updated_md == "![Image](imgs_rendered/rendered_page0_idx0.jpg)" + + def test_preserve_rendered_assets_missing_key_keeps_block_coherent(self): + from PIL import Image + + from glmocr.utils.image_asset_utils import export_image_assets + + with tempfile.TemporaryDirectory() as tmp_dir: + img_path = Path(tmp_dir, "sample.png") + Image.new("RGB", (200, 200), color=(255, 255, 255)).save(img_path) + + json_result = [ + [ + { + "index": 0, + "label": "image", + "bbox_2d": [100, 100, 500, 500], + "content": None, + "image_path": "imgs_embedded/existing.png", + "rendered_image_path": "imgs_rendered/rendered_page0_idx0.jpg", + "embedded_image_path": "imgs_embedded/existing.png", + "image_asset_source": "embedded", + } + ] + ] + markdown = "![Image](imgs_embedded/existing.png)" + + updated_json, updated_md, image_files = export_image_assets( + json_result, + markdown, + str(img_path), + enable_image_asset_export=False, + markdown_image_preference="embedded", + image_match_iou_threshold=0.5, + image_match_containment_threshold=0.8, + rendered_image_dpi=300, + rendered_images={ + "other.jpg": Image.new("RGB", (4, 4), color=(1, 2, 3)) + }, + ) + + block = updated_json[0][0] + assert block["image_path"].startswith("imgs_rendered/") + assert ( + block["rendered_image_path"] == "imgs_rendered/rendered_page0_idx0.jpg" + ) + assert block["embedded_image_path"] is None + assert block["image_asset_source"] == "rendered" + assert "imgs_rendered/rendered_page0_idx0.jpg" in image_files + assert "imgs_rendered/rendered_page0_idx0.jpg" in updated_md + + def test_preferred_mode_missing_rendered_key_does_not_leak_helper_fields(self): + from io import BytesIO + + import fitz + from PIL import Image + + from glmocr.utils.image_asset_utils import export_image_assets + + with tempfile.TemporaryDirectory() as tmp_dir: + pdf_path = Path(tmp_dir, "sample.pdf") + + img_buf = BytesIO() + Image.new("RGB", (80, 60), color=(0, 128, 255)).save(img_buf, format="PNG") + img_bytes = img_buf.getvalue() + + doc = fitz.open() + page = doc.new_page(width=300, height=300) + rect = fitz.Rect(60, 70, 180, 170) + page.insert_image(rect, stream=img_bytes) + doc.save(pdf_path) + doc.close() + + bbox = [200, 233, 600, 567] + json_result = [ + [ + { + "index": 0, + "label": "image", + "bbox_2d": bbox, + "content": None, + "image_path": "imgs_embedded/stale.png", + "rendered_image_path": "imgs_rendered/rendered_page0_idx0.jpg", + "embedded_image_path": "imgs_embedded/stale.png", + "image_asset_source": "embedded", + } + ] + ] + markdown = "![Image](imgs_embedded/stale.png)" + + updated_json, _, _ = export_image_assets( + json_result, + markdown, + str(pdf_path), + enable_image_asset_export=True, + markdown_image_preference="embedded", + image_match_iou_threshold=0.3, + image_match_containment_threshold=0.8, + rendered_image_dpi=300, + rendered_images={ + "other.jpg": Image.new("RGB", (4, 4), color=(1, 2, 3)) + }, + ) + + block = updated_json[0][0] + assert "_needs_rendered_export" not in block + assert "_previous_image_path" not in block + + def test_preferred_mode_render_recovery_failure_keeps_existing_asset_state(self): + from io import BytesIO + + import fitz + from PIL import Image + + from glmocr.utils.image_asset_utils import export_image_assets + + with tempfile.TemporaryDirectory() as tmp_dir: + pdf_path = Path(tmp_dir, "sample.pdf") + + img_buf = BytesIO() + Image.new("RGB", (80, 60), color=(0, 128, 255)).save(img_buf, format="PNG") + img_bytes = img_buf.getvalue() + + doc = fitz.open() + page = doc.new_page(width=300, height=300) + rect = fitz.Rect(60, 70, 180, 170) + page.insert_image(rect, stream=img_bytes) + doc.save(pdf_path) + doc.close() + + bbox = [200, 233, 600, 567] + json_result = [ + [ + { + "index": 0, + "label": "image", + "bbox_2d": bbox, + "content": None, + "image_path": "imgs_embedded/stale.png", + "rendered_image_path": "imgs_rendered/rendered_page0_idx0.jpg", + "embedded_image_path": "imgs_embedded/stale.png", + "image_asset_source": "embedded", + } + ] + ] + markdown = "![Image](imgs_embedded/stale.png)" + + with patch( + "glmocr.utils.image_asset_utils.crop_image_region", + side_effect=RuntimeError("crop failed"), + ): + updated_json, updated_md, image_files = export_image_assets( + json_result, + markdown, + str(pdf_path), + enable_image_asset_export=True, + markdown_image_preference="embedded", + image_match_iou_threshold=0.3, + image_match_containment_threshold=0.8, + rendered_image_dpi=300, + rendered_images={ + "other.jpg": Image.new("RGB", (4, 4), color=(1, 2, 3)) + }, + ) + + block = updated_json[0][0] + assert block["image_path"].startswith("imgs_embedded/") + assert block["rendered_image_path"] is None + assert block["embedded_image_path"].startswith("imgs_embedded/") + assert block["image_asset_source"] == "embedded" + assert "imgs_embedded/" in updated_md + assert all( + not path.startswith("imgs_rendered/rendered_page0_idx0.jpg") + for path in image_files + ) + + def test_preferred_mode_render_failure_without_embedded_match_does_not_advertise_stale_rendered_asset( + self, + ): + from PIL import Image + + from glmocr.utils.image_asset_utils import export_image_assets + + with tempfile.TemporaryDirectory() as tmp_dir: + img_path = Path(tmp_dir, "sample.png") + Image.new("RGB", (200, 200), color=(255, 255, 255)).save(img_path) + + json_result = [ + [ + { + "index": 0, + "label": "image", + "bbox_2d": [100, 100, 500, 500], + "content": None, + "image_path": "imgs_rendered/rendered_page0_idx0.jpg", + "rendered_image_path": "imgs_rendered/rendered_page0_idx0.jpg", + "embedded_image_path": None, + "image_asset_source": "rendered", + "_needs_rendered_export": True, + } + ] + ] + markdown = "![Image 0-0](imgs_rendered/rendered_page0_idx0.jpg)" + + with ( + patch( + "glmocr.utils.image_asset_utils._load_render_pages", + return_value=[Image.new("RGB", (200, 200), color=(255, 255, 255))], + ), + patch( + "glmocr.utils.image_asset_utils.crop_image_region", + side_effect=RuntimeError("crop failed"), + ), + ): + updated_json, updated_md, image_files = export_image_assets( + json_result, + markdown, + str(img_path), + enable_image_asset_export=True, + markdown_image_preference="embedded", + image_match_iou_threshold=0.3, + image_match_containment_threshold=0.8, + rendered_image_dpi=300, + ) + + block = updated_json[0][0] + assert block["image_path"] is None + assert block["rendered_image_path"] is None + assert block["embedded_image_path"] is None + assert block["image_asset_source"] == "rendered" + assert "imgs_rendered/rendered_page0_idx0.jpg" not in updated_md + assert image_files == {} + + def test_preferred_mode_no_render_pages_does_not_advertise_rendered_asset(self): + from io import BytesIO + + import fitz + from PIL import Image + + from glmocr.utils.image_asset_utils import export_image_assets + + with tempfile.TemporaryDirectory() as tmp_dir: + pdf_path = Path(tmp_dir, "sample.pdf") + + img_buf = BytesIO() + Image.new("RGB", (80, 60), color=(0, 128, 255)).save(img_buf, format="PNG") + img_bytes = img_buf.getvalue() + + doc = fitz.open() + page = doc.new_page(width=300, height=300) + rect = fitz.Rect(60, 70, 180, 170) + page.insert_image(rect, stream=img_bytes) + doc.save(pdf_path) + doc.close() + + bbox = [200, 233, 600, 567] + json_result = [ + [ + { + "index": 0, + "label": "image", + "bbox_2d": bbox, + "content": None, + "image_path": "imgs_embedded/stale.png", + "rendered_image_path": "imgs_rendered/rendered_page0_idx0.jpg", + "embedded_image_path": "imgs_embedded/stale.png", + "image_asset_source": "embedded", + } + ] + ] + markdown = "![Image](imgs_embedded/stale.png)" + + with patch( + "glmocr.utils.image_asset_utils._load_render_pages", return_value=[] + ): + updated_json, updated_md, image_files = export_image_assets( + json_result, + markdown, + str(pdf_path), + enable_image_asset_export=True, + markdown_image_preference="embedded", + image_match_iou_threshold=0.3, + image_match_containment_threshold=0.8, + rendered_image_dpi=300, + rendered_images={ + "other.jpg": Image.new("RGB", (4, 4), color=(1, 2, 3)) + }, + ) + + block = updated_json[0][0] + assert block["image_path"].startswith("imgs_embedded/") + assert block["rendered_image_path"] is None + assert block["embedded_image_path"].startswith("imgs_embedded/") + assert block["image_asset_source"] == "embedded" + assert "imgs_embedded/embedded_page0_idx0_xref" in updated_md + assert all( + not path.startswith("imgs_rendered/rendered_page0_idx0.jpg") + for path in image_files + ) + + def test_preferred_mode_no_render_pages_and_no_embedded_match_does_not_point_to_stale_rendered_path( + self, + ): + from PIL import Image + + from glmocr.utils.image_asset_utils import export_image_assets + + with tempfile.TemporaryDirectory() as tmp_dir: + img_path = Path(tmp_dir, "sample.png") + Image.new("RGB", (200, 200), color=(255, 255, 255)).save(img_path) + + json_result = [ + [ + { + "index": 0, + "label": "image", + "bbox_2d": [100, 100, 500, 500], + "content": None, + "image_path": "imgs_rendered/rendered_page0_idx0.jpg", + "rendered_image_path": "imgs_rendered/rendered_page0_idx0.jpg", + "embedded_image_path": None, + "image_asset_source": "rendered", + "_needs_rendered_export": True, + } + ] + ] + markdown = "![Image](imgs_rendered/rendered_page0_idx0.jpg)" + + with patch( + "glmocr.utils.image_asset_utils._load_render_pages", return_value=[] + ): + updated_json, updated_md, image_files = export_image_assets( + json_result, + markdown, + str(img_path), + enable_image_asset_export=True, + markdown_image_preference="embedded", + image_match_iou_threshold=0.3, + image_match_containment_threshold=0.8, + rendered_image_dpi=300, + ) + + block = updated_json[0][0] + assert block["image_path"] is None + assert block["rendered_image_path"] is None + assert block["embedded_image_path"] is None + assert block["image_asset_source"] == "rendered" + assert "imgs_rendered/rendered_page0_idx0.jpg" not in updated_md + assert image_files == {} + + def test_preferred_mode_embedded_markdown_origin_is_removed_when_no_asset_survives( + self, + ): + from PIL import Image + + from glmocr.utils.image_asset_utils import export_image_assets + + with tempfile.TemporaryDirectory() as tmp_dir: + img_path = Path(tmp_dir, "sample.png") + Image.new("RGB", (200, 200), color=(255, 255, 255)).save(img_path) + + json_result = [ + [ + { + "index": 0, + "label": "image", + "bbox_2d": [100, 100, 500, 500], + "content": None, + "image_path": "imgs_embedded/original.png", + "rendered_image_path": "imgs_rendered/rendered_page0_idx0.jpg", + "embedded_image_path": None, + "image_asset_source": "embedded", + "_needs_rendered_export": True, + "_previous_image_path": "imgs_embedded/original.png", + } + ] + ] + markdown = "![Image 0-0](imgs_embedded/original.png)" + + with patch( + "glmocr.utils.image_asset_utils._load_render_pages", return_value=[] + ): + updated_json, updated_md, image_files = export_image_assets( + json_result, + markdown, + str(img_path), + enable_image_asset_export=True, + markdown_image_preference="embedded", + image_match_iou_threshold=0.3, + image_match_containment_threshold=0.8, + rendered_image_dpi=300, + ) + + block = updated_json[0][0] + assert block["image_path"] is None + assert block["rendered_image_path"] is None + assert block["embedded_image_path"] is None + assert "imgs_embedded/original.png" not in updated_md + assert "imgs_rendered/rendered_page0_idx0.jpg" not in updated_md + assert image_files == {} + class TestOCRClientOllamaConfig: """Tests for OCRClient initialization with Ollama api_mode.""" diff --git a/glmocr/utils/image_asset_utils.py b/glmocr/utils/image_asset_utils.py new file mode 100644 index 0000000..cbe6f52 --- /dev/null +++ b/glmocr/utils/image_asset_utils.py @@ -0,0 +1,673 @@ +"""Image asset export utilities. + +Optional SDK-owned export of rendered and embedded image assets for layout image +regions. Embedded PDF images are matched geometrically; rendered crops remain the +fallback. +""" + +from __future__ import annotations + +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple + +from PIL import Image + +from glmocr.utils.image_utils import crop_image_region, pdf_to_images_pil +from glmocr.utils.logging import get_logger + +logger = get_logger(__name__) + +try: + import fitz +except Exception: # pragma: no cover + fitz = None # type: ignore[assignment] + + +def export_image_assets( + json_result: List[list], + markdown_result: str, + source: str, + *, + enable_image_asset_export: bool, + markdown_image_preference: str, + image_match_iou_threshold: float, + image_match_containment_threshold: float, + rendered_image_dpi: int, + rendered_images: Optional[Dict[str, Any]] = None, +) -> Tuple[List[list], str, Dict[str, Any]]: + """Return updated JSON/Markdown plus exportable image assets. + + When disabled: + - preserve existing rendered assets if already present (self-hosted) + - otherwise create rendered crops under ``imgs_rendered/`` (MaaS / bbox-only path) + + When enabled: + - export rendered crops under ``imgs_rendered/`` + - try to export matched embedded images under ``imgs_embedded/`` + - point JSON/Markdown to the preferred asset + """ + has_images = any( + region.get("label") == "image" + for page in json_result + if isinstance(page, list) + for region in page + if isinstance(region, dict) + ) + if not has_images: + return json_result, markdown_result, rendered_images or {} + + if not enable_image_asset_export: + return _create_rendered_only_assets( + json_result, + markdown_result, + source, + rendered_image_dpi=rendered_image_dpi, + rendered_images=rendered_images, + ) + + return _create_preferred_assets( + json_result, + markdown_result, + source, + markdown_image_preference=markdown_image_preference, + image_match_iou_threshold=image_match_iou_threshold, + image_match_containment_threshold=image_match_containment_threshold, + rendered_image_dpi=rendered_image_dpi, + rendered_images=rendered_images, + ) + + +def _create_rendered_only_assets( + json_result: List[list], + markdown_result: str, + source: str, + *, + rendered_image_dpi: int, + rendered_images: Optional[Dict[str, Any]] = None, +) -> Tuple[List[list], str, Dict[str, Any]]: + if rendered_images: + json_result, markdown_result, image_files = _preserve_rendered_assets( + json_result, markdown_result, rendered_images + ) + if not _has_pending_rendered_assets(json_result): + return ( + _strip_internal_image_fields(json_result), + markdown_result, + image_files, + ) + else: + image_files = {} + + loaded_images = _load_render_pages(source, rendered_image_dpi) + if not loaded_images: + return ( + _strip_internal_image_fields(_fill_missing_image_asset_fields(json_result)), + markdown_result, + image_files, + ) + + image_counter = 0 + updated_json: List[list] = [] + + for page_idx, page in enumerate(json_result): + if not isinstance(page, list): + updated_json.append(page) + continue + page_copy = [] + for region in page: + if ( + not isinstance(region, dict) + or region.get("label") != "image" + or page_idx >= len(loaded_images) + ): + page_copy.append(region) + continue + + region_copy = dict(region) + region_copy.setdefault("image_path", None) + region_copy.setdefault("rendered_image_path", None) + region_copy.setdefault("embedded_image_path", None) + region_copy.setdefault("image_asset_source", "rendered") + bbox = region.get("bbox_2d") + polygon = region.get("polygon") + previous_image_path = region_copy.get("image_path") + previous_embedded_path = region_copy.get("embedded_image_path") + previous_asset_source = region_copy.get("image_asset_source", "rendered") + if bbox and ( + region_copy.get("_needs_rendered_export") + or not region_copy.get("rendered_image_path") + ): + try: + cropped = crop_image_region(loaded_images[page_idx], bbox, polygon) + rel_path = region_copy.get("rendered_image_path") or ( + f"imgs_rendered/rendered_page{page_idx}_idx{region.get('index', image_counter)}.jpg" + ) + image_files[rel_path] = cropped + region_copy["image_path"] = rel_path + region_copy["rendered_image_path"] = rel_path + region_copy["embedded_image_path"] = None + region_copy["image_asset_source"] = "rendered" + replace_region = dict(region) + if region_copy.get("_previous_image_path"): + replace_region["image_path"] = region_copy[ + "_previous_image_path" + ] + markdown_result = _replace_markdown_image_reference( + markdown_result, + replace_region, + page_idx, + bbox, + rel_path, + ) + image_counter += 1 + except Exception as e: + logger.warning( + "Failed to render image asset (page=%d, bbox=%s): %s", + page_idx, + bbox, + e, + ) + region_copy["image_path"] = previous_image_path + region_copy["rendered_image_path"] = None + region_copy["embedded_image_path"] = previous_embedded_path + region_copy["image_asset_source"] = previous_asset_source + region_copy.pop("_needs_rendered_export", None) + region_copy.pop("_previous_image_path", None) + page_copy.append(region_copy) + updated_json.append(page_copy) + + return _strip_internal_image_fields(updated_json), markdown_result, image_files + + +def _create_preferred_assets( + json_result: List[list], + markdown_result: str, + source: str, + *, + markdown_image_preference: str, + image_match_iou_threshold: float, + image_match_containment_threshold: float, + rendered_image_dpi: int, + rendered_images: Optional[Dict[str, Any]] = None, +) -> Tuple[List[list], str, Dict[str, Any]]: + if rendered_images: + rendered_json, markdown_result, image_files = _preserve_rendered_assets( + json_result, markdown_result, rendered_images + ) + if _has_pending_rendered_assets(rendered_json): + rendered_pages = _load_render_pages(source, rendered_image_dpi) + else: + rendered_pages = [] + json_result = rendered_json + else: + rendered_pages = _load_render_pages(source, rendered_image_dpi) + image_files = {} + embedded_by_page = _inspect_embedded_pdf_images(source) + updated_json: List[list] = [] + + for page_idx, page in enumerate(json_result): + if not isinstance(page, list): + updated_json.append(page) + continue + + image_regions = [ + region + for region in page + if isinstance(region, dict) and region.get("label") == "image" + ] + matches = _match_embedded_images( + image_regions, + embedded_by_page.get(page_idx, []), + image_match_iou_threshold=image_match_iou_threshold, + image_match_containment_threshold=image_match_containment_threshold, + ) + + page_copy = [] + for region in page: + if not isinstance(region, dict) or region.get("label") != "image": + page_copy.append(region) + continue + + region_copy = dict(region) + region_copy.setdefault("image_path", None) + region_copy.setdefault("rendered_image_path", None) + region_copy.setdefault("embedded_image_path", None) + region_copy.setdefault("image_asset_source", "rendered") + bbox = region.get("bbox_2d") + polygon = region.get("polygon") + rendered_rel_path = region_copy.get("rendered_image_path") + embedded_rel_path = None + previous_image_path = region_copy.get("image_path") + render_failed = False + rendered_asset_available = bool( + rendered_rel_path and not region_copy.get("_needs_rendered_export") + ) + + if ( + bbox + and ( + rendered_rel_path is None + or region_copy.get("_needs_rendered_export") + ) + and page_idx < len(rendered_pages) + ): + try: + rendered = crop_image_region( + rendered_pages[page_idx], bbox, polygon + ) + rendered_rel_path = rendered_rel_path or ( + f"imgs_rendered/rendered_page{page_idx}_idx{region.get('index', 0)}.jpg" + ) + image_files[rendered_rel_path] = rendered + rendered_asset_available = True + except Exception as e: + logger.warning( + "Failed to render fallback image asset (page=%d, bbox=%s): %s", + page_idx, + bbox, + e, + ) + render_failed = True + rendered_rel_path = None + + match = matches.get(int(region.get("index", 0))) + if match is not None: + embedded_rel_path = f"imgs_embedded/embedded_page{page_idx}_idx{region.get('index', 0)}_xref{match['xref']}.{match['ext']}" + image_files[embedded_rel_path] = match["image_bytes"] + + effective_rendered_path = ( + rendered_rel_path if rendered_asset_available else None + ) + chosen_path = _choose_preferred_path( + embedded_rel_path, + effective_rendered_path, + markdown_image_preference=markdown_image_preference, + ) + + if render_failed and embedded_rel_path is None: + chosen_path = None + effective_rendered_path = None + original_markdown_path = ( + region_copy.get("_previous_image_path") or previous_image_path + ) + if bbox and original_markdown_path: + markdown_result = _replace_markdown_image_reference( + markdown_result, + { + "image_path": original_markdown_path, + "index": region.get("index", 0), + }, + page_idx, + bbox, + "", + ) + elif ( + region_copy.get("_needs_rendered_export") + and not rendered_asset_available + and embedded_rel_path is None + ): + chosen_path = None + original_markdown_path = ( + region_copy.get("_previous_image_path") or previous_image_path + ) + if bbox and original_markdown_path: + markdown_result = _replace_markdown_image_reference( + markdown_result, + { + "image_path": original_markdown_path, + "index": region.get("index", 0), + }, + page_idx, + bbox, + "", + ) + + if chosen_path is not None and bbox: + replace_region = dict(region) + if region_copy.get("_previous_image_path"): + replace_region["image_path"] = region_copy["_previous_image_path"] + elif previous_image_path: + replace_region["image_path"] = previous_image_path + markdown_result = _replace_markdown_image_reference( + markdown_result, replace_region, page_idx, bbox, chosen_path + ) + region_copy["image_path"] = chosen_path + if rendered_asset_available and effective_rendered_path is not None: + region_copy["rendered_image_path"] = effective_rendered_path + else: + region_copy["rendered_image_path"] = None + if render_failed and embedded_rel_path is None: + rendered_rel_path = None + region_copy["embedded_image_path"] = None + region_copy["rendered_image_path"] = None + region_copy["image_asset_source"] = "rendered" + region_copy["image_path"] = None + else: + region_copy["embedded_image_path"] = embedded_rel_path + if embedded_rel_path is None and not rendered_asset_available: + region_copy["image_asset_source"] = "rendered" + region_copy["image_path"] = None + else: + region_copy["image_asset_source"] = ( + "embedded" if chosen_path == embedded_rel_path else "rendered" + ) + if render_failed and embedded_rel_path is not None: + region_copy["rendered_image_path"] = None + region_copy.pop("_needs_rendered_export", None) + region_copy.pop("_previous_image_path", None) + page_copy.append(region_copy) + + updated_json.append(page_copy) + + return _strip_internal_image_fields(updated_json), markdown_result, image_files + + +def _load_render_pages(source: str, rendered_image_dpi: int) -> List[Image.Image]: + path = Path(source) + try: + if path.suffix.lower() == ".pdf" and path.is_file(): + return pdf_to_images_pil( + str(path), + dpi=rendered_image_dpi, + max_width_or_height=6000, + ) + if path.is_file(): + img = Image.open(str(path)) + if img.mode != "RGB": + img = img.convert("RGB") + return [img] + except Exception as e: + logger.warning("Cannot load source %s for image asset export: %s", source, e) + return [] + + +def _inspect_embedded_pdf_images(source: str) -> Dict[int, List[Dict[str, Any]]]: + if fitz is None: + return {} + path = Path(source) + if path.suffix.lower() != ".pdf" or not path.is_file(): + return {} + + doc = fitz.open(str(path)) + by_page: Dict[int, List[Dict[str, Any]]] = {} + try: + for page_index in range(len(doc)): + page = doc[page_index] + width = float(page.rect.width) or 1.0 + height = float(page.rect.height) or 1.0 + instances: List[Dict[str, Any]] = [] + for image in page.get_images(full=True): + xref = int(image[0]) + try: + extracted = doc.extract_image(xref) + rects = page.get_image_rects(xref, transform=True) + except Exception: + continue + for placement_idx, placement in enumerate(rects): + rect = placement[0] if isinstance(placement, tuple) else placement + bbox_norm = [ + rect.x0 / width, + rect.y0 / height, + rect.x1 / width, + rect.y1 / height, + ] + instances.append( + { + "xref": xref, + "ext": extracted.get("ext", "bin"), + "image_bytes": extracted.get("image", b""), + "width": int(extracted.get("width") or image[2] or 0), + "height": int(extracted.get("height") or image[3] or 0), + "bbox_norm": bbox_norm, + "placement_index": placement_idx, + } + ) + if instances: + by_page[page_index] = instances + finally: + doc.close() + return by_page + + +def _match_embedded_images( + image_regions: List[Dict[str, Any]], + embedded_instances: List[Dict[str, Any]], + *, + image_match_iou_threshold: float, + image_match_containment_threshold: float, +) -> Dict[int, Dict[str, Any]]: + candidates: List[Tuple[float, int, int]] = [] + for region in image_regions: + bbox = region.get("bbox_2d") + if not bbox or len(bbox) != 4: + continue + region_idx = int(region.get("index", 0)) + region_bbox = [coord / 1000.0 for coord in bbox] + region_ar = _bbox_aspect_ratio(region_bbox) + for embedded_idx, embedded in enumerate(embedded_instances): + embedded_bbox = embedded["bbox_norm"] + iou = _bbox_iou(region_bbox, embedded_bbox) + containment = _bbox_containment(region_bbox, embedded_bbox) + if ( + iou < image_match_iou_threshold + and containment < image_match_containment_threshold + ): + continue + embedded_ar = _bbox_aspect_ratio(embedded_bbox) + if not _aspect_ratio_plausible(region_ar, embedded_ar): + continue + area_ratio = _bbox_area_ratio(region_bbox, embedded_bbox) + score = max(iou, containment) + area_ratio * 0.1 + candidates.append((score, region_idx, embedded_idx)) + + candidates.sort(reverse=True) + assigned_regions = set() + assigned_embedded = set() + matches: Dict[int, Dict[str, Any]] = {} + for _, region_idx, embedded_idx in candidates: + if region_idx in assigned_regions or embedded_idx in assigned_embedded: + continue + matches[region_idx] = embedded_instances[embedded_idx] + assigned_regions.add(region_idx) + assigned_embedded.add(embedded_idx) + return matches + + +def _bbox_iou(a: List[float], b: List[float]) -> float: + ax0, ay0, ax1, ay1 = a + bx0, by0, bx1, by1 = b + inter_x0 = max(ax0, bx0) + inter_y0 = max(ay0, by0) + inter_x1 = min(ax1, bx1) + inter_y1 = min(ay1, by1) + if inter_x1 <= inter_x0 or inter_y1 <= inter_y0: + return 0.0 + inter_area = (inter_x1 - inter_x0) * (inter_y1 - inter_y0) + union = _bbox_area(a) + _bbox_area(b) - inter_area + return inter_area / union if union > 0 else 0.0 + + +def _bbox_containment(a: List[float], b: List[float]) -> float: + ax0, ay0, ax1, ay1 = a + bx0, by0, bx1, by1 = b + inter_x0 = max(ax0, bx0) + inter_y0 = max(ay0, by0) + inter_x1 = min(ax1, bx1) + inter_y1 = min(ay1, by1) + if inter_x1 <= inter_x0 or inter_y1 <= inter_y0: + return 0.0 + inter_area = (inter_x1 - inter_x0) * (inter_y1 - inter_y0) + return max(inter_area / _bbox_area(a), inter_area / _bbox_area(b)) + + +def _bbox_area(bbox: List[float]) -> float: + x0, y0, x1, y1 = bbox + return max(0.0, x1 - x0) * max(0.0, y1 - y0) + + +def _bbox_area_ratio(a: List[float], b: List[float]) -> float: + area_a = _bbox_area(a) + area_b = _bbox_area(b) + if area_a <= 0 or area_b <= 0: + return 0.0 + return min(area_a, area_b) / max(area_a, area_b) + + +def _bbox_aspect_ratio(bbox: List[float]) -> float: + x0, y0, x1, y1 = bbox + width = max(1e-9, x1 - x0) + height = max(1e-9, y1 - y0) + return width / height + + +def _aspect_ratio_plausible(a: float, b: float) -> bool: + ratio = max(a, b) / max(min(a, b), 1e-9) + return ratio <= 2.0 + + +def _choose_preferred_path( + embedded_rel_path: Optional[str], + rendered_rel_path: Optional[str], + *, + markdown_image_preference: str, +) -> Optional[str]: + if markdown_image_preference == "rendered": + return rendered_rel_path or embedded_rel_path + if embedded_rel_path: + return embedded_rel_path + return rendered_rel_path + + +def _replace_markdown_image_reference( + markdown_result: str, + region: Dict[str, Any], + page_idx: int, + bbox: List[int], + new_path: str, +) -> str: + old_path = region.get("image_path") + if old_path: + if not new_path: + import re + + pattern = re.compile(rf"!\[[^\]]*\]\({re.escape(old_path)}\)") + return pattern.sub("", markdown_result, count=1) + return markdown_result.replace(f"({old_path})", f"({new_path})", 1) + old_tag = f"![](page={page_idx},bbox={bbox})" + if not new_path: + return markdown_result.replace(old_tag, "", 1) + new_tag = f"![Image {page_idx}-{region.get('index', 0)}]({new_path})" + return markdown_result.replace(old_tag, new_tag, 1) + + +def _preserve_rendered_assets( + json_result: List[list], + markdown_result: str, + rendered_images: Dict[str, Any], +) -> Tuple[List[list], str, Dict[str, Any]]: + updated_json: List[list] = [] + normalized_images: Dict[str, Any] = {} + for page_idx, page in enumerate(json_result): + if not isinstance(page, list): + updated_json.append(page) + continue + page_copy = [] + for region in page: + if not isinstance(region, dict) or region.get("label") != "image": + page_copy.append(region) + continue + region_copy = dict(region) + image_path = region_copy.get("image_path") + rendered_path = region_copy.get("rendered_image_path") or image_path + if rendered_path: + filename = ( + rendered_path.split("/", 1)[-1] + if isinstance(rendered_path, str) + else None + ) + source_key = None + if isinstance(rendered_path, str) and rendered_path in rendered_images: + source_key = rendered_path + elif filename and filename in rendered_images: + source_key = filename + if source_key: + normalized_images[rendered_path] = rendered_images[source_key] + region_copy["image_path"] = rendered_path + region_copy["rendered_image_path"] = rendered_path + region_copy["embedded_image_path"] = None + region_copy["image_asset_source"] = "rendered" + bbox = region_copy.get("bbox_2d") + if bbox: + markdown_result = _replace_markdown_image_reference( + markdown_result, + region, + page_idx, + bbox, + rendered_path, + ) + else: + previous_image_path = region_copy.get("image_path") + region_copy["image_path"] = rendered_path + region_copy["rendered_image_path"] = rendered_path + region_copy["embedded_image_path"] = None + region_copy["image_asset_source"] = "rendered" + region_copy["_needs_rendered_export"] = True + region_copy["_previous_image_path"] = previous_image_path + else: + region_copy.setdefault("image_path", None) + region_copy.setdefault("rendered_image_path", None) + region_copy.setdefault("embedded_image_path", None) + region_copy.setdefault("image_asset_source", "rendered") + page_copy.append(region_copy) + updated_json.append(page_copy) + return updated_json, markdown_result, normalized_images + + +def _has_pending_rendered_assets(json_result: List[list]) -> bool: + for page in json_result: + if not isinstance(page, list): + continue + for region in page: + if isinstance(region, dict) and region.get("_needs_rendered_export"): + return True + return False + + +def _fill_missing_image_asset_fields(json_result: List[list]) -> List[list]: + updated_json: List[list] = [] + for page in json_result: + if not isinstance(page, list): + updated_json.append(page) + continue + page_copy = [] + for region in page: + if isinstance(region, dict) and region.get("label") == "image": + region_copy = dict(region) + region_copy.setdefault("image_path", None) + region_copy.setdefault("rendered_image_path", None) + region_copy.setdefault("embedded_image_path", None) + region_copy.setdefault("image_asset_source", "rendered") + page_copy.append(region_copy) + else: + page_copy.append(region) + updated_json.append(page_copy) + return updated_json + + +def _strip_internal_image_fields(json_result: List[list]) -> List[list]: + updated_json: List[list] = [] + for page in json_result: + if not isinstance(page, list): + updated_json.append(page) + continue + page_copy = [] + for region in page: + if isinstance(region, dict): + region_copy = dict(region) + region_copy.pop("_needs_rendered_export", None) + region_copy.pop("_previous_image_path", None) + page_copy.append(region_copy) + else: + page_copy.append(region) + updated_json.append(page_copy) + return updated_json